epub-translator 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. epub_translator/__init__.py +3 -2
  2. epub_translator/data/format.jinja +33 -0
  3. epub_translator/data/translate.jinja +15 -0
  4. epub_translator/epub/__init__.py +2 -3
  5. epub_translator/epub/content_parser.py +2 -2
  6. epub_translator/epub/html/__init__.py +1 -1
  7. epub_translator/epub/html/file.py +56 -41
  8. epub_translator/epub/html/texts_searcher.py +2 -1
  9. epub_translator/llm/__init__.py +1 -0
  10. epub_translator/llm/error.py +49 -0
  11. epub_translator/llm/executor.py +147 -0
  12. epub_translator/llm/increasable.py +35 -0
  13. epub_translator/llm/node.py +197 -0
  14. epub_translator/template.py +50 -0
  15. epub_translator/translation/__init__.py +2 -0
  16. epub_translator/translation/chunk.py +120 -0
  17. epub_translator/translation/splitter.py +77 -0
  18. epub_translator/translation/store.py +37 -0
  19. epub_translator/translation/translation.py +192 -0
  20. epub_translator/translation/types.py +23 -0
  21. epub_translator/translation/utils.py +11 -0
  22. epub_translator/translator.py +169 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/decoder.py +71 -0
  25. epub_translator/xml/encoder.py +95 -0
  26. epub_translator/xml/parser.py +172 -0
  27. epub_translator/xml/tag.py +93 -0
  28. epub_translator/xml/transform.py +34 -0
  29. epub_translator/xml/utils.py +12 -0
  30. epub_translator/zip_context.py +74 -0
  31. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/METADATA +5 -7
  32. epub_translator-0.0.3.dist-info/RECORD +36 -0
  33. epub_translator/epub/types.py +0 -4
  34. epub_translator/file.py +0 -124
  35. epub_translator/translator/__init__.py +0 -1
  36. epub_translator/translator/group.py +0 -140
  37. epub_translator/translator/llm.py +0 -58
  38. epub_translator/translator/nlp.py +0 -36
  39. epub_translator/translator/translator.py +0 -159
  40. epub_translator-0.0.1.dist-info/RECORD +0 -19
  41. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE +0 -0
  42. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,50 @@
1
+ import re
2
+
3
+ from typing import Tuple, Callable
4
+ from pathlib import Path
5
+ from jinja2 import select_autoescape, Environment, BaseLoader, TemplateNotFound
6
+
7
+
8
+ def create_env(dir_path: Path) -> Environment:
9
+ return Environment(
10
+ loader=_DSLoader(dir_path),
11
+ autoescape=select_autoescape(),
12
+ trim_blocks=True,
13
+ keep_trailing_newline=True,
14
+ )
15
+
16
+ _LoaderResult = Tuple[str, str | None, Callable[[], bool] | None]
17
+
18
+ class _DSLoader(BaseLoader):
19
+ def __init__(self, dir_path: Path):
20
+ super().__init__()
21
+ self._dir_path: Path = dir_path
22
+
23
+ def get_source(self, _: Environment, template: str) -> _LoaderResult:
24
+ template = self._norm_template(template)
25
+ target_path = (self._dir_path / template).resolve()
26
+
27
+ if not target_path.exists():
28
+ raise TemplateNotFound(f"cannot find {template}")
29
+
30
+ return self._get_source_with_path(target_path)
31
+
32
+ def _norm_template(self, template: str) -> str:
33
+ if bool(re.match(r"^\.+/", template)):
34
+ raise TemplateNotFound(f"invalid path {template}")
35
+
36
+ template = re.sub(r"^/", "", template)
37
+ template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
38
+ template = f"{template}.jinja"
39
+
40
+ return template
41
+
42
+ def _get_source_with_path(self, path: Path) -> _LoaderResult:
43
+ mtime = path.stat().st_mtime
44
+ with open(path, "r", encoding="utf-8") as f:
45
+ source = f.read()
46
+
47
+ def is_updated() -> bool:
48
+ return mtime == path.stat().st_mtime
49
+
50
+ return source, path, is_updated
@@ -0,0 +1,2 @@
1
+ from .types import *
2
+ from .translation import translate, ProgressReporter
@@ -0,0 +1,120 @@
1
+ from dataclasses import dataclass
2
+ from typing import Iterator, Iterable, Generator
3
+ from hashlib import sha512
4
+ from ..llm import LLM
5
+ from .types import Fragment
6
+
7
+
8
+ @dataclass
9
+ class Chunk:
10
+ index: int
11
+ hash: bytes
12
+ head: list[str]
13
+ body: list[str]
14
+ tail: list[str]
15
+ tokens_count: int
16
+
17
+ @dataclass
18
+ class ChunkRange:
19
+ index: int
20
+ head_remain_tokens: int
21
+ tail_remain_tokens: int
22
+ head_index: int
23
+ body_index: int
24
+ tail_index: int
25
+ fragments_count: int
26
+ tokens_count: int
27
+
28
+ def match(self, index: int) -> bool:
29
+ return self.head_index <= index < self.head_index + self.fragments_count
30
+
31
+ def match_fragments(
32
+ llm: LLM,
33
+ chunk_ranges_iter: Iterator[ChunkRange],
34
+ fragments_iter: Iterator[Fragment],
35
+ ) -> Generator[Chunk, None, None]:
36
+
37
+ for range, texts in _match_range_and_texts(
38
+ chunk_range_iter=chunk_ranges_iter,
39
+ fragments_iter=fragments_iter,
40
+ ):
41
+ head_length = range.body_index - range.head_index
42
+ body_length = range.tail_index - range.body_index
43
+ head = texts[:head_length]
44
+ body = texts[head_length:head_length + body_length]
45
+ tail = texts[head_length + body_length:]
46
+
47
+ hash = _hash_texts_list((head, body, tail))
48
+ head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
49
+ tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
50
+
51
+ yield Chunk(
52
+ hash=hash,
53
+ head=head,
54
+ body=body,
55
+ tail=tail,
56
+ index=range.index,
57
+ tokens_count=range.tokens_count,
58
+ )
59
+
60
+ def _match_range_and_texts(
61
+ chunk_range_iter: Iterator[ChunkRange],
62
+ fragments_iter: Iterator[Fragment],
63
+ ) -> Generator[tuple[ChunkRange, list[str]], None, None]:
64
+
65
+ next_chunk_range: ChunkRange | None = None
66
+ matched_chunk_ranges: list[tuple[ChunkRange, list[str]]] = []
67
+
68
+ for index, fragment in enumerate(fragments_iter):
69
+ while True:
70
+ if next_chunk_range is None:
71
+ next_chunk_range = next(chunk_range_iter, None)
72
+ if next_chunk_range is None:
73
+ break
74
+ if not next_chunk_range.match(index):
75
+ break
76
+ matched_chunk_ranges.append((next_chunk_range, []))
77
+ next_chunk_range = None
78
+
79
+ if matched_chunk_ranges:
80
+ next_matched_chunks: list[tuple[ChunkRange, list[str]]] = []
81
+ for chunk_range, texts in matched_chunk_ranges:
82
+ if chunk_range.match(index):
83
+ texts.append(fragment.text)
84
+ next_matched_chunks.append((chunk_range, texts))
85
+ else:
86
+ yield chunk_range, texts
87
+ matched_chunk_ranges = next_matched_chunks
88
+
89
+ yield from matched_chunk_ranges
90
+
91
+ def _hash_texts_list(texts_iterable: Iterable[list[str]]) -> bytes:
92
+ is_first = True
93
+ m = sha512()
94
+ for texts in texts_iterable:
95
+ for text in texts:
96
+ if is_first:
97
+ is_first = False
98
+ else:
99
+ m.update(b"\x00")
100
+ m.update(text.encode("utf-8"))
101
+ return m.digest()
102
+
103
+ def _crop_extra_texts(llm: LLM, texts: list[str], crop_left: bool, remain_tokens_count: int):
104
+ tokens_list: list[list[int]] = [llm.encode_tokens(text) for text in texts]
105
+ remain_texts: list[str] = []
106
+
107
+ for tokens in (reversed(tokens_list) if crop_left else tokens_list):
108
+ tokens_count = len(tokens)
109
+ if remain_tokens_count >= tokens_count:
110
+ remain_tokens_count -= tokens_count
111
+ remain_texts.append(llm.decode_tokens(tokens))
112
+ if remain_tokens_count == 0:
113
+ break
114
+ else:
115
+ remain_tokens = tokens[-remain_tokens_count:] if crop_left else tokens[:remain_tokens_count]
116
+ remain_texts.append(llm.decode_tokens(remain_tokens))
117
+
118
+ if crop_left:
119
+ remain_texts.reverse()
120
+ return remain_texts
@@ -0,0 +1,77 @@
1
+ from typing import Iterator, Generator
2
+ from resource_segmentation import split, Resource, Segment
3
+
4
+ from ..llm import LLM
5
+ from .types import Fragment
6
+ from .chunk import ChunkRange
7
+
8
+
9
+ def split_into_chunks(llm: LLM, fragments_iter: Iterator[Fragment], max_chunk_tokens_count: int):
10
+ for index, group in enumerate(split(
11
+ resources=_gen_resources(llm, fragments_iter),
12
+ max_segment_count=max_chunk_tokens_count,
13
+ gap_rate=0.15,
14
+ tail_rate=0.5,
15
+ )):
16
+ head_index: int
17
+ tail_index: int
18
+ fragments_count: int
19
+ body_index, body_end_index, body_tokens_count = _range_of_group_part(group.body)
20
+
21
+ if group.head:
22
+ head_index, head_end_index, _ = _range_of_group_part(group.head)
23
+ assert head_end_index + 1 == body_index, "Head must be continuous with body"
24
+ else:
25
+ head_index = body_index
26
+
27
+ if group.tail:
28
+ tail_index, tail_end_index, _ = _range_of_group_part(group.tail)
29
+ fragments_count = tail_end_index - head_index + 1
30
+ assert body_end_index + 1 == tail_index, "Body must be continuous with tail"
31
+ else:
32
+ tail_index = body_end_index + 1
33
+ fragments_count = tail_index - head_index
34
+
35
+ yield ChunkRange(
36
+ index=index,
37
+ head_remain_tokens=group.head_remain_count,
38
+ tail_remain_tokens=group.tail_remain_count,
39
+ head_index=head_index,
40
+ body_index=body_index,
41
+ tail_index=tail_index,
42
+ fragments_count=fragments_count,
43
+ tokens_count=body_tokens_count,
44
+ )
45
+
46
+ def _gen_resources(llm: LLM, fragments_iter: Iterator[Fragment]) -> Generator[Resource[int], None, None]:
47
+ for index, fragment in enumerate(fragments_iter):
48
+ yield Resource(
49
+ count=llm.count_tokens_count(fragment.text),
50
+ start_incision=fragment.start_incision,
51
+ end_incision=fragment.end_incision,
52
+ payload=index,
53
+ )
54
+
55
+ def _range_of_group_part(target: list[Resource[int] | Segment[int]]) -> tuple[int, int]:
56
+ start_index: int | None = None
57
+ previous_index: int = 0
58
+ tokens_count: int = 0
59
+ for resource in _iter_group_part(target):
60
+ index = resource.payload
61
+ if start_index is None:
62
+ start_index = index
63
+ else:
64
+ assert index == previous_index + 1, "Resources in group part must be continuous"
65
+ previous_index = index
66
+ tokens_count += resource.count
67
+
68
+ assert start_index is not None, "Group part must contain at least one resource"
69
+ return start_index, previous_index, tokens_count
70
+
71
+ def _iter_group_part(target: list[Resource[int] | Segment[int]]) -> Generator[Resource[int], None, None]:
72
+ for item in target:
73
+ if isinstance(item, Resource):
74
+ yield item
75
+ elif isinstance(item, Segment):
76
+ for resource in item.resources:
77
+ yield resource
@@ -0,0 +1,37 @@
1
+ from shutil import rmtree
2
+ from pathlib import Path
3
+ from typing import Iterator
4
+ from .utils import clean_spaces
5
+
6
+
7
+ class Store:
8
+ def __init__(self, directory: Path):
9
+ self._directory = directory
10
+
11
+ def get(self, chunk_hash: bytes) -> list[str] | None:
12
+ file_path = self._file_path(chunk_hash)
13
+ if not file_path.exists() or not file_path.is_file():
14
+ return None
15
+ with file_path.open("r", encoding="utf-8") as file:
16
+ return list(line for line in file if line.strip())
17
+
18
+ def put(self, chunk_hash: bytes, lines_iter: Iterator[str]):
19
+ file_path = self._file_path(chunk_hash)
20
+ if file_path.exists():
21
+ if file_path.is_file():
22
+ file_path.unlink()
23
+ else:
24
+ rmtree(file_path)
25
+
26
+ file_path.parent.mkdir(parents=True, exist_ok=True)
27
+ with file_path.open("w", encoding="utf-8") as file:
28
+ is_first_line = True
29
+ for line in lines_iter:
30
+ if is_first_line:
31
+ is_first_line = False
32
+ else:
33
+ file.write("\n")
34
+ file.write(clean_spaces(line))
35
+
36
+ def _file_path(self, chunk_hash: bytes) -> Path:
37
+ return self._directory / f"{chunk_hash.hex()}.chunk"
@@ -0,0 +1,192 @@
1
+ from typing import Callable, Iterator, Generator
2
+ from pathlib import Path
3
+ from concurrent.futures import as_completed, ThreadPoolExecutor
4
+ from xml.etree.ElementTree import Element
5
+
6
+ from ..llm import LLM
7
+ from ..xml import encode_friendly
8
+
9
+ from .types import Fragment, Language
10
+ from .store import Store
11
+ from .splitter import split_into_chunks
12
+ from .chunk import match_fragments, Chunk
13
+ from .utils import is_empty, clean_spaces
14
+
15
+
16
+ ProgressReporter = Callable[[float], None]
17
+
18
+ def translate(
19
+ llm: LLM,
20
+ gen_fragments_iter: Callable[[], Iterator[Fragment]],
21
+ cache_path: Path | None,
22
+ target_language: Language,
23
+ user_prompt: str | None,
24
+ max_chunk_tokens_count: int,
25
+ max_threads_count: int,
26
+ report_progress: ProgressReporter,
27
+ ) -> Generator[str, None, None]:
28
+
29
+ if user_prompt is not None:
30
+ user_prompt = _normalize_user_input(user_prompt.splitlines())
31
+
32
+ store = Store(cache_path) if cache_path else None
33
+ chunk_ranges = list(split_into_chunks(
34
+ llm=llm,
35
+ fragments_iter=gen_fragments_iter(),
36
+ max_chunk_tokens_count=max_chunk_tokens_count,
37
+ ))
38
+ with ThreadPoolExecutor(max_workers=max_threads_count) as executor:
39
+ futures = [
40
+ executor.submit(lambda chunk=chunk: (chunk, _translate_chunk(
41
+ llm=llm,
42
+ store=store,
43
+ chunk=chunk,
44
+ target_language=target_language,
45
+ user_prompt=user_prompt,
46
+ )))
47
+ for chunk in match_fragments(
48
+ llm=llm,
49
+ chunk_ranges_iter=iter(chunk_ranges),
50
+ fragments_iter=gen_fragments_iter(),
51
+ )
52
+ ]
53
+ yield from _sort_translated_texts_by_chunk(
54
+ target=(f.result() for f in as_completed(futures)),
55
+ total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
56
+ report_progress=report_progress,
57
+ )
58
+
59
+ def _sort_translated_texts_by_chunk(
60
+ target: Iterator[tuple[Chunk, list[str]]],
61
+ total_tokens_count: int,
62
+ report_progress: ProgressReporter,
63
+ ) -> Iterator[list[str]]:
64
+
65
+ buffer: list[tuple[Chunk, list[str]]] = []
66
+ wanna_next_index: int = 0
67
+ translated_tokens_count: int = 0
68
+
69
+ for chunk, translated_texts in target:
70
+ buffer.append((chunk, translated_texts))
71
+ if wanna_next_index == chunk.index:
72
+ buffer.sort(key=lambda e: e[0].index)
73
+ to_clear: list[list[str]] = []
74
+
75
+ for chunk, translated_texts in buffer:
76
+ if chunk.index > wanna_next_index:
77
+ break
78
+ to_clear.append(translated_texts)
79
+ if chunk.index == wanna_next_index:
80
+ wanna_next_index += 1
81
+
82
+ if to_clear:
83
+ buffer = buffer[len(to_clear):]
84
+ for translated_texts in to_clear:
85
+ yield from translated_texts
86
+
87
+ translated_tokens_count += chunk.tokens_count
88
+ report_progress(float(translated_tokens_count) / total_tokens_count)
89
+
90
+ def _translate_chunk(
91
+ llm: LLM,
92
+ store: Store,
93
+ chunk: Chunk,
94
+ target_language: Language,
95
+ user_prompt: str | None,
96
+ ) -> list[str]:
97
+
98
+ translated_texts: list[str] | None = None
99
+ if store is not None:
100
+ translated_texts = store.get(chunk.hash)
101
+
102
+ if translated_texts is None:
103
+ translated_texts = _translate_texts(
104
+ llm=llm,
105
+ texts=chunk.head + chunk.body + chunk.tail,
106
+ target_language=target_language,
107
+ user_prompt=user_prompt,
108
+ )
109
+ if store is not None:
110
+ store.put(chunk.hash, translated_texts)
111
+
112
+ head_length = len(chunk.head)
113
+ translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
114
+
115
+ return translated_texts
116
+
117
+ def _translate_texts(
118
+ llm: LLM,
119
+ texts: list[str],
120
+ target_language: Language,
121
+ user_prompt: str | None,
122
+ ) -> list[str]:
123
+
124
+ original_text = _normalize_user_input(texts)
125
+ if original_text is None:
126
+ return [""] * len(texts)
127
+
128
+ user_data = original_text
129
+ if user_prompt is not None:
130
+ user_data = f"<rules>{user_prompt}</rules>\n\n{original_text}"
131
+
132
+ translated_text = llm.request_text(
133
+ template_name="translate",
134
+ text_tag="TXT",
135
+ user_data=user_data,
136
+ parser=lambda r: r,
137
+ params={
138
+ "target_language": target_language.value,
139
+ "user_prompt": user_prompt,
140
+ },
141
+ )
142
+ request_element = Element("request")
143
+
144
+ for i, fragment in enumerate(texts):
145
+ fragment_element = Element("fragment", attrib={
146
+ "id": str(i + 1),
147
+ })
148
+ fragment_element.text = clean_spaces(fragment)
149
+ request_element.append(fragment_element)
150
+
151
+ request_element_text = encode_friendly(request_element)
152
+ request_text = f"```XML\n{request_element_text}\n```\n\n{translated_text}"
153
+
154
+ return llm.request_xml(
155
+ template_name="format",
156
+ user_data=request_text,
157
+ params={ "target_language": target_language.value },
158
+ parser=lambda r: _parse_translated_response(r, len(texts)),
159
+ )
160
+
161
+ def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
162
+ translated_fragments = [""] * sources_count
163
+ for fragment_element in resp_element:
164
+ if fragment_element.text is None:
165
+ continue
166
+ id = fragment_element.get("id", None)
167
+ if id is None:
168
+ continue
169
+ index = int(id) - 1
170
+ if index < 0 or index >= len(translated_fragments):
171
+ raise ValueError(f"invalid fragment id: {id}")
172
+ translated_fragments[index] = fragment_element.text.strip()
173
+
174
+ return translated_fragments
175
+
176
+ def _normalize_user_input(user_lines: list[str]) -> str | None:
177
+ empty_lines_count: int = 0
178
+ lines: list[str] = []
179
+ for line in user_lines:
180
+ if is_empty(line):
181
+ empty_lines_count += 1
182
+ else:
183
+ if lines:
184
+ if empty_lines_count >= 2:
185
+ lines.append("")
186
+ lines.append("")
187
+ elif empty_lines_count == 1:
188
+ lines.append("")
189
+ lines.append(clean_spaces(line))
190
+ if not lines:
191
+ return None
192
+ return "\n".join(lines)
@@ -0,0 +1,23 @@
1
+ from enum import Enum
2
+ from dataclasses import dataclass
3
+ from resource_segmentation import Incision
4
+
5
+
6
+ @dataclass
7
+ class Fragment:
8
+ text: str
9
+ start_incision: Incision
10
+ end_incision: Incision
11
+
12
+ class Language(Enum):
13
+ SIMPLIFIED_CHINESE = "简体中文"
14
+ TRADITIONAL_CHINESE = "繁体中文"
15
+ ENGLISH = "英语"
16
+ FRENCH = "法语"
17
+ GERMAN = "德语"
18
+ SPANISH = "西班牙语"
19
+ RUSSIAN = "俄语"
20
+ ITALIAN = "意大利语"
21
+ PORTUGUESE = "葡萄牙语"
22
+ JAPANESE = "日语"
23
+ KOREAN = "韩语"
@@ -0,0 +1,11 @@
1
+ import re
2
+
3
+
4
+ _EMPTY_LINE = re.compile(r"^\s*$")
5
+ _SPACE = re.compile(r"\s+")
6
+
7
+ def is_empty(text: str) -> bool:
8
+ return bool(_EMPTY_LINE.match(text))
9
+
10
+ def clean_spaces(text: str) -> str:
11
+ return _SPACE.sub(" ", text.strip())
@@ -0,0 +1,169 @@
1
+ from os import PathLike
2
+ from pathlib import Path
3
+ from tempfile import mkdtemp
4
+ from shutil import rmtree
5
+
6
+ from .llm import LLM
7
+ from .epub import HTMLFile
8
+ from .zip_context import ZipContext
9
+ from .translation import translate as _translate, Fragment, Incision, Language, ProgressReporter
10
+
11
+
12
+ def translate(
13
+ llm: LLM,
14
+ source_path: PathLike,
15
+ translated_path: PathLike,
16
+ target_language: Language,
17
+ user_prompt: str | None = None,
18
+ working_path: PathLike | None = None,
19
+ max_chunk_tokens_count: int = 3000,
20
+ max_threads_count: int = 1,
21
+ report_progress: ProgressReporter | None = None,
22
+ ) -> None:
23
+
24
+ source_path = Path(source_path)
25
+ translated_path = Path(translated_path)
26
+ working_path = Path(working_path) if working_path else None
27
+ report_progress = report_progress or (lambda _: None)
28
+
29
+ _Translator(
30
+ llm=llm,
31
+ target_language=target_language,
32
+ user_prompt=user_prompt,
33
+ max_chunk_tokens_count=max_chunk_tokens_count,
34
+ max_threads_count=max_threads_count,
35
+ report_progress=report_progress,
36
+ ).do(
37
+ source_path=source_path,
38
+ translated_path=translated_path,
39
+ working_path=working_path,
40
+ )
41
+
42
+ class _Translator:
43
+ def __init__(
44
+ self,
45
+ llm: LLM,
46
+ target_language: Language,
47
+ user_prompt: str | None,
48
+ max_chunk_tokens_count: int,
49
+ max_threads_count: int,
50
+ report_progress: ProgressReporter,
51
+ ) -> None:
52
+
53
+ self._llm: LLM = llm
54
+ self._target_language: Language = target_language
55
+ self._user_prompt: str | None = user_prompt
56
+ self._max_chunk_tokens_count: int = max_chunk_tokens_count
57
+ self._max_threads_count: int = max_threads_count
58
+ self._report_progress: ProgressReporter = report_progress
59
+
60
+ def do(self, source_path: Path, translated_path: Path, working_path: Path | None) -> None:
61
+ is_temp_workspace = not bool(working_path)
62
+ working_path = working_path or Path(mkdtemp())
63
+ try:
64
+ temp_dir = _clean_path(working_path / "temp")
65
+ temp_dir.mkdir(parents=True, exist_ok=True)
66
+ cache_path = working_path / "cache"
67
+
68
+ context = ZipContext(
69
+ epub_path=Path(source_path),
70
+ temp_dir=temp_dir,
71
+ )
72
+ context.replace_ncx(lambda texts: self._translate_ncx(
73
+ texts=texts,
74
+ cache_path=cache_path,
75
+ report_progress=lambda p: self._report_progress(p * 0.1)),
76
+ )
77
+ self._translate_spine(
78
+ context=context,
79
+ cache_path=cache_path,
80
+ report_progress=lambda p: self._report_progress(0.1 + p * 0.8),
81
+ )
82
+ context.archive(translated_path)
83
+ self._report_progress(1.0)
84
+
85
+ finally:
86
+ if is_temp_workspace:
87
+ rmtree(working_path, ignore_errors=True)
88
+
89
+ def _translate_ncx(self, texts: list[str], cache_path: Path, report_progress: ProgressReporter) -> list[str]:
90
+ return list(_translate(
91
+ llm=self._llm,
92
+ cache_path=cache_path,
93
+ max_chunk_tokens_count=self._max_chunk_tokens_count,
94
+ max_threads_count=1,
95
+ target_language=self._target_language,
96
+ user_prompt=self._user_prompt,
97
+ report_progress=report_progress,
98
+ gen_fragments_iter=lambda: (
99
+ Fragment(
100
+ text=text,
101
+ start_incision=Incision.IMPOSSIBLE,
102
+ end_incision=Incision.IMPOSSIBLE,
103
+ )
104
+ for text in texts
105
+ ),
106
+ ))
107
+
108
+ def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
109
+ spine_paths_iter = iter(list(context.search_spine_paths()))
110
+ spine_file: HTMLFile | None = None
111
+ translated_texts: list[str] = []
112
+ translated_count: int = 0
113
+
114
+ for translated_text in _translate(
115
+ llm=self._llm,
116
+ gen_fragments_iter=lambda: _gen_fragments(context),
117
+ cache_path=cache_path,
118
+ max_chunk_tokens_count=self._max_chunk_tokens_count,
119
+ max_threads_count=self._max_threads_count,
120
+ target_language=self._target_language,
121
+ user_prompt=self._user_prompt,
122
+ report_progress=report_progress,
123
+ ):
124
+ did_touch_end = False
125
+
126
+ if spine_file is not None and \
127
+ translated_count >= len(translated_texts):
128
+ spine_file.write_texts(translated_texts)
129
+ spine_file = None
130
+
131
+ while spine_file is None:
132
+ spine_path = next(spine_paths_iter, None)
133
+ if spine_path is None:
134
+ did_touch_end = True
135
+ break
136
+ spine_file = context.read_spine_file(spine_path)
137
+ if spine_file.texts_length == 0:
138
+ spine_file = None
139
+ continue
140
+ translated_texts = [""] * spine_file.texts_length
141
+ translated_count = 0
142
+
143
+ translated_texts[translated_count] = translated_text
144
+ translated_count += 1
145
+
146
+ if did_touch_end:
147
+ break
148
+ if spine_file and translated_count > 0:
149
+ spine_file.write_texts(translated_texts)
150
+
151
+ context.write_spine_file(spine_path, spine_file)
152
+
153
+ def _gen_fragments(context: ZipContext):
154
+ for spine_path in context.search_spine_paths():
155
+ spine_file = context.read_spine_file(spine_path)
156
+ for text in spine_file.read_texts():
157
+ yield Fragment(
158
+ text=text,
159
+ start_incision=Incision.IMPOSSIBLE,
160
+ end_incision=Incision.IMPOSSIBLE,
161
+ )
162
+
163
+ def _clean_path(path: Path) -> Path:
164
+ if path.exists():
165
+ if path.is_file():
166
+ path.unlink()
167
+ elif path.is_dir():
168
+ rmtree(path, ignore_errors=True)
169
+ return path
@@ -0,0 +1,3 @@
1
+ from .encoder import encode, encode_friendly
2
+ from .decoder import decode_friendly
3
+ from .utils import clone