epub-translator 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +3 -2
- epub_translator/data/format.jinja +33 -0
- epub_translator/data/translate.jinja +15 -0
- epub_translator/epub/__init__.py +2 -3
- epub_translator/epub/content_parser.py +2 -2
- epub_translator/epub/html/__init__.py +1 -1
- epub_translator/epub/html/file.py +56 -41
- epub_translator/epub/html/texts_searcher.py +2 -1
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/error.py +49 -0
- epub_translator/llm/executor.py +147 -0
- epub_translator/llm/increasable.py +35 -0
- epub_translator/llm/node.py +197 -0
- epub_translator/template.py +50 -0
- epub_translator/translation/__init__.py +2 -0
- epub_translator/translation/chunk.py +120 -0
- epub_translator/translation/splitter.py +77 -0
- epub_translator/translation/store.py +37 -0
- epub_translator/translation/translation.py +192 -0
- epub_translator/translation/types.py +23 -0
- epub_translator/translation/utils.py +11 -0
- epub_translator/translator.py +169 -0
- epub_translator/xml/__init__.py +3 -0
- epub_translator/xml/decoder.py +71 -0
- epub_translator/xml/encoder.py +95 -0
- epub_translator/xml/parser.py +172 -0
- epub_translator/xml/tag.py +93 -0
- epub_translator/xml/transform.py +34 -0
- epub_translator/xml/utils.py +12 -0
- epub_translator/zip_context.py +74 -0
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/METADATA +5 -7
- epub_translator-0.0.3.dist-info/RECORD +36 -0
- epub_translator/epub/types.py +0 -4
- epub_translator/file.py +0 -124
- epub_translator/translator/__init__.py +0 -1
- epub_translator/translator/group.py +0 -140
- epub_translator/translator/llm.py +0 -58
- epub_translator/translator/nlp.py +0 -36
- epub_translator/translator/translator.py +0 -159
- epub_translator-0.0.1.dist-info/RECORD +0 -19
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from typing import Tuple, Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from jinja2 import select_autoescape, Environment, BaseLoader, TemplateNotFound
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_env(dir_path: Path) -> Environment:
|
|
9
|
+
return Environment(
|
|
10
|
+
loader=_DSLoader(dir_path),
|
|
11
|
+
autoescape=select_autoescape(),
|
|
12
|
+
trim_blocks=True,
|
|
13
|
+
keep_trailing_newline=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
_LoaderResult = Tuple[str, str | None, Callable[[], bool] | None]
|
|
17
|
+
|
|
18
|
+
class _DSLoader(BaseLoader):
|
|
19
|
+
def __init__(self, dir_path: Path):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self._dir_path: Path = dir_path
|
|
22
|
+
|
|
23
|
+
def get_source(self, _: Environment, template: str) -> _LoaderResult:
|
|
24
|
+
template = self._norm_template(template)
|
|
25
|
+
target_path = (self._dir_path / template).resolve()
|
|
26
|
+
|
|
27
|
+
if not target_path.exists():
|
|
28
|
+
raise TemplateNotFound(f"cannot find {template}")
|
|
29
|
+
|
|
30
|
+
return self._get_source_with_path(target_path)
|
|
31
|
+
|
|
32
|
+
def _norm_template(self, template: str) -> str:
|
|
33
|
+
if bool(re.match(r"^\.+/", template)):
|
|
34
|
+
raise TemplateNotFound(f"invalid path {template}")
|
|
35
|
+
|
|
36
|
+
template = re.sub(r"^/", "", template)
|
|
37
|
+
template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
|
|
38
|
+
template = f"{template}.jinja"
|
|
39
|
+
|
|
40
|
+
return template
|
|
41
|
+
|
|
42
|
+
def _get_source_with_path(self, path: Path) -> _LoaderResult:
|
|
43
|
+
mtime = path.stat().st_mtime
|
|
44
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
45
|
+
source = f.read()
|
|
46
|
+
|
|
47
|
+
def is_updated() -> bool:
|
|
48
|
+
return mtime == path.stat().st_mtime
|
|
49
|
+
|
|
50
|
+
return source, path, is_updated
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Iterator, Iterable, Generator
|
|
3
|
+
from hashlib import sha512
|
|
4
|
+
from ..llm import LLM
|
|
5
|
+
from .types import Fragment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Chunk:
|
|
10
|
+
index: int
|
|
11
|
+
hash: bytes
|
|
12
|
+
head: list[str]
|
|
13
|
+
body: list[str]
|
|
14
|
+
tail: list[str]
|
|
15
|
+
tokens_count: int
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ChunkRange:
|
|
19
|
+
index: int
|
|
20
|
+
head_remain_tokens: int
|
|
21
|
+
tail_remain_tokens: int
|
|
22
|
+
head_index: int
|
|
23
|
+
body_index: int
|
|
24
|
+
tail_index: int
|
|
25
|
+
fragments_count: int
|
|
26
|
+
tokens_count: int
|
|
27
|
+
|
|
28
|
+
def match(self, index: int) -> bool:
|
|
29
|
+
return self.head_index <= index < self.head_index + self.fragments_count
|
|
30
|
+
|
|
31
|
+
def match_fragments(
|
|
32
|
+
llm: LLM,
|
|
33
|
+
chunk_ranges_iter: Iterator[ChunkRange],
|
|
34
|
+
fragments_iter: Iterator[Fragment],
|
|
35
|
+
) -> Generator[Chunk, None, None]:
|
|
36
|
+
|
|
37
|
+
for range, texts in _match_range_and_texts(
|
|
38
|
+
chunk_range_iter=chunk_ranges_iter,
|
|
39
|
+
fragments_iter=fragments_iter,
|
|
40
|
+
):
|
|
41
|
+
head_length = range.body_index - range.head_index
|
|
42
|
+
body_length = range.tail_index - range.body_index
|
|
43
|
+
head = texts[:head_length]
|
|
44
|
+
body = texts[head_length:head_length + body_length]
|
|
45
|
+
tail = texts[head_length + body_length:]
|
|
46
|
+
|
|
47
|
+
hash = _hash_texts_list((head, body, tail))
|
|
48
|
+
head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
|
|
49
|
+
tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
|
|
50
|
+
|
|
51
|
+
yield Chunk(
|
|
52
|
+
hash=hash,
|
|
53
|
+
head=head,
|
|
54
|
+
body=body,
|
|
55
|
+
tail=tail,
|
|
56
|
+
index=range.index,
|
|
57
|
+
tokens_count=range.tokens_count,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def _match_range_and_texts(
|
|
61
|
+
chunk_range_iter: Iterator[ChunkRange],
|
|
62
|
+
fragments_iter: Iterator[Fragment],
|
|
63
|
+
) -> Generator[tuple[ChunkRange, list[str]], None, None]:
|
|
64
|
+
|
|
65
|
+
next_chunk_range: ChunkRange | None = None
|
|
66
|
+
matched_chunk_ranges: list[tuple[ChunkRange, list[str]]] = []
|
|
67
|
+
|
|
68
|
+
for index, fragment in enumerate(fragments_iter):
|
|
69
|
+
while True:
|
|
70
|
+
if next_chunk_range is None:
|
|
71
|
+
next_chunk_range = next(chunk_range_iter, None)
|
|
72
|
+
if next_chunk_range is None:
|
|
73
|
+
break
|
|
74
|
+
if not next_chunk_range.match(index):
|
|
75
|
+
break
|
|
76
|
+
matched_chunk_ranges.append((next_chunk_range, []))
|
|
77
|
+
next_chunk_range = None
|
|
78
|
+
|
|
79
|
+
if matched_chunk_ranges:
|
|
80
|
+
next_matched_chunks: list[tuple[ChunkRange, list[str]]] = []
|
|
81
|
+
for chunk_range, texts in matched_chunk_ranges:
|
|
82
|
+
if chunk_range.match(index):
|
|
83
|
+
texts.append(fragment.text)
|
|
84
|
+
next_matched_chunks.append((chunk_range, texts))
|
|
85
|
+
else:
|
|
86
|
+
yield chunk_range, texts
|
|
87
|
+
matched_chunk_ranges = next_matched_chunks
|
|
88
|
+
|
|
89
|
+
yield from matched_chunk_ranges
|
|
90
|
+
|
|
91
|
+
def _hash_texts_list(texts_iterable: Iterable[list[str]]) -> bytes:
|
|
92
|
+
is_first = True
|
|
93
|
+
m = sha512()
|
|
94
|
+
for texts in texts_iterable:
|
|
95
|
+
for text in texts:
|
|
96
|
+
if is_first:
|
|
97
|
+
is_first = False
|
|
98
|
+
else:
|
|
99
|
+
m.update(b"\x00")
|
|
100
|
+
m.update(text.encode("utf-8"))
|
|
101
|
+
return m.digest()
|
|
102
|
+
|
|
103
|
+
def _crop_extra_texts(llm: LLM, texts: list[str], crop_left: bool, remain_tokens_count: int):
|
|
104
|
+
tokens_list: list[list[int]] = [llm.encode_tokens(text) for text in texts]
|
|
105
|
+
remain_texts: list[str] = []
|
|
106
|
+
|
|
107
|
+
for tokens in (reversed(tokens_list) if crop_left else tokens_list):
|
|
108
|
+
tokens_count = len(tokens)
|
|
109
|
+
if remain_tokens_count >= tokens_count:
|
|
110
|
+
remain_tokens_count -= tokens_count
|
|
111
|
+
remain_texts.append(llm.decode_tokens(tokens))
|
|
112
|
+
if remain_tokens_count == 0:
|
|
113
|
+
break
|
|
114
|
+
else:
|
|
115
|
+
remain_tokens = tokens[-remain_tokens_count:] if crop_left else tokens[:remain_tokens_count]
|
|
116
|
+
remain_texts.append(llm.decode_tokens(remain_tokens))
|
|
117
|
+
|
|
118
|
+
if crop_left:
|
|
119
|
+
remain_texts.reverse()
|
|
120
|
+
return remain_texts
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from typing import Iterator, Generator
|
|
2
|
+
from resource_segmentation import split, Resource, Segment
|
|
3
|
+
|
|
4
|
+
from ..llm import LLM
|
|
5
|
+
from .types import Fragment
|
|
6
|
+
from .chunk import ChunkRange
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def split_into_chunks(llm: LLM, fragments_iter: Iterator[Fragment], max_chunk_tokens_count: int):
|
|
10
|
+
for index, group in enumerate(split(
|
|
11
|
+
resources=_gen_resources(llm, fragments_iter),
|
|
12
|
+
max_segment_count=max_chunk_tokens_count,
|
|
13
|
+
gap_rate=0.15,
|
|
14
|
+
tail_rate=0.5,
|
|
15
|
+
)):
|
|
16
|
+
head_index: int
|
|
17
|
+
tail_index: int
|
|
18
|
+
fragments_count: int
|
|
19
|
+
body_index, body_end_index, body_tokens_count = _range_of_group_part(group.body)
|
|
20
|
+
|
|
21
|
+
if group.head:
|
|
22
|
+
head_index, head_end_index, _ = _range_of_group_part(group.head)
|
|
23
|
+
assert head_end_index + 1 == body_index, "Head must be continuous with body"
|
|
24
|
+
else:
|
|
25
|
+
head_index = body_index
|
|
26
|
+
|
|
27
|
+
if group.tail:
|
|
28
|
+
tail_index, tail_end_index, _ = _range_of_group_part(group.tail)
|
|
29
|
+
fragments_count = tail_end_index - head_index + 1
|
|
30
|
+
assert body_end_index + 1 == tail_index, "Body must be continuous with tail"
|
|
31
|
+
else:
|
|
32
|
+
tail_index = body_end_index + 1
|
|
33
|
+
fragments_count = tail_index - head_index
|
|
34
|
+
|
|
35
|
+
yield ChunkRange(
|
|
36
|
+
index=index,
|
|
37
|
+
head_remain_tokens=group.head_remain_count,
|
|
38
|
+
tail_remain_tokens=group.tail_remain_count,
|
|
39
|
+
head_index=head_index,
|
|
40
|
+
body_index=body_index,
|
|
41
|
+
tail_index=tail_index,
|
|
42
|
+
fragments_count=fragments_count,
|
|
43
|
+
tokens_count=body_tokens_count,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def _gen_resources(llm: LLM, fragments_iter: Iterator[Fragment]) -> Generator[Resource[int], None, None]:
|
|
47
|
+
for index, fragment in enumerate(fragments_iter):
|
|
48
|
+
yield Resource(
|
|
49
|
+
count=llm.count_tokens_count(fragment.text),
|
|
50
|
+
start_incision=fragment.start_incision,
|
|
51
|
+
end_incision=fragment.end_incision,
|
|
52
|
+
payload=index,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def _range_of_group_part(target: list[Resource[int] | Segment[int]]) -> tuple[int, int]:
|
|
56
|
+
start_index: int | None = None
|
|
57
|
+
previous_index: int = 0
|
|
58
|
+
tokens_count: int = 0
|
|
59
|
+
for resource in _iter_group_part(target):
|
|
60
|
+
index = resource.payload
|
|
61
|
+
if start_index is None:
|
|
62
|
+
start_index = index
|
|
63
|
+
else:
|
|
64
|
+
assert index == previous_index + 1, "Resources in group part must be continuous"
|
|
65
|
+
previous_index = index
|
|
66
|
+
tokens_count += resource.count
|
|
67
|
+
|
|
68
|
+
assert start_index is not None, "Group part must contain at least one resource"
|
|
69
|
+
return start_index, previous_index, tokens_count
|
|
70
|
+
|
|
71
|
+
def _iter_group_part(target: list[Resource[int] | Segment[int]]) -> Generator[Resource[int], None, None]:
|
|
72
|
+
for item in target:
|
|
73
|
+
if isinstance(item, Resource):
|
|
74
|
+
yield item
|
|
75
|
+
elif isinstance(item, Segment):
|
|
76
|
+
for resource in item.resources:
|
|
77
|
+
yield resource
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from shutil import rmtree
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
from .utils import clean_spaces
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Store:
|
|
8
|
+
def __init__(self, directory: Path):
|
|
9
|
+
self._directory = directory
|
|
10
|
+
|
|
11
|
+
def get(self, chunk_hash: bytes) -> list[str] | None:
|
|
12
|
+
file_path = self._file_path(chunk_hash)
|
|
13
|
+
if not file_path.exists() or not file_path.is_file():
|
|
14
|
+
return None
|
|
15
|
+
with file_path.open("r", encoding="utf-8") as file:
|
|
16
|
+
return list(line for line in file if line.strip())
|
|
17
|
+
|
|
18
|
+
def put(self, chunk_hash: bytes, lines_iter: Iterator[str]):
|
|
19
|
+
file_path = self._file_path(chunk_hash)
|
|
20
|
+
if file_path.exists():
|
|
21
|
+
if file_path.is_file():
|
|
22
|
+
file_path.unlink()
|
|
23
|
+
else:
|
|
24
|
+
rmtree(file_path)
|
|
25
|
+
|
|
26
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
with file_path.open("w", encoding="utf-8") as file:
|
|
28
|
+
is_first_line = True
|
|
29
|
+
for line in lines_iter:
|
|
30
|
+
if is_first_line:
|
|
31
|
+
is_first_line = False
|
|
32
|
+
else:
|
|
33
|
+
file.write("\n")
|
|
34
|
+
file.write(clean_spaces(line))
|
|
35
|
+
|
|
36
|
+
def _file_path(self, chunk_hash: bytes) -> Path:
|
|
37
|
+
return self._directory / f"{chunk_hash.hex()}.chunk"
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from typing import Callable, Iterator, Generator
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from concurrent.futures import as_completed, ThreadPoolExecutor
|
|
4
|
+
from xml.etree.ElementTree import Element
|
|
5
|
+
|
|
6
|
+
from ..llm import LLM
|
|
7
|
+
from ..xml import encode_friendly
|
|
8
|
+
|
|
9
|
+
from .types import Fragment, Language
|
|
10
|
+
from .store import Store
|
|
11
|
+
from .splitter import split_into_chunks
|
|
12
|
+
from .chunk import match_fragments, Chunk
|
|
13
|
+
from .utils import is_empty, clean_spaces
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ProgressReporter = Callable[[float], None]
|
|
17
|
+
|
|
18
|
+
def translate(
|
|
19
|
+
llm: LLM,
|
|
20
|
+
gen_fragments_iter: Callable[[], Iterator[Fragment]],
|
|
21
|
+
cache_path: Path | None,
|
|
22
|
+
target_language: Language,
|
|
23
|
+
user_prompt: str | None,
|
|
24
|
+
max_chunk_tokens_count: int,
|
|
25
|
+
max_threads_count: int,
|
|
26
|
+
report_progress: ProgressReporter,
|
|
27
|
+
) -> Generator[str, None, None]:
|
|
28
|
+
|
|
29
|
+
if user_prompt is not None:
|
|
30
|
+
user_prompt = _normalize_user_input(user_prompt.splitlines())
|
|
31
|
+
|
|
32
|
+
store = Store(cache_path) if cache_path else None
|
|
33
|
+
chunk_ranges = list(split_into_chunks(
|
|
34
|
+
llm=llm,
|
|
35
|
+
fragments_iter=gen_fragments_iter(),
|
|
36
|
+
max_chunk_tokens_count=max_chunk_tokens_count,
|
|
37
|
+
))
|
|
38
|
+
with ThreadPoolExecutor(max_workers=max_threads_count) as executor:
|
|
39
|
+
futures = [
|
|
40
|
+
executor.submit(lambda chunk=chunk: (chunk, _translate_chunk(
|
|
41
|
+
llm=llm,
|
|
42
|
+
store=store,
|
|
43
|
+
chunk=chunk,
|
|
44
|
+
target_language=target_language,
|
|
45
|
+
user_prompt=user_prompt,
|
|
46
|
+
)))
|
|
47
|
+
for chunk in match_fragments(
|
|
48
|
+
llm=llm,
|
|
49
|
+
chunk_ranges_iter=iter(chunk_ranges),
|
|
50
|
+
fragments_iter=gen_fragments_iter(),
|
|
51
|
+
)
|
|
52
|
+
]
|
|
53
|
+
yield from _sort_translated_texts_by_chunk(
|
|
54
|
+
target=(f.result() for f in as_completed(futures)),
|
|
55
|
+
total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
|
|
56
|
+
report_progress=report_progress,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _sort_translated_texts_by_chunk(
|
|
60
|
+
target: Iterator[tuple[Chunk, list[str]]],
|
|
61
|
+
total_tokens_count: int,
|
|
62
|
+
report_progress: ProgressReporter,
|
|
63
|
+
) -> Iterator[list[str]]:
|
|
64
|
+
|
|
65
|
+
buffer: list[tuple[Chunk, list[str]]] = []
|
|
66
|
+
wanna_next_index: int = 0
|
|
67
|
+
translated_tokens_count: int = 0
|
|
68
|
+
|
|
69
|
+
for chunk, translated_texts in target:
|
|
70
|
+
buffer.append((chunk, translated_texts))
|
|
71
|
+
if wanna_next_index == chunk.index:
|
|
72
|
+
buffer.sort(key=lambda e: e[0].index)
|
|
73
|
+
to_clear: list[list[str]] = []
|
|
74
|
+
|
|
75
|
+
for chunk, translated_texts in buffer:
|
|
76
|
+
if chunk.index > wanna_next_index:
|
|
77
|
+
break
|
|
78
|
+
to_clear.append(translated_texts)
|
|
79
|
+
if chunk.index == wanna_next_index:
|
|
80
|
+
wanna_next_index += 1
|
|
81
|
+
|
|
82
|
+
if to_clear:
|
|
83
|
+
buffer = buffer[len(to_clear):]
|
|
84
|
+
for translated_texts in to_clear:
|
|
85
|
+
yield from translated_texts
|
|
86
|
+
|
|
87
|
+
translated_tokens_count += chunk.tokens_count
|
|
88
|
+
report_progress(float(translated_tokens_count) / total_tokens_count)
|
|
89
|
+
|
|
90
|
+
def _translate_chunk(
|
|
91
|
+
llm: LLM,
|
|
92
|
+
store: Store,
|
|
93
|
+
chunk: Chunk,
|
|
94
|
+
target_language: Language,
|
|
95
|
+
user_prompt: str | None,
|
|
96
|
+
) -> list[str]:
|
|
97
|
+
|
|
98
|
+
translated_texts: list[str] | None = None
|
|
99
|
+
if store is not None:
|
|
100
|
+
translated_texts = store.get(chunk.hash)
|
|
101
|
+
|
|
102
|
+
if translated_texts is None:
|
|
103
|
+
translated_texts = _translate_texts(
|
|
104
|
+
llm=llm,
|
|
105
|
+
texts=chunk.head + chunk.body + chunk.tail,
|
|
106
|
+
target_language=target_language,
|
|
107
|
+
user_prompt=user_prompt,
|
|
108
|
+
)
|
|
109
|
+
if store is not None:
|
|
110
|
+
store.put(chunk.hash, translated_texts)
|
|
111
|
+
|
|
112
|
+
head_length = len(chunk.head)
|
|
113
|
+
translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
|
|
114
|
+
|
|
115
|
+
return translated_texts
|
|
116
|
+
|
|
117
|
+
def _translate_texts(
|
|
118
|
+
llm: LLM,
|
|
119
|
+
texts: list[str],
|
|
120
|
+
target_language: Language,
|
|
121
|
+
user_prompt: str | None,
|
|
122
|
+
) -> list[str]:
|
|
123
|
+
|
|
124
|
+
original_text = _normalize_user_input(texts)
|
|
125
|
+
if original_text is None:
|
|
126
|
+
return [""] * len(texts)
|
|
127
|
+
|
|
128
|
+
user_data = original_text
|
|
129
|
+
if user_prompt is not None:
|
|
130
|
+
user_data = f"<rules>{user_prompt}</rules>\n\n{original_text}"
|
|
131
|
+
|
|
132
|
+
translated_text = llm.request_text(
|
|
133
|
+
template_name="translate",
|
|
134
|
+
text_tag="TXT",
|
|
135
|
+
user_data=user_data,
|
|
136
|
+
parser=lambda r: r,
|
|
137
|
+
params={
|
|
138
|
+
"target_language": target_language.value,
|
|
139
|
+
"user_prompt": user_prompt,
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
request_element = Element("request")
|
|
143
|
+
|
|
144
|
+
for i, fragment in enumerate(texts):
|
|
145
|
+
fragment_element = Element("fragment", attrib={
|
|
146
|
+
"id": str(i + 1),
|
|
147
|
+
})
|
|
148
|
+
fragment_element.text = clean_spaces(fragment)
|
|
149
|
+
request_element.append(fragment_element)
|
|
150
|
+
|
|
151
|
+
request_element_text = encode_friendly(request_element)
|
|
152
|
+
request_text = f"```XML\n{request_element_text}\n```\n\n{translated_text}"
|
|
153
|
+
|
|
154
|
+
return llm.request_xml(
|
|
155
|
+
template_name="format",
|
|
156
|
+
user_data=request_text,
|
|
157
|
+
params={ "target_language": target_language.value },
|
|
158
|
+
parser=lambda r: _parse_translated_response(r, len(texts)),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
|
|
162
|
+
translated_fragments = [""] * sources_count
|
|
163
|
+
for fragment_element in resp_element:
|
|
164
|
+
if fragment_element.text is None:
|
|
165
|
+
continue
|
|
166
|
+
id = fragment_element.get("id", None)
|
|
167
|
+
if id is None:
|
|
168
|
+
continue
|
|
169
|
+
index = int(id) - 1
|
|
170
|
+
if index < 0 or index >= len(translated_fragments):
|
|
171
|
+
raise ValueError(f"invalid fragment id: {id}")
|
|
172
|
+
translated_fragments[index] = fragment_element.text.strip()
|
|
173
|
+
|
|
174
|
+
return translated_fragments
|
|
175
|
+
|
|
176
|
+
def _normalize_user_input(user_lines: list[str]) -> str | None:
|
|
177
|
+
empty_lines_count: int = 0
|
|
178
|
+
lines: list[str] = []
|
|
179
|
+
for line in user_lines:
|
|
180
|
+
if is_empty(line):
|
|
181
|
+
empty_lines_count += 1
|
|
182
|
+
else:
|
|
183
|
+
if lines:
|
|
184
|
+
if empty_lines_count >= 2:
|
|
185
|
+
lines.append("")
|
|
186
|
+
lines.append("")
|
|
187
|
+
elif empty_lines_count == 1:
|
|
188
|
+
lines.append("")
|
|
189
|
+
lines.append(clean_spaces(line))
|
|
190
|
+
if not lines:
|
|
191
|
+
return None
|
|
192
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from resource_segmentation import Incision
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Fragment:
|
|
8
|
+
text: str
|
|
9
|
+
start_incision: Incision
|
|
10
|
+
end_incision: Incision
|
|
11
|
+
|
|
12
|
+
class Language(Enum):
|
|
13
|
+
SIMPLIFIED_CHINESE = "简体中文"
|
|
14
|
+
TRADITIONAL_CHINESE = "繁体中文"
|
|
15
|
+
ENGLISH = "英语"
|
|
16
|
+
FRENCH = "法语"
|
|
17
|
+
GERMAN = "德语"
|
|
18
|
+
SPANISH = "西班牙语"
|
|
19
|
+
RUSSIAN = "俄语"
|
|
20
|
+
ITALIAN = "意大利语"
|
|
21
|
+
PORTUGUESE = "葡萄牙语"
|
|
22
|
+
JAPANESE = "日语"
|
|
23
|
+
KOREAN = "韩语"
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from os import PathLike
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from tempfile import mkdtemp
|
|
4
|
+
from shutil import rmtree
|
|
5
|
+
|
|
6
|
+
from .llm import LLM
|
|
7
|
+
from .epub import HTMLFile
|
|
8
|
+
from .zip_context import ZipContext
|
|
9
|
+
from .translation import translate as _translate, Fragment, Incision, Language, ProgressReporter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def translate(
|
|
13
|
+
llm: LLM,
|
|
14
|
+
source_path: PathLike,
|
|
15
|
+
translated_path: PathLike,
|
|
16
|
+
target_language: Language,
|
|
17
|
+
user_prompt: str | None = None,
|
|
18
|
+
working_path: PathLike | None = None,
|
|
19
|
+
max_chunk_tokens_count: int = 3000,
|
|
20
|
+
max_threads_count: int = 1,
|
|
21
|
+
report_progress: ProgressReporter | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
|
|
24
|
+
source_path = Path(source_path)
|
|
25
|
+
translated_path = Path(translated_path)
|
|
26
|
+
working_path = Path(working_path) if working_path else None
|
|
27
|
+
report_progress = report_progress or (lambda _: None)
|
|
28
|
+
|
|
29
|
+
_Translator(
|
|
30
|
+
llm=llm,
|
|
31
|
+
target_language=target_language,
|
|
32
|
+
user_prompt=user_prompt,
|
|
33
|
+
max_chunk_tokens_count=max_chunk_tokens_count,
|
|
34
|
+
max_threads_count=max_threads_count,
|
|
35
|
+
report_progress=report_progress,
|
|
36
|
+
).do(
|
|
37
|
+
source_path=source_path,
|
|
38
|
+
translated_path=translated_path,
|
|
39
|
+
working_path=working_path,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
class _Translator:
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
llm: LLM,
|
|
46
|
+
target_language: Language,
|
|
47
|
+
user_prompt: str | None,
|
|
48
|
+
max_chunk_tokens_count: int,
|
|
49
|
+
max_threads_count: int,
|
|
50
|
+
report_progress: ProgressReporter,
|
|
51
|
+
) -> None:
|
|
52
|
+
|
|
53
|
+
self._llm: LLM = llm
|
|
54
|
+
self._target_language: Language = target_language
|
|
55
|
+
self._user_prompt: str | None = user_prompt
|
|
56
|
+
self._max_chunk_tokens_count: int = max_chunk_tokens_count
|
|
57
|
+
self._max_threads_count: int = max_threads_count
|
|
58
|
+
self._report_progress: ProgressReporter = report_progress
|
|
59
|
+
|
|
60
|
+
def do(self, source_path: Path, translated_path: Path, working_path: Path | None) -> None:
|
|
61
|
+
is_temp_workspace = not bool(working_path)
|
|
62
|
+
working_path = working_path or Path(mkdtemp())
|
|
63
|
+
try:
|
|
64
|
+
temp_dir = _clean_path(working_path / "temp")
|
|
65
|
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
cache_path = working_path / "cache"
|
|
67
|
+
|
|
68
|
+
context = ZipContext(
|
|
69
|
+
epub_path=Path(source_path),
|
|
70
|
+
temp_dir=temp_dir,
|
|
71
|
+
)
|
|
72
|
+
context.replace_ncx(lambda texts: self._translate_ncx(
|
|
73
|
+
texts=texts,
|
|
74
|
+
cache_path=cache_path,
|
|
75
|
+
report_progress=lambda p: self._report_progress(p * 0.1)),
|
|
76
|
+
)
|
|
77
|
+
self._translate_spine(
|
|
78
|
+
context=context,
|
|
79
|
+
cache_path=cache_path,
|
|
80
|
+
report_progress=lambda p: self._report_progress(0.1 + p * 0.8),
|
|
81
|
+
)
|
|
82
|
+
context.archive(translated_path)
|
|
83
|
+
self._report_progress(1.0)
|
|
84
|
+
|
|
85
|
+
finally:
|
|
86
|
+
if is_temp_workspace:
|
|
87
|
+
rmtree(working_path, ignore_errors=True)
|
|
88
|
+
|
|
89
|
+
def _translate_ncx(self, texts: list[str], cache_path: Path, report_progress: ProgressReporter) -> list[str]:
|
|
90
|
+
return list(_translate(
|
|
91
|
+
llm=self._llm,
|
|
92
|
+
cache_path=cache_path,
|
|
93
|
+
max_chunk_tokens_count=self._max_chunk_tokens_count,
|
|
94
|
+
max_threads_count=1,
|
|
95
|
+
target_language=self._target_language,
|
|
96
|
+
user_prompt=self._user_prompt,
|
|
97
|
+
report_progress=report_progress,
|
|
98
|
+
gen_fragments_iter=lambda: (
|
|
99
|
+
Fragment(
|
|
100
|
+
text=text,
|
|
101
|
+
start_incision=Incision.IMPOSSIBLE,
|
|
102
|
+
end_incision=Incision.IMPOSSIBLE,
|
|
103
|
+
)
|
|
104
|
+
for text in texts
|
|
105
|
+
),
|
|
106
|
+
))
|
|
107
|
+
|
|
108
|
+
def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
|
|
109
|
+
spine_paths_iter = iter(list(context.search_spine_paths()))
|
|
110
|
+
spine_file: HTMLFile | None = None
|
|
111
|
+
translated_texts: list[str] = []
|
|
112
|
+
translated_count: int = 0
|
|
113
|
+
|
|
114
|
+
for translated_text in _translate(
|
|
115
|
+
llm=self._llm,
|
|
116
|
+
gen_fragments_iter=lambda: _gen_fragments(context),
|
|
117
|
+
cache_path=cache_path,
|
|
118
|
+
max_chunk_tokens_count=self._max_chunk_tokens_count,
|
|
119
|
+
max_threads_count=self._max_threads_count,
|
|
120
|
+
target_language=self._target_language,
|
|
121
|
+
user_prompt=self._user_prompt,
|
|
122
|
+
report_progress=report_progress,
|
|
123
|
+
):
|
|
124
|
+
did_touch_end = False
|
|
125
|
+
|
|
126
|
+
if spine_file is not None and \
|
|
127
|
+
translated_count >= len(translated_texts):
|
|
128
|
+
spine_file.write_texts(translated_texts)
|
|
129
|
+
spine_file = None
|
|
130
|
+
|
|
131
|
+
while spine_file is None:
|
|
132
|
+
spine_path = next(spine_paths_iter, None)
|
|
133
|
+
if spine_path is None:
|
|
134
|
+
did_touch_end = True
|
|
135
|
+
break
|
|
136
|
+
spine_file = context.read_spine_file(spine_path)
|
|
137
|
+
if spine_file.texts_length == 0:
|
|
138
|
+
spine_file = None
|
|
139
|
+
continue
|
|
140
|
+
translated_texts = [""] * spine_file.texts_length
|
|
141
|
+
translated_count = 0
|
|
142
|
+
|
|
143
|
+
translated_texts[translated_count] = translated_text
|
|
144
|
+
translated_count += 1
|
|
145
|
+
|
|
146
|
+
if did_touch_end:
|
|
147
|
+
break
|
|
148
|
+
if spine_file and translated_count > 0:
|
|
149
|
+
spine_file.write_texts(translated_texts)
|
|
150
|
+
|
|
151
|
+
context.write_spine_file(spine_path, spine_file)
|
|
152
|
+
|
|
153
|
+
def _gen_fragments(context: ZipContext):
|
|
154
|
+
for spine_path in context.search_spine_paths():
|
|
155
|
+
spine_file = context.read_spine_file(spine_path)
|
|
156
|
+
for text in spine_file.read_texts():
|
|
157
|
+
yield Fragment(
|
|
158
|
+
text=text,
|
|
159
|
+
start_incision=Incision.IMPOSSIBLE,
|
|
160
|
+
end_incision=Incision.IMPOSSIBLE,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _clean_path(path: Path) -> Path:
|
|
164
|
+
if path.exists():
|
|
165
|
+
if path.is_file():
|
|
166
|
+
path.unlink()
|
|
167
|
+
elif path.is_dir():
|
|
168
|
+
rmtree(path, ignore_errors=True)
|
|
169
|
+
return path
|