epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +3 -1
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +175 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +205 -168
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +176 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +178 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.0.dist-info/METADATA +283 -0
- epub_translator-0.1.0.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -62
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.6.dist-info/METADATA +0 -170
- epub_translator-0.0.6.dist-info/RECORD +0 -36
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Generic
|
|
4
|
+
|
|
5
|
+
from resource_segmentation import Resource, Segment, split
|
|
6
|
+
|
|
7
|
+
from .segment import ST
|
|
8
|
+
|
|
9
|
+
_INCISION = 0
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Chunk(Generic[ST]):
|
|
14
|
+
head_remain_tokens: int
|
|
15
|
+
tail_remain_tokens: int
|
|
16
|
+
head: list[ST]
|
|
17
|
+
body: list[ST]
|
|
18
|
+
tail: list[ST]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def split_into_chunks(segments: Iterable[ST], max_group_tokens: int) -> Generator[Chunk[ST], None, None]:
|
|
22
|
+
for group in split(
|
|
23
|
+
max_segment_count=max_group_tokens,
|
|
24
|
+
gap_rate=0.07,
|
|
25
|
+
tail_rate=0.5,
|
|
26
|
+
border_incision=_INCISION,
|
|
27
|
+
resources=(
|
|
28
|
+
Resource(
|
|
29
|
+
count=segment.tokens,
|
|
30
|
+
start_incision=_INCISION,
|
|
31
|
+
end_incision=_INCISION,
|
|
32
|
+
payload=segment,
|
|
33
|
+
)
|
|
34
|
+
for segment in segments
|
|
35
|
+
),
|
|
36
|
+
):
|
|
37
|
+
yield Chunk(
|
|
38
|
+
head_remain_tokens=group.head_remain_count,
|
|
39
|
+
tail_remain_tokens=group.tail_remain_count,
|
|
40
|
+
head=list(_expand_payloads(group.head)),
|
|
41
|
+
body=list(_expand_payloads(group.body)),
|
|
42
|
+
tail=list(_expand_payloads(group.tail)),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _expand_payloads(target: list[Resource[ST] | Segment[ST]]) -> Generator[ST, None, None]:
|
|
47
|
+
for item in target:
|
|
48
|
+
if isinstance(item, Resource):
|
|
49
|
+
yield item.payload
|
|
50
|
+
elif isinstance(item, Segment):
|
|
51
|
+
for resource in item.resources:
|
|
52
|
+
yield resource.payload
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Generic, Protocol, Self, TypeVar, runtime_checkable
|
|
2
|
+
|
|
3
|
+
S = TypeVar("S", covariant=True)
|
|
4
|
+
T = TypeVar("T")
|
|
5
|
+
ST = TypeVar("ST", bound="Segment")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@runtime_checkable
|
|
9
|
+
class Segment(Protocol, Generic[S]):
|
|
10
|
+
@property
|
|
11
|
+
def tokens(self) -> int: ...
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def payload(self) -> S: ...
|
|
15
|
+
|
|
16
|
+
def truncate_after_head(self, remain_tokens: int) -> Self: ...
|
|
17
|
+
def truncate_before_tail(self, remain_tokens: int) -> Self: ...
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from collections.abc import Callable, Generator, Iterable
|
|
2
|
+
|
|
3
|
+
from .chunk import split_into_chunks
|
|
4
|
+
from .segment import ST, T
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def split(
|
|
8
|
+
segments: Iterable[ST],
|
|
9
|
+
transform: Callable[[list[ST]], list[T]],
|
|
10
|
+
max_group_tokens: int,
|
|
11
|
+
) -> Generator[T, None, None]:
|
|
12
|
+
for group in split_into_chunks(segments, max_group_tokens):
|
|
13
|
+
head = list(
|
|
14
|
+
_truncate_extra_content(
|
|
15
|
+
segments=group.head,
|
|
16
|
+
remain_left=False,
|
|
17
|
+
remain_tokens=group.head_remain_tokens,
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
tail = list(
|
|
21
|
+
_truncate_extra_content(
|
|
22
|
+
segments=group.tail,
|
|
23
|
+
remain_left=True,
|
|
24
|
+
remain_tokens=group.tail_remain_tokens,
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
transformed = transform(head + group.body + tail)
|
|
28
|
+
|
|
29
|
+
if len(tail) > 0: # 避免 target[N:-0] 切片错误
|
|
30
|
+
yield from transformed[len(head) : -len(tail)]
|
|
31
|
+
else:
|
|
32
|
+
yield from transformed[len(head) :]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _truncate_extra_content(segments: list[ST], remain_left: bool, remain_tokens: int):
|
|
36
|
+
tokens_list: list[int] = [segment.tokens for segment in segments]
|
|
37
|
+
segments = list(segments)
|
|
38
|
+
for tokens in tokens_list if remain_left else reversed(tokens_list):
|
|
39
|
+
if remain_tokens <= 0:
|
|
40
|
+
break
|
|
41
|
+
next_segment = segments.pop(0) if remain_left else segments.pop()
|
|
42
|
+
if remain_tokens < tokens:
|
|
43
|
+
if remain_left:
|
|
44
|
+
next_segment = next_segment.truncate_after_head(remain_tokens)
|
|
45
|
+
else:
|
|
46
|
+
next_segment = next_segment.truncate_before_tail(remain_tokens)
|
|
47
|
+
remain_tokens = 0
|
|
48
|
+
else:
|
|
49
|
+
remain_tokens -= tokens
|
|
50
|
+
yield next_segment
|
epub_translator/template.py
CHANGED
|
@@ -1,50 +1,52 @@
|
|
|
1
1
|
import re
|
|
2
|
-
|
|
3
|
-
from typing import Tuple, Callable
|
|
2
|
+
from collections.abc import Callable
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
|
|
4
|
+
|
|
5
|
+
from jinja2 import BaseLoader, Environment, TemplateNotFound, select_autoescape
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def create_env(dir_path: Path) -> Environment:
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
return Environment(
|
|
10
|
+
loader=_DSLoader(dir_path),
|
|
11
|
+
autoescape=select_autoescape(),
|
|
12
|
+
trim_blocks=True,
|
|
13
|
+
keep_trailing_newline=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_LoaderResult = tuple[str, str | None, Callable[[], bool] | None]
|
|
15
18
|
|
|
16
|
-
_LoaderResult = Tuple[str, str | None, Callable[[], bool] | None]
|
|
17
19
|
|
|
18
20
|
class _DSLoader(BaseLoader):
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
def __init__(self, dir_path: Path):
|
|
22
|
+
super().__init__()
|
|
23
|
+
self._dir_path: Path = dir_path
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
def get_source(self, environment: Environment, template: str) -> _LoaderResult:
|
|
26
|
+
template = self._norm_template(template)
|
|
27
|
+
target_path = (self._dir_path / template).resolve()
|
|
26
28
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
+
if not target_path.exists():
|
|
30
|
+
raise TemplateNotFound(f"cannot find {template}")
|
|
29
31
|
|
|
30
|
-
|
|
32
|
+
return self._get_source_with_path(target_path)
|
|
31
33
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
def _norm_template(self, template: str) -> str:
|
|
35
|
+
if bool(re.match(r"^\.+/", template)):
|
|
36
|
+
raise TemplateNotFound(f"invalid path {template}")
|
|
35
37
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
38
|
+
template = re.sub(r"^/", "", template)
|
|
39
|
+
template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
|
|
40
|
+
template = f"{template}.jinja"
|
|
39
41
|
|
|
40
|
-
|
|
42
|
+
return template
|
|
41
43
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
def _get_source_with_path(self, path: Path) -> _LoaderResult:
|
|
45
|
+
mtime = path.stat().st_mtime
|
|
46
|
+
with open(path, encoding="utf-8") as f:
|
|
47
|
+
source = f.read()
|
|
46
48
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
+
def is_updated() -> bool:
|
|
50
|
+
return mtime == path.stat().st_mtime
|
|
49
51
|
|
|
50
|
-
|
|
52
|
+
return source, str(path), is_updated
|
epub_translator/translator.py
CHANGED
|
@@ -1,174 +1,211 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Callable
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from
|
|
4
|
-
from shutil import rmtree
|
|
3
|
+
from xml.etree.ElementTree import Element
|
|
5
4
|
|
|
5
|
+
from .epub import Placeholder, Zip, is_placeholder_tag, read_toc, search_spine_paths, write_toc
|
|
6
|
+
from .epub.common import find_opf_path
|
|
6
7
|
from .llm import LLM
|
|
7
|
-
from .
|
|
8
|
-
from .
|
|
9
|
-
from .translation import translate as _translate, Incision, Fragment, Language, ProgressReporter
|
|
8
|
+
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first, plain_text
|
|
9
|
+
from .xml_translator import XMLGroupContext, XMLTranslator, submit_text_segments
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def translate(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
for
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
13
|
+
llm: LLM,
|
|
14
|
+
source_path: Path,
|
|
15
|
+
target_path: Path,
|
|
16
|
+
target_language: str,
|
|
17
|
+
user_prompt: str | None = None,
|
|
18
|
+
max_retries: int = 5,
|
|
19
|
+
max_group_tokens: int = 1200,
|
|
20
|
+
on_progress: Callable[[float], None] | None = None,
|
|
21
|
+
) -> None:
|
|
22
|
+
translator = XMLTranslator(
|
|
23
|
+
llm=llm,
|
|
24
|
+
target_language=target_language,
|
|
25
|
+
user_prompt=user_prompt,
|
|
26
|
+
ignore_translated_error=False,
|
|
27
|
+
max_retries=max_retries,
|
|
28
|
+
max_fill_displaying_errors=10,
|
|
29
|
+
group_context=XMLGroupContext(
|
|
30
|
+
encoding=llm.encoding,
|
|
31
|
+
max_group_tokens=max_group_tokens,
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
with Zip(source_path, target_path) as zip:
|
|
35
|
+
# Progress distribution: TOC 3%, metadata 2%, chapters 95%
|
|
36
|
+
TOC_PROGRESS = 0.03
|
|
37
|
+
METADATA_PROGRESS = 0.02
|
|
38
|
+
CHAPTERS_PROGRESS = 0.95
|
|
39
|
+
|
|
40
|
+
# Count total chapters for progress calculation (lightweight, no content loading)
|
|
41
|
+
total_chapters = _count_chapters(zip)
|
|
42
|
+
chapter_progress_step = CHAPTERS_PROGRESS / total_chapters if total_chapters > 0 else 0
|
|
43
|
+
|
|
44
|
+
current_progress = 0.0
|
|
45
|
+
|
|
46
|
+
# Translate TOC
|
|
47
|
+
_translate_toc(translator, zip)
|
|
48
|
+
current_progress += TOC_PROGRESS
|
|
49
|
+
if on_progress:
|
|
50
|
+
on_progress(current_progress)
|
|
51
|
+
|
|
52
|
+
# Translate metadata
|
|
53
|
+
_translate_metadata(translator, zip)
|
|
54
|
+
current_progress += METADATA_PROGRESS
|
|
55
|
+
if on_progress:
|
|
56
|
+
on_progress(current_progress)
|
|
57
|
+
|
|
58
|
+
# Translate chapters
|
|
59
|
+
processed_chapters = 0
|
|
60
|
+
for element, text_segments, (chapter_path, xml, placeholder) in translator.translate_to_text_segments(
|
|
61
|
+
items=_search_chapter_items(zip),
|
|
62
|
+
):
|
|
63
|
+
submit_text_segments(
|
|
64
|
+
element=element,
|
|
65
|
+
text_segments=(
|
|
66
|
+
segment
|
|
67
|
+
for segment in text_segments
|
|
68
|
+
if not any(is_placeholder_tag(e.tag) for e in segment.parent_stack)
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
placeholder.recover()
|
|
72
|
+
deduplicate_ids_in_element(xml.element)
|
|
73
|
+
with zip.replace(chapter_path) as target_file:
|
|
74
|
+
xml.save(target_file, is_html_like=True)
|
|
75
|
+
|
|
76
|
+
# Update progress after each chapter
|
|
77
|
+
processed_chapters += 1
|
|
78
|
+
current_progress = TOC_PROGRESS + METADATA_PROGRESS + (processed_chapters * chapter_progress_step)
|
|
79
|
+
if on_progress:
|
|
80
|
+
on_progress(current_progress)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _translate_toc(translator: XMLTranslator, zip: Zip):
|
|
84
|
+
"""Translate TOC (Table of Contents) titles."""
|
|
85
|
+
toc_list = read_toc(zip)
|
|
86
|
+
if not toc_list:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
# Collect all titles recursively
|
|
90
|
+
titles_to_translate: list[str] = []
|
|
91
|
+
|
|
92
|
+
def collect_titles(items):
|
|
93
|
+
for item in items:
|
|
94
|
+
titles_to_translate.append(item.title)
|
|
95
|
+
if item.children:
|
|
96
|
+
collect_titles(item.children)
|
|
97
|
+
|
|
98
|
+
collect_titles(toc_list)
|
|
99
|
+
|
|
100
|
+
# Create XML elements for translation
|
|
101
|
+
elements_to_translate = Element("toc")
|
|
102
|
+
elements_to_translate.extend(_create_text_element(title) for title in titles_to_translate)
|
|
103
|
+
|
|
104
|
+
# Translate all titles at once
|
|
105
|
+
translated_element = translator.translate_to_element(elements_to_translate)
|
|
106
|
+
|
|
107
|
+
# Extract translated texts
|
|
108
|
+
from builtins import zip as builtin_zip
|
|
109
|
+
|
|
110
|
+
translated_titles = [
|
|
111
|
+
plain_text(elem) if elem is not None else original
|
|
112
|
+
for elem, original in builtin_zip(translated_element, titles_to_translate)
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
# Fill back translated titles
|
|
116
|
+
title_index = 0
|
|
117
|
+
|
|
118
|
+
def fill_titles(items):
|
|
119
|
+
nonlocal title_index
|
|
120
|
+
for item in items:
|
|
121
|
+
item.title = translated_titles[title_index]
|
|
122
|
+
title_index += 1
|
|
123
|
+
if item.children:
|
|
124
|
+
fill_titles(item.children)
|
|
125
|
+
|
|
126
|
+
fill_titles(toc_list)
|
|
127
|
+
|
|
128
|
+
# Write back the translated TOC
|
|
129
|
+
write_toc(zip, toc_list)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _translate_metadata(translator: XMLTranslator, zip: Zip):
|
|
133
|
+
"""Translate metadata fields in OPF file."""
|
|
134
|
+
opf_path = find_opf_path(zip)
|
|
135
|
+
|
|
136
|
+
with zip.read(opf_path) as f:
|
|
137
|
+
xml = XMLLikeNode(f)
|
|
138
|
+
|
|
139
|
+
# Find metadata element
|
|
140
|
+
metadata_elem = None
|
|
141
|
+
for child in xml.element:
|
|
142
|
+
if child.tag.endswith("metadata"):
|
|
143
|
+
metadata_elem = child
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
if metadata_elem is None:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
# Collect metadata fields to translate
|
|
150
|
+
# Skip fields that should not be translated
|
|
151
|
+
skip_fields = {
|
|
152
|
+
"language",
|
|
153
|
+
"identifier",
|
|
154
|
+
"date",
|
|
155
|
+
"meta",
|
|
156
|
+
"contributor", # Usually technical information
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
fields_to_translate: list[tuple[Element, str]] = []
|
|
160
|
+
|
|
161
|
+
for elem in metadata_elem:
|
|
162
|
+
# Get tag name without namespace
|
|
163
|
+
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
164
|
+
|
|
165
|
+
# Check if element has text content and should be translated
|
|
166
|
+
if elem.text and elem.text.strip() and tag_name not in skip_fields:
|
|
167
|
+
fields_to_translate.append((elem, elem.text.strip()))
|
|
168
|
+
|
|
169
|
+
if not fields_to_translate:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
# Create XML elements for translation
|
|
173
|
+
elements_to_translate = Element("metadata")
|
|
174
|
+
elements_to_translate.extend(_create_text_element(text) for _, text in fields_to_translate)
|
|
175
|
+
|
|
176
|
+
# Translate all metadata at once
|
|
177
|
+
translated_element = translator.translate_to_element(elements_to_translate)
|
|
178
|
+
|
|
179
|
+
# Fill back translated texts
|
|
180
|
+
from builtins import zip as builtin_zip
|
|
181
|
+
|
|
182
|
+
for (elem, _), translated_elem in builtin_zip(fields_to_translate, translated_element, strict=True):
|
|
183
|
+
if translated_elem is not None:
|
|
184
|
+
translated_text = plain_text(translated_elem)
|
|
185
|
+
if translated_text:
|
|
186
|
+
elem.text = translated_text
|
|
187
|
+
|
|
188
|
+
# Write back the modified OPF file
|
|
189
|
+
with zip.replace(opf_path) as f:
|
|
190
|
+
xml.save(f)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _count_chapters(zip: Zip) -> int:
|
|
194
|
+
"""Count total chapters without loading content (lightweight)."""
|
|
195
|
+
return sum(1 for _ in search_spine_paths(zip))
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _search_chapter_items(zip: Zip):
|
|
199
|
+
for chapter_path in search_spine_paths(zip):
|
|
200
|
+
with zip.read(chapter_path) as chapter_file:
|
|
201
|
+
xml = XMLLikeNode(chapter_file)
|
|
202
|
+
body_element = find_first(xml.element, "body")
|
|
203
|
+
if body_element is not None:
|
|
204
|
+
placeholder = Placeholder(body_element)
|
|
205
|
+
yield body_element, (chapter_path, xml, placeholder)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _create_text_element(text: str) -> Element:
|
|
209
|
+
elem = Element("text")
|
|
210
|
+
elem.text = text
|
|
211
|
+
return elem
|
epub_translator/utils.py
ADDED
epub_translator/xml/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
1
|
+
from .deduplication import *
|
|
2
|
+
from .firendly import *
|
|
3
|
+
from .xml import *
|
|
4
|
+
from .xml_like import *
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from .xml import iter_with_stack
|
|
4
|
+
|
|
5
|
+
_ID_KEY = "id"
|
|
6
|
+
_SUFFIX = "__translated"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def deduplicate_ids_in_element(element: Element) -> Element:
|
|
10
|
+
seen_ids: set[str] = set()
|
|
11
|
+
original_id_count: dict[str, int] = {}
|
|
12
|
+
|
|
13
|
+
for _, sub_element in iter_with_stack(element):
|
|
14
|
+
if _ID_KEY not in sub_element.attrib:
|
|
15
|
+
continue
|
|
16
|
+
original_id = sub_element.attrib[_ID_KEY]
|
|
17
|
+
|
|
18
|
+
if original_id not in seen_ids:
|
|
19
|
+
seen_ids.add(original_id)
|
|
20
|
+
original_id_count[original_id] = 1
|
|
21
|
+
else:
|
|
22
|
+
original_id_count[original_id] = original_id_count.get(original_id, 1) + 1
|
|
23
|
+
occurrence = original_id_count[original_id]
|
|
24
|
+
|
|
25
|
+
if occurrence == 2:
|
|
26
|
+
new_id = f"{original_id}{_SUFFIX}"
|
|
27
|
+
else:
|
|
28
|
+
new_id = f"{original_id}{_SUFFIX}_{occurrence - 1}"
|
|
29
|
+
|
|
30
|
+
counter = occurrence - 1
|
|
31
|
+
while new_id in seen_ids:
|
|
32
|
+
counter += 1
|
|
33
|
+
new_id = f"{original_id}{_SUFFIX}_{counter}"
|
|
34
|
+
|
|
35
|
+
sub_element.attrib["id"] = new_id
|
|
36
|
+
seen_ids.add(new_id)
|
|
37
|
+
|
|
38
|
+
return element
|