epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +3 -1
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +175 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +205 -168
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +176 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +178 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.0.dist-info/METADATA +283 -0
  56. epub_translator-0.1.0.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -62
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.6.dist-info/METADATA +0 -170
  80. epub_translator-0.0.6.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,52 @@
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from typing import Generic
4
+
5
+ from resource_segmentation import Resource, Segment, split
6
+
7
+ from .segment import ST
8
+
9
+ _INCISION = 0
10
+
11
+
12
+ @dataclass
13
+ class Chunk(Generic[ST]):
14
+ head_remain_tokens: int
15
+ tail_remain_tokens: int
16
+ head: list[ST]
17
+ body: list[ST]
18
+ tail: list[ST]
19
+
20
+
21
+ def split_into_chunks(segments: Iterable[ST], max_group_tokens: int) -> Generator[Chunk[ST], None, None]:
22
+ for group in split(
23
+ max_segment_count=max_group_tokens,
24
+ gap_rate=0.07,
25
+ tail_rate=0.5,
26
+ border_incision=_INCISION,
27
+ resources=(
28
+ Resource(
29
+ count=segment.tokens,
30
+ start_incision=_INCISION,
31
+ end_incision=_INCISION,
32
+ payload=segment,
33
+ )
34
+ for segment in segments
35
+ ),
36
+ ):
37
+ yield Chunk(
38
+ head_remain_tokens=group.head_remain_count,
39
+ tail_remain_tokens=group.tail_remain_count,
40
+ head=list(_expand_payloads(group.head)),
41
+ body=list(_expand_payloads(group.body)),
42
+ tail=list(_expand_payloads(group.tail)),
43
+ )
44
+
45
+
46
+ def _expand_payloads(target: list[Resource[ST] | Segment[ST]]) -> Generator[ST, None, None]:
47
+ for item in target:
48
+ if isinstance(item, Resource):
49
+ yield item.payload
50
+ elif isinstance(item, Segment):
51
+ for resource in item.resources:
52
+ yield resource.payload
@@ -0,0 +1,17 @@
1
+ from typing import Generic, Protocol, Self, TypeVar, runtime_checkable
2
+
3
+ S = TypeVar("S", covariant=True)
4
+ T = TypeVar("T")
5
+ ST = TypeVar("ST", bound="Segment")
6
+
7
+
8
+ @runtime_checkable
9
+ class Segment(Protocol, Generic[S]):
10
+ @property
11
+ def tokens(self) -> int: ...
12
+
13
+ @property
14
+ def payload(self) -> S: ...
15
+
16
+ def truncate_after_head(self, remain_tokens: int) -> Self: ...
17
+ def truncate_before_tail(self, remain_tokens: int) -> Self: ...
@@ -0,0 +1,50 @@
1
+ from collections.abc import Callable, Generator, Iterable
2
+
3
+ from .chunk import split_into_chunks
4
+ from .segment import ST, T
5
+
6
+
7
+ def split(
8
+ segments: Iterable[ST],
9
+ transform: Callable[[list[ST]], list[T]],
10
+ max_group_tokens: int,
11
+ ) -> Generator[T, None, None]:
12
+ for group in split_into_chunks(segments, max_group_tokens):
13
+ head = list(
14
+ _truncate_extra_content(
15
+ segments=group.head,
16
+ remain_left=False,
17
+ remain_tokens=group.head_remain_tokens,
18
+ )
19
+ )
20
+ tail = list(
21
+ _truncate_extra_content(
22
+ segments=group.tail,
23
+ remain_left=True,
24
+ remain_tokens=group.tail_remain_tokens,
25
+ )
26
+ )
27
+ transformed = transform(head + group.body + tail)
28
+
29
+ if len(tail) > 0: # 避免 target[N:-0] 切片错误
30
+ yield from transformed[len(head) : -len(tail)]
31
+ else:
32
+ yield from transformed[len(head) :]
33
+
34
+
35
+ def _truncate_extra_content(segments: list[ST], remain_left: bool, remain_tokens: int):
36
+ tokens_list: list[int] = [segment.tokens for segment in segments]
37
+ segments = list(segments)
38
+ for tokens in tokens_list if remain_left else reversed(tokens_list):
39
+ if remain_tokens <= 0:
40
+ break
41
+ next_segment = segments.pop(0) if remain_left else segments.pop()
42
+ if remain_tokens < tokens:
43
+ if remain_left:
44
+ next_segment = next_segment.truncate_after_head(remain_tokens)
45
+ else:
46
+ next_segment = next_segment.truncate_before_tail(remain_tokens)
47
+ remain_tokens = 0
48
+ else:
49
+ remain_tokens -= tokens
50
+ yield next_segment
@@ -1,50 +1,52 @@
1
1
  import re
2
-
3
- from typing import Tuple, Callable
2
+ from collections.abc import Callable
4
3
  from pathlib import Path
5
- from jinja2 import select_autoescape, Environment, BaseLoader, TemplateNotFound
4
+
5
+ from jinja2 import BaseLoader, Environment, TemplateNotFound, select_autoescape
6
6
 
7
7
 
8
8
  def create_env(dir_path: Path) -> Environment:
9
- return Environment(
10
- loader=_DSLoader(dir_path),
11
- autoescape=select_autoescape(),
12
- trim_blocks=True,
13
- keep_trailing_newline=True,
14
- )
9
+ return Environment(
10
+ loader=_DSLoader(dir_path),
11
+ autoescape=select_autoescape(),
12
+ trim_blocks=True,
13
+ keep_trailing_newline=True,
14
+ )
15
+
16
+
17
+ _LoaderResult = tuple[str, str | None, Callable[[], bool] | None]
15
18
 
16
- _LoaderResult = Tuple[str, str | None, Callable[[], bool] | None]
17
19
 
18
20
  class _DSLoader(BaseLoader):
19
- def __init__(self, dir_path: Path):
20
- super().__init__()
21
- self._dir_path: Path = dir_path
21
+ def __init__(self, dir_path: Path):
22
+ super().__init__()
23
+ self._dir_path: Path = dir_path
22
24
 
23
- def get_source(self, _: Environment, template: str) -> _LoaderResult:
24
- template = self._norm_template(template)
25
- target_path = (self._dir_path / template).resolve()
25
+ def get_source(self, environment: Environment, template: str) -> _LoaderResult:
26
+ template = self._norm_template(template)
27
+ target_path = (self._dir_path / template).resolve()
26
28
 
27
- if not target_path.exists():
28
- raise TemplateNotFound(f"cannot find {template}")
29
+ if not target_path.exists():
30
+ raise TemplateNotFound(f"cannot find {template}")
29
31
 
30
- return self._get_source_with_path(target_path)
32
+ return self._get_source_with_path(target_path)
31
33
 
32
- def _norm_template(self, template: str) -> str:
33
- if bool(re.match(r"^\.+/", template)):
34
- raise TemplateNotFound(f"invalid path {template}")
34
+ def _norm_template(self, template: str) -> str:
35
+ if bool(re.match(r"^\.+/", template)):
36
+ raise TemplateNotFound(f"invalid path {template}")
35
37
 
36
- template = re.sub(r"^/", "", template)
37
- template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
38
- template = f"{template}.jinja"
38
+ template = re.sub(r"^/", "", template)
39
+ template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
40
+ template = f"{template}.jinja"
39
41
 
40
- return template
42
+ return template
41
43
 
42
- def _get_source_with_path(self, path: Path) -> _LoaderResult:
43
- mtime = path.stat().st_mtime
44
- with open(path, "r", encoding="utf-8") as f:
45
- source = f.read()
44
+ def _get_source_with_path(self, path: Path) -> _LoaderResult:
45
+ mtime = path.stat().st_mtime
46
+ with open(path, encoding="utf-8") as f:
47
+ source = f.read()
46
48
 
47
- def is_updated() -> bool:
48
- return mtime == path.stat().st_mtime
49
+ def is_updated() -> bool:
50
+ return mtime == path.stat().st_mtime
49
51
 
50
- return source, path, is_updated
52
+ return source, str(path), is_updated
@@ -1,174 +1,211 @@
1
- from os import PathLike
1
+ from collections.abc import Callable
2
2
  from pathlib import Path
3
- from tempfile import mkdtemp
4
- from shutil import rmtree
3
+ from xml.etree.ElementTree import Element
5
4
 
5
+ from .epub import Placeholder, Zip, is_placeholder_tag, read_toc, search_spine_paths, write_toc
6
+ from .epub.common import find_opf_path
6
7
  from .llm import LLM
7
- from .epub import HTMLFile
8
- from .zip_context import ZipContext
9
- from .translation import translate as _translate, Incision, Fragment, Language, ProgressReporter
8
+ from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first, plain_text
9
+ from .xml_translator import XMLGroupContext, XMLTranslator, submit_text_segments
10
10
 
11
11
 
12
12
  def translate(
13
- llm: LLM,
14
- source_path: PathLike,
15
- translated_path: PathLike,
16
- target_language: Language,
17
- user_prompt: str | None = None,
18
- working_path: PathLike | None = None,
19
- max_chunk_tokens_count: int = 3000,
20
- max_threads_count: int = 1,
21
- report_progress: ProgressReporter | None = None,
22
- ) -> None:
23
-
24
- source_path = Path(source_path)
25
- translated_path = Path(translated_path)
26
- working_path = Path(working_path) if working_path else None
27
- report_progress = report_progress or (lambda _: None)
28
-
29
- _Translator(
30
- llm=llm,
31
- target_language=target_language,
32
- user_prompt=user_prompt,
33
- max_chunk_tokens_count=max_chunk_tokens_count,
34
- max_threads_count=max_threads_count,
35
- report_progress=report_progress,
36
- ).do(
37
- source_path=source_path,
38
- translated_path=translated_path,
39
- working_path=working_path,
40
- )
41
-
42
- class _Translator:
43
- def __init__(
44
- self,
45
- llm: LLM,
46
- target_language: Language,
47
- user_prompt: str | None,
48
- max_chunk_tokens_count: int,
49
- max_threads_count: int,
50
- report_progress: ProgressReporter,
51
- ) -> None:
52
-
53
- self._llm: LLM = llm
54
- self._target_language: Language = target_language
55
- self._user_prompt: str | None = user_prompt
56
- self._max_chunk_tokens_count: int = max_chunk_tokens_count
57
- self._max_threads_count: int = max_threads_count
58
- self._report_progress: ProgressReporter = report_progress
59
-
60
- def do(self, source_path: Path, translated_path: Path, working_path: Path | None) -> None:
61
- is_temp_workspace = not bool(working_path)
62
- working_path = working_path or Path(mkdtemp())
63
- try:
64
- temp_dir = _clean_path(working_path / "temp")
65
- temp_dir.mkdir(parents=True, exist_ok=True)
66
- cache_path = working_path / "cache"
67
-
68
- context = ZipContext(
69
- epub_path=Path(source_path),
70
- temp_dir=temp_dir,
71
- )
72
- context.replace_ncx(lambda texts: self._translate_ncx(
73
- texts=texts,
74
- cache_path=cache_path,
75
- report_progress=lambda p: self._report_progress(p * 0.1)),
76
- )
77
- self._translate_spine(
78
- context=context,
79
- cache_path=cache_path,
80
- report_progress=lambda p: self._report_progress(0.1 + p * 0.8),
81
- )
82
- context.archive(translated_path)
83
- self._report_progress(1.0)
84
-
85
- finally:
86
- if is_temp_workspace:
87
- rmtree(working_path, ignore_errors=True)
88
-
89
- def _translate_ncx(self, texts: list[str], cache_path: Path, report_progress: ProgressReporter) -> list[str]:
90
- return list(_translate(
91
- llm=self._llm,
92
- cache_path=cache_path,
93
- max_chunk_tokens_count=self._max_chunk_tokens_count,
94
- max_threads_count=1,
95
- target_language=self._target_language,
96
- user_prompt=self._user_prompt,
97
- report_progress=report_progress,
98
- gen_fragments_iter=lambda: (
99
- Fragment(
100
- text=text,
101
- start_incision=Incision.IMPOSSIBLE,
102
- end_incision=Incision.IMPOSSIBLE,
103
- )
104
- for text in texts
105
- ),
106
- ))
107
-
108
- def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
109
- spine_paths_iter = iter(list(context.search_spine_paths()))
110
- spine: tuple[Path, HTMLFile] | None = None
111
- translated_texts: list[str] = []
112
- translated_count: int = 0
113
-
114
- for translated_text in _translate(
115
- llm=self._llm,
116
- gen_fragments_iter=lambda: _gen_fragments(context),
117
- cache_path=cache_path,
118
- max_chunk_tokens_count=self._max_chunk_tokens_count,
119
- max_threads_count=self._max_threads_count,
120
- target_language=self._target_language,
121
- user_prompt=self._user_prompt,
122
- report_progress=report_progress,
123
- ):
124
- did_touch_end = False
125
-
126
- if spine and translated_count >= len(translated_texts):
127
- spine_path, spine_file = spine
128
- spine_file.write_texts(translated_texts)
129
- context.write_spine_file(spine_path, spine_file)
130
- spine = None
131
-
132
- while not spine:
133
- spine_path = next(spine_paths_iter, None)
134
- if spine_path is None:
135
- spine = None
136
- did_touch_end = True
137
- break
138
- spine_file = context.read_spine_file(spine_path)
139
- if spine_file.texts_length == 0:
140
- continue
141
- spine = (spine_path, spine_file)
142
- translated_texts = [""] * spine_file.texts_length
143
- translated_count = 0
144
- break
145
-
146
- translated_texts[translated_count] = translated_text
147
- translated_count += 1
148
-
149
- if did_touch_end:
150
- break
151
-
152
- if spine:
153
- spine_path, spine_file = spine
154
- if translated_count > 0:
155
- spine_file.write_texts(translated_texts)
156
- context.write_spine_file(spine_path, spine_file)
157
-
158
- def _gen_fragments(context: ZipContext):
159
- for spine_path in context.search_spine_paths():
160
- spine_file = context.read_spine_file(spine_path)
161
- for text in spine_file.read_texts():
162
- yield Fragment(
163
- text=text,
164
- start_incision=Incision.IMPOSSIBLE,
165
- end_incision=Incision.IMPOSSIBLE,
166
- )
167
-
168
- def _clean_path(path: Path) -> Path:
169
- if path.exists():
170
- if path.is_file():
171
- path.unlink()
172
- elif path.is_dir():
173
- rmtree(path, ignore_errors=True)
174
- return path
13
+ llm: LLM,
14
+ source_path: Path,
15
+ target_path: Path,
16
+ target_language: str,
17
+ user_prompt: str | None = None,
18
+ max_retries: int = 5,
19
+ max_group_tokens: int = 1200,
20
+ on_progress: Callable[[float], None] | None = None,
21
+ ) -> None:
22
+ translator = XMLTranslator(
23
+ llm=llm,
24
+ target_language=target_language,
25
+ user_prompt=user_prompt,
26
+ ignore_translated_error=False,
27
+ max_retries=max_retries,
28
+ max_fill_displaying_errors=10,
29
+ group_context=XMLGroupContext(
30
+ encoding=llm.encoding,
31
+ max_group_tokens=max_group_tokens,
32
+ ),
33
+ )
34
+ with Zip(source_path, target_path) as zip:
35
+ # Progress distribution: TOC 3%, metadata 2%, chapters 95%
36
+ TOC_PROGRESS = 0.03
37
+ METADATA_PROGRESS = 0.02
38
+ CHAPTERS_PROGRESS = 0.95
39
+
40
+ # Count total chapters for progress calculation (lightweight, no content loading)
41
+ total_chapters = _count_chapters(zip)
42
+ chapter_progress_step = CHAPTERS_PROGRESS / total_chapters if total_chapters > 0 else 0
43
+
44
+ current_progress = 0.0
45
+
46
+ # Translate TOC
47
+ _translate_toc(translator, zip)
48
+ current_progress += TOC_PROGRESS
49
+ if on_progress:
50
+ on_progress(current_progress)
51
+
52
+ # Translate metadata
53
+ _translate_metadata(translator, zip)
54
+ current_progress += METADATA_PROGRESS
55
+ if on_progress:
56
+ on_progress(current_progress)
57
+
58
+ # Translate chapters
59
+ processed_chapters = 0
60
+ for element, text_segments, (chapter_path, xml, placeholder) in translator.translate_to_text_segments(
61
+ items=_search_chapter_items(zip),
62
+ ):
63
+ submit_text_segments(
64
+ element=element,
65
+ text_segments=(
66
+ segment
67
+ for segment in text_segments
68
+ if not any(is_placeholder_tag(e.tag) for e in segment.parent_stack)
69
+ ),
70
+ )
71
+ placeholder.recover()
72
+ deduplicate_ids_in_element(xml.element)
73
+ with zip.replace(chapter_path) as target_file:
74
+ xml.save(target_file, is_html_like=True)
75
+
76
+ # Update progress after each chapter
77
+ processed_chapters += 1
78
+ current_progress = TOC_PROGRESS + METADATA_PROGRESS + (processed_chapters * chapter_progress_step)
79
+ if on_progress:
80
+ on_progress(current_progress)
81
+
82
+
83
+ def _translate_toc(translator: XMLTranslator, zip: Zip):
84
+ """Translate TOC (Table of Contents) titles."""
85
+ toc_list = read_toc(zip)
86
+ if not toc_list:
87
+ return
88
+
89
+ # Collect all titles recursively
90
+ titles_to_translate: list[str] = []
91
+
92
+ def collect_titles(items):
93
+ for item in items:
94
+ titles_to_translate.append(item.title)
95
+ if item.children:
96
+ collect_titles(item.children)
97
+
98
+ collect_titles(toc_list)
99
+
100
+ # Create XML elements for translation
101
+ elements_to_translate = Element("toc")
102
+ elements_to_translate.extend(_create_text_element(title) for title in titles_to_translate)
103
+
104
+ # Translate all titles at once
105
+ translated_element = translator.translate_to_element(elements_to_translate)
106
+
107
+ # Extract translated texts
108
+ from builtins import zip as builtin_zip
109
+
110
+ translated_titles = [
111
+ plain_text(elem) if elem is not None else original
112
+ for elem, original in builtin_zip(translated_element, titles_to_translate)
113
+ ]
114
+
115
+ # Fill back translated titles
116
+ title_index = 0
117
+
118
+ def fill_titles(items):
119
+ nonlocal title_index
120
+ for item in items:
121
+ item.title = translated_titles[title_index]
122
+ title_index += 1
123
+ if item.children:
124
+ fill_titles(item.children)
125
+
126
+ fill_titles(toc_list)
127
+
128
+ # Write back the translated TOC
129
+ write_toc(zip, toc_list)
130
+
131
+
132
+ def _translate_metadata(translator: XMLTranslator, zip: Zip):
133
+ """Translate metadata fields in OPF file."""
134
+ opf_path = find_opf_path(zip)
135
+
136
+ with zip.read(opf_path) as f:
137
+ xml = XMLLikeNode(f)
138
+
139
+ # Find metadata element
140
+ metadata_elem = None
141
+ for child in xml.element:
142
+ if child.tag.endswith("metadata"):
143
+ metadata_elem = child
144
+ break
145
+
146
+ if metadata_elem is None:
147
+ return
148
+
149
+ # Collect metadata fields to translate
150
+ # Skip fields that should not be translated
151
+ skip_fields = {
152
+ "language",
153
+ "identifier",
154
+ "date",
155
+ "meta",
156
+ "contributor", # Usually technical information
157
+ }
158
+
159
+ fields_to_translate: list[tuple[Element, str]] = []
160
+
161
+ for elem in metadata_elem:
162
+ # Get tag name without namespace
163
+ tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
164
+
165
+ # Check if element has text content and should be translated
166
+ if elem.text and elem.text.strip() and tag_name not in skip_fields:
167
+ fields_to_translate.append((elem, elem.text.strip()))
168
+
169
+ if not fields_to_translate:
170
+ return
171
+
172
+ # Create XML elements for translation
173
+ elements_to_translate = Element("metadata")
174
+ elements_to_translate.extend(_create_text_element(text) for _, text in fields_to_translate)
175
+
176
+ # Translate all metadata at once
177
+ translated_element = translator.translate_to_element(elements_to_translate)
178
+
179
+ # Fill back translated texts
180
+ from builtins import zip as builtin_zip
181
+
182
+ for (elem, _), translated_elem in builtin_zip(fields_to_translate, translated_element, strict=True):
183
+ if translated_elem is not None:
184
+ translated_text = plain_text(translated_elem)
185
+ if translated_text:
186
+ elem.text = translated_text
187
+
188
+ # Write back the modified OPF file
189
+ with zip.replace(opf_path) as f:
190
+ xml.save(f)
191
+
192
+
193
+ def _count_chapters(zip: Zip) -> int:
194
+ """Count total chapters without loading content (lightweight)."""
195
+ return sum(1 for _ in search_spine_paths(zip))
196
+
197
+
198
+ def _search_chapter_items(zip: Zip):
199
+ for chapter_path in search_spine_paths(zip):
200
+ with zip.read(chapter_path) as chapter_file:
201
+ xml = XMLLikeNode(chapter_file)
202
+ body_element = find_first(xml.element, "body")
203
+ if body_element is not None:
204
+ placeholder = Placeholder(body_element)
205
+ yield body_element, (chapter_path, xml, placeholder)
206
+
207
+
208
+ def _create_text_element(text: str) -> Element:
209
+ elem = Element("text")
210
+ elem.text = text
211
+ return elem
@@ -0,0 +1,7 @@
1
+ import re
2
+
3
+ _WHITESPACE_PATTERN = re.compile(r"\s+")
4
+
5
+
6
+ def normalize_whitespace(text: str) -> str:
7
+ return _WHITESPACE_PATTERN.sub(" ", text)
@@ -1,3 +1,4 @@
1
- from .encoder import encode, encode_friendly
2
- from .decoder import decode_friendly
3
- from .utils import clone
1
+ from .deduplication import *
2
+ from .firendly import *
3
+ from .xml import *
4
+ from .xml_like import *
@@ -0,0 +1,38 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from .xml import iter_with_stack
4
+
5
+ _ID_KEY = "id"
6
+ _SUFFIX = "__translated"
7
+
8
+
9
+ def deduplicate_ids_in_element(element: Element) -> Element:
10
+ seen_ids: set[str] = set()
11
+ original_id_count: dict[str, int] = {}
12
+
13
+ for _, sub_element in iter_with_stack(element):
14
+ if _ID_KEY not in sub_element.attrib:
15
+ continue
16
+ original_id = sub_element.attrib[_ID_KEY]
17
+
18
+ if original_id not in seen_ids:
19
+ seen_ids.add(original_id)
20
+ original_id_count[original_id] = 1
21
+ else:
22
+ original_id_count[original_id] = original_id_count.get(original_id, 1) + 1
23
+ occurrence = original_id_count[original_id]
24
+
25
+ if occurrence == 2:
26
+ new_id = f"{original_id}{_SUFFIX}"
27
+ else:
28
+ new_id = f"{original_id}{_SUFFIX}_{occurrence - 1}"
29
+
30
+ counter = occurrence - 1
31
+ while new_id in seen_ids:
32
+ counter += 1
33
+ new_id = f"{original_id}{_SUFFIX}_{counter}"
34
+
35
+ sub_element.attrib["id"] = new_id
36
+ seen_ids.add(new_id)
37
+
38
+ return element
@@ -0,0 +1,2 @@
1
+ from .decoder import decode_friendly
2
+ from .encoder import encode_friendly