epub-translator 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +32 -113
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +147 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +8 -33
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +162 -113
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.1.dist-info/RECORD +0 -58
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
epub_translator/translator.py
CHANGED
|
@@ -1,214 +1,178 @@
|
|
|
1
1
|
from collections.abc import Callable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum, auto
|
|
4
|
+
from importlib.metadata import version as get_package_version
|
|
5
|
+
from os import PathLike
|
|
2
6
|
from pathlib import Path
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
4
7
|
|
|
5
|
-
from .epub import
|
|
6
|
-
|
|
8
|
+
from .epub import (
|
|
9
|
+
Zip,
|
|
10
|
+
read_metadata,
|
|
11
|
+
read_toc,
|
|
12
|
+
search_spine_paths,
|
|
13
|
+
write_metadata,
|
|
14
|
+
write_toc,
|
|
15
|
+
)
|
|
16
|
+
from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
|
|
7
17
|
from .llm import LLM
|
|
8
|
-
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
9
|
-
from .
|
|
18
|
+
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
19
|
+
from .xml_interrupter import XMLInterrupter
|
|
20
|
+
from .xml_translator import FillFailedEvent, XMLTranslator
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _ElementType(Enum):
|
|
24
|
+
TOC = auto()
|
|
25
|
+
METADATA = auto()
|
|
26
|
+
CHAPTER = auto()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class _ElementContext:
|
|
31
|
+
element_type: _ElementType
|
|
32
|
+
chapter_data: tuple[Path, XMLLikeNode] | None = None
|
|
10
33
|
|
|
11
34
|
|
|
12
35
|
def translate(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
target_path: Path,
|
|
36
|
+
source_path: PathLike | str,
|
|
37
|
+
target_path: PathLike | str,
|
|
16
38
|
target_language: str,
|
|
17
39
|
user_prompt: str | None = None,
|
|
18
40
|
max_retries: int = 5,
|
|
19
41
|
max_group_tokens: int = 1200,
|
|
42
|
+
llm: LLM | None = None,
|
|
43
|
+
translation_llm: LLM | None = None,
|
|
44
|
+
fill_llm: LLM | None = None,
|
|
20
45
|
on_progress: Callable[[float], None] | None = None,
|
|
46
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
21
47
|
) -> None:
|
|
48
|
+
translation_llm = translation_llm or llm
|
|
49
|
+
fill_llm = fill_llm or llm
|
|
50
|
+
if translation_llm is None:
|
|
51
|
+
raise ValueError("Either translation_llm or llm must be provided")
|
|
52
|
+
if fill_llm is None:
|
|
53
|
+
raise ValueError("Either fill_llm or llm must be provided")
|
|
54
|
+
|
|
22
55
|
translator = XMLTranslator(
|
|
23
|
-
|
|
56
|
+
translation_llm=translation_llm,
|
|
57
|
+
fill_llm=fill_llm,
|
|
24
58
|
target_language=target_language,
|
|
25
59
|
user_prompt=user_prompt,
|
|
26
60
|
ignore_translated_error=False,
|
|
27
61
|
max_retries=max_retries,
|
|
28
62
|
max_fill_displaying_errors=10,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
max_group_tokens=max_group_tokens,
|
|
32
|
-
),
|
|
63
|
+
max_group_tokens=max_group_tokens,
|
|
64
|
+
cache_seed_content=f"{_get_version()}:{target_language}",
|
|
33
65
|
)
|
|
34
|
-
with Zip(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
total_chapters =
|
|
42
|
-
|
|
43
|
-
|
|
66
|
+
with Zip(
|
|
67
|
+
source_path=Path(source_path).resolve(),
|
|
68
|
+
target_path=Path(target_path).resolve(),
|
|
69
|
+
) as zip:
|
|
70
|
+
# mimetype should be the first file in the EPUB ZIP
|
|
71
|
+
zip.migrate(Path("mimetype"))
|
|
72
|
+
|
|
73
|
+
total_chapters = sum(1 for _, _ in search_spine_paths(zip))
|
|
74
|
+
toc_list = read_toc(zip)
|
|
75
|
+
metadata_fields = read_metadata(zip)
|
|
76
|
+
|
|
77
|
+
# Calculate weights: TOC (5%), Metadata (5%), Chapters (90%)
|
|
78
|
+
toc_has_items = len(toc_list) > 0
|
|
79
|
+
metadata_has_items = len(metadata_fields) > 0
|
|
80
|
+
total_items = (1 if toc_has_items else 0) + (1 if metadata_has_items else 0) + total_chapters
|
|
81
|
+
|
|
82
|
+
if total_items == 0:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
interrupter = XMLInterrupter()
|
|
86
|
+
element_contexts: dict[int, _ElementContext] = {}
|
|
87
|
+
|
|
88
|
+
toc_weight = 0.05 if toc_has_items else 0
|
|
89
|
+
metadata_weight = 0.05 if metadata_has_items else 0
|
|
90
|
+
chapters_weight = 1.0 - toc_weight - metadata_weight
|
|
91
|
+
progress_per_chapter = chapters_weight / total_chapters if total_chapters > 0 else 0
|
|
44
92
|
current_progress = 0.0
|
|
45
93
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# Translate chapters
|
|
59
|
-
processed_chapters = 0
|
|
60
|
-
for element, text_segments, (chapter_path, xml, placeholder) in translator.translate_to_text_segments(
|
|
61
|
-
items=_search_chapter_items(zip),
|
|
94
|
+
for translated_elem in translator.translate_elements(
|
|
95
|
+
interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
|
|
96
|
+
interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
|
|
97
|
+
interrupt_block_element=interrupter.interrupt_block_element,
|
|
98
|
+
on_fill_failed=on_fill_failed,
|
|
99
|
+
elements=_generate_elements_from_book(
|
|
100
|
+
zip=zip,
|
|
101
|
+
toc_list=toc_list,
|
|
102
|
+
metadata_fields=metadata_fields,
|
|
103
|
+
element_contexts=element_contexts,
|
|
104
|
+
),
|
|
62
105
|
):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
title_index = 0
|
|
117
|
-
|
|
118
|
-
def fill_titles(items):
|
|
119
|
-
nonlocal title_index
|
|
120
|
-
for item in items:
|
|
121
|
-
item.title = translated_titles[title_index]
|
|
122
|
-
title_index += 1
|
|
123
|
-
if item.children:
|
|
124
|
-
fill_titles(item.children)
|
|
125
|
-
|
|
126
|
-
fill_titles(toc_list)
|
|
127
|
-
|
|
128
|
-
# Write back the translated TOC
|
|
129
|
-
write_toc(zip, toc_list)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def _translate_metadata(translator: XMLTranslator, zip: Zip):
|
|
133
|
-
"""Translate metadata fields in OPF file."""
|
|
134
|
-
opf_path = find_opf_path(zip)
|
|
135
|
-
|
|
136
|
-
with zip.read(opf_path) as f:
|
|
137
|
-
xml = XMLLikeNode(f)
|
|
138
|
-
|
|
139
|
-
# Find metadata element
|
|
140
|
-
metadata_elem = None
|
|
141
|
-
for child in xml.element:
|
|
142
|
-
if child.tag.endswith("metadata"):
|
|
143
|
-
metadata_elem = child
|
|
144
|
-
break
|
|
145
|
-
|
|
146
|
-
if metadata_elem is None:
|
|
147
|
-
return
|
|
148
|
-
|
|
149
|
-
# Collect metadata fields to translate
|
|
150
|
-
# Skip fields that should not be translated
|
|
151
|
-
skip_fields = {
|
|
152
|
-
"language",
|
|
153
|
-
"identifier",
|
|
154
|
-
"date",
|
|
155
|
-
"meta",
|
|
156
|
-
"contributor", # Usually technical information
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
fields_to_translate: list[tuple[Element, str]] = []
|
|
160
|
-
|
|
161
|
-
for elem in metadata_elem:
|
|
162
|
-
# Get tag name without namespace
|
|
163
|
-
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
164
|
-
|
|
165
|
-
# Check if element has text content and should be translated
|
|
166
|
-
if elem.text and elem.text.strip() and tag_name not in skip_fields:
|
|
167
|
-
fields_to_translate.append((elem, elem.text.strip()))
|
|
168
|
-
|
|
169
|
-
if not fields_to_translate:
|
|
170
|
-
return
|
|
171
|
-
|
|
172
|
-
# Create XML elements for translation
|
|
173
|
-
elements_to_translate = Element("metadata")
|
|
174
|
-
elements_to_translate.extend(_create_text_element(text) for _, text in fields_to_translate)
|
|
175
|
-
|
|
176
|
-
# Translate all metadata at once
|
|
177
|
-
translated_element = translator.translate_to_element(elements_to_translate)
|
|
178
|
-
|
|
179
|
-
# Fill back translated texts
|
|
180
|
-
from builtins import zip as builtin_zip
|
|
181
|
-
|
|
182
|
-
for (elem, _), translated_elem in builtin_zip(fields_to_translate, translated_element, strict=True):
|
|
183
|
-
if translated_elem is not None:
|
|
184
|
-
translated_text = plain_text(translated_elem)
|
|
185
|
-
if translated_text:
|
|
186
|
-
elem.text = translated_text
|
|
187
|
-
|
|
188
|
-
# Write back the modified OPF file
|
|
189
|
-
with zip.replace(opf_path) as f:
|
|
190
|
-
xml.save(f)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _count_chapters(zip: Zip) -> int:
|
|
194
|
-
"""Count total chapters without loading content (lightweight)."""
|
|
195
|
-
return sum(1 for _ in search_spine_paths(zip))
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def _search_chapter_items(zip: Zip):
|
|
199
|
-
for chapter_path in search_spine_paths(zip):
|
|
106
|
+
elem_id = id(translated_elem)
|
|
107
|
+
context = element_contexts.pop(elem_id, None)
|
|
108
|
+
|
|
109
|
+
if context is None:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if context.element_type == _ElementType.TOC:
|
|
113
|
+
decoded_toc = decode_toc_list(translated_elem)
|
|
114
|
+
write_toc(zip, decoded_toc)
|
|
115
|
+
|
|
116
|
+
current_progress += toc_weight
|
|
117
|
+
if on_progress:
|
|
118
|
+
on_progress(current_progress)
|
|
119
|
+
|
|
120
|
+
elif context.element_type == _ElementType.METADATA:
|
|
121
|
+
decoded_metadata = decode_metadata(translated_elem)
|
|
122
|
+
write_metadata(zip, decoded_metadata)
|
|
123
|
+
|
|
124
|
+
current_progress += metadata_weight
|
|
125
|
+
if on_progress:
|
|
126
|
+
on_progress(current_progress)
|
|
127
|
+
|
|
128
|
+
elif context.element_type == _ElementType.CHAPTER:
|
|
129
|
+
if context.chapter_data is not None:
|
|
130
|
+
chapter_path, xml = context.chapter_data
|
|
131
|
+
deduplicate_ids_in_element(xml.element)
|
|
132
|
+
with zip.replace(chapter_path) as target_file:
|
|
133
|
+
xml.save(target_file)
|
|
134
|
+
|
|
135
|
+
current_progress += progress_per_chapter
|
|
136
|
+
if on_progress:
|
|
137
|
+
on_progress(current_progress)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _generate_elements_from_book(
|
|
141
|
+
zip: Zip,
|
|
142
|
+
toc_list: list,
|
|
143
|
+
metadata_fields: list,
|
|
144
|
+
element_contexts: dict[int, _ElementContext],
|
|
145
|
+
):
|
|
146
|
+
if toc_list:
|
|
147
|
+
toc_elem = encode_toc_list(toc_list)
|
|
148
|
+
elem_id = id(toc_elem)
|
|
149
|
+
element_contexts[elem_id] = _ElementContext(element_type=_ElementType.TOC)
|
|
150
|
+
yield toc_elem
|
|
151
|
+
|
|
152
|
+
if metadata_fields:
|
|
153
|
+
metadata_elem = encode_metadata(metadata_fields)
|
|
154
|
+
elem_id = id(metadata_elem)
|
|
155
|
+
element_contexts[elem_id] = _ElementContext(element_type=_ElementType.METADATA)
|
|
156
|
+
yield metadata_elem
|
|
157
|
+
|
|
158
|
+
for chapter_path, media_type in search_spine_paths(zip):
|
|
200
159
|
with zip.read(chapter_path) as chapter_file:
|
|
201
160
|
xml = XMLLikeNode(
|
|
202
161
|
file=chapter_file,
|
|
203
|
-
is_html_like=
|
|
162
|
+
is_html_like=(media_type == "text/html"),
|
|
204
163
|
)
|
|
205
164
|
body_element = find_first(xml.element, "body")
|
|
206
165
|
if body_element is not None:
|
|
207
|
-
|
|
208
|
-
|
|
166
|
+
elem_id = id(body_element)
|
|
167
|
+
element_contexts[elem_id] = _ElementContext(
|
|
168
|
+
element_type=_ElementType.CHAPTER,
|
|
169
|
+
chapter_data=(chapter_path, xml),
|
|
170
|
+
)
|
|
171
|
+
yield body_element
|
|
209
172
|
|
|
210
173
|
|
|
211
|
-
def
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
174
|
+
def _get_version() -> str:
|
|
175
|
+
try:
|
|
176
|
+
return get_package_version("epub-translator")
|
|
177
|
+
except Exception:
|
|
178
|
+
return "development"
|
epub_translator/utils.py
CHANGED
|
@@ -1,7 +1,40 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from typing import TypeVar
|
|
4
|
+
|
|
5
|
+
K = TypeVar("K")
|
|
6
|
+
T = TypeVar("T")
|
|
2
7
|
|
|
3
8
|
_WHITESPACE_PATTERN = re.compile(r"\s+")
|
|
4
9
|
|
|
5
10
|
|
|
6
11
|
def normalize_whitespace(text: str) -> str:
|
|
7
12
|
return _WHITESPACE_PATTERN.sub(" ", text)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def is_the_same(elements: Iterable[T]) -> bool:
|
|
16
|
+
iterator = iter(elements)
|
|
17
|
+
try:
|
|
18
|
+
first_element = next(iterator)
|
|
19
|
+
except StopIteration:
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
for element in iterator:
|
|
23
|
+
if element != first_element:
|
|
24
|
+
return False
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def nest(items: Iterable[tuple[K, T]]) -> dict[K, list[T]]:
|
|
29
|
+
nested_dict: dict[K, list[T]] = {}
|
|
30
|
+
for key, value in items:
|
|
31
|
+
ensure_list(nested_dict, key).append(value)
|
|
32
|
+
return nested_dict
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def ensure_list(target: dict[K, list[T]], key: K) -> list[T]:
|
|
36
|
+
value = target.get(key, None)
|
|
37
|
+
if value is None:
|
|
38
|
+
value = []
|
|
39
|
+
target[key] = value
|
|
40
|
+
return value
|
epub_translator/xml/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ID_KEY: str = "id"
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from xml.etree.ElementTree import Element
|
|
2
2
|
|
|
3
|
+
from .const import ID_KEY
|
|
3
4
|
from .xml import iter_with_stack
|
|
4
5
|
|
|
5
|
-
_ID_KEY = "id"
|
|
6
6
|
_SUFFIX = "__translated"
|
|
7
7
|
|
|
8
8
|
|
|
@@ -11,9 +11,9 @@ def deduplicate_ids_in_element(element: Element) -> Element:
|
|
|
11
11
|
original_id_count: dict[str, int] = {}
|
|
12
12
|
|
|
13
13
|
for _, sub_element in iter_with_stack(element):
|
|
14
|
-
if
|
|
14
|
+
if ID_KEY not in sub_element.attrib:
|
|
15
15
|
continue
|
|
16
|
-
original_id = sub_element.attrib[
|
|
16
|
+
original_id = sub_element.attrib[ID_KEY]
|
|
17
17
|
|
|
18
18
|
if original_id not in seen_ids:
|
|
19
19
|
seen_ids.add(original_id)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
# Some non-standard EPUB generators use HTML-style tags without self-closing syntax
|
|
4
|
+
# We need to convert them to XML-compatible format before parsing
|
|
5
|
+
# These are HTML5 void elements that must be self-closing in XHTML
|
|
6
|
+
_VOID_TAGS = (
|
|
7
|
+
"area",
|
|
8
|
+
"base",
|
|
9
|
+
"br",
|
|
10
|
+
"col",
|
|
11
|
+
"embed",
|
|
12
|
+
"hr",
|
|
13
|
+
"img",
|
|
14
|
+
"input",
|
|
15
|
+
"link",
|
|
16
|
+
"meta",
|
|
17
|
+
"param",
|
|
18
|
+
"source",
|
|
19
|
+
"track",
|
|
20
|
+
"wbr",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def self_close_void_elements(xml_content: str) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Convert void HTML elements to self-closing format for XML parsing.
|
|
27
|
+
|
|
28
|
+
This function handles non-standard HTML where void elements are not self-closed.
|
|
29
|
+
For illegal cases like <meta>content</meta>, the content is removed.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
xml_content: HTML/XHTML content string
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Content with void elements in self-closing format
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
<meta charset="utf-8"> → <meta charset="utf-8" />
|
|
39
|
+
<br> → <br />
|
|
40
|
+
<meta>illegal</meta> → <meta />
|
|
41
|
+
"""
|
|
42
|
+
for tag in _VOID_TAGS:
|
|
43
|
+
xml_content = _fix_void_element(xml_content, tag)
|
|
44
|
+
return xml_content
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _fix_void_element(content: str, tag_name: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Fix a specific void element in the content.
|
|
50
|
+
|
|
51
|
+
Strategy:
|
|
52
|
+
1. Find <tag ...> (not already self-closed)
|
|
53
|
+
2. Check if there's a matching </tag>
|
|
54
|
+
3. If yes, remove everything between them and make it self-closing
|
|
55
|
+
4. If no, just make the opening tag self-closing
|
|
56
|
+
"""
|
|
57
|
+
result = []
|
|
58
|
+
pos = 0
|
|
59
|
+
|
|
60
|
+
while pos < len(content):
|
|
61
|
+
tag_start = content.find(f"<{tag_name}", pos)
|
|
62
|
+
if tag_start == -1:
|
|
63
|
+
result.append(content[pos:])
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# Verify it's a complete tag match (not a prefix like <br matching <brain>)
|
|
67
|
+
# The character after tag_name must be >, /, or whitespace
|
|
68
|
+
check_pos = tag_start + len(f"<{tag_name}")
|
|
69
|
+
if check_pos < len(content):
|
|
70
|
+
next_char = content[check_pos]
|
|
71
|
+
if next_char not in (">", "/", " ", "\t", "\n", "\r"):
|
|
72
|
+
result.append(content[pos:check_pos])
|
|
73
|
+
pos = check_pos
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
result.append(content[pos:tag_start])
|
|
77
|
+
tag_end = _find_tag_end(content, tag_start)
|
|
78
|
+
if tag_end == -1:
|
|
79
|
+
result.append(content[tag_start:])
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
opening_tag = content[tag_start : tag_end + 1]
|
|
83
|
+
|
|
84
|
+
if opening_tag.rstrip().endswith("/>"):
|
|
85
|
+
result.append(opening_tag)
|
|
86
|
+
pos = tag_end + 1
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
if not opening_tag.endswith(">"):
|
|
90
|
+
result.append(opening_tag)
|
|
91
|
+
pos = tag_end + 1
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
closing_tag = f"</{tag_name}>"
|
|
95
|
+
closing_pos = content.find(closing_tag, tag_end + 1)
|
|
96
|
+
|
|
97
|
+
if closing_pos != -1:
|
|
98
|
+
attrs_part = opening_tag[len(f"<{tag_name}") : -1].rstrip()
|
|
99
|
+
if attrs_part:
|
|
100
|
+
result.append(f"<{tag_name}{attrs_part} />")
|
|
101
|
+
else:
|
|
102
|
+
result.append(f"<{tag_name} />")
|
|
103
|
+
pos = closing_pos + len(closing_tag)
|
|
104
|
+
else:
|
|
105
|
+
attrs_part = opening_tag[len(f"<{tag_name}") : -1].rstrip()
|
|
106
|
+
if attrs_part:
|
|
107
|
+
result.append(f"<{tag_name}{attrs_part} />")
|
|
108
|
+
else:
|
|
109
|
+
result.append(f"<{tag_name} />")
|
|
110
|
+
pos = tag_end + 1
|
|
111
|
+
|
|
112
|
+
return "".join(result)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _find_tag_end(content: str, start_pos: int) -> int:
|
|
116
|
+
"""
|
|
117
|
+
Find the end of an HTML tag (the position of >).
|
|
118
|
+
|
|
119
|
+
Handles quotes: ignores > inside quoted attribute values.
|
|
120
|
+
"""
|
|
121
|
+
pos = start_pos
|
|
122
|
+
in_quote = None # None, '"', or "'"
|
|
123
|
+
|
|
124
|
+
while pos < len(content):
|
|
125
|
+
char = content[pos]
|
|
126
|
+
|
|
127
|
+
if in_quote:
|
|
128
|
+
if char == in_quote:
|
|
129
|
+
if pos > 0 and content[pos - 1] == "\\":
|
|
130
|
+
pos += 1
|
|
131
|
+
continue
|
|
132
|
+
else:
|
|
133
|
+
in_quote = None
|
|
134
|
+
else:
|
|
135
|
+
if char in ('"', "'"):
|
|
136
|
+
in_quote = char
|
|
137
|
+
elif char == ">":
|
|
138
|
+
return pos
|
|
139
|
+
|
|
140
|
+
pos += 1
|
|
141
|
+
|
|
142
|
+
return -1 # Not found
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# For saving: match self-closing tags like <br /> or <br/>
|
|
146
|
+
# Capture tag name and everything between tag name and />
|
|
147
|
+
_VOID_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_VOID_TAGS) + r")([^>]*?)\s*/>")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def unclose_void_elements(xml_content: str) -> str:
|
|
151
|
+
"""
|
|
152
|
+
Convert void elements from self-closing to unclosed format for HTML compatibility.
|
|
153
|
+
|
|
154
|
+
Transforms self-closed void elements like <br /> back to <br> for
|
|
155
|
+
compatibility with HTML parsers that don't support XHTML syntax.
|
|
156
|
+
Used only for text/html media type files.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
xml_content: HTML/XHTML content string
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Content with void elements in unclosed format
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
<meta charset="utf-8" /> → <meta charset="utf-8">
|
|
166
|
+
<br /> → <br>
|
|
167
|
+
<img src="test.png" /> → <img src="test.png">
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def replacer(m: re.Match):
|
|
171
|
+
tag_name = m.group(1)
|
|
172
|
+
attrs = m.group(2).rstrip() # Remove trailing whitespace
|
|
173
|
+
if attrs:
|
|
174
|
+
return f"<{tag_name}{attrs}>"
|
|
175
|
+
else:
|
|
176
|
+
return f"<{tag_name}>"
|
|
177
|
+
|
|
178
|
+
return re.sub(
|
|
179
|
+
pattern=_VOID_TAG_CLOSE_PATTERN,
|
|
180
|
+
repl=replacer,
|
|
181
|
+
string=xml_content,
|
|
182
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from xml.etree.ElementTree import Element
|
|
3
|
+
|
|
4
|
+
from ..utils import normalize_whitespace
|
|
5
|
+
from .const import ID_KEY
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def normalize_text_in_element(text: str | None) -> str | None:
|
|
9
|
+
if text is None:
|
|
10
|
+
return None
|
|
11
|
+
text = normalize_whitespace(text)
|
|
12
|
+
if not text.strip():
|
|
13
|
+
return None
|
|
14
|
+
return text
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def append_text_in_element(origin_text: str | None, append_text: str) -> str:
|
|
18
|
+
if origin_text is None:
|
|
19
|
+
return append_text
|
|
20
|
+
else:
|
|
21
|
+
return origin_text + append_text
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def index_of_parent(parent: Element, checked_element: Element) -> int:
|
|
25
|
+
for i, child in enumerate(parent):
|
|
26
|
+
if child == checked_element:
|
|
27
|
+
return i
|
|
28
|
+
raise ValueError("Element not found in parent.")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def expand_left_element_texts(element: Element) -> Generator[str, None, None]:
|
|
32
|
+
yield "<"
|
|
33
|
+
yield element.tag
|
|
34
|
+
yield " "
|
|
35
|
+
yield ID_KEY
|
|
36
|
+
yield '="99">'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def expand_right_element_texts(element: Element) -> Generator[str, None, None]:
|
|
40
|
+
yield "</"
|
|
41
|
+
yield element.tag
|
|
42
|
+
yield ">"
|
epub_translator/xml/xml.py
CHANGED
|
@@ -12,6 +12,13 @@ def find_first(element: Element, tag: str) -> Element | None:
|
|
|
12
12
|
return None
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def index_in_parent(parent: Element, element: Element) -> int | None:
|
|
16
|
+
for i, child in enumerate(parent):
|
|
17
|
+
if child is element:
|
|
18
|
+
return i
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
15
22
|
def iter_with_stack(element: Element) -> Generator[tuple[list[Element], Element], None, None]:
|
|
16
23
|
"""先序遍历:yield parent_path, element"""
|
|
17
24
|
stack: list[list[Element]] = [[element]]
|