epub-translator 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/epub/__init__.py +2 -2
- epub_translator/epub/metadata.py +29 -66
- epub_translator/epub/toc.py +76 -94
- epub_translator/translation/translator.py +16 -6
- epub_translator/xml/self_closing.py +5 -4
- epub_translator/xml/xml_like.py +23 -1
- {epub_translator-0.1.8.dist-info → epub_translator-0.1.9.dist-info}/METADATA +7 -16
- {epub_translator-0.1.8.dist-info → epub_translator-0.1.9.dist-info}/RECORD +10 -10
- {epub_translator-0.1.8.dist-info → epub_translator-0.1.9.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.8.dist-info → epub_translator-0.1.9.dist-info}/WHEEL +0 -0
epub_translator/epub/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from .metadata import read_metadata, write_metadata
|
|
1
|
+
from .metadata import MetadataContext, read_metadata, write_metadata
|
|
2
2
|
from .spines import search_spine_paths
|
|
3
|
-
from .toc import Toc, read_toc, write_toc
|
|
3
|
+
from .toc import Toc, TocContext, read_toc, write_toc
|
|
4
4
|
from .zip import Zip
|
epub_translator/epub/metadata.py
CHANGED
|
@@ -1,91 +1,63 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
|
|
4
|
+
from ..xml import XMLLikeNode
|
|
3
5
|
from .common import find_opf_path
|
|
4
6
|
from .zip import Zip
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
@dataclass
|
|
8
10
|
class MetadataField:
|
|
9
|
-
"""
|
|
10
|
-
表示 EPUB OPF 文件中的元数据字段
|
|
11
|
-
|
|
12
|
-
- tag_name: 标签名(不带命名空间)
|
|
13
|
-
- text: 文本内容
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
11
|
tag_name: str
|
|
17
12
|
text: str
|
|
18
13
|
|
|
19
14
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
"meta",
|
|
26
|
-
"contributor", # Usually technical information
|
|
27
|
-
}
|
|
15
|
+
@dataclass
|
|
16
|
+
class MetadataContext:
|
|
17
|
+
opf_path: Path # OPF 文件路径
|
|
18
|
+
xml_node: XMLLikeNode # XMLLikeNode 对象,保留原始文件信息
|
|
19
|
+
|
|
28
20
|
|
|
21
|
+
SKIP_FIELDS = frozenset(
|
|
22
|
+
(
|
|
23
|
+
"language",
|
|
24
|
+
"identifier",
|
|
25
|
+
"date",
|
|
26
|
+
"meta",
|
|
27
|
+
"contributor", # Usually technical information
|
|
28
|
+
)
|
|
29
|
+
)
|
|
29
30
|
|
|
30
|
-
def read_metadata(zip: Zip) -> list[MetadataField]:
|
|
31
|
-
"""
|
|
32
|
-
从 EPUB 的 OPF 文件中读取所有可翻译的元数据字段。
|
|
33
31
|
|
|
34
|
-
|
|
35
|
-
自动过滤掉不应该翻译的字段(language, identifier, date, meta, contributor 等)。
|
|
36
|
-
"""
|
|
32
|
+
def read_metadata(zip: Zip) -> tuple[list[MetadataField], MetadataContext]:
|
|
37
33
|
opf_path = find_opf_path(zip)
|
|
38
34
|
|
|
39
35
|
with zip.read(opf_path) as f:
|
|
40
|
-
|
|
36
|
+
xml_node = XMLLikeNode(f, is_html_like=False)
|
|
41
37
|
|
|
42
|
-
from xml.etree import ElementTree as ET
|
|
43
|
-
|
|
44
|
-
root = ET.fromstring(content)
|
|
45
|
-
|
|
46
|
-
# Find metadata element
|
|
47
38
|
metadata_elem = None
|
|
48
|
-
for child in
|
|
39
|
+
for child in xml_node.element:
|
|
49
40
|
if child.tag.endswith("metadata"):
|
|
50
41
|
metadata_elem = child
|
|
51
42
|
break
|
|
52
43
|
|
|
53
44
|
if metadata_elem is None:
|
|
54
|
-
|
|
45
|
+
context = MetadataContext(opf_path=opf_path, xml_node=xml_node)
|
|
46
|
+
return [], context
|
|
55
47
|
|
|
56
|
-
# Collect metadata fields to translate
|
|
57
48
|
fields: list[MetadataField] = []
|
|
58
|
-
|
|
59
49
|
for elem in metadata_elem:
|
|
60
|
-
|
|
61
|
-
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
62
|
-
|
|
63
|
-
# Check if element has text content and should be translated
|
|
50
|
+
tag_name = elem.tag
|
|
64
51
|
if elem.text and elem.text.strip() and tag_name not in SKIP_FIELDS:
|
|
65
52
|
fields.append(MetadataField(tag_name=tag_name, text=elem.text.strip()))
|
|
66
53
|
|
|
67
|
-
|
|
54
|
+
context = MetadataContext(opf_path=opf_path, xml_node=xml_node)
|
|
55
|
+
return fields, context
|
|
68
56
|
|
|
69
57
|
|
|
70
|
-
def write_metadata(zip: Zip, fields: list[MetadataField]) -> None:
|
|
71
|
-
"""
|
|
72
|
-
将翻译后的元数据字段写回 EPUB 的 OPF 文件。
|
|
73
|
-
|
|
74
|
-
根据 tag_name 匹配对应的元素,并更新其文本内容。
|
|
75
|
-
匹配策略:按照 tag_name 和在文件中出现的顺序依次匹配。
|
|
76
|
-
"""
|
|
77
|
-
opf_path = find_opf_path(zip)
|
|
78
|
-
|
|
79
|
-
with zip.read(opf_path) as f:
|
|
80
|
-
content = f.read()
|
|
81
|
-
|
|
82
|
-
from xml.etree import ElementTree as ET
|
|
83
|
-
|
|
84
|
-
root = ET.fromstring(content)
|
|
85
|
-
|
|
86
|
-
# Find metadata element
|
|
58
|
+
def write_metadata(zip: Zip, fields: list[MetadataField], context: MetadataContext) -> None:
|
|
87
59
|
metadata_elem = None
|
|
88
|
-
for child in
|
|
60
|
+
for child in context.xml_node.element:
|
|
89
61
|
if child.tag.endswith("metadata"):
|
|
90
62
|
metadata_elem = child
|
|
91
63
|
break
|
|
@@ -93,30 +65,21 @@ def write_metadata(zip: Zip, fields: list[MetadataField]) -> None:
|
|
|
93
65
|
if metadata_elem is None:
|
|
94
66
|
return
|
|
95
67
|
|
|
96
|
-
# Build a mapping: tag_name -> list of fields with that tag_name
|
|
97
68
|
fields_by_tag: dict[str, list[str]] = {}
|
|
98
69
|
for field in fields:
|
|
99
70
|
if field.tag_name not in fields_by_tag:
|
|
100
71
|
fields_by_tag[field.tag_name] = []
|
|
101
72
|
fields_by_tag[field.tag_name].append(field.text)
|
|
102
73
|
|
|
103
|
-
# Create a counter for each tag to track which occurrence we're at
|
|
104
74
|
tag_counters: dict[str, int] = {tag: 0 for tag in fields_by_tag}
|
|
105
75
|
|
|
106
|
-
# Update elements in metadata
|
|
107
76
|
for elem in metadata_elem:
|
|
108
|
-
|
|
109
|
-
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
110
|
-
|
|
111
|
-
# Check if this tag has translated text
|
|
77
|
+
tag_name = elem.tag
|
|
112
78
|
if tag_name in fields_by_tag and elem.text and elem.text.strip():
|
|
113
79
|
counter = tag_counters[tag_name]
|
|
114
80
|
if counter < len(fields_by_tag[tag_name]):
|
|
115
|
-
# Update the text with translated version
|
|
116
81
|
elem.text = fields_by_tag[tag_name][counter]
|
|
117
82
|
tag_counters[tag_name] += 1
|
|
118
83
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
with zip.replace(opf_path) as f:
|
|
122
|
-
tree.write(f, encoding="utf-8", xml_declaration=True)
|
|
84
|
+
with zip.replace(context.opf_path) as f:
|
|
85
|
+
context.xml_node.save(f)
|
epub_translator/epub/toc.py
CHANGED
|
@@ -3,8 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from xml.etree import ElementTree as ET
|
|
4
4
|
from xml.etree.ElementTree import Element
|
|
5
5
|
|
|
6
|
-
from ..xml
|
|
7
|
-
from .common import
|
|
6
|
+
from ..xml import XMLLikeNode, plain_text
|
|
7
|
+
from .common import find_opf_path, strip_namespace
|
|
8
8
|
from .zip import Zip
|
|
9
9
|
|
|
10
10
|
|
|
@@ -41,30 +41,40 @@ class Toc:
|
|
|
41
41
|
return self.href
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
@dataclass
|
|
45
|
+
class TocContext:
|
|
46
|
+
version: int
|
|
47
|
+
toc_path: Path
|
|
48
|
+
xml_node: XMLLikeNode
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def read_toc(zip: Zip) -> tuple[list[Toc], TocContext]:
|
|
45
52
|
version = _detect_epub_version(zip)
|
|
46
53
|
toc_path = _find_toc_path(zip, version)
|
|
47
54
|
|
|
48
55
|
if toc_path is None:
|
|
49
|
-
|
|
56
|
+
raise ValueError("Cannot find TOC file in EPUB")
|
|
50
57
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
else:
|
|
54
|
-
return _read_nav_toc(zip, toc_path)
|
|
58
|
+
with zip.read(toc_path) as f:
|
|
59
|
+
xml_node = XMLLikeNode(f, is_html_like=False)
|
|
55
60
|
|
|
61
|
+
if version == 3:
|
|
62
|
+
toc_list = _read_nav_toc(xml_node.element)
|
|
63
|
+
else:
|
|
64
|
+
toc_list = _read_ncx_toc(xml_node.element)
|
|
56
65
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
toc_path = _find_toc_path(zip, version)
|
|
66
|
+
context = TocContext(version=version, toc_path=toc_path, xml_node=xml_node)
|
|
67
|
+
return toc_list, context
|
|
60
68
|
|
|
61
|
-
if toc_path is None:
|
|
62
|
-
raise ValueError("Cannot find TOC file in EPUB")
|
|
63
69
|
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
def write_toc(zip: Zip, toc: list[Toc], context: TocContext) -> None:
|
|
71
|
+
if context.version == 2:
|
|
72
|
+
_update_ncx_toc(context.xml_node.element, toc)
|
|
66
73
|
else:
|
|
67
|
-
|
|
74
|
+
_update_nav_toc(context.xml_node.element, toc)
|
|
75
|
+
|
|
76
|
+
with zip.replace(context.toc_path) as f:
|
|
77
|
+
context.xml_node.save(f)
|
|
68
78
|
|
|
69
79
|
|
|
70
80
|
def _detect_epub_version(zip: Zip) -> int:
|
|
@@ -72,8 +82,6 @@ def _detect_epub_version(zip: Zip) -> int:
|
|
|
72
82
|
with zip.read(opf_path) as f:
|
|
73
83
|
content = f.read()
|
|
74
84
|
root = ET.fromstring(content)
|
|
75
|
-
|
|
76
|
-
# 检查 package 元素的 version 属性
|
|
77
85
|
version_str = root.get("version", "2.0")
|
|
78
86
|
|
|
79
87
|
if version_str.startswith("3"):
|
|
@@ -89,7 +97,7 @@ def _find_toc_path(zip: Zip, version: int) -> Path | None:
|
|
|
89
97
|
with zip.read(opf_path) as f:
|
|
90
98
|
content = f.read()
|
|
91
99
|
root = ET.fromstring(content)
|
|
92
|
-
strip_namespace(root)
|
|
100
|
+
strip_namespace(root)
|
|
93
101
|
|
|
94
102
|
manifest = root.find(".//manifest")
|
|
95
103
|
if manifest is None:
|
|
@@ -115,23 +123,18 @@ def _find_toc_path(zip: Zip, version: int) -> Path | None:
|
|
|
115
123
|
return None
|
|
116
124
|
|
|
117
125
|
|
|
118
|
-
def _read_ncx_toc(
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
strip_namespace(root) # 移除命名空间前缀以简化 XPath
|
|
123
|
-
|
|
124
|
-
nav_map = root.find(".//navMap")
|
|
125
|
-
if nav_map is None:
|
|
126
|
-
return []
|
|
126
|
+
def _read_ncx_toc(root: Element) -> list[Toc]:
|
|
127
|
+
nav_map = root.find(".//navMap")
|
|
128
|
+
if nav_map is None:
|
|
129
|
+
return []
|
|
127
130
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
131
|
+
result = []
|
|
132
|
+
for nav_point in nav_map.findall("navPoint"):
|
|
133
|
+
toc_item = _parse_nav_point(nav_point)
|
|
134
|
+
if toc_item:
|
|
135
|
+
result.append(toc_item)
|
|
133
136
|
|
|
134
|
-
|
|
137
|
+
return result
|
|
135
138
|
|
|
136
139
|
|
|
137
140
|
def _parse_nav_point(nav_point: Element) -> Toc | None:
|
|
@@ -172,18 +175,11 @@ def _parse_nav_point(nav_point: Element) -> Toc | None:
|
|
|
172
175
|
)
|
|
173
176
|
|
|
174
177
|
|
|
175
|
-
def
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
nav_map = root.find(f".//{{{ns}}}navMap" if ns else ".//navMap")
|
|
181
|
-
if nav_map is None:
|
|
182
|
-
raise ValueError("Cannot find navMap in NCX file")
|
|
183
|
-
_update_nav_points(nav_map, toc_list, ns)
|
|
184
|
-
tree = ET.ElementTree(root)
|
|
185
|
-
with zip.replace(ncx_path) as out:
|
|
186
|
-
tree.write(out, encoding="utf-8", xml_declaration=True)
|
|
178
|
+
def _update_ncx_toc(root: Element, toc_list: list[Toc]) -> None:
|
|
179
|
+
nav_map = root.find(".//navMap")
|
|
180
|
+
if nav_map is None:
|
|
181
|
+
raise ValueError("Cannot find navMap in NCX file")
|
|
182
|
+
_update_nav_points(nav_map, toc_list, None)
|
|
187
183
|
|
|
188
184
|
|
|
189
185
|
def _update_nav_points(parent: Element, toc_list: list[Toc], ns: str | None, start_play_order: int = 1) -> int:
|
|
@@ -255,34 +251,28 @@ def _create_nav_point(toc: Toc, ns: str | None, play_order: int) -> Element:
|
|
|
255
251
|
return nav_point
|
|
256
252
|
|
|
257
253
|
|
|
258
|
-
def _read_nav_toc(
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
nav_elem = None
|
|
266
|
-
for nav in root.findall(".//nav"):
|
|
267
|
-
epub_type = nav.get("{http://www.idpf.org/2007/ops}type") or nav.get("type")
|
|
268
|
-
if epub_type == "toc":
|
|
269
|
-
nav_elem = nav
|
|
270
|
-
break
|
|
254
|
+
def _read_nav_toc(root: Element) -> list[Toc]:
|
|
255
|
+
nav_elem = None
|
|
256
|
+
for nav in root.findall(".//nav"):
|
|
257
|
+
epub_type = nav.get("type")
|
|
258
|
+
if epub_type == "toc":
|
|
259
|
+
nav_elem = nav
|
|
260
|
+
break
|
|
271
261
|
|
|
272
|
-
|
|
273
|
-
|
|
262
|
+
if nav_elem is None:
|
|
263
|
+
return []
|
|
274
264
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
265
|
+
ol = nav_elem.find(".//ol")
|
|
266
|
+
if ol is None:
|
|
267
|
+
return []
|
|
278
268
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
269
|
+
result = []
|
|
270
|
+
for li in ol.findall("li"):
|
|
271
|
+
toc_item = _parse_nav_li(li)
|
|
272
|
+
if toc_item:
|
|
273
|
+
result.append(toc_item)
|
|
284
274
|
|
|
285
|
-
|
|
275
|
+
return result
|
|
286
276
|
|
|
287
277
|
|
|
288
278
|
def _parse_nav_li(li: Element) -> Toc | None:
|
|
@@ -331,30 +321,22 @@ def _parse_nav_li(li: Element) -> Toc | None:
|
|
|
331
321
|
)
|
|
332
322
|
|
|
333
323
|
|
|
334
|
-
def
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
if ol is None:
|
|
351
|
-
raise ValueError("Cannot find ol in nav element")
|
|
352
|
-
|
|
353
|
-
_update_nav_lis(ol, toc_list, ns)
|
|
354
|
-
|
|
355
|
-
tree = ET.ElementTree(root)
|
|
356
|
-
with zip.replace(nav_path) as out:
|
|
357
|
-
tree.write(out, encoding="utf-8", xml_declaration=True)
|
|
324
|
+
def _update_nav_toc(root: Element, toc_list: list[Toc]) -> None:
|
|
325
|
+
nav_elem = None
|
|
326
|
+
for nav in root.findall(".//nav"):
|
|
327
|
+
epub_type = nav.get("type")
|
|
328
|
+
if epub_type == "toc":
|
|
329
|
+
nav_elem = nav
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
if nav_elem is None:
|
|
333
|
+
raise ValueError("Cannot find nav element with type='toc'")
|
|
334
|
+
|
|
335
|
+
ol = nav_elem.find(".//ol")
|
|
336
|
+
if ol is None:
|
|
337
|
+
raise ValueError("Cannot find ol in nav element")
|
|
338
|
+
|
|
339
|
+
_update_nav_lis(ol, toc_list, None)
|
|
358
340
|
|
|
359
341
|
|
|
360
342
|
def _update_nav_lis(ol: Element, toc_list: list[Toc], ns: str | None) -> None:
|
|
@@ -6,6 +6,8 @@ from os import PathLike
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
8
|
from ..epub import (
|
|
9
|
+
MetadataContext,
|
|
10
|
+
TocContext,
|
|
9
11
|
Zip,
|
|
10
12
|
read_metadata,
|
|
11
13
|
read_toc,
|
|
@@ -31,6 +33,8 @@ class _ElementType(Enum):
|
|
|
31
33
|
class _ElementContext:
|
|
32
34
|
element_type: _ElementType
|
|
33
35
|
chapter_data: tuple[Path, XMLLikeNode] | None = None
|
|
36
|
+
toc_context: TocContext | None = None
|
|
37
|
+
metadata_context: MetadataContext | None = None
|
|
34
38
|
|
|
35
39
|
|
|
36
40
|
def translate(
|
|
@@ -74,8 +78,8 @@ def translate(
|
|
|
74
78
|
zip.migrate(Path("mimetype"))
|
|
75
79
|
|
|
76
80
|
total_chapters = sum(1 for _, _ in search_spine_paths(zip))
|
|
77
|
-
toc_list = read_toc(zip)
|
|
78
|
-
metadata_fields = read_metadata(zip)
|
|
81
|
+
toc_list, toc_context = read_toc(zip)
|
|
82
|
+
metadata_fields, metadata_context = read_metadata(zip)
|
|
79
83
|
|
|
80
84
|
# Calculate weights: TOC (5%), Metadata (5%), Chapters (90%)
|
|
81
85
|
toc_has_items = len(toc_list) > 0
|
|
@@ -101,14 +105,17 @@ def translate(
|
|
|
101
105
|
tasks=_generate_tasks_from_book(
|
|
102
106
|
zip=zip,
|
|
103
107
|
toc_list=toc_list,
|
|
108
|
+
toc_context=toc_context,
|
|
104
109
|
metadata_fields=metadata_fields,
|
|
110
|
+
metadata_context=metadata_context,
|
|
105
111
|
submit=submit,
|
|
106
112
|
),
|
|
107
113
|
):
|
|
108
114
|
if context.element_type == _ElementType.TOC:
|
|
109
115
|
translated_elem = unwrap_french_quotes(translated_elem)
|
|
110
116
|
decoded_toc = decode_toc_list(translated_elem)
|
|
111
|
-
|
|
117
|
+
if context.toc_context is not None:
|
|
118
|
+
write_toc(zip, decoded_toc, context.toc_context)
|
|
112
119
|
|
|
113
120
|
current_progress += toc_weight
|
|
114
121
|
if on_progress:
|
|
@@ -117,7 +124,8 @@ def translate(
|
|
|
117
124
|
elif context.element_type == _ElementType.METADATA:
|
|
118
125
|
translated_elem = unwrap_french_quotes(translated_elem)
|
|
119
126
|
decoded_metadata = decode_metadata(translated_elem)
|
|
120
|
-
|
|
127
|
+
if context.metadata_context is not None:
|
|
128
|
+
write_metadata(zip, decoded_metadata, context.metadata_context)
|
|
121
129
|
|
|
122
130
|
current_progress += metadata_weight
|
|
123
131
|
if on_progress:
|
|
@@ -138,7 +146,9 @@ def translate(
|
|
|
138
146
|
def _generate_tasks_from_book(
|
|
139
147
|
zip: Zip,
|
|
140
148
|
toc_list: list,
|
|
149
|
+
toc_context: TocContext,
|
|
141
150
|
metadata_fields: list,
|
|
151
|
+
metadata_context: MetadataContext,
|
|
142
152
|
submit: SubmitKind,
|
|
143
153
|
) -> Generator[TranslationTask[_ElementContext], None, None]:
|
|
144
154
|
head_submit = submit
|
|
@@ -149,14 +159,14 @@ def _generate_tasks_from_book(
|
|
|
149
159
|
yield TranslationTask(
|
|
150
160
|
element=encode_toc_list(toc_list),
|
|
151
161
|
action=head_submit,
|
|
152
|
-
payload=_ElementContext(element_type=_ElementType.TOC),
|
|
162
|
+
payload=_ElementContext(element_type=_ElementType.TOC, toc_context=toc_context),
|
|
153
163
|
)
|
|
154
164
|
|
|
155
165
|
if metadata_fields:
|
|
156
166
|
yield TranslationTask(
|
|
157
167
|
element=encode_metadata(metadata_fields),
|
|
158
168
|
action=head_submit,
|
|
159
|
-
payload=_ElementContext(element_type=_ElementType.METADATA),
|
|
169
|
+
payload=_ElementContext(element_type=_ElementType.METADATA, metadata_context=metadata_context),
|
|
160
170
|
)
|
|
161
171
|
|
|
162
172
|
for chapter_path, media_type in search_spine_paths(zip):
|
|
@@ -3,6 +3,8 @@ import re
|
|
|
3
3
|
# Some non-standard EPUB generators use HTML-style tags without self-closing syntax
|
|
4
4
|
# We need to convert them to XML-compatible format before parsing
|
|
5
5
|
# These are HTML5 void elements that must be self-closing in XHTML
|
|
6
|
+
# Note: "meta" is excluded because OPF files have <meta property="...">content</meta>
|
|
7
|
+
# which is NOT a void element (different namespace, different rules)
|
|
6
8
|
_VOID_TAGS = (
|
|
7
9
|
"area",
|
|
8
10
|
"base",
|
|
@@ -13,7 +15,6 @@ _VOID_TAGS = (
|
|
|
13
15
|
"img",
|
|
14
16
|
"input",
|
|
15
17
|
"link",
|
|
16
|
-
"meta",
|
|
17
18
|
"param",
|
|
18
19
|
"source",
|
|
19
20
|
"track",
|
|
@@ -26,7 +27,8 @@ def self_close_void_elements(xml_content: str) -> str:
|
|
|
26
27
|
Convert void HTML elements to self-closing format for XML parsing.
|
|
27
28
|
|
|
28
29
|
This function handles non-standard HTML where void elements are not self-closed.
|
|
29
|
-
|
|
30
|
+
Note: "meta" is excluded from processing because EPUB OPF files have
|
|
31
|
+
<meta property="...">content</meta> which is NOT a void element.
|
|
30
32
|
|
|
31
33
|
Args:
|
|
32
34
|
xml_content: HTML/XHTML content string
|
|
@@ -35,9 +37,8 @@ def self_close_void_elements(xml_content: str) -> str:
|
|
|
35
37
|
Content with void elements in self-closing format
|
|
36
38
|
|
|
37
39
|
Example:
|
|
38
|
-
<meta charset="utf-8"> → <meta charset="utf-8" />
|
|
39
40
|
<br> → <br />
|
|
40
|
-
<
|
|
41
|
+
<link rel="stylesheet" href="style.css"> → <link rel="stylesheet" href="style.css" />
|
|
41
42
|
"""
|
|
42
43
|
for tag in _VOID_TAGS:
|
|
43
44
|
xml_content = _fix_void_element(xml_content, tag)
|
epub_translator/xml/xml_like.py
CHANGED
|
@@ -32,6 +32,25 @@ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNOREC
|
|
|
32
32
|
_FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
|
|
33
33
|
_NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
|
|
34
34
|
|
|
35
|
+
# When an attribute name exists in multiple namespaces (e.g., 'type' in XHTML and EPUB ops),
|
|
36
|
+
# _attr_to_namespace only records ONE namespace per attribute name. During serialization,
|
|
37
|
+
# the global string replacement wrongly adds namespace prefixes to ALL occurrences of that
|
|
38
|
+
# attribute, including ones that should remain unprefixed (e.g., <link type="text/css">).
|
|
39
|
+
#
|
|
40
|
+
# Example problem:
|
|
41
|
+
# Original file has:
|
|
42
|
+
# - <link type="text/css"> (no namespace, standard HTML attribute)
|
|
43
|
+
# - <nav epub:type="toc"> (EPUB ops namespace)
|
|
44
|
+
# After parsing, _attr_to_namespace records: {'type': 'http://www.idpf.org/2007/ops'}
|
|
45
|
+
# During serialization, ALL ' type="' get replaced to ' epub:type="', breaking <link>
|
|
46
|
+
#
|
|
47
|
+
# This workaround fixes specific known cases where HTML standard attributes should not
|
|
48
|
+
# be prefixed, even if the same attribute name appears with a namespace elsewhere.
|
|
49
|
+
_STANDARD_HTML_ATTRS = (
|
|
50
|
+
(re.compile(r'<link([^>]*?) epub:type="'), r'<link\1 type="'), # <link type="...">
|
|
51
|
+
(re.compile(r'<link([^>]*?) epub:rel="'), r'<link\1 rel="'), # <link rel="...">
|
|
52
|
+
)
|
|
53
|
+
|
|
35
54
|
|
|
36
55
|
class XMLLikeNode:
|
|
37
56
|
def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
|
|
@@ -197,10 +216,13 @@ class XMLLikeNode:
|
|
|
197
216
|
xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
|
|
198
217
|
xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
|
|
199
218
|
|
|
200
|
-
# Similarly for attributes (though less common in EPUB)
|
|
201
219
|
for attr_name, namespace_uri in self._attr_to_namespace.items():
|
|
202
220
|
if namespace_uri not in _ROOT_NAMESPACES:
|
|
203
221
|
prefix = self._namespaces[namespace_uri]
|
|
204
222
|
xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
|
|
205
223
|
|
|
224
|
+
# Apply workaround to fix standard HTML attributes (see _STANDARD_HTML_ATTRS comment)
|
|
225
|
+
for pattern, replacement in _STANDARD_HTML_ATTRS:
|
|
226
|
+
xml_string = pattern.sub(replacement, xml_string)
|
|
227
|
+
|
|
206
228
|
return xml_string
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -46,26 +46,17 @@ Description-Content-Type: text/markdown
|
|
|
46
46
|
</div>
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
Want to read a book in a foreign language without losing the original context? EPUB Translator transforms any EPUB into a bilingual edition with AI-powered translations displayed side-by-side with the original text.
|
|
50
50
|
|
|
51
|
-
|
|
51
|
+
Whether you're learning a new language, conducting academic research, or simply enjoying foreign literature, you get both versions in one book - preserving all formatting, images, and structure.
|
|
52
52
|
|
|
53
|
-
|
|
53
|
+

|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
- **LLM-Powered**: Leverages large language models for high-quality, context-aware translations
|
|
57
|
-
- **Format Preservation**: Maintains EPUB structure, styles, images, and formatting
|
|
58
|
-
- **Complete Translation**: Translates chapter content, table of contents, and metadata
|
|
59
|
-
- **Progress Tracking**: Monitor translation progress with built-in callbacks
|
|
60
|
-
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
61
|
-
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
55
|
+
### Online Demo
|
|
62
56
|
|
|
63
|
-
|
|
57
|
+
We provide an [online demo platform](https://hub.oomol.com/package/books-translator) where you can try EPUB Translator's bilingual translation capabilities without any installation. Simply upload your EPUB file and get a translated bilingual edition.
|
|
64
58
|
|
|
65
|
-
|
|
66
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
67
|
-
- **Content Localization**: Prepare books for international audiences
|
|
68
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
59
|
+
[](https://hub.oomol.com/package/books-translator)
|
|
69
60
|
|
|
70
61
|
## Installation
|
|
71
62
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
epub_translator/__init__.py,sha256=JsiOUPpk5k7q8mXIgnRQWdVVnkJww_KDTg7jXsP7_C4,222
|
|
2
2
|
epub_translator/data/fill.jinja,sha256=zSytA8Vhp2i6YBZ09F1z9iPJq1-jUaiphoXqTNZwnvo,6964
|
|
3
3
|
epub_translator/data/translate.jinja,sha256=MVAWvgO9kybEFi0zMiZLEWwuRUL3l8PrwJdsoueQeCs,855
|
|
4
|
-
epub_translator/epub/__init__.py,sha256=
|
|
4
|
+
epub_translator/epub/__init__.py,sha256=4kIHrFUvFBemqg4lpjOpa9mmvWZGycgWdiQUaJ4JmL4,183
|
|
5
5
|
epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
|
|
6
6
|
epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
|
|
7
|
-
epub_translator/epub/metadata.py,sha256=
|
|
7
|
+
epub_translator/epub/metadata.py,sha256=Ddhq-kDtYz2yy41ayXtFxEL-_Lsvn-_vf8hm76HUbRE,2387
|
|
8
8
|
epub_translator/epub/spines.py,sha256=bP2IsobZm7zs4z10iXGc9SmgAFSIq9pJc8HE-V0aW9Y,1331
|
|
9
|
-
epub_translator/epub/toc.py,sha256=
|
|
9
|
+
epub_translator/epub/toc.py,sha256=N-tBR9Pv0FyCWq8swtSI93mCywN7mIXqweDBBmcDYJ8,13815
|
|
10
10
|
epub_translator/epub/zip.py,sha256=-3LI8f-ksgU8xCy28NjBOKyQPE8PhPEUPqIKZE1p8dw,2364
|
|
11
11
|
epub_translator/llm/__init__.py,sha256=YcFYYnXmXyX0RUyC-PDbj5k7Woygp_XOpTI3vDiNSPM,75
|
|
12
12
|
epub_translator/llm/context.py,sha256=8-0UnrZIaNshR_imy_ed_UpOK7H1a6dOsG-boaYOX8k,4186
|
|
@@ -31,7 +31,7 @@ epub_translator/translation/__init__.py,sha256=R0c0ZngocOC-Qczs0a8JYAdAcCu2gv3FL
|
|
|
31
31
|
epub_translator/translation/epub_transcode.py,sha256=_pRzmQgDrlfsibalkUogVi0F0Qy_uuYfKhZk3nP5pkA,2747
|
|
32
32
|
epub_translator/translation/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
33
33
|
epub_translator/translation/punctuation.py,sha256=TPCGjEmlAyN3G11VuXdHn-pvUkuWDwWqbTNzw-ij60E,813
|
|
34
|
-
epub_translator/translation/translator.py,sha256=
|
|
34
|
+
epub_translator/translation/translator.py,sha256=rly6hXwZ0bylV0-5LVeEEHrZSJ6xKaZlEbrjnG4kkOE,7033
|
|
35
35
|
epub_translator/translation/xml_interrupter.py,sha256=7TRGskn_OxRZT5mvKfjL0VMtU2VCgl1d9ElmfhFG0pM,8628
|
|
36
36
|
epub_translator/utils.py,sha256=BfZWrYjzDNQ4cFrgvRNzd4i1CKLtPxS8Z4LBHhqEV78,914
|
|
37
37
|
epub_translator/xml/__init__.py,sha256=qluFTfZYlPmOie8nR2C5O0tZ3UbCQEoEoR-Fq-__79c,160
|
|
@@ -44,10 +44,10 @@ epub_translator/xml/friendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42N
|
|
|
44
44
|
epub_translator/xml/friendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
45
45
|
epub_translator/xml/friendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
46
46
|
epub_translator/xml/inline.py,sha256=VcaNEF2ebVl2fogVk2yV3f4vOP4rePsPTV_qU3fJCE0,3108
|
|
47
|
-
epub_translator/xml/self_closing.py,sha256=
|
|
47
|
+
epub_translator/xml/self_closing.py,sha256=gA3wI4axhx281iMnK7Eu81mSXfOhaGsHAVYCYKTXAoQ,5446
|
|
48
48
|
epub_translator/xml/utils.py,sha256=7tQ6L5P0_JXhxONeG64hEeeL5mKjA6NKS1H1Q9B1Cac,1062
|
|
49
49
|
epub_translator/xml/xml.py,sha256=qQ5Wk1-KVVHE4TX25zGOR7fINsGkXnoq-qyKKNl5no4,1675
|
|
50
|
-
epub_translator/xml/xml_like.py,sha256=
|
|
50
|
+
epub_translator/xml/xml_like.py,sha256=oW8JhpdihlayOxDLFlM29uA2HTjpHk7r85jxTcqajME,10142
|
|
51
51
|
epub_translator/xml_translator/__init__.py,sha256=lqts1mJL_WfojDnMAQ5OM7TbT6u9X3H-X4C_avHzvXM,128
|
|
52
52
|
epub_translator/xml_translator/callbacks.py,sha256=IoZrsaivd2W76cHFupwv6auVxgEWHcBN2MHQJYcWoJ8,1324
|
|
53
53
|
epub_translator/xml_translator/common.py,sha256=hSPptgPp7j6dm47imELB5DgmEbzTEyJD6WEeELOOc50,38
|
|
@@ -58,7 +58,7 @@ epub_translator/xml_translator/stream_mapper.py,sha256=nk8iRUHAUQA2B35_y-JOCo6il
|
|
|
58
58
|
epub_translator/xml_translator/submitter.py,sha256=_ic2_JBPdEd2nMSu2mtQ5OzqpGv0zGrvYaicVUXAiUQ,14159
|
|
59
59
|
epub_translator/xml_translator/translator.py,sha256=7Ja1jFbmjIgHcmI9V6gg_K0t7qb6in9mhRn54a7qhZ8,9497
|
|
60
60
|
epub_translator/xml_translator/validation.py,sha256=-OKlSZuD__sjAiEpGAO93YQme4ZDSPmoPjRsAMOCEjc,16668
|
|
61
|
-
epub_translator-0.1.
|
|
62
|
-
epub_translator-0.1.
|
|
63
|
-
epub_translator-0.1.
|
|
64
|
-
epub_translator-0.1.
|
|
61
|
+
epub_translator-0.1.9.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
62
|
+
epub_translator-0.1.9.dist-info/METADATA,sha256=0Av_UtT49b-yCrurxxzXxMS-KGnraqPLzQCOdxzLh9U,18274
|
|
63
|
+
epub_translator-0.1.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
64
|
+
epub_translator-0.1.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|