epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +4 -2
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +233 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +208 -178
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +231 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +179 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.1.dist-info/METADATA +283 -0
- epub_translator-0.1.1.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -68
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.7.dist-info/METADATA +0 -170
- epub_translator-0.0.7.dist-info/RECORD +0 -36
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable
|
|
2
|
+
from enum import Enum, auto
|
|
3
|
+
from io import StringIO
|
|
4
|
+
|
|
5
|
+
from .tag import Tag, TagKind, is_valid_name_char, is_valid_value_char
|
|
6
|
+
|
|
7
|
+
_SPACES = (" ", "\n")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _Phase(Enum):
|
|
11
|
+
OUTSIDE = auto()
|
|
12
|
+
LEFT_BRACKET = auto()
|
|
13
|
+
LEFT_SLASH = auto()
|
|
14
|
+
TAG_NAME = auto()
|
|
15
|
+
TAG_GAP = auto()
|
|
16
|
+
ATTRIBUTE_NAME = auto()
|
|
17
|
+
ATTRIBUTE_NAME_EQUAL = auto()
|
|
18
|
+
ATTRIBUTE_VALUE = auto()
|
|
19
|
+
MUST_CLOSING_SIGN = auto()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _ParsedResult(Enum):
|
|
23
|
+
Continue = auto()
|
|
24
|
+
Success = auto()
|
|
25
|
+
Failed = auto()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_tags(chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
29
|
+
yield from _XMLTagsParser().do(chars)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _XMLTagsParser:
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self._outside_buffer: StringIO = StringIO()
|
|
35
|
+
self._tag_buffer: StringIO = StringIO()
|
|
36
|
+
self._tag: Tag | None = None
|
|
37
|
+
self._phase: _Phase = _Phase.OUTSIDE
|
|
38
|
+
|
|
39
|
+
def do(self, chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
40
|
+
for char in chars:
|
|
41
|
+
parsed_result = self._parse_char(char)
|
|
42
|
+
yield from self._generate_by_result(parsed_result)
|
|
43
|
+
|
|
44
|
+
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
45
|
+
outside_text = self._outside_buffer.getvalue()
|
|
46
|
+
if outside_text != "":
|
|
47
|
+
yield outside_text
|
|
48
|
+
|
|
49
|
+
def _parse_char(self, char: str) -> _ParsedResult:
|
|
50
|
+
parsed_result: _ParsedResult = _ParsedResult.Continue
|
|
51
|
+
|
|
52
|
+
if self._phase == _Phase.OUTSIDE:
|
|
53
|
+
if char != "<":
|
|
54
|
+
self._outside_buffer.write(char)
|
|
55
|
+
else:
|
|
56
|
+
self._phase = _Phase.LEFT_BRACKET
|
|
57
|
+
self._tag_buffer.write(char)
|
|
58
|
+
self._tag = Tag(
|
|
59
|
+
kind=TagKind.OPENING,
|
|
60
|
+
name="",
|
|
61
|
+
proto="",
|
|
62
|
+
attributes=[],
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
assert self._tag is not None
|
|
66
|
+
self._tag_buffer.write(char)
|
|
67
|
+
|
|
68
|
+
if self._phase == _Phase.LEFT_BRACKET:
|
|
69
|
+
if char == "/":
|
|
70
|
+
self._tag.kind = TagKind.CLOSING
|
|
71
|
+
self._phase = _Phase.LEFT_SLASH
|
|
72
|
+
elif is_valid_name_char(char):
|
|
73
|
+
self._tag.name += char
|
|
74
|
+
self._phase = _Phase.TAG_NAME
|
|
75
|
+
else:
|
|
76
|
+
parsed_result = _ParsedResult.Failed
|
|
77
|
+
|
|
78
|
+
elif self._phase == _Phase.LEFT_SLASH:
|
|
79
|
+
if is_valid_name_char(char):
|
|
80
|
+
self._tag.name += char
|
|
81
|
+
self._phase = _Phase.TAG_NAME
|
|
82
|
+
else:
|
|
83
|
+
parsed_result = _ParsedResult.Failed
|
|
84
|
+
|
|
85
|
+
elif self._phase == _Phase.TAG_NAME:
|
|
86
|
+
if char in _SPACES:
|
|
87
|
+
self._phase = _Phase.TAG_GAP
|
|
88
|
+
elif is_valid_name_char(char):
|
|
89
|
+
self._tag.name += char
|
|
90
|
+
elif char == ">":
|
|
91
|
+
parsed_result = _ParsedResult.Success
|
|
92
|
+
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
93
|
+
self._tag.kind = TagKind.SELF_CLOSING
|
|
94
|
+
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
95
|
+
else:
|
|
96
|
+
parsed_result = _ParsedResult.Failed
|
|
97
|
+
|
|
98
|
+
elif self._phase == _Phase.TAG_GAP:
|
|
99
|
+
if char in _SPACES:
|
|
100
|
+
pass
|
|
101
|
+
elif is_valid_name_char(char):
|
|
102
|
+
self._tag.attributes.append((char, ""))
|
|
103
|
+
self._phase = _Phase.ATTRIBUTE_NAME
|
|
104
|
+
elif char == ">":
|
|
105
|
+
parsed_result = _ParsedResult.Success
|
|
106
|
+
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
107
|
+
self._tag.kind = TagKind.SELF_CLOSING
|
|
108
|
+
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
109
|
+
else:
|
|
110
|
+
parsed_result = _ParsedResult.Failed
|
|
111
|
+
|
|
112
|
+
elif self._phase == _Phase.ATTRIBUTE_NAME:
|
|
113
|
+
if is_valid_name_char(char):
|
|
114
|
+
attr_name, attr_value = self._tag.attributes[-1]
|
|
115
|
+
attr_name = attr_name + char
|
|
116
|
+
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
117
|
+
elif char == "=":
|
|
118
|
+
self._phase = _Phase.ATTRIBUTE_NAME_EQUAL
|
|
119
|
+
else:
|
|
120
|
+
parsed_result = _ParsedResult.Failed
|
|
121
|
+
|
|
122
|
+
elif self._phase == _Phase.ATTRIBUTE_NAME_EQUAL:
|
|
123
|
+
if char == '"':
|
|
124
|
+
self._phase = _Phase.ATTRIBUTE_VALUE
|
|
125
|
+
else:
|
|
126
|
+
parsed_result = _ParsedResult.Failed
|
|
127
|
+
|
|
128
|
+
elif self._phase == _Phase.ATTRIBUTE_VALUE:
|
|
129
|
+
if is_valid_value_char(char):
|
|
130
|
+
attr_name, attr_value = self._tag.attributes[-1]
|
|
131
|
+
attr_value = attr_value + char
|
|
132
|
+
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
133
|
+
elif char == '"':
|
|
134
|
+
self._phase = _Phase.TAG_GAP
|
|
135
|
+
else:
|
|
136
|
+
parsed_result = _ParsedResult.Failed
|
|
137
|
+
|
|
138
|
+
elif self._phase == _Phase.MUST_CLOSING_SIGN:
|
|
139
|
+
if char == ">":
|
|
140
|
+
parsed_result = _ParsedResult.Success
|
|
141
|
+
else:
|
|
142
|
+
parsed_result = _ParsedResult.Failed
|
|
143
|
+
|
|
144
|
+
return parsed_result
|
|
145
|
+
|
|
146
|
+
def _generate_by_result(self, parsed_result: _ParsedResult) -> Generator[str | Tag, None, None]:
|
|
147
|
+
if parsed_result == _ParsedResult.Success:
|
|
148
|
+
assert self._tag is not None
|
|
149
|
+
if self._is_tag_valid(self._tag):
|
|
150
|
+
outside_text = self._outside_buffer.getvalue()
|
|
151
|
+
self._clear_buffer(self._outside_buffer)
|
|
152
|
+
self._clear_buffer(self._tag_buffer)
|
|
153
|
+
if outside_text != "":
|
|
154
|
+
yield outside_text
|
|
155
|
+
yield self._tag
|
|
156
|
+
else:
|
|
157
|
+
self._tag.proto = self._tag_buffer.getvalue()
|
|
158
|
+
self._outside_buffer.write(self._tag.proto)
|
|
159
|
+
self._clear_buffer(self._tag_buffer)
|
|
160
|
+
self._tag = None
|
|
161
|
+
self._phase = _Phase.OUTSIDE
|
|
162
|
+
|
|
163
|
+
elif parsed_result == _ParsedResult.Failed:
|
|
164
|
+
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
165
|
+
self._clear_buffer(self._tag_buffer)
|
|
166
|
+
self._phase = _Phase.OUTSIDE
|
|
167
|
+
|
|
168
|
+
def _is_tag_valid(self, tag: Tag) -> bool:
|
|
169
|
+
if tag.kind == TagKind.CLOSING and len(tag.attributes) > 0:
|
|
170
|
+
return False
|
|
171
|
+
if tag.find_invalid_name() is not None:
|
|
172
|
+
return False
|
|
173
|
+
return True
|
|
174
|
+
|
|
175
|
+
def _clear_buffer(self, buffer: StringIO):
|
|
176
|
+
buffer.truncate(0)
|
|
177
|
+
buffer.seek(0)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum, auto
|
|
4
|
+
from io import StringIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TagKind(Enum):
|
|
8
|
+
OPENING = auto()
|
|
9
|
+
CLOSING = auto()
|
|
10
|
+
SELF_CLOSING = auto()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class Tag:
|
|
15
|
+
kind: TagKind
|
|
16
|
+
name: str
|
|
17
|
+
proto: str
|
|
18
|
+
attributes: list[tuple[str, str]]
|
|
19
|
+
|
|
20
|
+
def __str__(self):
|
|
21
|
+
buffer = StringIO()
|
|
22
|
+
buffer.write("<")
|
|
23
|
+
if self.kind == TagKind.CLOSING:
|
|
24
|
+
buffer.write("/")
|
|
25
|
+
buffer.write(self.name)
|
|
26
|
+
if len(self.attributes) > 0:
|
|
27
|
+
buffer.write(" ")
|
|
28
|
+
for i, (attr_name, attr_value) in enumerate(self.attributes):
|
|
29
|
+
buffer.write(attr_name)
|
|
30
|
+
buffer.write("=")
|
|
31
|
+
buffer.write('"')
|
|
32
|
+
buffer.write(attr_value)
|
|
33
|
+
buffer.write('"')
|
|
34
|
+
if i < len(self.attributes) - 1:
|
|
35
|
+
buffer.write(" ")
|
|
36
|
+
if self.kind == TagKind.SELF_CLOSING:
|
|
37
|
+
buffer.write("/>")
|
|
38
|
+
else:
|
|
39
|
+
buffer.write(">")
|
|
40
|
+
return buffer.getvalue()
|
|
41
|
+
|
|
42
|
+
def find_invalid_name(self) -> str | None:
|
|
43
|
+
for name in self._iter_tag_names():
|
|
44
|
+
if not all(is_valid_value_char(c) for c in name):
|
|
45
|
+
return name
|
|
46
|
+
# https://www.w3schools.com/xml/xml_elements.asp
|
|
47
|
+
# The following logic enforces a subset of XML naming rules:
|
|
48
|
+
# - Names must not be empty.
|
|
49
|
+
# - Names must start with a letter (a-z, A-Z) or an underscore (_).
|
|
50
|
+
if name == "":
|
|
51
|
+
return name
|
|
52
|
+
char = name[0]
|
|
53
|
+
if char == "_":
|
|
54
|
+
continue
|
|
55
|
+
if "a" <= char <= "z" or "A" <= char <= "Z":
|
|
56
|
+
continue
|
|
57
|
+
return name
|
|
58
|
+
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def find_invalid_attr_value(self) -> tuple[str, str] | None:
|
|
62
|
+
for attr_name, attr_value in self.attributes:
|
|
63
|
+
if not all(is_valid_value_char(c) for c in attr_value):
|
|
64
|
+
return attr_name, attr_value
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def _iter_tag_names(self) -> Generator[str, None, None]:
|
|
68
|
+
yield self.name
|
|
69
|
+
for attr_name, _ in self.attributes:
|
|
70
|
+
yield attr_name
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# XML Attribute Values: https://www.w3.org/TR/xml/#NT-AttValue
|
|
74
|
+
# URI Syntax: https://www.rfc-editor.org/rfc/rfc3986
|
|
75
|
+
# HTML Attributes: https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
|
|
76
|
+
_VALID_VALUE_CHARS = frozenset(
|
|
77
|
+
(
|
|
78
|
+
",",
|
|
79
|
+
".",
|
|
80
|
+
"/",
|
|
81
|
+
"#",
|
|
82
|
+
"?",
|
|
83
|
+
"&",
|
|
84
|
+
"=",
|
|
85
|
+
":",
|
|
86
|
+
"%",
|
|
87
|
+
";",
|
|
88
|
+
" ",
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# XML Names: https://www.w3.org/TR/xml/#NT-Name
|
|
94
|
+
# XML Namespaces: https://www.w3.org/TR/xml-names/#ns-qualnames
|
|
95
|
+
# HTML Custom Data Attributes: https://html.spec.whatwg.org/multipage/dom.html#custom-data-attribute
|
|
96
|
+
_VALID_NAME_CHARS = frozenset(("-", "_", ":", "."))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def is_valid_value_char(char: str) -> bool:
|
|
100
|
+
if is_valid_name_char(char):
|
|
101
|
+
return True
|
|
102
|
+
if char in _VALID_VALUE_CHARS:
|
|
103
|
+
return True
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def is_valid_name_char(char: str) -> bool:
|
|
108
|
+
if char in _VALID_NAME_CHARS:
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
# https://www.w3.org/TR/xml/#NT-Name
|
|
112
|
+
if "a" <= char <= "z":
|
|
113
|
+
return True
|
|
114
|
+
if "A" <= char <= "Z":
|
|
115
|
+
return True
|
|
116
|
+
if "0" <= char <= "9":
|
|
117
|
+
return True
|
|
118
|
+
return False
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from .tag import Tag, TagKind
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def tag_to_element(tag: Tag) -> Element:
|
|
7
|
+
element = Element(tag.name)
|
|
8
|
+
for attr_name, attr_value in tag.attributes:
|
|
9
|
+
element.set(attr_name, attr_value)
|
|
10
|
+
return element
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def element_to_tag(element: Element, kind: TagKind, proto: str = "") -> Tag:
|
|
14
|
+
tag = Tag(
|
|
15
|
+
kind=kind,
|
|
16
|
+
name=element.tag,
|
|
17
|
+
proto=proto,
|
|
18
|
+
attributes=[],
|
|
19
|
+
)
|
|
20
|
+
if kind != TagKind.CLOSING:
|
|
21
|
+
for attr_name in sorted(list(element.keys())):
|
|
22
|
+
attr_value = element.get(attr_name, "")
|
|
23
|
+
tag.attributes.append((attr_name, attr_value))
|
|
24
|
+
|
|
25
|
+
# To make LLM easier to understand, the naming here is restricted in a more strict way.
|
|
26
|
+
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
27
|
+
invalid_name = tag.find_invalid_name()
|
|
28
|
+
if invalid_name is not None:
|
|
29
|
+
raise ValueError(f"find invalid tag name or attribute name: {invalid_name}")
|
|
30
|
+
|
|
31
|
+
invalid_attr_pair = tag.find_invalid_attr_value()
|
|
32
|
+
if invalid_attr_pair is not None:
|
|
33
|
+
attr_name, attr_value = invalid_attr_pair
|
|
34
|
+
raise ValueError(f'find invalid attribute value: {attr_name}="{attr_value}"')
|
|
35
|
+
|
|
36
|
+
return tag
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from xml.etree.ElementTree import Element
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def find_first(element: Element, tag: str) -> Element | None:
|
|
6
|
+
if element.tag == tag:
|
|
7
|
+
return element
|
|
8
|
+
for child in element:
|
|
9
|
+
result = find_first(child, tag)
|
|
10
|
+
if result is not None:
|
|
11
|
+
return result
|
|
12
|
+
return None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def iter_with_stack(element: Element) -> Generator[tuple[list[Element], Element], None, None]:
|
|
16
|
+
"""先序遍历:yield parent_path, element"""
|
|
17
|
+
stack: list[list[Element]] = [[element]]
|
|
18
|
+
while stack:
|
|
19
|
+
current_path = stack.pop()
|
|
20
|
+
current = current_path[-1]
|
|
21
|
+
yield current_path[:-1], current
|
|
22
|
+
|
|
23
|
+
if len(current) == 0:
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
for child in reversed(list(current)):
|
|
27
|
+
child_path = list(current_path)
|
|
28
|
+
child_path.append(child)
|
|
29
|
+
stack.append(child_path)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def clone_element(element: Element) -> Element:
|
|
33
|
+
new_element = Element(element.tag, element.attrib)
|
|
34
|
+
new_element.text = element.text
|
|
35
|
+
for child in element:
|
|
36
|
+
new_child = clone_element(child)
|
|
37
|
+
new_child.tail = child.tail
|
|
38
|
+
new_element.append(new_child)
|
|
39
|
+
return new_element
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def plain_text(element: Element) -> str:
|
|
43
|
+
return "".join(_iter_text_in(element))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _iter_text_in(element: Element) -> Generator[str, None, None]:
|
|
47
|
+
if element.text:
|
|
48
|
+
yield element.text
|
|
49
|
+
for child in element:
|
|
50
|
+
yield from _iter_text_in(child)
|
|
51
|
+
if child.tail:
|
|
52
|
+
yield child.tail
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import re
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import IO
|
|
5
|
+
from xml.etree.ElementTree import Element, fromstring, tostring
|
|
6
|
+
|
|
7
|
+
from .xml import iter_with_stack
|
|
8
|
+
|
|
9
|
+
_XML_NAMESPACE_URI = "http://www.w3.org/XML/1998/namespace"
|
|
10
|
+
|
|
11
|
+
_COMMON_NAMESPACES = {
|
|
12
|
+
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
13
|
+
"http://www.idpf.org/2007/ops": "epub",
|
|
14
|
+
"http://www.w3.org/1998/Math/MathML": "m",
|
|
15
|
+
"http://purl.org/dc/elements/1.1/": "dc",
|
|
16
|
+
"http://www.daisy.org/z3986/2005/ncx/": "ncx",
|
|
17
|
+
"http://www.idpf.org/2007/opf": "opf",
|
|
18
|
+
"http://www.w3.org/2000/svg": "svg",
|
|
19
|
+
"urn:oasis:names:tc:opendocument:xmlns:container": "container",
|
|
20
|
+
"http://www.w3.org/XML/1998/namespace": "xml", # Reserved XML namespace
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
_ROOT_NAMESPACES = {
|
|
24
|
+
"http://www.w3.org/1999/xhtml", # XHTML
|
|
25
|
+
"http://www.daisy.org/z3986/2005/ncx/", # NCX
|
|
26
|
+
"http://www.idpf.org/2007/opf", # OPF
|
|
27
|
+
"urn:oasis:names:tc:opendocument:xmlns:container", # Container
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
|
|
31
|
+
_FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
|
|
32
|
+
_NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
|
|
33
|
+
|
|
34
|
+
# Some non-standard EPUB generators use HTML-style tags without self-closing syntax
|
|
35
|
+
# We need to convert them to XML-compatible format before parsing
|
|
36
|
+
_EMPTY_TAGS = (
|
|
37
|
+
"br",
|
|
38
|
+
"hr",
|
|
39
|
+
"input",
|
|
40
|
+
"col",
|
|
41
|
+
"base",
|
|
42
|
+
"meta",
|
|
43
|
+
"area",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# For reading: match tags like <br> or <br class="x"> (but not <br/> or <body>)
|
|
47
|
+
_EMPTY_TAG_OPEN_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^/>]*)>")
|
|
48
|
+
|
|
49
|
+
# For saving: match self-closing tags like <br />
|
|
50
|
+
_EMPTY_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/>")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class XMLLikeNode:
|
|
54
|
+
def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
|
|
55
|
+
raw_content = file.read()
|
|
56
|
+
self._encoding: str = self._detect_encoding(raw_content)
|
|
57
|
+
content = raw_content.decode(self._encoding)
|
|
58
|
+
self._header, xml_content = self._extract_header(content)
|
|
59
|
+
self._namespaces: dict[str, str] = {}
|
|
60
|
+
self._tag_to_namespace: dict[str, str] = {}
|
|
61
|
+
self._attr_to_namespace: dict[str, str] = {}
|
|
62
|
+
|
|
63
|
+
# For non-standard HTML files, convert <br> to <br/> before parsing
|
|
64
|
+
self._is_html_like = is_html_like
|
|
65
|
+
if is_html_like:
|
|
66
|
+
xml_content = re.sub(
|
|
67
|
+
pattern=_EMPTY_TAG_OPEN_PATTERN,
|
|
68
|
+
repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
|
|
69
|
+
string=xml_content,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
self.element = self._extract_and_clean_namespaces(
|
|
74
|
+
element=fromstring(xml_content),
|
|
75
|
+
)
|
|
76
|
+
except Exception as error:
|
|
77
|
+
raise ValueError("Failed to parse XML-like content") from error
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def encoding(self) -> str:
|
|
81
|
+
return self._encoding
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def namespaces(self) -> list[str]:
|
|
85
|
+
return list(self._namespaces.keys())
|
|
86
|
+
|
|
87
|
+
def save(self, file: IO[bytes]) -> None:
|
|
88
|
+
writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
|
|
89
|
+
try:
|
|
90
|
+
if self._header:
|
|
91
|
+
writer.write(self._header)
|
|
92
|
+
|
|
93
|
+
content = self._serialize_with_namespaces(self.element)
|
|
94
|
+
|
|
95
|
+
# For non-standard HTML files, convert back from <br/> to <br>
|
|
96
|
+
if self._is_html_like:
|
|
97
|
+
content = re.sub(
|
|
98
|
+
pattern=_EMPTY_TAG_CLOSE_PATTERN,
|
|
99
|
+
repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
|
|
100
|
+
string=content,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
writer.write(content)
|
|
104
|
+
|
|
105
|
+
finally:
|
|
106
|
+
writer.detach()
|
|
107
|
+
|
|
108
|
+
def _detect_encoding(self, raw_content: bytes) -> str:
|
|
109
|
+
if raw_content.startswith(b"\xef\xbb\xbf"):
|
|
110
|
+
return "utf-8-sig"
|
|
111
|
+
elif raw_content.startswith(b"\xff\xfe"):
|
|
112
|
+
return "utf-16-le"
|
|
113
|
+
elif raw_content.startswith(b"\xfe\xff"):
|
|
114
|
+
return "utf-16-be"
|
|
115
|
+
|
|
116
|
+
# 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
|
|
117
|
+
header_bytes = raw_content[:1024]
|
|
118
|
+
for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
|
|
119
|
+
try:
|
|
120
|
+
header_str = header_bytes.decode(try_encoding)
|
|
121
|
+
match = _ENCODING_PATTERN.search(header_str)
|
|
122
|
+
if match:
|
|
123
|
+
declared_encoding = match.group(1).lower()
|
|
124
|
+
try:
|
|
125
|
+
raw_content.decode(declared_encoding)
|
|
126
|
+
return declared_encoding
|
|
127
|
+
except (LookupError, UnicodeDecodeError):
|
|
128
|
+
pass
|
|
129
|
+
except UnicodeDecodeError:
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
raw_content.decode("utf-8")
|
|
134
|
+
return "utf-8"
|
|
135
|
+
except UnicodeDecodeError:
|
|
136
|
+
pass
|
|
137
|
+
return "iso-8859-1"
|
|
138
|
+
|
|
139
|
+
def _extract_header(self, content: str) -> tuple[str, str]:
|
|
140
|
+
match = _FIRST_ELEMENT_PATTERN.search(content)
|
|
141
|
+
if match:
|
|
142
|
+
split_pos = match.start()
|
|
143
|
+
header = content[:split_pos]
|
|
144
|
+
xml_content = content[split_pos:]
|
|
145
|
+
return header, xml_content
|
|
146
|
+
return "", content
|
|
147
|
+
|
|
148
|
+
def _extract_and_clean_namespaces(self, element: Element) -> Element:
|
|
149
|
+
for _, elem in iter_with_stack(element):
|
|
150
|
+
match = _NAMESPACE_IN_TAG.match(elem.tag)
|
|
151
|
+
if match:
|
|
152
|
+
namespace_uri = match.group(1)
|
|
153
|
+
if namespace_uri not in self._namespaces:
|
|
154
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
|
|
155
|
+
self._namespaces[namespace_uri] = prefix
|
|
156
|
+
|
|
157
|
+
tag_name = elem.tag[len(match.group(0)) :]
|
|
158
|
+
|
|
159
|
+
# Record tag -> namespace mapping (warn if conflict)
|
|
160
|
+
if tag_name in self._tag_to_namespace and self._tag_to_namespace[tag_name] != namespace_uri:
|
|
161
|
+
warnings.warn(
|
|
162
|
+
f"Tag '{tag_name}' has multiple namespaces: "
|
|
163
|
+
f"{self._tag_to_namespace[tag_name]} and {namespace_uri}. "
|
|
164
|
+
f"Using the first one.",
|
|
165
|
+
stacklevel=2,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self._tag_to_namespace[tag_name] = namespace_uri
|
|
169
|
+
|
|
170
|
+
# Clean: remove namespace URI completely
|
|
171
|
+
elem.tag = tag_name
|
|
172
|
+
|
|
173
|
+
for attr_key in list(elem.attrib.keys()):
|
|
174
|
+
match = _NAMESPACE_IN_TAG.match(attr_key)
|
|
175
|
+
if match:
|
|
176
|
+
namespace_uri = match.group(1)
|
|
177
|
+
if namespace_uri not in self._namespaces:
|
|
178
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
|
|
179
|
+
self._namespaces[namespace_uri] = prefix
|
|
180
|
+
|
|
181
|
+
attr_name = attr_key[len(match.group(0)) :]
|
|
182
|
+
attr_value = elem.attrib.pop(attr_key)
|
|
183
|
+
|
|
184
|
+
# Record attr -> namespace mapping (warn if conflict)
|
|
185
|
+
if attr_name in self._attr_to_namespace and self._attr_to_namespace[attr_name] != namespace_uri:
|
|
186
|
+
warnings.warn(
|
|
187
|
+
f"Attribute '{attr_name}' has multiple namespaces: "
|
|
188
|
+
f"{self._attr_to_namespace[attr_name]} and {namespace_uri}. "
|
|
189
|
+
f"Using the first one.",
|
|
190
|
+
stacklevel=2,
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
self._attr_to_namespace[attr_name] = namespace_uri
|
|
194
|
+
|
|
195
|
+
# Clean: remove namespace URI completely
|
|
196
|
+
elem.attrib[attr_name] = attr_value
|
|
197
|
+
return element
|
|
198
|
+
|
|
199
|
+
def _serialize_with_namespaces(self, element: Element) -> str:
|
|
200
|
+
# First, add namespace declarations to root element (before serialization)
|
|
201
|
+
for namespace_uri, prefix in self._namespaces.items():
|
|
202
|
+
# Skip the reserved xml namespace - it's implicit
|
|
203
|
+
if namespace_uri == _XML_NAMESPACE_URI:
|
|
204
|
+
continue
|
|
205
|
+
if namespace_uri in _ROOT_NAMESPACES:
|
|
206
|
+
element.attrib["xmlns"] = namespace_uri
|
|
207
|
+
else:
|
|
208
|
+
element.attrib[f"xmlns:{prefix}"] = namespace_uri
|
|
209
|
+
|
|
210
|
+
# Serialize the element tree as-is (tags are simple names without prefixes)
|
|
211
|
+
xml_string = tostring(element, encoding="unicode")
|
|
212
|
+
|
|
213
|
+
# Now restore namespace prefixes in the serialized string
|
|
214
|
+
# For each tag that should have a namespace prefix, wrap it with the prefix
|
|
215
|
+
for tag_name, namespace_uri in self._tag_to_namespace.items():
|
|
216
|
+
if namespace_uri not in _ROOT_NAMESPACES:
|
|
217
|
+
# Get the prefix for this namespace
|
|
218
|
+
prefix = self._namespaces[namespace_uri]
|
|
219
|
+
# Replace opening and closing tags
|
|
220
|
+
xml_string = xml_string.replace(f"<{tag_name} ", f"<{prefix}:{tag_name} ")
|
|
221
|
+
xml_string = xml_string.replace(f"<{tag_name}>", f"<{prefix}:{tag_name}>")
|
|
222
|
+
xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
|
|
223
|
+
xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
|
|
224
|
+
|
|
225
|
+
# Similarly for attributes (though less common in EPUB)
|
|
226
|
+
for attr_name, namespace_uri in self._attr_to_namespace.items():
|
|
227
|
+
if namespace_uri not in _ROOT_NAMESPACES:
|
|
228
|
+
prefix = self._namespaces[namespace_uri]
|
|
229
|
+
xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
|
|
230
|
+
|
|
231
|
+
return xml_string
|