epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +3 -1
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +175 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +205 -168
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +176 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +178 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.0.dist-info/METADATA +283 -0
- epub_translator-0.1.0.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -62
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.6.dist-info/METADATA +0 -170
- epub_translator-0.0.6.dist-info/RECORD +0 -36
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import re
|
|
3
|
+
from typing import IO
|
|
4
|
+
from xml.etree.ElementTree import Element, fromstring, tostring
|
|
5
|
+
|
|
6
|
+
from .xml import iter_with_stack
|
|
7
|
+
|
|
8
|
+
_COMMON_NAMESPACES = {
|
|
9
|
+
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
10
|
+
"http://www.idpf.org/2007/ops": "epub",
|
|
11
|
+
"http://www.w3.org/1998/Math/MathML": "m",
|
|
12
|
+
"http://purl.org/dc/elements/1.1/": "dc",
|
|
13
|
+
"http://www.daisy.org/z3986/2005/ncx/": "ncx",
|
|
14
|
+
"http://www.idpf.org/2007/opf": "opf",
|
|
15
|
+
"http://www.w3.org/2000/svg": "svg",
|
|
16
|
+
"urn:oasis:names:tc:opendocument:xmlns:container": "container",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
_ROOT_NAMESPACES = {
|
|
20
|
+
"http://www.w3.org/1999/xhtml", # XHTML
|
|
21
|
+
"http://www.daisy.org/z3986/2005/ncx/", # NCX
|
|
22
|
+
"http://www.idpf.org/2007/opf", # OPF
|
|
23
|
+
"urn:oasis:names:tc:opendocument:xmlns:container", # Container
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
_ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
|
|
27
|
+
_FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
|
|
28
|
+
_NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
|
|
29
|
+
|
|
30
|
+
# HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
|
|
31
|
+
# https://www.tutorialspoint.com/which-html-tags-are-self-closing
|
|
32
|
+
_EMPTY_TAGS = (
|
|
33
|
+
"br",
|
|
34
|
+
"hr",
|
|
35
|
+
"input",
|
|
36
|
+
"col",
|
|
37
|
+
"base",
|
|
38
|
+
"meta",
|
|
39
|
+
"area",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
_EMPTY_TAG_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class XMLLikeNode:
|
|
46
|
+
def __init__(self, file: IO[bytes]) -> None:
|
|
47
|
+
raw_content = file.read()
|
|
48
|
+
self._encoding: str = _detect_encoding(raw_content)
|
|
49
|
+
content = raw_content.decode(self._encoding)
|
|
50
|
+
self._header, xml_content = _extract_header(content)
|
|
51
|
+
try:
|
|
52
|
+
self.element = fromstring(xml_content)
|
|
53
|
+
except Exception as error:
|
|
54
|
+
raise ValueError("Failed to parse XML-like content") from error
|
|
55
|
+
self._namespaces: dict[str, str] = _extract_and_clean_namespaces(self.element)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def encoding(self) -> str:
|
|
59
|
+
return self._encoding
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def namespaces(self) -> list[str]:
|
|
63
|
+
return list(self._namespaces.keys())
|
|
64
|
+
|
|
65
|
+
def save(self, file: IO[bytes], is_html_like: bool = False) -> None:
|
|
66
|
+
writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
|
|
67
|
+
try:
|
|
68
|
+
if self._header:
|
|
69
|
+
writer.write(self._header)
|
|
70
|
+
|
|
71
|
+
content = _serialize_with_namespaces(element=self.element, namespaces=self._namespaces)
|
|
72
|
+
if is_html_like:
|
|
73
|
+
content = re.sub(
|
|
74
|
+
pattern=_EMPTY_TAG_PATTERN,
|
|
75
|
+
repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
|
|
76
|
+
string=content,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
content = re.sub(
|
|
80
|
+
pattern=_EMPTY_TAG_PATTERN,
|
|
81
|
+
repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
|
|
82
|
+
string=content,
|
|
83
|
+
)
|
|
84
|
+
writer.write(content)
|
|
85
|
+
|
|
86
|
+
finally:
|
|
87
|
+
writer.detach()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _detect_encoding(raw_content: bytes) -> str:
|
|
91
|
+
if raw_content.startswith(b"\xef\xbb\xbf"):
|
|
92
|
+
return "utf-8-sig"
|
|
93
|
+
elif raw_content.startswith(b"\xff\xfe"):
|
|
94
|
+
return "utf-16-le"
|
|
95
|
+
elif raw_content.startswith(b"\xfe\xff"):
|
|
96
|
+
return "utf-16-be"
|
|
97
|
+
|
|
98
|
+
# 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
|
|
99
|
+
header_bytes = raw_content[:1024]
|
|
100
|
+
for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
|
|
101
|
+
try:
|
|
102
|
+
header_str = header_bytes.decode(try_encoding)
|
|
103
|
+
match = _ENCODING_PATTERN.search(header_str)
|
|
104
|
+
if match:
|
|
105
|
+
declared_encoding = match.group(1).lower()
|
|
106
|
+
try:
|
|
107
|
+
raw_content.decode(declared_encoding)
|
|
108
|
+
return declared_encoding
|
|
109
|
+
except (LookupError, UnicodeDecodeError):
|
|
110
|
+
pass
|
|
111
|
+
except UnicodeDecodeError:
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
raw_content.decode("utf-8")
|
|
116
|
+
return "utf-8"
|
|
117
|
+
except UnicodeDecodeError:
|
|
118
|
+
pass
|
|
119
|
+
return "iso-8859-1"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _extract_header(content: str) -> tuple[str, str]:
|
|
123
|
+
match = _FIRST_ELEMENT_PATTERN.search(content)
|
|
124
|
+
if match:
|
|
125
|
+
split_pos = match.start()
|
|
126
|
+
header = content[:split_pos]
|
|
127
|
+
xml_content = content[split_pos:]
|
|
128
|
+
return header, xml_content
|
|
129
|
+
return "", content
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _extract_and_clean_namespaces(element: Element):
|
|
133
|
+
namespaces: dict[str, str] = {}
|
|
134
|
+
for _, elem in iter_with_stack(element):
|
|
135
|
+
match = _NAMESPACE_IN_TAG.match(elem.tag)
|
|
136
|
+
if match:
|
|
137
|
+
namespace_uri = match.group(1)
|
|
138
|
+
if namespace_uri not in namespaces:
|
|
139
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
|
|
140
|
+
namespaces[namespace_uri] = prefix
|
|
141
|
+
|
|
142
|
+
tag_name = elem.tag[len(match.group(0)) :]
|
|
143
|
+
elem.tag = tag_name
|
|
144
|
+
|
|
145
|
+
for attr_key in list(elem.attrib.keys()):
|
|
146
|
+
match = _NAMESPACE_IN_TAG.match(attr_key)
|
|
147
|
+
if match:
|
|
148
|
+
namespace_uri = match.group(1)
|
|
149
|
+
if namespace_uri not in namespaces:
|
|
150
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
|
|
151
|
+
namespaces[namespace_uri] = prefix
|
|
152
|
+
|
|
153
|
+
attr_name = attr_key[len(match.group(0)) :]
|
|
154
|
+
attr_value = elem.attrib.pop(attr_key)
|
|
155
|
+
elem.attrib[attr_name] = attr_value
|
|
156
|
+
return namespaces
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _serialize_with_namespaces(
|
|
160
|
+
element: Element,
|
|
161
|
+
namespaces: dict[str, str],
|
|
162
|
+
) -> str:
|
|
163
|
+
for namespace_uri, prefix in namespaces.items():
|
|
164
|
+
if namespace_uri in _ROOT_NAMESPACES:
|
|
165
|
+
element.attrib["xmlns"] = namespace_uri
|
|
166
|
+
else:
|
|
167
|
+
element.attrib[f"xmlns:{prefix}"] = namespace_uri
|
|
168
|
+
xml_string = tostring(element, encoding="unicode")
|
|
169
|
+
for namespace_uri, prefix in namespaces.items():
|
|
170
|
+
if namespace_uri in _ROOT_NAMESPACES:
|
|
171
|
+
xml_string = xml_string.replace(f"{{{namespace_uri}}}", "")
|
|
172
|
+
else:
|
|
173
|
+
xml_string = xml_string.replace(f"{{{namespace_uri}}}", f"{prefix}:")
|
|
174
|
+
pattern = r'\s+xmlns:(ns\d+)="' + re.escape(namespace_uri) + r'"'
|
|
175
|
+
xml_string = re.sub(pattern, "", xml_string)
|
|
176
|
+
return xml_string
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from ..utils import normalize_whitespace
|
|
4
|
+
from ..xml import plain_text
|
|
5
|
+
from .const import DATA_ORIGIN_LEN_KEY, ID_KEY
|
|
6
|
+
from .format import format
|
|
7
|
+
from .text_segment import TextSegment, combine_text_segments
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class XMLFill:
|
|
11
|
+
def __init__(self, text_segments: list[TextSegment]) -> None:
|
|
12
|
+
self._request_element = Element("xml")
|
|
13
|
+
self._text_segments: dict[tuple[int, ...], list[TextSegment]] = {} # generated id stack -> text segments
|
|
14
|
+
|
|
15
|
+
raw2generated: dict[int, Element] = {}
|
|
16
|
+
raw2generated_ids: dict[int, int] = {}
|
|
17
|
+
|
|
18
|
+
for combined_element, sub_raw2generated in combine_text_segments(text_segments):
|
|
19
|
+
unwrapped_parent_ids: set[int] = set()
|
|
20
|
+
sub_element, parents = self._unwrap_parents(combined_element)
|
|
21
|
+
self._request_element.append(sub_element)
|
|
22
|
+
for parent in parents:
|
|
23
|
+
unwrapped_parent_ids.add(id(parent))
|
|
24
|
+
|
|
25
|
+
for raw_id, generated_element in sub_raw2generated.items():
|
|
26
|
+
if raw_id in unwrapped_parent_ids:
|
|
27
|
+
continue
|
|
28
|
+
if id(generated_element) in unwrapped_parent_ids:
|
|
29
|
+
continue
|
|
30
|
+
generated_id = len(raw2generated)
|
|
31
|
+
raw2generated[raw_id] = generated_element
|
|
32
|
+
raw2generated_ids[raw_id] = generated_id
|
|
33
|
+
|
|
34
|
+
generated_text = normalize_whitespace(
|
|
35
|
+
text=plain_text(generated_element),
|
|
36
|
+
)
|
|
37
|
+
generated_element.attrib = {
|
|
38
|
+
ID_KEY: str(generated_id),
|
|
39
|
+
DATA_ORIGIN_LEN_KEY: str(len(generated_text)),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
for text_segment in text_segments:
|
|
43
|
+
generated_id_stack: list[int] = []
|
|
44
|
+
for parent in text_segment.parent_stack:
|
|
45
|
+
generated_id = raw2generated_ids.get(id(parent), None)
|
|
46
|
+
if generated_id is not None:
|
|
47
|
+
generated_id_stack.append(generated_id)
|
|
48
|
+
generated_key = tuple(generated_id_stack)
|
|
49
|
+
text_segments_stack = self._text_segments.get(generated_key, None)
|
|
50
|
+
if text_segments_stack is None:
|
|
51
|
+
text_segments_stack = []
|
|
52
|
+
self._text_segments[generated_key] = text_segments_stack
|
|
53
|
+
text_segments_stack.append(text_segment)
|
|
54
|
+
|
|
55
|
+
for text_segments_stack in self._text_segments.values():
|
|
56
|
+
text_segments_stack.reverse() # for use call .pop()
|
|
57
|
+
|
|
58
|
+
def _unwrap_parents(self, element: Element):
|
|
59
|
+
parents: list[Element] = []
|
|
60
|
+
while True:
|
|
61
|
+
if len(element) != 1:
|
|
62
|
+
break
|
|
63
|
+
child = element[0]
|
|
64
|
+
if not element.text:
|
|
65
|
+
break
|
|
66
|
+
if not child.tail:
|
|
67
|
+
break
|
|
68
|
+
parents.append(element)
|
|
69
|
+
element = child
|
|
70
|
+
element.tail = None
|
|
71
|
+
return element, parents
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def request_element(self) -> Element:
|
|
75
|
+
return self._request_element
|
|
76
|
+
|
|
77
|
+
def submit_response_text(self, text: str, errors_limit: int) -> Element:
|
|
78
|
+
submitted_element = format(
|
|
79
|
+
template_ele=self._request_element,
|
|
80
|
+
validated_text=text,
|
|
81
|
+
errors_limit=errors_limit,
|
|
82
|
+
)
|
|
83
|
+
self._fill_submitted_texts(
|
|
84
|
+
generated_ids_stack=[],
|
|
85
|
+
element=submitted_element,
|
|
86
|
+
)
|
|
87
|
+
return submitted_element
|
|
88
|
+
|
|
89
|
+
def _fill_submitted_texts(self, generated_ids_stack: list[int], element: Element):
|
|
90
|
+
current_stack = generated_ids_stack
|
|
91
|
+
generated_id = self._generated_id(element)
|
|
92
|
+
if generated_id >= 0:
|
|
93
|
+
current_stack = generated_ids_stack + [generated_id]
|
|
94
|
+
|
|
95
|
+
generated_key = tuple(current_stack)
|
|
96
|
+
text_segments_stack = self._text_segments.get(generated_key, None)
|
|
97
|
+
text = self._normalize_text(element.text)
|
|
98
|
+
|
|
99
|
+
if text_segments_stack and text is not None:
|
|
100
|
+
text_segment = text_segments_stack.pop()
|
|
101
|
+
text_segment.text = text
|
|
102
|
+
|
|
103
|
+
for child_element in element:
|
|
104
|
+
self._fill_submitted_texts(
|
|
105
|
+
generated_ids_stack=current_stack,
|
|
106
|
+
element=child_element,
|
|
107
|
+
)
|
|
108
|
+
tail = self._normalize_text(child_element.tail)
|
|
109
|
+
if text_segments_stack and tail is not None:
|
|
110
|
+
text_segment = text_segments_stack.pop()
|
|
111
|
+
text_segment.text = tail
|
|
112
|
+
|
|
113
|
+
def _generated_id(self, element: Element) -> int:
|
|
114
|
+
str_id = element.get(ID_KEY, None)
|
|
115
|
+
if str_id is None:
|
|
116
|
+
return -1
|
|
117
|
+
try:
|
|
118
|
+
return int(str_id)
|
|
119
|
+
except ValueError:
|
|
120
|
+
return -1
|
|
121
|
+
|
|
122
|
+
def _normalize_text(self, text: str | None) -> str | None:
|
|
123
|
+
if text is None:
|
|
124
|
+
return None
|
|
125
|
+
text = normalize_whitespace(text)
|
|
126
|
+
if not text.strip():
|
|
127
|
+
return None
|
|
128
|
+
return text
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from ..utils import normalize_whitespace
|
|
4
|
+
from ..xml import decode_friendly
|
|
5
|
+
from .const import ID_KEY
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format(template_ele: Element, validated_text: str, errors_limit: int) -> Element:
|
|
9
|
+
context = _ValidationContext()
|
|
10
|
+
validated_ele = _extract_xml_element(validated_text)
|
|
11
|
+
context.validate(raw_ele=template_ele, validated_ele=validated_ele)
|
|
12
|
+
error_message = context.errors(limit=errors_limit)
|
|
13
|
+
if error_message:
|
|
14
|
+
raise ValidationError(message=error_message, validated_ele=validated_ele)
|
|
15
|
+
return validated_ele
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ValidationError(Exception):
|
|
19
|
+
def __init__(self, message: str, validated_ele: Element | None = None) -> None:
|
|
20
|
+
super().__init__(message)
|
|
21
|
+
self.validated_ele = validated_ele
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _extract_xml_element(text: str) -> Element:
|
|
25
|
+
first_xml_element: Element | None = None
|
|
26
|
+
all_xml_elements: int = 0
|
|
27
|
+
|
|
28
|
+
for xml_element in decode_friendly(text, tags="xml"):
|
|
29
|
+
if first_xml_element is None:
|
|
30
|
+
first_xml_element = xml_element
|
|
31
|
+
all_xml_elements += 1
|
|
32
|
+
|
|
33
|
+
if first_xml_element is None:
|
|
34
|
+
raise ValidationError(
|
|
35
|
+
"No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag."
|
|
36
|
+
)
|
|
37
|
+
if all_xml_elements > 1:
|
|
38
|
+
raise ValidationError(
|
|
39
|
+
f"Found {all_xml_elements} <xml>...</xml> blocks. "
|
|
40
|
+
"Please return only one XML block without any examples or explanations."
|
|
41
|
+
)
|
|
42
|
+
return first_xml_element
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _ValidationContext:
|
|
46
|
+
def __init__(self) -> None:
|
|
47
|
+
self._tag_text_dict: dict[int, str] = {}
|
|
48
|
+
self._errors: dict[tuple[int, ...], list[str]] = {}
|
|
49
|
+
|
|
50
|
+
def validate(self, raw_ele: Element, validated_ele: Element):
|
|
51
|
+
self._validate_ele(ids_path=[], raw_ele=raw_ele, validated_ele=validated_ele)
|
|
52
|
+
|
|
53
|
+
def errors(self, limit: int) -> str | None:
|
|
54
|
+
if not self._errors:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
keys = list(self._errors.keys())
|
|
58
|
+
keys.sort(key=lambda k: (len(k), k)) # AI 矫正应该先浅后深
|
|
59
|
+
keys = keys[:limit]
|
|
60
|
+
max_len_key = max((len(key) for key in keys), default=0)
|
|
61
|
+
|
|
62
|
+
for i in range(len(keys)):
|
|
63
|
+
key = keys[i]
|
|
64
|
+
if len(key) < max_len_key:
|
|
65
|
+
key_list = list(key)
|
|
66
|
+
while len(key_list) < max_len_key:
|
|
67
|
+
key_list.append(-1)
|
|
68
|
+
keys[i] = tuple(key_list)
|
|
69
|
+
|
|
70
|
+
content: list[str] = []
|
|
71
|
+
total_errors = sum(len(messages) for messages in self._errors.values())
|
|
72
|
+
remain_errors = total_errors
|
|
73
|
+
|
|
74
|
+
for key in sorted(keys): # 改成深度优先排序,看起来关联度更好
|
|
75
|
+
raw_key = tuple(k for k in key if k >= 0)
|
|
76
|
+
indent: str = f"{' ' * len(raw_key)}"
|
|
77
|
+
errors_list = self._errors[raw_key]
|
|
78
|
+
parent_text: str
|
|
79
|
+
|
|
80
|
+
if len(raw_key) > 0:
|
|
81
|
+
parent_text = self._tag_text_dict[raw_key[-1]]
|
|
82
|
+
else:
|
|
83
|
+
parent_text = "the root tag"
|
|
84
|
+
|
|
85
|
+
if len(errors_list) == 1:
|
|
86
|
+
error = errors_list[0]
|
|
87
|
+
content.append(f"{indent}- errors in {parent_text}: {error}.")
|
|
88
|
+
else:
|
|
89
|
+
content.append(f"{indent}- errors in {parent_text}:")
|
|
90
|
+
for error in errors_list:
|
|
91
|
+
content.append(f"{indent} - {error}.")
|
|
92
|
+
remain_errors -= len(errors_list)
|
|
93
|
+
|
|
94
|
+
content.insert(0, f"Found {total_errors} error(s) in your response XML structure.")
|
|
95
|
+
if remain_errors > 0:
|
|
96
|
+
content.append(f"\n... and {remain_errors} more error(s).")
|
|
97
|
+
|
|
98
|
+
return "\n".join(content)
|
|
99
|
+
|
|
100
|
+
def _validate_ele(self, ids_path: list[int], raw_ele: Element, validated_ele: Element):
|
|
101
|
+
raw_id_map = self._build_id_map(raw_ele)
|
|
102
|
+
validated_id_map = self._build_id_map(validated_ele)
|
|
103
|
+
lost_ids: list[int] = []
|
|
104
|
+
extra_ids: list[int] = []
|
|
105
|
+
|
|
106
|
+
for id, sub_raw in raw_id_map.items():
|
|
107
|
+
sub_validated = validated_id_map.get(id, None)
|
|
108
|
+
if sub_validated is None:
|
|
109
|
+
lost_ids.append(id)
|
|
110
|
+
else:
|
|
111
|
+
self._validate_id_ele(
|
|
112
|
+
id=id,
|
|
113
|
+
ids_path=ids_path,
|
|
114
|
+
raw_ele=sub_raw,
|
|
115
|
+
validated_ele=sub_validated,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
for id in validated_id_map.keys():
|
|
119
|
+
if id not in raw_id_map:
|
|
120
|
+
extra_ids.append(id)
|
|
121
|
+
|
|
122
|
+
if lost_ids or extra_ids:
|
|
123
|
+
messages: list[str] = []
|
|
124
|
+
lost_ids.sort()
|
|
125
|
+
extra_ids.sort()
|
|
126
|
+
|
|
127
|
+
if lost_ids:
|
|
128
|
+
tags = [self._str_tag(raw_id_map[id]) for id in lost_ids]
|
|
129
|
+
# Provide context from source XML
|
|
130
|
+
context_info = self._get_source_context(raw_ele, lost_ids)
|
|
131
|
+
messages.append(f"lost sub-tags {' '.join(tags)}")
|
|
132
|
+
if context_info:
|
|
133
|
+
messages.append(f"Source structure was: {context_info}")
|
|
134
|
+
|
|
135
|
+
if extra_ids:
|
|
136
|
+
tags = [self._str_tag(validated_id_map[id]) for id in extra_ids]
|
|
137
|
+
messages.append(f"extra sub-tags {' '.join(tags)}")
|
|
138
|
+
|
|
139
|
+
if messages:
|
|
140
|
+
self._add_error(
|
|
141
|
+
ids_path=ids_path,
|
|
142
|
+
message="find " + " and ".join(messages),
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
raw_element_empty = not self._has_text_content(raw_ele)
|
|
146
|
+
validated_ele_empty = not self._has_text_content(validated_ele)
|
|
147
|
+
|
|
148
|
+
if raw_element_empty and not validated_ele_empty:
|
|
149
|
+
self._add_error(
|
|
150
|
+
ids_path=ids_path,
|
|
151
|
+
message="shouldn't have text content",
|
|
152
|
+
)
|
|
153
|
+
elif not raw_element_empty and validated_ele_empty:
|
|
154
|
+
self._add_error(
|
|
155
|
+
ids_path=ids_path,
|
|
156
|
+
message="text content is missing",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _validate_id_ele(self, ids_path: list[int], id: int, raw_ele: Element, validated_ele: Element):
|
|
160
|
+
if raw_ele.tag == validated_ele.tag:
|
|
161
|
+
self._tag_text_dict[id] = self._str_tag(raw_ele)
|
|
162
|
+
raw_has_text = self._has_direct_text(raw_ele.text)
|
|
163
|
+
validated_has_text = self._has_direct_text(validated_ele.text)
|
|
164
|
+
|
|
165
|
+
if raw_has_text and not validated_has_text:
|
|
166
|
+
self._add_error(
|
|
167
|
+
ids_path=ids_path + [id],
|
|
168
|
+
message="missing text content before child elements",
|
|
169
|
+
)
|
|
170
|
+
elif not raw_has_text and validated_has_text:
|
|
171
|
+
self._add_error(
|
|
172
|
+
ids_path=ids_path + [id],
|
|
173
|
+
message="shouldn't have text content before child elements",
|
|
174
|
+
)
|
|
175
|
+
raw_has_tail = self._has_direct_text(raw_ele.tail)
|
|
176
|
+
validated_has_tail = self._has_direct_text(validated_ele.tail)
|
|
177
|
+
|
|
178
|
+
if raw_has_tail and not validated_has_tail:
|
|
179
|
+
self._add_error(
|
|
180
|
+
ids_path=ids_path + [id],
|
|
181
|
+
message="missing text content after the element",
|
|
182
|
+
)
|
|
183
|
+
elif not raw_has_tail and validated_has_tail:
|
|
184
|
+
self._add_error(
|
|
185
|
+
ids_path=ids_path + [id],
|
|
186
|
+
message="shouldn't have text content after the element",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
self._validate_ele(
|
|
190
|
+
ids_path=ids_path + [id],
|
|
191
|
+
raw_ele=raw_ele,
|
|
192
|
+
validated_ele=validated_ele,
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
self._add_error(
|
|
196
|
+
ids_path=ids_path,
|
|
197
|
+
message=f'got <{validated_ele.tag} id="{id}">',
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def _add_error(self, ids_path: list[int], message: str):
|
|
201
|
+
key = tuple(ids_path)
|
|
202
|
+
if key not in self._errors:
|
|
203
|
+
self._errors[key] = []
|
|
204
|
+
self._errors[key].append(message)
|
|
205
|
+
|
|
206
|
+
def _build_id_map(self, ele: Element):
|
|
207
|
+
id_map: dict[int, Element] = {}
|
|
208
|
+
for child_ele in ele:
|
|
209
|
+
id_text = child_ele.get(ID_KEY, None)
|
|
210
|
+
if id_text is not None:
|
|
211
|
+
id = int(id_text)
|
|
212
|
+
if id < 0:
|
|
213
|
+
raise ValueError(f"Invalid id {id} found. IDs must be non-negative integers.")
|
|
214
|
+
if id_text is not None:
|
|
215
|
+
id_map[id] = child_ele
|
|
216
|
+
return id_map
|
|
217
|
+
|
|
218
|
+
def _has_text_content(self, ele: Element) -> bool:
|
|
219
|
+
text = "".join(self._plain_text(ele))
|
|
220
|
+
text = normalize_whitespace(text)
|
|
221
|
+
text = text.strip()
|
|
222
|
+
return len(text) > 0
|
|
223
|
+
|
|
224
|
+
def _has_direct_text(self, text: str | None) -> bool:
|
|
225
|
+
if text is None:
|
|
226
|
+
return False
|
|
227
|
+
normalized = normalize_whitespace(text).strip()
|
|
228
|
+
return len(normalized) > 0
|
|
229
|
+
|
|
230
|
+
def _plain_text(self, ele: Element):
|
|
231
|
+
if ele.text:
|
|
232
|
+
yield ele.text
|
|
233
|
+
for child in ele:
|
|
234
|
+
if child.get(ID_KEY, None) is not None:
|
|
235
|
+
yield from self._plain_text(child)
|
|
236
|
+
if child.tail:
|
|
237
|
+
yield child.tail
|
|
238
|
+
|
|
239
|
+
def _str_tag(self, ele: Element) -> str:
|
|
240
|
+
ele_id = ele.get(ID_KEY)
|
|
241
|
+
content: str
|
|
242
|
+
if ele_id is not None:
|
|
243
|
+
content = f'<{ele.tag} id="{ele_id}"'
|
|
244
|
+
else:
|
|
245
|
+
content = f"<{ele.tag}"
|
|
246
|
+
if len(ele) > 0:
|
|
247
|
+
content += f"> ... </{ele.tag}>"
|
|
248
|
+
else:
|
|
249
|
+
content += " />"
|
|
250
|
+
return content
|
|
251
|
+
|
|
252
|
+
def _get_source_context(self, parent: Element, lost_ids: list[int]) -> str:
|
|
253
|
+
"""Generate context showing where lost tags appeared in source XML."""
|
|
254
|
+
if not lost_ids:
|
|
255
|
+
return ""
|
|
256
|
+
|
|
257
|
+
# Build a simple representation of the source structure
|
|
258
|
+
children_with_ids = []
|
|
259
|
+
for child in parent:
|
|
260
|
+
child_id_str = child.get(ID_KEY)
|
|
261
|
+
if child_id_str is not None:
|
|
262
|
+
child_id = int(child_id_str)
|
|
263
|
+
is_lost = child_id in lost_ids
|
|
264
|
+
tag_str = f'<{child.tag} id="{child_id}">'
|
|
265
|
+
|
|
266
|
+
# Show text before/inside/after
|
|
267
|
+
parts = []
|
|
268
|
+
if child.text and child.text.strip():
|
|
269
|
+
preview = child.text.strip()[:20]
|
|
270
|
+
if is_lost:
|
|
271
|
+
parts.append(f'[{preview}...]')
|
|
272
|
+
else:
|
|
273
|
+
parts.append(f'{preview}...')
|
|
274
|
+
|
|
275
|
+
if is_lost:
|
|
276
|
+
children_with_ids.append(f'{tag_str}*MISSING*')
|
|
277
|
+
else:
|
|
278
|
+
children_with_ids.append(tag_str)
|
|
279
|
+
|
|
280
|
+
if children_with_ids:
|
|
281
|
+
return f"[{' '.join(children_with_ids)}]"
|
|
282
|
+
return ""
|