epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +3 -1
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +175 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +205 -168
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +176 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +178 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.0.dist-info/METADATA +283 -0
  56. epub_translator-0.1.0.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -62
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.6.dist-info/METADATA +0 -170
  80. epub_translator-0.0.6.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,176 @@
1
+ import io
2
+ import re
3
+ from typing import IO
4
+ from xml.etree.ElementTree import Element, fromstring, tostring
5
+
6
+ from .xml import iter_with_stack
7
+
8
+ _COMMON_NAMESPACES = {
9
+ "http://www.w3.org/1999/xhtml": "xhtml",
10
+ "http://www.idpf.org/2007/ops": "epub",
11
+ "http://www.w3.org/1998/Math/MathML": "m",
12
+ "http://purl.org/dc/elements/1.1/": "dc",
13
+ "http://www.daisy.org/z3986/2005/ncx/": "ncx",
14
+ "http://www.idpf.org/2007/opf": "opf",
15
+ "http://www.w3.org/2000/svg": "svg",
16
+ "urn:oasis:names:tc:opendocument:xmlns:container": "container",
17
+ }
18
+
19
+ _ROOT_NAMESPACES = {
20
+ "http://www.w3.org/1999/xhtml", # XHTML
21
+ "http://www.daisy.org/z3986/2005/ncx/", # NCX
22
+ "http://www.idpf.org/2007/opf", # OPF
23
+ "urn:oasis:names:tc:opendocument:xmlns:container", # Container
24
+ }
25
+
26
+ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
27
+ _FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
28
+ _NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
29
+
30
+ # HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
31
+ # https://www.tutorialspoint.com/which-html-tags-are-self-closing
32
+ _EMPTY_TAGS = (
33
+ "br",
34
+ "hr",
35
+ "input",
36
+ "col",
37
+ "base",
38
+ "meta",
39
+ "area",
40
+ )
41
+
42
+ _EMPTY_TAG_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>")
43
+
44
+
45
+ class XMLLikeNode:
46
+ def __init__(self, file: IO[bytes]) -> None:
47
+ raw_content = file.read()
48
+ self._encoding: str = _detect_encoding(raw_content)
49
+ content = raw_content.decode(self._encoding)
50
+ self._header, xml_content = _extract_header(content)
51
+ try:
52
+ self.element = fromstring(xml_content)
53
+ except Exception as error:
54
+ raise ValueError("Failed to parse XML-like content") from error
55
+ self._namespaces: dict[str, str] = _extract_and_clean_namespaces(self.element)
56
+
57
+ @property
58
+ def encoding(self) -> str:
59
+ return self._encoding
60
+
61
+ @property
62
+ def namespaces(self) -> list[str]:
63
+ return list(self._namespaces.keys())
64
+
65
+ def save(self, file: IO[bytes], is_html_like: bool = False) -> None:
66
+ writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
67
+ try:
68
+ if self._header:
69
+ writer.write(self._header)
70
+
71
+ content = _serialize_with_namespaces(element=self.element, namespaces=self._namespaces)
72
+ if is_html_like:
73
+ content = re.sub(
74
+ pattern=_EMPTY_TAG_PATTERN,
75
+ repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
76
+ string=content,
77
+ )
78
+ else:
79
+ content = re.sub(
80
+ pattern=_EMPTY_TAG_PATTERN,
81
+ repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
82
+ string=content,
83
+ )
84
+ writer.write(content)
85
+
86
+ finally:
87
+ writer.detach()
88
+
89
+
90
+ def _detect_encoding(raw_content: bytes) -> str:
91
+ if raw_content.startswith(b"\xef\xbb\xbf"):
92
+ return "utf-8-sig"
93
+ elif raw_content.startswith(b"\xff\xfe"):
94
+ return "utf-16-le"
95
+ elif raw_content.startswith(b"\xfe\xff"):
96
+ return "utf-16-be"
97
+
98
+ # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
99
+ header_bytes = raw_content[:1024]
100
+ for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
101
+ try:
102
+ header_str = header_bytes.decode(try_encoding)
103
+ match = _ENCODING_PATTERN.search(header_str)
104
+ if match:
105
+ declared_encoding = match.group(1).lower()
106
+ try:
107
+ raw_content.decode(declared_encoding)
108
+ return declared_encoding
109
+ except (LookupError, UnicodeDecodeError):
110
+ pass
111
+ except UnicodeDecodeError:
112
+ continue
113
+
114
+ try:
115
+ raw_content.decode("utf-8")
116
+ return "utf-8"
117
+ except UnicodeDecodeError:
118
+ pass
119
+ return "iso-8859-1"
120
+
121
+
122
+ def _extract_header(content: str) -> tuple[str, str]:
123
+ match = _FIRST_ELEMENT_PATTERN.search(content)
124
+ if match:
125
+ split_pos = match.start()
126
+ header = content[:split_pos]
127
+ xml_content = content[split_pos:]
128
+ return header, xml_content
129
+ return "", content
130
+
131
+
132
+ def _extract_and_clean_namespaces(element: Element):
133
+ namespaces: dict[str, str] = {}
134
+ for _, elem in iter_with_stack(element):
135
+ match = _NAMESPACE_IN_TAG.match(elem.tag)
136
+ if match:
137
+ namespace_uri = match.group(1)
138
+ if namespace_uri not in namespaces:
139
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
140
+ namespaces[namespace_uri] = prefix
141
+
142
+ tag_name = elem.tag[len(match.group(0)) :]
143
+ elem.tag = tag_name
144
+
145
+ for attr_key in list(elem.attrib.keys()):
146
+ match = _NAMESPACE_IN_TAG.match(attr_key)
147
+ if match:
148
+ namespace_uri = match.group(1)
149
+ if namespace_uri not in namespaces:
150
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
151
+ namespaces[namespace_uri] = prefix
152
+
153
+ attr_name = attr_key[len(match.group(0)) :]
154
+ attr_value = elem.attrib.pop(attr_key)
155
+ elem.attrib[attr_name] = attr_value
156
+ return namespaces
157
+
158
+
159
+ def _serialize_with_namespaces(
160
+ element: Element,
161
+ namespaces: dict[str, str],
162
+ ) -> str:
163
+ for namespace_uri, prefix in namespaces.items():
164
+ if namespace_uri in _ROOT_NAMESPACES:
165
+ element.attrib["xmlns"] = namespace_uri
166
+ else:
167
+ element.attrib[f"xmlns:{prefix}"] = namespace_uri
168
+ xml_string = tostring(element, encoding="unicode")
169
+ for namespace_uri, prefix in namespaces.items():
170
+ if namespace_uri in _ROOT_NAMESPACES:
171
+ xml_string = xml_string.replace(f"{{{namespace_uri}}}", "")
172
+ else:
173
+ xml_string = xml_string.replace(f"{{{namespace_uri}}}", f"{prefix}:")
174
+ pattern = r'\s+xmlns:(ns\d+)="' + re.escape(namespace_uri) + r'"'
175
+ xml_string = re.sub(pattern, "", xml_string)
176
+ return xml_string
@@ -0,0 +1,3 @@
1
+ from .group import XMLGroupContext
2
+ from .submitter import submit_text_segments
3
+ from .translator import XMLTranslator
@@ -0,0 +1,2 @@
1
+ ID_KEY: str = "id"
2
+ DATA_ORIGIN_LEN_KEY = "data-orig-len"
@@ -0,0 +1,128 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from ..utils import normalize_whitespace
4
+ from ..xml import plain_text
5
+ from .const import DATA_ORIGIN_LEN_KEY, ID_KEY
6
+ from .format import format
7
+ from .text_segment import TextSegment, combine_text_segments
8
+
9
+
10
+ class XMLFill:
11
+ def __init__(self, text_segments: list[TextSegment]) -> None:
12
+ self._request_element = Element("xml")
13
+ self._text_segments: dict[tuple[int, ...], list[TextSegment]] = {} # generated id stack -> text segments
14
+
15
+ raw2generated: dict[int, Element] = {}
16
+ raw2generated_ids: dict[int, int] = {}
17
+
18
+ for combined_element, sub_raw2generated in combine_text_segments(text_segments):
19
+ unwrapped_parent_ids: set[int] = set()
20
+ sub_element, parents = self._unwrap_parents(combined_element)
21
+ self._request_element.append(sub_element)
22
+ for parent in parents:
23
+ unwrapped_parent_ids.add(id(parent))
24
+
25
+ for raw_id, generated_element in sub_raw2generated.items():
26
+ if raw_id in unwrapped_parent_ids:
27
+ continue
28
+ if id(generated_element) in unwrapped_parent_ids:
29
+ continue
30
+ generated_id = len(raw2generated)
31
+ raw2generated[raw_id] = generated_element
32
+ raw2generated_ids[raw_id] = generated_id
33
+
34
+ generated_text = normalize_whitespace(
35
+ text=plain_text(generated_element),
36
+ )
37
+ generated_element.attrib = {
38
+ ID_KEY: str(generated_id),
39
+ DATA_ORIGIN_LEN_KEY: str(len(generated_text)),
40
+ }
41
+
42
+ for text_segment in text_segments:
43
+ generated_id_stack: list[int] = []
44
+ for parent in text_segment.parent_stack:
45
+ generated_id = raw2generated_ids.get(id(parent), None)
46
+ if generated_id is not None:
47
+ generated_id_stack.append(generated_id)
48
+ generated_key = tuple(generated_id_stack)
49
+ text_segments_stack = self._text_segments.get(generated_key, None)
50
+ if text_segments_stack is None:
51
+ text_segments_stack = []
52
+ self._text_segments[generated_key] = text_segments_stack
53
+ text_segments_stack.append(text_segment)
54
+
55
+ for text_segments_stack in self._text_segments.values():
56
+ text_segments_stack.reverse() # for use call .pop()
57
+
58
+ def _unwrap_parents(self, element: Element):
59
+ parents: list[Element] = []
60
+ while True:
61
+ if len(element) != 1:
62
+ break
63
+ child = element[0]
64
+ if not element.text:
65
+ break
66
+ if not child.tail:
67
+ break
68
+ parents.append(element)
69
+ element = child
70
+ element.tail = None
71
+ return element, parents
72
+
73
+ @property
74
+ def request_element(self) -> Element:
75
+ return self._request_element
76
+
77
+ def submit_response_text(self, text: str, errors_limit: int) -> Element:
78
+ submitted_element = format(
79
+ template_ele=self._request_element,
80
+ validated_text=text,
81
+ errors_limit=errors_limit,
82
+ )
83
+ self._fill_submitted_texts(
84
+ generated_ids_stack=[],
85
+ element=submitted_element,
86
+ )
87
+ return submitted_element
88
+
89
+ def _fill_submitted_texts(self, generated_ids_stack: list[int], element: Element):
90
+ current_stack = generated_ids_stack
91
+ generated_id = self._generated_id(element)
92
+ if generated_id >= 0:
93
+ current_stack = generated_ids_stack + [generated_id]
94
+
95
+ generated_key = tuple(current_stack)
96
+ text_segments_stack = self._text_segments.get(generated_key, None)
97
+ text = self._normalize_text(element.text)
98
+
99
+ if text_segments_stack and text is not None:
100
+ text_segment = text_segments_stack.pop()
101
+ text_segment.text = text
102
+
103
+ for child_element in element:
104
+ self._fill_submitted_texts(
105
+ generated_ids_stack=current_stack,
106
+ element=child_element,
107
+ )
108
+ tail = self._normalize_text(child_element.tail)
109
+ if text_segments_stack and tail is not None:
110
+ text_segment = text_segments_stack.pop()
111
+ text_segment.text = tail
112
+
113
+ def _generated_id(self, element: Element) -> int:
114
+ str_id = element.get(ID_KEY, None)
115
+ if str_id is None:
116
+ return -1
117
+ try:
118
+ return int(str_id)
119
+ except ValueError:
120
+ return -1
121
+
122
+ def _normalize_text(self, text: str | None) -> str | None:
123
+ if text is None:
124
+ return None
125
+ text = normalize_whitespace(text)
126
+ if not text.strip():
127
+ return None
128
+ return text
@@ -0,0 +1,282 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from ..utils import normalize_whitespace
4
+ from ..xml import decode_friendly
5
+ from .const import ID_KEY
6
+
7
+
8
+ def format(template_ele: Element, validated_text: str, errors_limit: int) -> Element:
9
+ context = _ValidationContext()
10
+ validated_ele = _extract_xml_element(validated_text)
11
+ context.validate(raw_ele=template_ele, validated_ele=validated_ele)
12
+ error_message = context.errors(limit=errors_limit)
13
+ if error_message:
14
+ raise ValidationError(message=error_message, validated_ele=validated_ele)
15
+ return validated_ele
16
+
17
+
18
+ class ValidationError(Exception):
19
+ def __init__(self, message: str, validated_ele: Element | None = None) -> None:
20
+ super().__init__(message)
21
+ self.validated_ele = validated_ele
22
+
23
+
24
+ def _extract_xml_element(text: str) -> Element:
25
+ first_xml_element: Element | None = None
26
+ all_xml_elements: int = 0
27
+
28
+ for xml_element in decode_friendly(text, tags="xml"):
29
+ if first_xml_element is None:
30
+ first_xml_element = xml_element
31
+ all_xml_elements += 1
32
+
33
+ if first_xml_element is None:
34
+ raise ValidationError(
35
+ "No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag."
36
+ )
37
+ if all_xml_elements > 1:
38
+ raise ValidationError(
39
+ f"Found {all_xml_elements} <xml>...</xml> blocks. "
40
+ "Please return only one XML block without any examples or explanations."
41
+ )
42
+ return first_xml_element
43
+
44
+
45
+ class _ValidationContext:
46
+ def __init__(self) -> None:
47
+ self._tag_text_dict: dict[int, str] = {}
48
+ self._errors: dict[tuple[int, ...], list[str]] = {}
49
+
50
+ def validate(self, raw_ele: Element, validated_ele: Element):
51
+ self._validate_ele(ids_path=[], raw_ele=raw_ele, validated_ele=validated_ele)
52
+
53
+ def errors(self, limit: int) -> str | None:
54
+ if not self._errors:
55
+ return
56
+
57
+ keys = list(self._errors.keys())
58
+ keys.sort(key=lambda k: (len(k), k)) # AI 矫正应该先浅后深
59
+ keys = keys[:limit]
60
+ max_len_key = max((len(key) for key in keys), default=0)
61
+
62
+ for i in range(len(keys)):
63
+ key = keys[i]
64
+ if len(key) < max_len_key:
65
+ key_list = list(key)
66
+ while len(key_list) < max_len_key:
67
+ key_list.append(-1)
68
+ keys[i] = tuple(key_list)
69
+
70
+ content: list[str] = []
71
+ total_errors = sum(len(messages) for messages in self._errors.values())
72
+ remain_errors = total_errors
73
+
74
+ for key in sorted(keys): # 改成深度优先排序,看起来关联度更好
75
+ raw_key = tuple(k for k in key if k >= 0)
76
+ indent: str = f"{' ' * len(raw_key)}"
77
+ errors_list = self._errors[raw_key]
78
+ parent_text: str
79
+
80
+ if len(raw_key) > 0:
81
+ parent_text = self._tag_text_dict[raw_key[-1]]
82
+ else:
83
+ parent_text = "the root tag"
84
+
85
+ if len(errors_list) == 1:
86
+ error = errors_list[0]
87
+ content.append(f"{indent}- errors in {parent_text}: {error}.")
88
+ else:
89
+ content.append(f"{indent}- errors in {parent_text}:")
90
+ for error in errors_list:
91
+ content.append(f"{indent} - {error}.")
92
+ remain_errors -= len(errors_list)
93
+
94
+ content.insert(0, f"Found {total_errors} error(s) in your response XML structure.")
95
+ if remain_errors > 0:
96
+ content.append(f"\n... and {remain_errors} more error(s).")
97
+
98
+ return "\n".join(content)
99
+
100
+ def _validate_ele(self, ids_path: list[int], raw_ele: Element, validated_ele: Element):
101
+ raw_id_map = self._build_id_map(raw_ele)
102
+ validated_id_map = self._build_id_map(validated_ele)
103
+ lost_ids: list[int] = []
104
+ extra_ids: list[int] = []
105
+
106
+ for id, sub_raw in raw_id_map.items():
107
+ sub_validated = validated_id_map.get(id, None)
108
+ if sub_validated is None:
109
+ lost_ids.append(id)
110
+ else:
111
+ self._validate_id_ele(
112
+ id=id,
113
+ ids_path=ids_path,
114
+ raw_ele=sub_raw,
115
+ validated_ele=sub_validated,
116
+ )
117
+
118
+ for id in validated_id_map.keys():
119
+ if id not in raw_id_map:
120
+ extra_ids.append(id)
121
+
122
+ if lost_ids or extra_ids:
123
+ messages: list[str] = []
124
+ lost_ids.sort()
125
+ extra_ids.sort()
126
+
127
+ if lost_ids:
128
+ tags = [self._str_tag(raw_id_map[id]) for id in lost_ids]
129
+ # Provide context from source XML
130
+ context_info = self._get_source_context(raw_ele, lost_ids)
131
+ messages.append(f"lost sub-tags {' '.join(tags)}")
132
+ if context_info:
133
+ messages.append(f"Source structure was: {context_info}")
134
+
135
+ if extra_ids:
136
+ tags = [self._str_tag(validated_id_map[id]) for id in extra_ids]
137
+ messages.append(f"extra sub-tags {' '.join(tags)}")
138
+
139
+ if messages:
140
+ self._add_error(
141
+ ids_path=ids_path,
142
+ message="find " + " and ".join(messages),
143
+ )
144
+ else:
145
+ raw_element_empty = not self._has_text_content(raw_ele)
146
+ validated_ele_empty = not self._has_text_content(validated_ele)
147
+
148
+ if raw_element_empty and not validated_ele_empty:
149
+ self._add_error(
150
+ ids_path=ids_path,
151
+ message="shouldn't have text content",
152
+ )
153
+ elif not raw_element_empty and validated_ele_empty:
154
+ self._add_error(
155
+ ids_path=ids_path,
156
+ message="text content is missing",
157
+ )
158
+
159
+ def _validate_id_ele(self, ids_path: list[int], id: int, raw_ele: Element, validated_ele: Element):
160
+ if raw_ele.tag == validated_ele.tag:
161
+ self._tag_text_dict[id] = self._str_tag(raw_ele)
162
+ raw_has_text = self._has_direct_text(raw_ele.text)
163
+ validated_has_text = self._has_direct_text(validated_ele.text)
164
+
165
+ if raw_has_text and not validated_has_text:
166
+ self._add_error(
167
+ ids_path=ids_path + [id],
168
+ message="missing text content before child elements",
169
+ )
170
+ elif not raw_has_text and validated_has_text:
171
+ self._add_error(
172
+ ids_path=ids_path + [id],
173
+ message="shouldn't have text content before child elements",
174
+ )
175
+ raw_has_tail = self._has_direct_text(raw_ele.tail)
176
+ validated_has_tail = self._has_direct_text(validated_ele.tail)
177
+
178
+ if raw_has_tail and not validated_has_tail:
179
+ self._add_error(
180
+ ids_path=ids_path + [id],
181
+ message="missing text content after the element",
182
+ )
183
+ elif not raw_has_tail and validated_has_tail:
184
+ self._add_error(
185
+ ids_path=ids_path + [id],
186
+ message="shouldn't have text content after the element",
187
+ )
188
+
189
+ self._validate_ele(
190
+ ids_path=ids_path + [id],
191
+ raw_ele=raw_ele,
192
+ validated_ele=validated_ele,
193
+ )
194
+ else:
195
+ self._add_error(
196
+ ids_path=ids_path,
197
+ message=f'got <{validated_ele.tag} id="{id}">',
198
+ )
199
+
200
+ def _add_error(self, ids_path: list[int], message: str):
201
+ key = tuple(ids_path)
202
+ if key not in self._errors:
203
+ self._errors[key] = []
204
+ self._errors[key].append(message)
205
+
206
+ def _build_id_map(self, ele: Element):
207
+ id_map: dict[int, Element] = {}
208
+ for child_ele in ele:
209
+ id_text = child_ele.get(ID_KEY, None)
210
+ if id_text is not None:
211
+ id = int(id_text)
212
+ if id < 0:
213
+ raise ValueError(f"Invalid id {id} found. IDs must be non-negative integers.")
214
+ if id_text is not None:
215
+ id_map[id] = child_ele
216
+ return id_map
217
+
218
+ def _has_text_content(self, ele: Element) -> bool:
219
+ text = "".join(self._plain_text(ele))
220
+ text = normalize_whitespace(text)
221
+ text = text.strip()
222
+ return len(text) > 0
223
+
224
+ def _has_direct_text(self, text: str | None) -> bool:
225
+ if text is None:
226
+ return False
227
+ normalized = normalize_whitespace(text).strip()
228
+ return len(normalized) > 0
229
+
230
+ def _plain_text(self, ele: Element):
231
+ if ele.text:
232
+ yield ele.text
233
+ for child in ele:
234
+ if child.get(ID_KEY, None) is not None:
235
+ yield from self._plain_text(child)
236
+ if child.tail:
237
+ yield child.tail
238
+
239
+ def _str_tag(self, ele: Element) -> str:
240
+ ele_id = ele.get(ID_KEY)
241
+ content: str
242
+ if ele_id is not None:
243
+ content = f'<{ele.tag} id="{ele_id}"'
244
+ else:
245
+ content = f"<{ele.tag}"
246
+ if len(ele) > 0:
247
+ content += f"> ... </{ele.tag}>"
248
+ else:
249
+ content += " />"
250
+ return content
251
+
252
+ def _get_source_context(self, parent: Element, lost_ids: list[int]) -> str:
253
+ """Generate context showing where lost tags appeared in source XML."""
254
+ if not lost_ids:
255
+ return ""
256
+
257
+ # Build a simple representation of the source structure
258
+ children_with_ids = []
259
+ for child in parent:
260
+ child_id_str = child.get(ID_KEY)
261
+ if child_id_str is not None:
262
+ child_id = int(child_id_str)
263
+ is_lost = child_id in lost_ids
264
+ tag_str = f'<{child.tag} id="{child_id}">'
265
+
266
+ # Show text before/inside/after
267
+ parts = []
268
+ if child.text and child.text.strip():
269
+ preview = child.text.strip()[:20]
270
+ if is_lost:
271
+ parts.append(f'[{preview}...]')
272
+ else:
273
+ parts.append(f'{preview}...')
274
+
275
+ if is_lost:
276
+ children_with_ids.append(f'{tag_str}*MISSING*')
277
+ else:
278
+ children_with_ids.append(tag_str)
279
+
280
+ if children_with_ids:
281
+ return f"[{' '.join(children_with_ids)}]"
282
+ return ""