epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +4 -2
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +233 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +208 -178
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +231 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +179 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.1.dist-info/METADATA +283 -0
  56. epub_translator-0.1.1.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -68
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.7.dist-info/METADATA +0 -170
  80. epub_translator-0.0.7.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,177 @@
1
+ from collections.abc import Generator, Iterable
2
+ from enum import Enum, auto
3
+ from io import StringIO
4
+
5
+ from .tag import Tag, TagKind, is_valid_name_char, is_valid_value_char
6
+
7
+ _SPACES = (" ", "\n")
8
+
9
+
10
+ class _Phase(Enum):
11
+ OUTSIDE = auto()
12
+ LEFT_BRACKET = auto()
13
+ LEFT_SLASH = auto()
14
+ TAG_NAME = auto()
15
+ TAG_GAP = auto()
16
+ ATTRIBUTE_NAME = auto()
17
+ ATTRIBUTE_NAME_EQUAL = auto()
18
+ ATTRIBUTE_VALUE = auto()
19
+ MUST_CLOSING_SIGN = auto()
20
+
21
+
22
+ class _ParsedResult(Enum):
23
+ Continue = auto()
24
+ Success = auto()
25
+ Failed = auto()
26
+
27
+
28
+ def parse_tags(chars: Iterable[str]) -> Generator[str | Tag, None, None]:
29
+ yield from _XMLTagsParser().do(chars)
30
+
31
+
32
+ class _XMLTagsParser:
33
+ def __init__(self):
34
+ self._outside_buffer: StringIO = StringIO()
35
+ self._tag_buffer: StringIO = StringIO()
36
+ self._tag: Tag | None = None
37
+ self._phase: _Phase = _Phase.OUTSIDE
38
+
39
+ def do(self, chars: Iterable[str]) -> Generator[str | Tag, None, None]:
40
+ for char in chars:
41
+ parsed_result = self._parse_char(char)
42
+ yield from self._generate_by_result(parsed_result)
43
+
44
+ self._outside_buffer.write(self._tag_buffer.getvalue())
45
+ outside_text = self._outside_buffer.getvalue()
46
+ if outside_text != "":
47
+ yield outside_text
48
+
49
+ def _parse_char(self, char: str) -> _ParsedResult:
50
+ parsed_result: _ParsedResult = _ParsedResult.Continue
51
+
52
+ if self._phase == _Phase.OUTSIDE:
53
+ if char != "<":
54
+ self._outside_buffer.write(char)
55
+ else:
56
+ self._phase = _Phase.LEFT_BRACKET
57
+ self._tag_buffer.write(char)
58
+ self._tag = Tag(
59
+ kind=TagKind.OPENING,
60
+ name="",
61
+ proto="",
62
+ attributes=[],
63
+ )
64
+ else:
65
+ assert self._tag is not None
66
+ self._tag_buffer.write(char)
67
+
68
+ if self._phase == _Phase.LEFT_BRACKET:
69
+ if char == "/":
70
+ self._tag.kind = TagKind.CLOSING
71
+ self._phase = _Phase.LEFT_SLASH
72
+ elif is_valid_name_char(char):
73
+ self._tag.name += char
74
+ self._phase = _Phase.TAG_NAME
75
+ else:
76
+ parsed_result = _ParsedResult.Failed
77
+
78
+ elif self._phase == _Phase.LEFT_SLASH:
79
+ if is_valid_name_char(char):
80
+ self._tag.name += char
81
+ self._phase = _Phase.TAG_NAME
82
+ else:
83
+ parsed_result = _ParsedResult.Failed
84
+
85
+ elif self._phase == _Phase.TAG_NAME:
86
+ if char in _SPACES:
87
+ self._phase = _Phase.TAG_GAP
88
+ elif is_valid_name_char(char):
89
+ self._tag.name += char
90
+ elif char == ">":
91
+ parsed_result = _ParsedResult.Success
92
+ elif char == "/" and self._tag.kind == TagKind.OPENING:
93
+ self._tag.kind = TagKind.SELF_CLOSING
94
+ self._phase = _Phase.MUST_CLOSING_SIGN
95
+ else:
96
+ parsed_result = _ParsedResult.Failed
97
+
98
+ elif self._phase == _Phase.TAG_GAP:
99
+ if char in _SPACES:
100
+ pass
101
+ elif is_valid_name_char(char):
102
+ self._tag.attributes.append((char, ""))
103
+ self._phase = _Phase.ATTRIBUTE_NAME
104
+ elif char == ">":
105
+ parsed_result = _ParsedResult.Success
106
+ elif char == "/" and self._tag.kind == TagKind.OPENING:
107
+ self._tag.kind = TagKind.SELF_CLOSING
108
+ self._phase = _Phase.MUST_CLOSING_SIGN
109
+ else:
110
+ parsed_result = _ParsedResult.Failed
111
+
112
+ elif self._phase == _Phase.ATTRIBUTE_NAME:
113
+ if is_valid_name_char(char):
114
+ attr_name, attr_value = self._tag.attributes[-1]
115
+ attr_name = attr_name + char
116
+ self._tag.attributes[-1] = (attr_name, attr_value)
117
+ elif char == "=":
118
+ self._phase = _Phase.ATTRIBUTE_NAME_EQUAL
119
+ else:
120
+ parsed_result = _ParsedResult.Failed
121
+
122
+ elif self._phase == _Phase.ATTRIBUTE_NAME_EQUAL:
123
+ if char == '"':
124
+ self._phase = _Phase.ATTRIBUTE_VALUE
125
+ else:
126
+ parsed_result = _ParsedResult.Failed
127
+
128
+ elif self._phase == _Phase.ATTRIBUTE_VALUE:
129
+ if is_valid_value_char(char):
130
+ attr_name, attr_value = self._tag.attributes[-1]
131
+ attr_value = attr_value + char
132
+ self._tag.attributes[-1] = (attr_name, attr_value)
133
+ elif char == '"':
134
+ self._phase = _Phase.TAG_GAP
135
+ else:
136
+ parsed_result = _ParsedResult.Failed
137
+
138
+ elif self._phase == _Phase.MUST_CLOSING_SIGN:
139
+ if char == ">":
140
+ parsed_result = _ParsedResult.Success
141
+ else:
142
+ parsed_result = _ParsedResult.Failed
143
+
144
+ return parsed_result
145
+
146
+ def _generate_by_result(self, parsed_result: _ParsedResult) -> Generator[str | Tag, None, None]:
147
+ if parsed_result == _ParsedResult.Success:
148
+ assert self._tag is not None
149
+ if self._is_tag_valid(self._tag):
150
+ outside_text = self._outside_buffer.getvalue()
151
+ self._clear_buffer(self._outside_buffer)
152
+ self._clear_buffer(self._tag_buffer)
153
+ if outside_text != "":
154
+ yield outside_text
155
+ yield self._tag
156
+ else:
157
+ self._tag.proto = self._tag_buffer.getvalue()
158
+ self._outside_buffer.write(self._tag.proto)
159
+ self._clear_buffer(self._tag_buffer)
160
+ self._tag = None
161
+ self._phase = _Phase.OUTSIDE
162
+
163
+ elif parsed_result == _ParsedResult.Failed:
164
+ self._outside_buffer.write(self._tag_buffer.getvalue())
165
+ self._clear_buffer(self._tag_buffer)
166
+ self._phase = _Phase.OUTSIDE
167
+
168
+ def _is_tag_valid(self, tag: Tag) -> bool:
169
+ if tag.kind == TagKind.CLOSING and len(tag.attributes) > 0:
170
+ return False
171
+ if tag.find_invalid_name() is not None:
172
+ return False
173
+ return True
174
+
175
+ def _clear_buffer(self, buffer: StringIO):
176
+ buffer.truncate(0)
177
+ buffer.seek(0)
@@ -0,0 +1,118 @@
1
+ from collections.abc import Generator
2
+ from dataclasses import dataclass
3
+ from enum import Enum, auto
4
+ from io import StringIO
5
+
6
+
7
+ class TagKind(Enum):
8
+ OPENING = auto()
9
+ CLOSING = auto()
10
+ SELF_CLOSING = auto()
11
+
12
+
13
+ @dataclass
14
+ class Tag:
15
+ kind: TagKind
16
+ name: str
17
+ proto: str
18
+ attributes: list[tuple[str, str]]
19
+
20
+ def __str__(self):
21
+ buffer = StringIO()
22
+ buffer.write("<")
23
+ if self.kind == TagKind.CLOSING:
24
+ buffer.write("/")
25
+ buffer.write(self.name)
26
+ if len(self.attributes) > 0:
27
+ buffer.write(" ")
28
+ for i, (attr_name, attr_value) in enumerate(self.attributes):
29
+ buffer.write(attr_name)
30
+ buffer.write("=")
31
+ buffer.write('"')
32
+ buffer.write(attr_value)
33
+ buffer.write('"')
34
+ if i < len(self.attributes) - 1:
35
+ buffer.write(" ")
36
+ if self.kind == TagKind.SELF_CLOSING:
37
+ buffer.write("/>")
38
+ else:
39
+ buffer.write(">")
40
+ return buffer.getvalue()
41
+
42
+ def find_invalid_name(self) -> str | None:
43
+ for name in self._iter_tag_names():
44
+ if not all(is_valid_value_char(c) for c in name):
45
+ return name
46
+ # https://www.w3schools.com/xml/xml_elements.asp
47
+ # The following logic enforces a subset of XML naming rules:
48
+ # - Names must not be empty.
49
+ # - Names must start with a letter (a-z, A-Z) or an underscore (_).
50
+ if name == "":
51
+ return name
52
+ char = name[0]
53
+ if char == "_":
54
+ continue
55
+ if "a" <= char <= "z" or "A" <= char <= "Z":
56
+ continue
57
+ return name
58
+
59
+ return None
60
+
61
+ def find_invalid_attr_value(self) -> tuple[str, str] | None:
62
+ for attr_name, attr_value in self.attributes:
63
+ if not all(is_valid_value_char(c) for c in attr_value):
64
+ return attr_name, attr_value
65
+ return None
66
+
67
+ def _iter_tag_names(self) -> Generator[str, None, None]:
68
+ yield self.name
69
+ for attr_name, _ in self.attributes:
70
+ yield attr_name
71
+
72
+
73
+ # XML Attribute Values: https://www.w3.org/TR/xml/#NT-AttValue
74
+ # URI Syntax: https://www.rfc-editor.org/rfc/rfc3986
75
+ # HTML Attributes: https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
76
+ _VALID_VALUE_CHARS = frozenset(
77
+ (
78
+ ",",
79
+ ".",
80
+ "/",
81
+ "#",
82
+ "?",
83
+ "&",
84
+ "=",
85
+ ":",
86
+ "%",
87
+ ";",
88
+ " ",
89
+ )
90
+ )
91
+
92
+
93
+ # XML Names: https://www.w3.org/TR/xml/#NT-Name
94
+ # XML Namespaces: https://www.w3.org/TR/xml-names/#ns-qualnames
95
+ # HTML Custom Data Attributes: https://html.spec.whatwg.org/multipage/dom.html#custom-data-attribute
96
+ _VALID_NAME_CHARS = frozenset(("-", "_", ":", "."))
97
+
98
+
99
+ def is_valid_value_char(char: str) -> bool:
100
+ if is_valid_name_char(char):
101
+ return True
102
+ if char in _VALID_VALUE_CHARS:
103
+ return True
104
+ return False
105
+
106
+
107
+ def is_valid_name_char(char: str) -> bool:
108
+ if char in _VALID_NAME_CHARS:
109
+ return True
110
+
111
+ # https://www.w3.org/TR/xml/#NT-Name
112
+ if "a" <= char <= "z":
113
+ return True
114
+ if "A" <= char <= "Z":
115
+ return True
116
+ if "0" <= char <= "9":
117
+ return True
118
+ return False
@@ -0,0 +1,36 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from .tag import Tag, TagKind
4
+
5
+
6
+ def tag_to_element(tag: Tag) -> Element:
7
+ element = Element(tag.name)
8
+ for attr_name, attr_value in tag.attributes:
9
+ element.set(attr_name, attr_value)
10
+ return element
11
+
12
+
13
+ def element_to_tag(element: Element, kind: TagKind, proto: str = "") -> Tag:
14
+ tag = Tag(
15
+ kind=kind,
16
+ name=element.tag,
17
+ proto=proto,
18
+ attributes=[],
19
+ )
20
+ if kind != TagKind.CLOSING:
21
+ for attr_name in sorted(list(element.keys())):
22
+ attr_value = element.get(attr_name, "")
23
+ tag.attributes.append((attr_name, attr_value))
24
+
25
+ # To make LLM easier to understand, the naming here is restricted in a more strict way.
26
+ # https://github.com/oomol-lab/pdf-craft/issues/149
27
+ invalid_name = tag.find_invalid_name()
28
+ if invalid_name is not None:
29
+ raise ValueError(f"find invalid tag name or attribute name: {invalid_name}")
30
+
31
+ invalid_attr_pair = tag.find_invalid_attr_value()
32
+ if invalid_attr_pair is not None:
33
+ attr_name, attr_value = invalid_attr_pair
34
+ raise ValueError(f'find invalid attribute value: {attr_name}="{attr_value}"')
35
+
36
+ return tag
@@ -0,0 +1,52 @@
1
+ from collections.abc import Generator
2
+ from xml.etree.ElementTree import Element
3
+
4
+
5
+ def find_first(element: Element, tag: str) -> Element | None:
6
+ if element.tag == tag:
7
+ return element
8
+ for child in element:
9
+ result = find_first(child, tag)
10
+ if result is not None:
11
+ return result
12
+ return None
13
+
14
+
15
+ def iter_with_stack(element: Element) -> Generator[tuple[list[Element], Element], None, None]:
16
+ """先序遍历:yield parent_path, element"""
17
+ stack: list[list[Element]] = [[element]]
18
+ while stack:
19
+ current_path = stack.pop()
20
+ current = current_path[-1]
21
+ yield current_path[:-1], current
22
+
23
+ if len(current) == 0:
24
+ continue
25
+
26
+ for child in reversed(list(current)):
27
+ child_path = list(current_path)
28
+ child_path.append(child)
29
+ stack.append(child_path)
30
+
31
+
32
+ def clone_element(element: Element) -> Element:
33
+ new_element = Element(element.tag, element.attrib)
34
+ new_element.text = element.text
35
+ for child in element:
36
+ new_child = clone_element(child)
37
+ new_child.tail = child.tail
38
+ new_element.append(new_child)
39
+ return new_element
40
+
41
+
42
+ def plain_text(element: Element) -> str:
43
+ return "".join(_iter_text_in(element))
44
+
45
+
46
+ def _iter_text_in(element: Element) -> Generator[str, None, None]:
47
+ if element.text:
48
+ yield element.text
49
+ for child in element:
50
+ yield from _iter_text_in(child)
51
+ if child.tail:
52
+ yield child.tail
@@ -0,0 +1,231 @@
1
+ import io
2
+ import re
3
+ import warnings
4
+ from typing import IO
5
+ from xml.etree.ElementTree import Element, fromstring, tostring
6
+
7
+ from .xml import iter_with_stack
8
+
9
+ _XML_NAMESPACE_URI = "http://www.w3.org/XML/1998/namespace"
10
+
11
+ _COMMON_NAMESPACES = {
12
+ "http://www.w3.org/1999/xhtml": "xhtml",
13
+ "http://www.idpf.org/2007/ops": "epub",
14
+ "http://www.w3.org/1998/Math/MathML": "m",
15
+ "http://purl.org/dc/elements/1.1/": "dc",
16
+ "http://www.daisy.org/z3986/2005/ncx/": "ncx",
17
+ "http://www.idpf.org/2007/opf": "opf",
18
+ "http://www.w3.org/2000/svg": "svg",
19
+ "urn:oasis:names:tc:opendocument:xmlns:container": "container",
20
+ "http://www.w3.org/XML/1998/namespace": "xml", # Reserved XML namespace
21
+ }
22
+
23
+ _ROOT_NAMESPACES = {
24
+ "http://www.w3.org/1999/xhtml", # XHTML
25
+ "http://www.daisy.org/z3986/2005/ncx/", # NCX
26
+ "http://www.idpf.org/2007/opf", # OPF
27
+ "urn:oasis:names:tc:opendocument:xmlns:container", # Container
28
+ }
29
+
30
+ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
31
+ _FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
32
+ _NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
33
+
34
+ # Some non-standard EPUB generators use HTML-style tags without self-closing syntax
35
+ # We need to convert them to XML-compatible format before parsing
36
+ _EMPTY_TAGS = (
37
+ "br",
38
+ "hr",
39
+ "input",
40
+ "col",
41
+ "base",
42
+ "meta",
43
+ "area",
44
+ )
45
+
46
+ # For reading: match tags like <br> or <br class="x"> (but not <br/> or <body>)
47
+ _EMPTY_TAG_OPEN_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^/>]*)>")
48
+
49
+ # For saving: match self-closing tags like <br />
50
+ _EMPTY_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/>")
51
+
52
+
53
+ class XMLLikeNode:
54
+ def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
55
+ raw_content = file.read()
56
+ self._encoding: str = self._detect_encoding(raw_content)
57
+ content = raw_content.decode(self._encoding)
58
+ self._header, xml_content = self._extract_header(content)
59
+ self._namespaces: dict[str, str] = {}
60
+ self._tag_to_namespace: dict[str, str] = {}
61
+ self._attr_to_namespace: dict[str, str] = {}
62
+
63
+ # For non-standard HTML files, convert <br> to <br/> before parsing
64
+ self._is_html_like = is_html_like
65
+ if is_html_like:
66
+ xml_content = re.sub(
67
+ pattern=_EMPTY_TAG_OPEN_PATTERN,
68
+ repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
69
+ string=xml_content,
70
+ )
71
+
72
+ try:
73
+ self.element = self._extract_and_clean_namespaces(
74
+ element=fromstring(xml_content),
75
+ )
76
+ except Exception as error:
77
+ raise ValueError("Failed to parse XML-like content") from error
78
+
79
+ @property
80
+ def encoding(self) -> str:
81
+ return self._encoding
82
+
83
+ @property
84
+ def namespaces(self) -> list[str]:
85
+ return list(self._namespaces.keys())
86
+
87
+ def save(self, file: IO[bytes]) -> None:
88
+ writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
89
+ try:
90
+ if self._header:
91
+ writer.write(self._header)
92
+
93
+ content = self._serialize_with_namespaces(self.element)
94
+
95
+ # For non-standard HTML files, convert back from <br/> to <br>
96
+ if self._is_html_like:
97
+ content = re.sub(
98
+ pattern=_EMPTY_TAG_CLOSE_PATTERN,
99
+ repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
100
+ string=content,
101
+ )
102
+
103
+ writer.write(content)
104
+
105
+ finally:
106
+ writer.detach()
107
+
108
+ def _detect_encoding(self, raw_content: bytes) -> str:
109
+ if raw_content.startswith(b"\xef\xbb\xbf"):
110
+ return "utf-8-sig"
111
+ elif raw_content.startswith(b"\xff\xfe"):
112
+ return "utf-16-le"
113
+ elif raw_content.startswith(b"\xfe\xff"):
114
+ return "utf-16-be"
115
+
116
+ # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
117
+ header_bytes = raw_content[:1024]
118
+ for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
119
+ try:
120
+ header_str = header_bytes.decode(try_encoding)
121
+ match = _ENCODING_PATTERN.search(header_str)
122
+ if match:
123
+ declared_encoding = match.group(1).lower()
124
+ try:
125
+ raw_content.decode(declared_encoding)
126
+ return declared_encoding
127
+ except (LookupError, UnicodeDecodeError):
128
+ pass
129
+ except UnicodeDecodeError:
130
+ continue
131
+
132
+ try:
133
+ raw_content.decode("utf-8")
134
+ return "utf-8"
135
+ except UnicodeDecodeError:
136
+ pass
137
+ return "iso-8859-1"
138
+
139
+ def _extract_header(self, content: str) -> tuple[str, str]:
140
+ match = _FIRST_ELEMENT_PATTERN.search(content)
141
+ if match:
142
+ split_pos = match.start()
143
+ header = content[:split_pos]
144
+ xml_content = content[split_pos:]
145
+ return header, xml_content
146
+ return "", content
147
+
148
+ def _extract_and_clean_namespaces(self, element: Element) -> Element:
149
+ for _, elem in iter_with_stack(element):
150
+ match = _NAMESPACE_IN_TAG.match(elem.tag)
151
+ if match:
152
+ namespace_uri = match.group(1)
153
+ if namespace_uri not in self._namespaces:
154
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
155
+ self._namespaces[namespace_uri] = prefix
156
+
157
+ tag_name = elem.tag[len(match.group(0)) :]
158
+
159
+ # Record tag -> namespace mapping (warn if conflict)
160
+ if tag_name in self._tag_to_namespace and self._tag_to_namespace[tag_name] != namespace_uri:
161
+ warnings.warn(
162
+ f"Tag '{tag_name}' has multiple namespaces: "
163
+ f"{self._tag_to_namespace[tag_name]} and {namespace_uri}. "
164
+ f"Using the first one.",
165
+ stacklevel=2,
166
+ )
167
+ else:
168
+ self._tag_to_namespace[tag_name] = namespace_uri
169
+
170
+ # Clean: remove namespace URI completely
171
+ elem.tag = tag_name
172
+
173
+ for attr_key in list(elem.attrib.keys()):
174
+ match = _NAMESPACE_IN_TAG.match(attr_key)
175
+ if match:
176
+ namespace_uri = match.group(1)
177
+ if namespace_uri not in self._namespaces:
178
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
179
+ self._namespaces[namespace_uri] = prefix
180
+
181
+ attr_name = attr_key[len(match.group(0)) :]
182
+ attr_value = elem.attrib.pop(attr_key)
183
+
184
+ # Record attr -> namespace mapping (warn if conflict)
185
+ if attr_name in self._attr_to_namespace and self._attr_to_namespace[attr_name] != namespace_uri:
186
+ warnings.warn(
187
+ f"Attribute '{attr_name}' has multiple namespaces: "
188
+ f"{self._attr_to_namespace[attr_name]} and {namespace_uri}. "
189
+ f"Using the first one.",
190
+ stacklevel=2,
191
+ )
192
+ else:
193
+ self._attr_to_namespace[attr_name] = namespace_uri
194
+
195
+ # Clean: remove namespace URI completely
196
+ elem.attrib[attr_name] = attr_value
197
+ return element
198
+
199
+ def _serialize_with_namespaces(self, element: Element) -> str:
200
+ # First, add namespace declarations to root element (before serialization)
201
+ for namespace_uri, prefix in self._namespaces.items():
202
+ # Skip the reserved xml namespace - it's implicit
203
+ if namespace_uri == _XML_NAMESPACE_URI:
204
+ continue
205
+ if namespace_uri in _ROOT_NAMESPACES:
206
+ element.attrib["xmlns"] = namespace_uri
207
+ else:
208
+ element.attrib[f"xmlns:{prefix}"] = namespace_uri
209
+
210
+ # Serialize the element tree as-is (tags are simple names without prefixes)
211
+ xml_string = tostring(element, encoding="unicode")
212
+
213
+ # Now restore namespace prefixes in the serialized string
214
+ # For each tag that should have a namespace prefix, wrap it with the prefix
215
+ for tag_name, namespace_uri in self._tag_to_namespace.items():
216
+ if namespace_uri not in _ROOT_NAMESPACES:
217
+ # Get the prefix for this namespace
218
+ prefix = self._namespaces[namespace_uri]
219
+ # Replace opening and closing tags
220
+ xml_string = xml_string.replace(f"<{tag_name} ", f"<{prefix}:{tag_name} ")
221
+ xml_string = xml_string.replace(f"<{tag_name}>", f"<{prefix}:{tag_name}>")
222
+ xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
223
+ xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
224
+
225
+ # Similarly for attributes (though less common in EPUB)
226
+ for attr_name, namespace_uri in self._attr_to_namespace.items():
227
+ if namespace_uri not in _ROOT_NAMESPACES:
228
+ prefix = self._namespaces[namespace_uri]
229
+ xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
230
+
231
+ return xml_string
@@ -0,0 +1,3 @@
1
+ from .group import XMLGroupContext
2
+ from .submitter import submit_text_segments
3
+ from .translator import XMLTranslator
@@ -0,0 +1,2 @@
1
+ ID_KEY: str = "id"
2
+ DATA_ORIGIN_LEN_KEY = "data-orig-len"