xmlcst 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xmlcst/__init__.py +42 -0
- xmlcst/_api.py +47 -0
- xmlcst/_builder.py +319 -0
- xmlcst/_entities.py +41 -0
- xmlcst/_errors.py +19 -0
- xmlcst/_nodes.py +811 -0
- xmlcst/_tokenizer.py +315 -0
- xmlcst/_tokens.py +47 -0
- xmlcst/py.typed +0 -0
- xmlcst-0.1.0.dist-info/METADATA +217 -0
- xmlcst-0.1.0.dist-info/RECORD +13 -0
- xmlcst-0.1.0.dist-info/WHEEL +4 -0
- xmlcst-0.1.0.dist-info/licenses/LICENSE +21 -0
xmlcst/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Full-fidelity XML concrete syntax tree -- parse, edit, and serialize with zero formatting loss."""
|
|
2
|
+
|
|
3
|
+
from ._api import parse, parse_bytes, parse_file
|
|
4
|
+
from ._errors import ParseError
|
|
5
|
+
from ._nodes import (
|
|
6
|
+
Attribute,
|
|
7
|
+
AttributeList,
|
|
8
|
+
CData,
|
|
9
|
+
Comment,
|
|
10
|
+
Doctype,
|
|
11
|
+
Document,
|
|
12
|
+
Element,
|
|
13
|
+
Node,
|
|
14
|
+
ProcessingInstruction,
|
|
15
|
+
SerializationMode,
|
|
16
|
+
Text,
|
|
17
|
+
Whitespace,
|
|
18
|
+
XmlDeclaration,
|
|
19
|
+
)
|
|
20
|
+
from ._tokens import Token, TokenType
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"parse",
|
|
24
|
+
"parse_bytes",
|
|
25
|
+
"parse_file",
|
|
26
|
+
"ParseError",
|
|
27
|
+
"Attribute",
|
|
28
|
+
"AttributeList",
|
|
29
|
+
"CData",
|
|
30
|
+
"Comment",
|
|
31
|
+
"Doctype",
|
|
32
|
+
"Document",
|
|
33
|
+
"Element",
|
|
34
|
+
"Node",
|
|
35
|
+
"ProcessingInstruction",
|
|
36
|
+
"SerializationMode",
|
|
37
|
+
"Text",
|
|
38
|
+
"Token",
|
|
39
|
+
"TokenType",
|
|
40
|
+
"Whitespace",
|
|
41
|
+
"XmlDeclaration",
|
|
42
|
+
]
|
xmlcst/_api.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ._builder import build_tree
|
|
6
|
+
from ._nodes import Document
|
|
7
|
+
from ._tokenizer import tokenize
|
|
8
|
+
|
|
9
|
+
_UTF8_BOM = b"\xef\xbb\xbf"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse(text: str) -> Document:
|
|
13
|
+
"""Parse an XML string into a Document.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
ParseError: If the input is not well-formed XML.
|
|
17
|
+
"""
|
|
18
|
+
tokens = tokenize(text)
|
|
19
|
+
return build_tree(tokens)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_bytes(data: bytes) -> Document:
|
|
23
|
+
"""Parse UTF-8 encoded XML bytes into a Document.
|
|
24
|
+
|
|
25
|
+
A leading UTF-8 BOM is detected and preserved on round-trip.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ParseError: If the input is not well-formed XML.
|
|
29
|
+
"""
|
|
30
|
+
bom = data.startswith(_UTF8_BOM)
|
|
31
|
+
if bom:
|
|
32
|
+
data = data[3:]
|
|
33
|
+
text = data.decode("utf-8")
|
|
34
|
+
tokens = tokenize(text)
|
|
35
|
+
doc = build_tree(tokens)
|
|
36
|
+
doc.bom = bom
|
|
37
|
+
return doc
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_file(path: str | Path) -> Document:
|
|
41
|
+
"""Read a file and parse its contents as XML.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ParseError: If the file contents are not well-formed XML.
|
|
45
|
+
"""
|
|
46
|
+
data = Path(path).read_bytes()
|
|
47
|
+
return parse_bytes(data)
|
xmlcst/_builder.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from ._errors import ParseError
|
|
6
|
+
from ._nodes import (
|
|
7
|
+
Attribute,
|
|
8
|
+
CData,
|
|
9
|
+
Comment,
|
|
10
|
+
Doctype,
|
|
11
|
+
Document,
|
|
12
|
+
Element,
|
|
13
|
+
Node,
|
|
14
|
+
ProcessingInstruction,
|
|
15
|
+
Text,
|
|
16
|
+
Whitespace,
|
|
17
|
+
XmlDeclaration,
|
|
18
|
+
)
|
|
19
|
+
from ._tokens import Token, TokenType
|
|
20
|
+
|
|
21
|
+
_XMLDECL_RE = re.compile(
|
|
22
|
+
r"""
|
|
23
|
+
\s+version\s*=\s*(['"])(?P<version>[^'"]*)\1
|
|
24
|
+
(?:\s+encoding\s*=\s*(['"])(?P<encoding>[^'"]*)\3)?
|
|
25
|
+
(?:\s+standalone\s*=\s*(['"])(?P<standalone>[^'"]*)\5)?
|
|
26
|
+
\s*
|
|
27
|
+
""",
|
|
28
|
+
re.VERBOSE,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_tree(tokens: list[Token]) -> Document:
|
|
33
|
+
pos = 0
|
|
34
|
+
length = len(tokens)
|
|
35
|
+
|
|
36
|
+
def _cur() -> Token:
|
|
37
|
+
return tokens[pos]
|
|
38
|
+
|
|
39
|
+
def _at_end() -> bool:
|
|
40
|
+
return pos >= length
|
|
41
|
+
|
|
42
|
+
def _build_children(_parent_tag: str | None) -> tuple[list[Node], int]:
|
|
43
|
+
nonlocal pos
|
|
44
|
+
children: list[Node] = []
|
|
45
|
+
while not _at_end():
|
|
46
|
+
tt = _cur().type
|
|
47
|
+
if tt == TokenType.EndTag:
|
|
48
|
+
break
|
|
49
|
+
node, _end_pos = _build_node()
|
|
50
|
+
if node is not None:
|
|
51
|
+
children.append(node)
|
|
52
|
+
return children, pos
|
|
53
|
+
|
|
54
|
+
def _build_node() -> tuple[Node | None, int]:
|
|
55
|
+
nonlocal pos
|
|
56
|
+
tok = _cur()
|
|
57
|
+
tt = tok.type
|
|
58
|
+
|
|
59
|
+
if tt == TokenType.XmlDeclOpen:
|
|
60
|
+
return _build_xml_decl()
|
|
61
|
+
if tt == TokenType.Doctype:
|
|
62
|
+
return _build_doctype()
|
|
63
|
+
if tt == TokenType.CommentOpen:
|
|
64
|
+
return _build_comment()
|
|
65
|
+
if tt == TokenType.PiOpen:
|
|
66
|
+
return _build_pi()
|
|
67
|
+
if tt == TokenType.CDataOpen:
|
|
68
|
+
return _build_cdata()
|
|
69
|
+
if tt == TokenType.StartTagOpen:
|
|
70
|
+
return _build_element()
|
|
71
|
+
if tt == TokenType.Whitespace:
|
|
72
|
+
return _build_whitespace()
|
|
73
|
+
if tt in (TokenType.Text, TokenType.EntityRef, TokenType.CharRef):
|
|
74
|
+
return _build_text()
|
|
75
|
+
raise ParseError(f"unexpected token {tt.value}", 1, 1, tok.offset)
|
|
76
|
+
|
|
77
|
+
def _build_xml_decl() -> tuple[XmlDeclaration, int]:
|
|
78
|
+
nonlocal pos
|
|
79
|
+
start = pos
|
|
80
|
+
pos += 1
|
|
81
|
+
|
|
82
|
+
ws_content = ""
|
|
83
|
+
if not _at_end() and _cur().type == TokenType.Whitespace:
|
|
84
|
+
ws_content = _cur().text
|
|
85
|
+
pos += 1
|
|
86
|
+
|
|
87
|
+
if _at_end() or _cur().type != TokenType.XmlDeclClose:
|
|
88
|
+
raise ParseError("unterminated XML declaration", 1, 1, tokens[start].offset)
|
|
89
|
+
pos += 1
|
|
90
|
+
|
|
91
|
+
raw_text = "".join(t.text for t in tokens[start:pos])
|
|
92
|
+
|
|
93
|
+
version = "1.0"
|
|
94
|
+
encoding = None
|
|
95
|
+
standalone = None
|
|
96
|
+
|
|
97
|
+
m = _XMLDECL_RE.match(ws_content)
|
|
98
|
+
if m:
|
|
99
|
+
version = m.group("version")
|
|
100
|
+
encoding = m.group("encoding")
|
|
101
|
+
standalone = m.group("standalone")
|
|
102
|
+
|
|
103
|
+
node = XmlDeclaration(
|
|
104
|
+
version=version,
|
|
105
|
+
encoding=encoding,
|
|
106
|
+
standalone=standalone,
|
|
107
|
+
raw_text=raw_text,
|
|
108
|
+
token_span=(start, pos),
|
|
109
|
+
tokens_ref=tokens,
|
|
110
|
+
)
|
|
111
|
+
return node, pos
|
|
112
|
+
|
|
113
|
+
def _build_doctype() -> tuple[Doctype, int]:
|
|
114
|
+
nonlocal pos
|
|
115
|
+
start = pos
|
|
116
|
+
raw_text = _cur().text
|
|
117
|
+
pos += 1
|
|
118
|
+
node = Doctype(raw_text=raw_text, token_span=(start, pos), tokens_ref=tokens)
|
|
119
|
+
return node, pos
|
|
120
|
+
|
|
121
|
+
def _build_comment() -> tuple[Comment, int]:
|
|
122
|
+
nonlocal pos
|
|
123
|
+
start = pos
|
|
124
|
+
pos += 1
|
|
125
|
+
content = ""
|
|
126
|
+
if not _at_end() and _cur().type == TokenType.CommentContent:
|
|
127
|
+
content = _cur().text
|
|
128
|
+
pos += 1
|
|
129
|
+
if _at_end() or _cur().type != TokenType.CommentClose:
|
|
130
|
+
raise ParseError("unterminated comment", 1, 1, tokens[start].offset)
|
|
131
|
+
pos += 1
|
|
132
|
+
node = Comment(content=content, token_span=(start, pos), tokens_ref=tokens)
|
|
133
|
+
return node, pos
|
|
134
|
+
|
|
135
|
+
def _build_pi() -> tuple[ProcessingInstruction, int]:
|
|
136
|
+
nonlocal pos
|
|
137
|
+
start = pos
|
|
138
|
+
open_text = _cur().text
|
|
139
|
+
target = open_text[2:]
|
|
140
|
+
pos += 1
|
|
141
|
+
content = ""
|
|
142
|
+
if not _at_end() and _cur().type == TokenType.PiContent:
|
|
143
|
+
content = _cur().text
|
|
144
|
+
pos += 1
|
|
145
|
+
if _at_end() or _cur().type != TokenType.PiClose:
|
|
146
|
+
raise ParseError("unterminated PI", 1, 1, tokens[start].offset)
|
|
147
|
+
pos += 1
|
|
148
|
+
node = ProcessingInstruction(
|
|
149
|
+
target=target, content=content, token_span=(start, pos), tokens_ref=tokens
|
|
150
|
+
)
|
|
151
|
+
return node, pos
|
|
152
|
+
|
|
153
|
+
def _build_cdata() -> tuple[CData, int]:
|
|
154
|
+
nonlocal pos
|
|
155
|
+
start = pos
|
|
156
|
+
pos += 1
|
|
157
|
+
content = ""
|
|
158
|
+
if not _at_end() and _cur().type == TokenType.CDataContent:
|
|
159
|
+
content = _cur().text
|
|
160
|
+
pos += 1
|
|
161
|
+
if _at_end() or _cur().type != TokenType.CDataClose:
|
|
162
|
+
raise ParseError("unterminated CDATA", 1, 1, tokens[start].offset)
|
|
163
|
+
pos += 1
|
|
164
|
+
node = CData(content=content, token_span=(start, pos), tokens_ref=tokens)
|
|
165
|
+
return node, pos
|
|
166
|
+
|
|
167
|
+
def _build_whitespace() -> tuple[Whitespace, int]:
|
|
168
|
+
nonlocal pos
|
|
169
|
+
start = pos
|
|
170
|
+
content = _cur().text
|
|
171
|
+
pos += 1
|
|
172
|
+
node = Whitespace(content=content, token_span=(start, pos), tokens_ref=tokens)
|
|
173
|
+
return node, pos
|
|
174
|
+
|
|
175
|
+
def _build_text() -> tuple[Text, int]:
|
|
176
|
+
nonlocal pos
|
|
177
|
+
start = pos
|
|
178
|
+
parts: list[str] = []
|
|
179
|
+
while not _at_end() and _cur().type in (
|
|
180
|
+
TokenType.Text,
|
|
181
|
+
TokenType.EntityRef,
|
|
182
|
+
TokenType.CharRef,
|
|
183
|
+
):
|
|
184
|
+
parts.append(_cur().text)
|
|
185
|
+
pos += 1
|
|
186
|
+
content = "".join(parts)
|
|
187
|
+
node = Text(content=content, token_span=(start, pos), tokens_ref=tokens)
|
|
188
|
+
return node, pos
|
|
189
|
+
|
|
190
|
+
def _build_element() -> tuple[Element, int]:
|
|
191
|
+
nonlocal pos
|
|
192
|
+
start = pos
|
|
193
|
+
open_tok = _cur()
|
|
194
|
+
tag = open_tok.text[1:]
|
|
195
|
+
pos += 1
|
|
196
|
+
|
|
197
|
+
attrs: list[Attribute] = []
|
|
198
|
+
self_closing = False
|
|
199
|
+
empty_close_text = "/>"
|
|
200
|
+
trailing_ws = ""
|
|
201
|
+
|
|
202
|
+
while not _at_end():
|
|
203
|
+
tt = _cur().type
|
|
204
|
+
if tt == TokenType.StartTagClose:
|
|
205
|
+
pos += 1
|
|
206
|
+
break
|
|
207
|
+
if tt == TokenType.EmptyTagClose:
|
|
208
|
+
self_closing = True
|
|
209
|
+
empty_close_text = trailing_ws + _cur().text
|
|
210
|
+
pos += 1
|
|
211
|
+
break
|
|
212
|
+
if tt == TokenType.Whitespace:
|
|
213
|
+
ws_text = _cur().text
|
|
214
|
+
pos += 1
|
|
215
|
+
if not _at_end() and _cur().type == TokenType.AttrName:
|
|
216
|
+
attr = _build_attribute(ws_text)
|
|
217
|
+
attrs.append(attr)
|
|
218
|
+
trailing_ws = ""
|
|
219
|
+
else:
|
|
220
|
+
trailing_ws = ws_text
|
|
221
|
+
continue
|
|
222
|
+
if tt == TokenType.AttrName:
|
|
223
|
+
attr = _build_attribute("")
|
|
224
|
+
attrs.append(attr)
|
|
225
|
+
trailing_ws = ""
|
|
226
|
+
continue
|
|
227
|
+
raise ParseError(
|
|
228
|
+
f"unexpected token in start tag: {tt.value}",
|
|
229
|
+
1,
|
|
230
|
+
1,
|
|
231
|
+
_cur().offset,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
elem = Element(tag=tag, token_span=(start, -1), tokens_ref=tokens)
|
|
235
|
+
elem._self_closing = self_closing
|
|
236
|
+
elem._empty_close_text = empty_close_text
|
|
237
|
+
for attr in attrs:
|
|
238
|
+
elem._attributes._append_parsed(attr)
|
|
239
|
+
|
|
240
|
+
if not self_closing:
|
|
241
|
+
children, _ = _build_children(tag)
|
|
242
|
+
elem._children = children
|
|
243
|
+
|
|
244
|
+
if _at_end() or _cur().type != TokenType.EndTag:
|
|
245
|
+
raise ParseError(
|
|
246
|
+
f"missing end tag for <{tag}>", 1, 1, tokens[start].offset
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
end_tok = _cur()
|
|
250
|
+
end_tag_name = end_tok.text[2:-1].strip()
|
|
251
|
+
if end_tag_name != tag:
|
|
252
|
+
raise ParseError(
|
|
253
|
+
f"mismatched tags: <{tag}> and </{end_tag_name}>",
|
|
254
|
+
1,
|
|
255
|
+
1,
|
|
256
|
+
end_tok.offset,
|
|
257
|
+
)
|
|
258
|
+
elem._end_tag_text = end_tok.text
|
|
259
|
+
pos += 1
|
|
260
|
+
|
|
261
|
+
elem._token_span = (start, pos)
|
|
262
|
+
elem._dirty = False
|
|
263
|
+
return elem, pos
|
|
264
|
+
|
|
265
|
+
def _build_attribute(leading_ws: str) -> Attribute:
|
|
266
|
+
nonlocal pos
|
|
267
|
+
name = _cur().text
|
|
268
|
+
pos += 1
|
|
269
|
+
|
|
270
|
+
if _at_end() or _cur().type != TokenType.AttrEq:
|
|
271
|
+
raise ParseError(
|
|
272
|
+
f"expected '=' after attribute {name!r}", 1, 1, tokens[pos - 1].offset
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
eq_text = _cur().text
|
|
276
|
+
pos += 1
|
|
277
|
+
idx = eq_text.index("=")
|
|
278
|
+
before_eq = eq_text[:idx]
|
|
279
|
+
after_eq = eq_text[idx + 1 :]
|
|
280
|
+
eq_whitespace = (before_eq, after_eq)
|
|
281
|
+
|
|
282
|
+
if _at_end() or _cur().type != TokenType.AttrValueOpen:
|
|
283
|
+
raise ParseError(
|
|
284
|
+
f"expected quote for attribute {name!r}", 1, 1, tokens[pos - 1].offset
|
|
285
|
+
)
|
|
286
|
+
quote = _cur().text
|
|
287
|
+
pos += 1
|
|
288
|
+
|
|
289
|
+
value_parts: list[str] = []
|
|
290
|
+
while not _at_end() and _cur().type != TokenType.AttrValueClose:
|
|
291
|
+
value_parts.append(_cur().text)
|
|
292
|
+
pos += 1
|
|
293
|
+
|
|
294
|
+
if _at_end():
|
|
295
|
+
raise ParseError(
|
|
296
|
+
f"unterminated attribute value for {name!r}",
|
|
297
|
+
1,
|
|
298
|
+
1,
|
|
299
|
+
tokens[pos - 1].offset,
|
|
300
|
+
)
|
|
301
|
+
pos += 1
|
|
302
|
+
|
|
303
|
+
raw_value = "".join(value_parts)
|
|
304
|
+
|
|
305
|
+
return Attribute(
|
|
306
|
+
name=name,
|
|
307
|
+
raw_value=raw_value,
|
|
308
|
+
quote=quote,
|
|
309
|
+
leading_whitespace=leading_ws,
|
|
310
|
+
eq_whitespace=eq_whitespace,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
children: list[Node] = []
|
|
314
|
+
while not _at_end():
|
|
315
|
+
node, _ = _build_node()
|
|
316
|
+
if node is not None:
|
|
317
|
+
children.append(node)
|
|
318
|
+
|
|
319
|
+
return Document(children=children, bom=False, tokens=tokens)
|
xmlcst/_entities.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_BUILTIN_ENTITIES: dict[str, str] = {
|
|
6
|
+
"amp": "&",
|
|
7
|
+
"lt": "<",
|
|
8
|
+
"gt": ">",
|
|
9
|
+
"apos": "'",
|
|
10
|
+
"quot": '"',
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
_REF_RE = re.compile(r"&(?:#x([0-9a-fA-F]+)|#([0-9]+)|([a-zA-Z_][\w.-]*));")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def decode_references(raw: str) -> str:
|
|
17
|
+
def _replace(m: re.Match[str]) -> str:
|
|
18
|
+
if m.group(1) is not None:
|
|
19
|
+
return chr(int(m.group(1), 16))
|
|
20
|
+
if m.group(2) is not None:
|
|
21
|
+
return chr(int(m.group(2)))
|
|
22
|
+
name = m.group(3)
|
|
23
|
+
return _BUILTIN_ENTITIES.get(name, m.group(0))
|
|
24
|
+
|
|
25
|
+
return _REF_RE.sub(_replace, raw)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def escape_for_attr(decoded: str, quote: str) -> str:
|
|
29
|
+
result = decoded.replace("&", "&")
|
|
30
|
+
result = result.replace("<", "<")
|
|
31
|
+
if quote == '"':
|
|
32
|
+
result = result.replace('"', """)
|
|
33
|
+
elif quote == "'":
|
|
34
|
+
result = result.replace("'", "'")
|
|
35
|
+
return result
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def escape_for_text(decoded: str) -> str:
|
|
39
|
+
result = decoded.replace("&", "&")
|
|
40
|
+
result = result.replace("<", "<")
|
|
41
|
+
return result
|
xmlcst/_errors.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ParseError(Exception):
|
|
5
|
+
"""Raised when the XML source is malformed.
|
|
6
|
+
|
|
7
|
+
Attributes:
|
|
8
|
+
message: Human-readable description of the error.
|
|
9
|
+
line: 1-based line number where the error was detected.
|
|
10
|
+
column: 1-based column number where the error was detected.
|
|
11
|
+
offset: 0-based character offset from the start of the source.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, message: str, line: int, column: int, offset: int) -> None:
|
|
15
|
+
self.message = message
|
|
16
|
+
self.line = line
|
|
17
|
+
self.column = column
|
|
18
|
+
self.offset = offset
|
|
19
|
+
super().__init__(f"{message} at line {line}, column {column}")
|