xmlcst 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xmlcst/__init__.py ADDED
@@ -0,0 +1,42 @@
1
+ """Full-fidelity XML concrete syntax tree -- parse, edit, and serialize with zero formatting loss."""
2
+
3
+ from ._api import parse, parse_bytes, parse_file
4
+ from ._errors import ParseError
5
+ from ._nodes import (
6
+ Attribute,
7
+ AttributeList,
8
+ CData,
9
+ Comment,
10
+ Doctype,
11
+ Document,
12
+ Element,
13
+ Node,
14
+ ProcessingInstruction,
15
+ SerializationMode,
16
+ Text,
17
+ Whitespace,
18
+ XmlDeclaration,
19
+ )
20
+ from ._tokens import Token, TokenType
21
+
22
+ __all__ = [
23
+ "parse",
24
+ "parse_bytes",
25
+ "parse_file",
26
+ "ParseError",
27
+ "Attribute",
28
+ "AttributeList",
29
+ "CData",
30
+ "Comment",
31
+ "Doctype",
32
+ "Document",
33
+ "Element",
34
+ "Node",
35
+ "ProcessingInstruction",
36
+ "SerializationMode",
37
+ "Text",
38
+ "Token",
39
+ "TokenType",
40
+ "Whitespace",
41
+ "XmlDeclaration",
42
+ ]
xmlcst/_api.py ADDED
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from ._builder import build_tree
6
+ from ._nodes import Document
7
+ from ._tokenizer import tokenize
8
+
9
+ _UTF8_BOM = b"\xef\xbb\xbf"
10
+
11
+
12
+ def parse(text: str) -> Document:
13
+ """Parse an XML string into a Document.
14
+
15
+ Raises:
16
+ ParseError: If the input is not well-formed XML.
17
+ """
18
+ tokens = tokenize(text)
19
+ return build_tree(tokens)
20
+
21
+
22
+ def parse_bytes(data: bytes) -> Document:
23
+ """Parse UTF-8 encoded XML bytes into a Document.
24
+
25
+ A leading UTF-8 BOM is detected and preserved on round-trip.
26
+
27
+ Raises:
28
+ ParseError: If the input is not well-formed XML.
29
+ """
30
+ bom = data.startswith(_UTF8_BOM)
31
+ if bom:
32
+ data = data[3:]
33
+ text = data.decode("utf-8")
34
+ tokens = tokenize(text)
35
+ doc = build_tree(tokens)
36
+ doc.bom = bom
37
+ return doc
38
+
39
+
40
+ def parse_file(path: str | Path) -> Document:
41
+ """Read a file and parse its contents as XML.
42
+
43
+ Raises:
44
+ ParseError: If the file contents are not well-formed XML.
45
+ """
46
+ data = Path(path).read_bytes()
47
+ return parse_bytes(data)
xmlcst/_builder.py ADDED
@@ -0,0 +1,319 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from ._errors import ParseError
6
+ from ._nodes import (
7
+ Attribute,
8
+ CData,
9
+ Comment,
10
+ Doctype,
11
+ Document,
12
+ Element,
13
+ Node,
14
+ ProcessingInstruction,
15
+ Text,
16
+ Whitespace,
17
+ XmlDeclaration,
18
+ )
19
+ from ._tokens import Token, TokenType
20
+
21
+ _XMLDECL_RE = re.compile(
22
+ r"""
23
+ \s+version\s*=\s*(['"])(?P<version>[^'"]*)\1
24
+ (?:\s+encoding\s*=\s*(['"])(?P<encoding>[^'"]*)\3)?
25
+ (?:\s+standalone\s*=\s*(['"])(?P<standalone>[^'"]*)\5)?
26
+ \s*
27
+ """,
28
+ re.VERBOSE,
29
+ )
30
+
31
+
32
+ def build_tree(tokens: list[Token]) -> Document:
33
+ pos = 0
34
+ length = len(tokens)
35
+
36
+ def _cur() -> Token:
37
+ return tokens[pos]
38
+
39
+ def _at_end() -> bool:
40
+ return pos >= length
41
+
42
+ def _build_children(_parent_tag: str | None) -> tuple[list[Node], int]:
43
+ nonlocal pos
44
+ children: list[Node] = []
45
+ while not _at_end():
46
+ tt = _cur().type
47
+ if tt == TokenType.EndTag:
48
+ break
49
+ node, _end_pos = _build_node()
50
+ if node is not None:
51
+ children.append(node)
52
+ return children, pos
53
+
54
+ def _build_node() -> tuple[Node | None, int]:
55
+ nonlocal pos
56
+ tok = _cur()
57
+ tt = tok.type
58
+
59
+ if tt == TokenType.XmlDeclOpen:
60
+ return _build_xml_decl()
61
+ if tt == TokenType.Doctype:
62
+ return _build_doctype()
63
+ if tt == TokenType.CommentOpen:
64
+ return _build_comment()
65
+ if tt == TokenType.PiOpen:
66
+ return _build_pi()
67
+ if tt == TokenType.CDataOpen:
68
+ return _build_cdata()
69
+ if tt == TokenType.StartTagOpen:
70
+ return _build_element()
71
+ if tt == TokenType.Whitespace:
72
+ return _build_whitespace()
73
+ if tt in (TokenType.Text, TokenType.EntityRef, TokenType.CharRef):
74
+ return _build_text()
75
+ raise ParseError(f"unexpected token {tt.value}", 1, 1, tok.offset)
76
+
77
+ def _build_xml_decl() -> tuple[XmlDeclaration, int]:
78
+ nonlocal pos
79
+ start = pos
80
+ pos += 1
81
+
82
+ ws_content = ""
83
+ if not _at_end() and _cur().type == TokenType.Whitespace:
84
+ ws_content = _cur().text
85
+ pos += 1
86
+
87
+ if _at_end() or _cur().type != TokenType.XmlDeclClose:
88
+ raise ParseError("unterminated XML declaration", 1, 1, tokens[start].offset)
89
+ pos += 1
90
+
91
+ raw_text = "".join(t.text for t in tokens[start:pos])
92
+
93
+ version = "1.0"
94
+ encoding = None
95
+ standalone = None
96
+
97
+ m = _XMLDECL_RE.match(ws_content)
98
+ if m:
99
+ version = m.group("version")
100
+ encoding = m.group("encoding")
101
+ standalone = m.group("standalone")
102
+
103
+ node = XmlDeclaration(
104
+ version=version,
105
+ encoding=encoding,
106
+ standalone=standalone,
107
+ raw_text=raw_text,
108
+ token_span=(start, pos),
109
+ tokens_ref=tokens,
110
+ )
111
+ return node, pos
112
+
113
+ def _build_doctype() -> tuple[Doctype, int]:
114
+ nonlocal pos
115
+ start = pos
116
+ raw_text = _cur().text
117
+ pos += 1
118
+ node = Doctype(raw_text=raw_text, token_span=(start, pos), tokens_ref=tokens)
119
+ return node, pos
120
+
121
+ def _build_comment() -> tuple[Comment, int]:
122
+ nonlocal pos
123
+ start = pos
124
+ pos += 1
125
+ content = ""
126
+ if not _at_end() and _cur().type == TokenType.CommentContent:
127
+ content = _cur().text
128
+ pos += 1
129
+ if _at_end() or _cur().type != TokenType.CommentClose:
130
+ raise ParseError("unterminated comment", 1, 1, tokens[start].offset)
131
+ pos += 1
132
+ node = Comment(content=content, token_span=(start, pos), tokens_ref=tokens)
133
+ return node, pos
134
+
135
+ def _build_pi() -> tuple[ProcessingInstruction, int]:
136
+ nonlocal pos
137
+ start = pos
138
+ open_text = _cur().text
139
+ target = open_text[2:]
140
+ pos += 1
141
+ content = ""
142
+ if not _at_end() and _cur().type == TokenType.PiContent:
143
+ content = _cur().text
144
+ pos += 1
145
+ if _at_end() or _cur().type != TokenType.PiClose:
146
+ raise ParseError("unterminated PI", 1, 1, tokens[start].offset)
147
+ pos += 1
148
+ node = ProcessingInstruction(
149
+ target=target, content=content, token_span=(start, pos), tokens_ref=tokens
150
+ )
151
+ return node, pos
152
+
153
+ def _build_cdata() -> tuple[CData, int]:
154
+ nonlocal pos
155
+ start = pos
156
+ pos += 1
157
+ content = ""
158
+ if not _at_end() and _cur().type == TokenType.CDataContent:
159
+ content = _cur().text
160
+ pos += 1
161
+ if _at_end() or _cur().type != TokenType.CDataClose:
162
+ raise ParseError("unterminated CDATA", 1, 1, tokens[start].offset)
163
+ pos += 1
164
+ node = CData(content=content, token_span=(start, pos), tokens_ref=tokens)
165
+ return node, pos
166
+
167
+ def _build_whitespace() -> tuple[Whitespace, int]:
168
+ nonlocal pos
169
+ start = pos
170
+ content = _cur().text
171
+ pos += 1
172
+ node = Whitespace(content=content, token_span=(start, pos), tokens_ref=tokens)
173
+ return node, pos
174
+
175
+ def _build_text() -> tuple[Text, int]:
176
+ nonlocal pos
177
+ start = pos
178
+ parts: list[str] = []
179
+ while not _at_end() and _cur().type in (
180
+ TokenType.Text,
181
+ TokenType.EntityRef,
182
+ TokenType.CharRef,
183
+ ):
184
+ parts.append(_cur().text)
185
+ pos += 1
186
+ content = "".join(parts)
187
+ node = Text(content=content, token_span=(start, pos), tokens_ref=tokens)
188
+ return node, pos
189
+
190
+ def _build_element() -> tuple[Element, int]:
191
+ nonlocal pos
192
+ start = pos
193
+ open_tok = _cur()
194
+ tag = open_tok.text[1:]
195
+ pos += 1
196
+
197
+ attrs: list[Attribute] = []
198
+ self_closing = False
199
+ empty_close_text = "/>"
200
+ trailing_ws = ""
201
+
202
+ while not _at_end():
203
+ tt = _cur().type
204
+ if tt == TokenType.StartTagClose:
205
+ pos += 1
206
+ break
207
+ if tt == TokenType.EmptyTagClose:
208
+ self_closing = True
209
+ empty_close_text = trailing_ws + _cur().text
210
+ pos += 1
211
+ break
212
+ if tt == TokenType.Whitespace:
213
+ ws_text = _cur().text
214
+ pos += 1
215
+ if not _at_end() and _cur().type == TokenType.AttrName:
216
+ attr = _build_attribute(ws_text)
217
+ attrs.append(attr)
218
+ trailing_ws = ""
219
+ else:
220
+ trailing_ws = ws_text
221
+ continue
222
+ if tt == TokenType.AttrName:
223
+ attr = _build_attribute("")
224
+ attrs.append(attr)
225
+ trailing_ws = ""
226
+ continue
227
+ raise ParseError(
228
+ f"unexpected token in start tag: {tt.value}",
229
+ 1,
230
+ 1,
231
+ _cur().offset,
232
+ )
233
+
234
+ elem = Element(tag=tag, token_span=(start, -1), tokens_ref=tokens)
235
+ elem._self_closing = self_closing
236
+ elem._empty_close_text = empty_close_text
237
+ for attr in attrs:
238
+ elem._attributes._append_parsed(attr)
239
+
240
+ if not self_closing:
241
+ children, _ = _build_children(tag)
242
+ elem._children = children
243
+
244
+ if _at_end() or _cur().type != TokenType.EndTag:
245
+ raise ParseError(
246
+ f"missing end tag for <{tag}>", 1, 1, tokens[start].offset
247
+ )
248
+
249
+ end_tok = _cur()
250
+ end_tag_name = end_tok.text[2:-1].strip()
251
+ if end_tag_name != tag:
252
+ raise ParseError(
253
+ f"mismatched tags: <{tag}> and </{end_tag_name}>",
254
+ 1,
255
+ 1,
256
+ end_tok.offset,
257
+ )
258
+ elem._end_tag_text = end_tok.text
259
+ pos += 1
260
+
261
+ elem._token_span = (start, pos)
262
+ elem._dirty = False
263
+ return elem, pos
264
+
265
+ def _build_attribute(leading_ws: str) -> Attribute:
266
+ nonlocal pos
267
+ name = _cur().text
268
+ pos += 1
269
+
270
+ if _at_end() or _cur().type != TokenType.AttrEq:
271
+ raise ParseError(
272
+ f"expected '=' after attribute {name!r}", 1, 1, tokens[pos - 1].offset
273
+ )
274
+
275
+ eq_text = _cur().text
276
+ pos += 1
277
+ idx = eq_text.index("=")
278
+ before_eq = eq_text[:idx]
279
+ after_eq = eq_text[idx + 1 :]
280
+ eq_whitespace = (before_eq, after_eq)
281
+
282
+ if _at_end() or _cur().type != TokenType.AttrValueOpen:
283
+ raise ParseError(
284
+ f"expected quote for attribute {name!r}", 1, 1, tokens[pos - 1].offset
285
+ )
286
+ quote = _cur().text
287
+ pos += 1
288
+
289
+ value_parts: list[str] = []
290
+ while not _at_end() and _cur().type != TokenType.AttrValueClose:
291
+ value_parts.append(_cur().text)
292
+ pos += 1
293
+
294
+ if _at_end():
295
+ raise ParseError(
296
+ f"unterminated attribute value for {name!r}",
297
+ 1,
298
+ 1,
299
+ tokens[pos - 1].offset,
300
+ )
301
+ pos += 1
302
+
303
+ raw_value = "".join(value_parts)
304
+
305
+ return Attribute(
306
+ name=name,
307
+ raw_value=raw_value,
308
+ quote=quote,
309
+ leading_whitespace=leading_ws,
310
+ eq_whitespace=eq_whitespace,
311
+ )
312
+
313
+ children: list[Node] = []
314
+ while not _at_end():
315
+ node, _ = _build_node()
316
+ if node is not None:
317
+ children.append(node)
318
+
319
+ return Document(children=children, bom=False, tokens=tokens)
xmlcst/_entities.py ADDED
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ _BUILTIN_ENTITIES: dict[str, str] = {
6
+ "amp": "&",
7
+ "lt": "<",
8
+ "gt": ">",
9
+ "apos": "'",
10
+ "quot": '"',
11
+ }
12
+
13
+ _REF_RE = re.compile(r"&(?:#x([0-9a-fA-F]+)|#([0-9]+)|([a-zA-Z_][\w.-]*));")
14
+
15
+
16
+ def decode_references(raw: str) -> str:
17
+ def _replace(m: re.Match[str]) -> str:
18
+ if m.group(1) is not None:
19
+ return chr(int(m.group(1), 16))
20
+ if m.group(2) is not None:
21
+ return chr(int(m.group(2)))
22
+ name = m.group(3)
23
+ return _BUILTIN_ENTITIES.get(name, m.group(0))
24
+
25
+ return _REF_RE.sub(_replace, raw)
26
+
27
+
28
+ def escape_for_attr(decoded: str, quote: str) -> str:
29
+ result = decoded.replace("&", "&amp;")
30
+ result = result.replace("<", "&lt;")
31
+ if quote == '"':
32
+ result = result.replace('"', "&quot;")
33
+ elif quote == "'":
34
+ result = result.replace("'", "&apos;")
35
+ return result
36
+
37
+
38
+ def escape_for_text(decoded: str) -> str:
39
+ result = decoded.replace("&", "&amp;")
40
+ result = result.replace("<", "&lt;")
41
+ return result
xmlcst/_errors.py ADDED
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class ParseError(Exception):
5
+ """Raised when the XML source is malformed.
6
+
7
+ Attributes:
8
+ message: Human-readable description of the error.
9
+ line: 1-based line number where the error was detected.
10
+ column: 1-based column number where the error was detected.
11
+ offset: 0-based character offset from the start of the source.
12
+ """
13
+
14
+ def __init__(self, message: str, line: int, column: int, offset: int) -> None:
15
+ self.message = message
16
+ self.line = line
17
+ self.column = column
18
+ self.offset = offset
19
+ super().__init__(f"{message} at line {line}, column {column}")