justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/entities.py ADDED
@@ -0,0 +1,342 @@
1
+ """HTML5 character entity decoding.
2
+
3
+ Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
4
+ Supports both named entities (&,  ) and numeric references (<, <).
5
+ """
6
+
7
+ import html.entities
8
+
9
+ # Use Python's complete HTML5 entity list (2231 entities)
10
+ # Keys include the trailing semicolon (e.g., "amp;", "lang;")
11
+ # We'll strip semicolons when looking up to match both forms
12
+ _HTML5_ENTITIES = html.entities.html5
13
+
14
+ # Build a normalized lookup without semicolons for easier access
15
+ NAMED_ENTITIES = {}
16
+ for key, value in _HTML5_ENTITIES.items():
17
+ # Remove trailing semicolon for lookup
18
+ if key.endswith(";"):
19
+ NAMED_ENTITIES[key[:-1]] = value
20
+ else:
21
+ NAMED_ENTITIES[key] = value
22
+
23
+ # Legacy named character references that can be used without semicolons
24
+ # Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
25
+ # Modern entities like "prod", "notin" etc. require semicolons
26
+ # Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
27
+ LEGACY_ENTITIES = {
28
+ "gt",
29
+ "lt",
30
+ "amp",
31
+ "quot",
32
+ "nbsp",
33
+ "AMP",
34
+ "QUOT",
35
+ "GT",
36
+ "LT",
37
+ "COPY",
38
+ "REG",
39
+ "AElig",
40
+ "Aacute",
41
+ "Acirc",
42
+ "Agrave",
43
+ "Aring",
44
+ "Atilde",
45
+ "Auml",
46
+ "Ccedil",
47
+ "ETH",
48
+ "Eacute",
49
+ "Ecirc",
50
+ "Egrave",
51
+ "Euml",
52
+ "Iacute",
53
+ "Icirc",
54
+ "Igrave",
55
+ "Iuml",
56
+ "Ntilde",
57
+ "Oacute",
58
+ "Ocirc",
59
+ "Ograve",
60
+ "Oslash",
61
+ "Otilde",
62
+ "Ouml",
63
+ "THORN",
64
+ "Uacute",
65
+ "Ucirc",
66
+ "Ugrave",
67
+ "Uuml",
68
+ "Yacute",
69
+ "aacute",
70
+ "acirc",
71
+ "acute",
72
+ "aelig",
73
+ "agrave",
74
+ "aring",
75
+ "atilde",
76
+ "auml",
77
+ "brvbar",
78
+ "ccedil",
79
+ "cedil",
80
+ "cent",
81
+ "copy",
82
+ "curren",
83
+ "deg",
84
+ "divide",
85
+ "eacute",
86
+ "ecirc",
87
+ "egrave",
88
+ "eth",
89
+ "euml",
90
+ "frac12",
91
+ "frac14",
92
+ "frac34",
93
+ "iacute",
94
+ "icirc",
95
+ "iexcl",
96
+ "igrave",
97
+ "iquest",
98
+ "iuml",
99
+ "laquo",
100
+ "macr",
101
+ "micro",
102
+ "middot",
103
+ "not",
104
+ "ntilde",
105
+ "oacute",
106
+ "ocirc",
107
+ "ograve",
108
+ "ordf",
109
+ "ordm",
110
+ "oslash",
111
+ "otilde",
112
+ "ouml",
113
+ "para",
114
+ "plusmn",
115
+ "pound",
116
+ "raquo",
117
+ "reg",
118
+ "sect",
119
+ "shy",
120
+ "sup1",
121
+ "sup2",
122
+ "sup3",
123
+ "szlig",
124
+ "thorn",
125
+ "times",
126
+ "uacute",
127
+ "ucirc",
128
+ "ugrave",
129
+ "uml",
130
+ "uuml",
131
+ "yacute",
132
+ "yen",
133
+ "yuml",
134
+ }
135
+
136
+ # HTML5 numeric character reference replacements (§13.2.5.73)
137
+ NUMERIC_REPLACEMENTS = {
138
+ 0x00: "\ufffd", # NULL
139
+ 0x80: "\u20ac", # EURO SIGN
140
+ 0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK
141
+ 0x83: "\u0192", # LATIN SMALL LETTER F WITH HOOK
142
+ 0x84: "\u201e", # DOUBLE LOW-9 QUOTATION MARK
143
+ 0x85: "\u2026", # HORIZONTAL ELLIPSIS
144
+ 0x86: "\u2020", # DAGGER
145
+ 0x87: "\u2021", # DOUBLE DAGGER
146
+ 0x88: "\u02c6", # MODIFIER LETTER CIRCUMFLEX ACCENT
147
+ 0x89: "\u2030", # PER MILLE SIGN
148
+ 0x8A: "\u0160", # LATIN CAPITAL LETTER S WITH CARON
149
+ 0x8B: "\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
150
+ 0x8C: "\u0152", # LATIN CAPITAL LIGATURE OE
151
+ 0x8E: "\u017d", # LATIN CAPITAL LETTER Z WITH CARON
152
+ 0x91: "\u2018", # LEFT SINGLE QUOTATION MARK
153
+ 0x92: "\u2019", # RIGHT SINGLE QUOTATION MARK
154
+ 0x93: "\u201c", # LEFT DOUBLE QUOTATION MARK
155
+ 0x94: "\u201d", # RIGHT DOUBLE QUOTATION MARK
156
+ 0x95: "\u2022", # BULLET
157
+ 0x96: "\u2013", # EN DASH
158
+ 0x97: "\u2014", # EM DASH
159
+ 0x98: "\u02dc", # SMALL TILDE
160
+ 0x99: "\u2122", # TRADE MARK SIGN
161
+ 0x9A: "\u0161", # LATIN SMALL LETTER S WITH CARON
162
+ 0x9B: "\u203a", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
163
+ 0x9C: "\u0153", # LATIN SMALL LIGATURE OE
164
+ 0x9E: "\u017e", # LATIN SMALL LETTER Z WITH CARON
165
+ 0x9F: "\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
166
+ }
167
+
168
+
169
+ def decode_numeric_entity(text, is_hex=False):
170
+ """Decode a numeric character reference like < or <.
171
+
172
+ Args:
173
+ text: The numeric part (without &# or ;)
174
+ is_hex: Whether this is hexadecimal (&#x) or decimal (&#)
175
+
176
+ Returns:
177
+ The decoded character, or None if invalid
178
+ """
179
+ base = 16 if is_hex else 10
180
+ codepoint = int(text, base)
181
+
182
+ # Apply HTML5 replacements for certain ranges
183
+ if codepoint in NUMERIC_REPLACEMENTS:
184
+ return NUMERIC_REPLACEMENTS[codepoint]
185
+
186
+ # Invalid ranges per HTML5 spec
187
+ if codepoint > 0x10FFFF:
188
+ return "\ufffd" # REPLACEMENT CHARACTER
189
+ if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
190
+ return "\ufffd"
191
+
192
+ return chr(codepoint)
193
+
194
+
195
+ def decode_entities_in_text(text, in_attribute=False):
196
+ """Decode all HTML entities in text.
197
+
198
+ This is a simple implementation that handles:
199
+ - Named entities: &amp; &lt; &gt; &quot; &nbsp; etc.
200
+ - Decimal numeric: &#60; &#160; etc.
201
+ - Hex numeric: &#x3C; &#xA0; etc.
202
+
203
+ Args:
204
+ text: Input text potentially containing entities
205
+ in_attribute: Whether this is attribute value (stricter rules for legacy entities)
206
+
207
+ Returns:
208
+ Text with entities decoded
209
+ """
210
+ result = []
211
+ i = 0
212
+ length = len(text)
213
+ while i < length:
214
+ next_amp = text.find("&", i)
215
+ if next_amp == -1:
216
+ result.append(text[i:])
217
+ break
218
+
219
+ if next_amp > i:
220
+ result.append(text[i:next_amp])
221
+
222
+ i = next_amp
223
+ # Look for entity
224
+ j = i + 1
225
+
226
+ # Check for numeric entity
227
+ if j < length and text[j] == "#":
228
+ j += 1
229
+ is_hex = False
230
+
231
+ if j < length and text[j] in "xX":
232
+ is_hex = True
233
+ j += 1
234
+
235
+ # Collect digits
236
+ digit_start = j
237
+ if is_hex:
238
+ while j < length and text[j] in "0123456789abcdefABCDEF":
239
+ j += 1
240
+ else:
241
+ while j < length and text[j].isdigit():
242
+ j += 1
243
+
244
+ has_semicolon = j < length and text[j] == ";"
245
+ digit_text = text[digit_start:j]
246
+
247
+ if digit_text:
248
+ result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
249
+ i = j + 1 if has_semicolon else j
250
+ continue
251
+
252
+ # Invalid numeric entity, keep as-is
253
+ result.append(text[i : j + 1 if has_semicolon else j])
254
+ i = j + 1 if has_semicolon else j
255
+ continue
256
+
257
+ # Named entity
258
+ # Collect alphanumeric characters (entity names are case-sensitive and can include uppercase)
259
+ while j < length and (text[j].isalpha() or text[j].isdigit()):
260
+ j += 1
261
+
262
+ entity_name = text[i + 1 : j]
263
+ has_semicolon = j < length and text[j] == ";"
264
+
265
+ if not entity_name:
266
+ result.append("&")
267
+ i += 1
268
+ continue
269
+
270
+ # Try exact match first (with semicolon expected)
271
+ if has_semicolon and entity_name in NAMED_ENTITIES:
272
+ result.append(NAMED_ENTITIES[entity_name])
273
+ i = j + 1
274
+ continue
275
+ # If semicolon present but no exact match, allow legacy prefix match in text
276
+ if has_semicolon and not in_attribute:
277
+ best_match = None
278
+ best_match_len = 0
279
+ for k in range(len(entity_name), 0, -1):
280
+ prefix = entity_name[:k]
281
+ if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
282
+ best_match = NAMED_ENTITIES[prefix]
283
+ best_match_len = k
284
+ break
285
+ if best_match:
286
+ result.append(best_match)
287
+ i = i + 1 + best_match_len
288
+ continue
289
+
290
+ # Try without semicolon for legacy compatibility
291
+ # Only legacy entities can be used without semicolons
292
+ if entity_name in LEGACY_ENTITIES and entity_name in NAMED_ENTITIES:
293
+ # Legacy entities without semicolon have strict rules in attributes:
294
+ # don't decode if followed by alphanumeric or '='
295
+ # Per HTML5 spec §13.2.5.72
296
+ next_char = text[j] if j < length else None
297
+ if in_attribute and next_char and (next_char.isalnum() or next_char == "="):
298
+ result.append("&")
299
+ i += 1
300
+ continue
301
+
302
+ # Decode legacy entity
303
+ result.append(NAMED_ENTITIES[entity_name])
304
+ i = j
305
+ continue
306
+
307
+ # Try longest prefix match for legacy entities without semicolon
308
+ # This handles cases like &notit where &not is valid but &notit is not
309
+ best_match = None
310
+ best_match_len = 0
311
+ for k in range(len(entity_name), 0, -1):
312
+ prefix = entity_name[:k]
313
+ if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
314
+ best_match = NAMED_ENTITIES[prefix]
315
+ best_match_len = k
316
+ break
317
+
318
+ if best_match:
319
+ # Check legacy entity rules
320
+ end_pos = i + 1 + best_match_len
321
+ next_char = text[end_pos] if end_pos < length else None
322
+ if in_attribute:
323
+ # In attributes with prefix match, the next char is always alphanumeric
324
+ # (since entity_name was built from alphanumerics only)
325
+ # Per HTML5 spec, don't decode if followed by alphanumeric or =
326
+ result.append("&")
327
+ i += 1
328
+ continue
329
+
330
+ result.append(best_match)
331
+ i = i + 1 + best_match_len
332
+ continue
333
+
334
+ # No match found
335
+ if has_semicolon:
336
+ result.append(text[i : j + 1])
337
+ i = j + 1
338
+ else:
339
+ result.append("&")
340
+ i += 1
341
+
342
+ return "".join(result)
justhtml/errors.py ADDED
@@ -0,0 +1,138 @@
1
+ """Centralized error message definitions and helpers for HTML parsing errors.
2
+
3
+ This module provides human-readable error messages for all parse error codes
4
+ emitted by both the tokenizer and tree builder during HTML parsing.
5
+ """
6
+
7
+
8
+ def generate_error_message(code, tag_name=None):
9
+ """Generate human-readable error message from error code.
10
+
11
+ Args:
12
+ code: The error code string (kebab-case format)
13
+ tag_name: Optional tag name to include in the message for context
14
+
15
+ Returns:
16
+ Human-readable error message string
17
+ """
18
+ messages = {
19
+ # ================================================================
20
+ # TOKENIZER ERRORS
21
+ # ================================================================
22
+ # DOCTYPE errors
23
+ "eof-in-doctype": "Unexpected end of file in DOCTYPE declaration",
24
+ "eof-in-doctype-name": "Unexpected end of file while reading DOCTYPE name",
25
+ "eof-in-doctype-public-identifier": "Unexpected end of file in DOCTYPE public identifier",
26
+ "eof-in-doctype-system-identifier": "Unexpected end of file in DOCTYPE system identifier",
27
+ "expected-doctype-name-but-got-right-bracket": "Expected DOCTYPE name but got >",
28
+ "missing-whitespace-before-doctype-name": "Missing whitespace after <!DOCTYPE",
29
+ "abrupt-doctype-public-identifier": "DOCTYPE public identifier ended abruptly",
30
+ "abrupt-doctype-system-identifier": "DOCTYPE system identifier ended abruptly",
31
+ "missing-quote-before-doctype-public-identifier": "Missing quote before DOCTYPE public identifier",
32
+ "missing-quote-before-doctype-system-identifier": "Missing quote before DOCTYPE system identifier",
33
+ "missing-doctype-public-identifier": "Missing DOCTYPE public identifier",
34
+ "missing-doctype-system-identifier": "Missing DOCTYPE system identifier",
35
+ "missing-whitespace-before-doctype-public-identifier": "Missing whitespace before DOCTYPE public identifier",
36
+ "missing-whitespace-after-doctype-public-identifier": "Missing whitespace after DOCTYPE public identifier",
37
+ "missing-whitespace-between-doctype-public-and-system-identifiers": "Missing whitespace between DOCTYPE identifiers",
38
+ "missing-whitespace-after-doctype-name": "Missing whitespace after DOCTYPE name",
39
+ "unexpected-character-after-doctype-public-keyword": "Unexpected character after PUBLIC keyword",
40
+ "unexpected-character-after-doctype-system-keyword": "Unexpected character after SYSTEM keyword",
41
+ "unexpected-character-after-doctype-public-identifier": "Unexpected character after public identifier",
42
+ "unexpected-character-after-doctype-system-identifier": "Unexpected character after system identifier",
43
+ # Comment errors
44
+ "eof-in-comment": "Unexpected end of file in comment",
45
+ "abrupt-closing-of-empty-comment": "Comment ended abruptly with -->",
46
+ "incorrectly-closed-comment": "Comment ended with --!> instead of -->",
47
+ # Tag errors
48
+ "eof-in-tag": "Unexpected end of file in tag",
49
+ "eof-before-tag-name": "Unexpected end of file before tag name",
50
+ "empty-end-tag": "Empty end tag </> is not allowed",
51
+ "invalid-first-character-of-tag-name": "Invalid first character of tag name",
52
+ "unexpected-question-mark-instead-of-tag-name": "Unexpected ? instead of tag name",
53
+ "unexpected-character-after-solidus-in-tag": "Unexpected character after / in tag",
54
+ # Attribute errors
55
+ "duplicate-attribute": "Duplicate attribute name",
56
+ "missing-attribute-value": "Missing attribute value",
57
+ "unexpected-character-in-attribute-name": "Unexpected character in attribute name",
58
+ "unexpected-character-in-unquoted-attribute-value": "Unexpected character in unquoted attribute value",
59
+ "missing-whitespace-between-attributes": "Missing whitespace between attributes",
60
+ "unexpected-equals-sign-before-attribute-name": "Unexpected = before attribute name",
61
+ # Script errors
62
+ "eof-in-script-html-comment-like-text": "Unexpected end of file in script with HTML-like comment",
63
+ "eof-in-script-in-script": "Unexpected end of file in nested script tag",
64
+ # CDATA errors
65
+ "eof-in-cdata": "Unexpected end of file in CDATA section",
66
+ "cdata-in-html-content": "CDATA section only allowed in SVG/MathML content",
67
+ # NULL character errors
68
+ "unexpected-null-character": "Unexpected NULL character (U+0000)",
69
+ # Markup declaration errors
70
+ "incorrectly-opened-comment": "Incorrectly opened comment",
71
+ # Character reference errors
72
+ "control-character-reference": "Invalid control character in character reference",
73
+ "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
74
+ "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
75
+ "named-entity-without-semicolon": "Named entity used without semicolon",
76
+ # ================================================================
77
+ # TREE BUILDER ERRORS
78
+ # ================================================================
79
+ # DOCTYPE errors
80
+ "unexpected-doctype": "Unexpected DOCTYPE declaration",
81
+ "unknown-doctype": "Unknown DOCTYPE (expected <!DOCTYPE html>)",
82
+ "expected-doctype-but-got-chars": "Expected DOCTYPE but got text content",
83
+ "expected-doctype-but-got-eof": "Expected DOCTYPE but reached end of file",
84
+ "expected-doctype-but-got-start-tag": f"Expected DOCTYPE but got <{tag_name}> tag",
85
+ "expected-doctype-but-got-end-tag": f"Expected DOCTYPE but got </{tag_name}> tag",
86
+ "unexpected-doctype-in-foreign-content": "Unexpected DOCTYPE in SVG/MathML content",
87
+ # Unexpected tag errors
88
+ "unexpected-start-tag": f"Unexpected <{tag_name}> start tag",
89
+ "unexpected-end-tag": f"Unexpected </{tag_name}> end tag",
90
+ "unexpected-end-tag-before-html": f"Unexpected </{tag_name}> end tag before <html>",
91
+ "unexpected-end-tag-before-head": f"Unexpected </{tag_name}> end tag before <head>",
92
+ "unexpected-end-tag-after-head": f"Unexpected </{tag_name}> end tag after <head>",
93
+ "unexpected-start-tag-ignored": f"<{tag_name}> start tag ignored in current context",
94
+ "unexpected-start-tag-implies-end-tag": f"<{tag_name}> start tag implicitly closes previous element",
95
+ # EOF errors
96
+ "expected-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
97
+ "expected-named-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
98
+ # Invalid character errors
99
+ "invalid-codepoint": "Invalid character (U+0000 NULL or U+000C FORM FEED)",
100
+ "invalid-codepoint-before-head": "Invalid character before <head>",
101
+ "invalid-codepoint-in-body": "Invalid character in <body>",
102
+ "invalid-codepoint-in-table-text": "Invalid character in table text",
103
+ "invalid-codepoint-in-select": "Invalid character in <select>",
104
+ "invalid-codepoint-in-foreign-content": "Invalid character in SVG/MathML content",
105
+ # Foster parenting / table errors
106
+ "foster-parenting-character": "Text content in table requires foster parenting",
107
+ "foster-parenting-start-tag": "Start tag in table requires foster parenting",
108
+ "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
109
+ "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
110
+ "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
111
+ "unexpected-form-in-table": "Form element not allowed in table context",
112
+ "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
113
+ # Context-specific errors
114
+ "unexpected-hidden-input-after-head": "Unexpected hidden input after <head>",
115
+ "unexpected-token-in-frameset": "Unexpected content in <frameset>",
116
+ "unexpected-token-after-frameset": "Unexpected content after <frameset>",
117
+ "unexpected-token-after-after-frameset": "Unexpected content after frameset closed",
118
+ "unexpected-token-after-body": "Unexpected content after </body>",
119
+ "unexpected-char-after-body": "Unexpected character after </body>",
120
+ "unexpected-characters-in-column-group": "Text not allowed in <colgroup>",
121
+ "unexpected-characters-in-template-column-group": "Text not allowed in template column group",
122
+ "unexpected-start-tag-in-column-group": f"<{tag_name}> start tag not allowed in <colgroup>",
123
+ "unexpected-start-tag-in-template-column-group": f"<{tag_name}> start tag not allowed in template column group",
124
+ "unexpected-start-tag-in-template-table-context": f"<{tag_name}> start tag not allowed in template table context",
125
+ "unexpected-start-tag-in-cell-fragment": f"<{tag_name}> start tag not allowed in cell fragment context",
126
+ # Foreign content errors
127
+ "unexpected-html-element-in-foreign-content": "HTML element breaks out of SVG/MathML content",
128
+ "unexpected-end-tag-in-foreign-content": f"Mismatched </{tag_name}> end tag in SVG/MathML content",
129
+ "unexpected-end-tag-in-fragment-context": f"</{tag_name}> end tag not allowed in fragment parsing context",
130
+ # Miscellaneous errors
131
+ "end-tag-too-early": f"</{tag_name}> end tag closed early (unclosed children)",
132
+ "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
133
+ "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
134
+ "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
135
+ }
136
+
137
+ # Return message or fall back to the code itself if not found
138
+ return messages.get(code, code)
justhtml/node.py ADDED
@@ -0,0 +1,208 @@
1
+ from .selector import query
2
+ from .serialize import to_html
3
+
4
+
5
+ class SimpleDomNode:
6
+ __slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
7
+
8
+ def __init__(self, name, attrs=None, data=None, namespace=None):
9
+ self.name = name
10
+ self.parent = None
11
+ self.data = data
12
+
13
+ if name.startswith("#") or name == "!doctype":
14
+ self.namespace = namespace
15
+ if name == "#comment" or name == "!doctype":
16
+ self.children = None
17
+ self.attrs = None
18
+ else:
19
+ self.children = []
20
+ self.attrs = attrs if attrs is not None else {}
21
+ else:
22
+ self.namespace = namespace or "html"
23
+ self.children = []
24
+ self.attrs = attrs if attrs is not None else {}
25
+
26
+ def append_child(self, node):
27
+ self.children.append(node)
28
+ node.parent = self
29
+
30
+ def remove_child(self, node):
31
+ self.children.remove(node)
32
+ node.parent = None
33
+
34
+ def to_html(self, indent=0, indent_size=2, pretty=True):
35
+ """Convert node to HTML string."""
36
+ return to_html(self, indent, indent_size, pretty=pretty)
37
+
38
+ def query(self, selector):
39
+ """
40
+ Query this subtree using a CSS selector.
41
+
42
+ Args:
43
+ selector: A CSS selector string
44
+
45
+ Returns:
46
+ A list of matching nodes
47
+
48
+ Raises:
49
+ ValueError: If the selector is invalid
50
+ """
51
+ return query(self, selector)
52
+
53
+ @property
54
+ def text(self):
55
+ """Return the text content of this node and its descendants."""
56
+ if self.name == "#text":
57
+ return self.data or ""
58
+ if not self.children:
59
+ return ""
60
+ return "".join(child.text for child in self.children)
61
+
62
+ def insert_before(self, node, reference_node):
63
+ """
64
+ Insert a node before a reference node.
65
+
66
+ Args:
67
+ node: The node to insert
68
+ reference_node: The node to insert before. If None, append to end.
69
+
70
+ Raises:
71
+ ValueError: If reference_node is not a child of this node
72
+ """
73
+ if self.children is None:
74
+ raise ValueError(f"Node {self.name} cannot have children")
75
+
76
+ if reference_node is None:
77
+ self.append_child(node)
78
+ return
79
+
80
+ try:
81
+ index = self.children.index(reference_node)
82
+ self.children.insert(index, node)
83
+ node.parent = self
84
+ except ValueError:
85
+ raise ValueError("Reference node is not a child of this node") from None
86
+
87
+ def replace_child(self, new_node, old_node):
88
+ """
89
+ Replace a child node with a new node.
90
+
91
+ Args:
92
+ new_node: The new node to insert
93
+ old_node: The child node to replace
94
+
95
+ Returns:
96
+ The replaced node (old_node)
97
+
98
+ Raises:
99
+ ValueError: If old_node is not a child of this node
100
+ """
101
+ if self.children is None:
102
+ raise ValueError(f"Node {self.name} cannot have children")
103
+
104
+ try:
105
+ index = self.children.index(old_node)
106
+ except ValueError:
107
+ raise ValueError("The node to be replaced is not a child of this node") from None
108
+
109
+ self.children[index] = new_node
110
+ new_node.parent = self
111
+ old_node.parent = None
112
+ return old_node
113
+
114
+ def has_child_nodes(self):
115
+ """Return True if this node has children."""
116
+ return bool(self.children)
117
+
118
+ def clone_node(self, deep=False):
119
+ """
120
+ Clone this node.
121
+
122
+ Args:
123
+ deep: If True, recursively clone children.
124
+
125
+ Returns:
126
+ A new node that is a copy of this node.
127
+ """
128
+ clone = SimpleDomNode(
129
+ self.name,
130
+ self.attrs.copy() if self.attrs else None,
131
+ self.data,
132
+ self.namespace,
133
+ )
134
+ if deep and self.children:
135
+ for child in self.children:
136
+ clone.append_child(child.clone_node(deep=True))
137
+ return clone
138
+
139
+
140
+ class ElementNode(SimpleDomNode):
141
+ __slots__ = ()
142
+
143
+ def __init__(self, name, attrs, namespace):
144
+ self.name = name
145
+ self.parent = None
146
+ self.data = None
147
+ self.namespace = namespace
148
+ self.children = []
149
+ self.attrs = attrs
150
+
151
+ def clone_node(self, deep=False):
152
+ clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
153
+ if deep:
154
+ for child in self.children:
155
+ clone.append_child(child.clone_node(deep=True))
156
+ return clone
157
+
158
+
159
+ class TemplateNode(ElementNode):
160
+ __slots__ = ("template_content",)
161
+
162
+ def __init__(self, name, attrs=None, data=None, namespace=None):
163
+ super().__init__(name, attrs, namespace)
164
+ if self.namespace == "html":
165
+ self.template_content = SimpleDomNode("#document-fragment")
166
+ else:
167
+ self.template_content = None
168
+
169
+ def clone_node(self, deep=False):
170
+ clone = TemplateNode(
171
+ self.name,
172
+ self.attrs.copy() if self.attrs else {},
173
+ self.data,
174
+ self.namespace,
175
+ )
176
+ if deep:
177
+ if self.template_content:
178
+ clone.template_content = self.template_content.clone_node(deep=True)
179
+ for child in self.children:
180
+ clone.append_child(child.clone_node(deep=True))
181
+ return clone
182
+
183
+
184
+ class TextNode:
185
+ __slots__ = ("data", "name", "namespace", "parent")
186
+
187
+ def __init__(self, data):
188
+ self.data = data
189
+ self.parent = None
190
+ self.name = "#text"
191
+ self.namespace = None
192
+
193
+ @property
194
+ def text(self):
195
+ """Return the text content of this node."""
196
+ return self.data or ""
197
+
198
+ @property
199
+ def children(self):
200
+ """Return empty list for TextNode (leaf node)."""
201
+ return []
202
+
203
+ def has_child_nodes(self):
204
+ """Return False for TextNode."""
205
+ return False
206
+
207
+ def clone_node(self, deep=False):
208
+ return TextNode(self.data)