justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
justhtml/entities.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""HTML5 character entity decoding.
|
|
2
|
+
|
|
3
|
+
Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
|
|
4
|
+
Supports both named entities (&, ) and numeric references (<, <).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import html.entities
|
|
8
|
+
|
|
9
|
+
# Use Python's complete HTML5 entity list (2231 entities)
|
|
10
|
+
# Keys include the trailing semicolon (e.g., "amp;", "lang;")
|
|
11
|
+
# We'll strip semicolons when looking up to match both forms
|
|
12
|
+
_HTML5_ENTITIES = html.entities.html5
|
|
13
|
+
|
|
14
|
+
# Build a normalized lookup without semicolons for easier access
|
|
15
|
+
NAMED_ENTITIES = {}
|
|
16
|
+
for key, value in _HTML5_ENTITIES.items():
|
|
17
|
+
# Remove trailing semicolon for lookup
|
|
18
|
+
if key.endswith(";"):
|
|
19
|
+
NAMED_ENTITIES[key[:-1]] = value
|
|
20
|
+
else:
|
|
21
|
+
NAMED_ENTITIES[key] = value
|
|
22
|
+
|
|
23
|
+
# Legacy named character references that can be used without semicolons
|
|
24
|
+
# Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
|
|
25
|
+
# Modern entities like "prod", "notin" etc. require semicolons
|
|
26
|
+
# Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
|
|
27
|
+
LEGACY_ENTITIES = {
|
|
28
|
+
"gt",
|
|
29
|
+
"lt",
|
|
30
|
+
"amp",
|
|
31
|
+
"quot",
|
|
32
|
+
"nbsp",
|
|
33
|
+
"AMP",
|
|
34
|
+
"QUOT",
|
|
35
|
+
"GT",
|
|
36
|
+
"LT",
|
|
37
|
+
"COPY",
|
|
38
|
+
"REG",
|
|
39
|
+
"AElig",
|
|
40
|
+
"Aacute",
|
|
41
|
+
"Acirc",
|
|
42
|
+
"Agrave",
|
|
43
|
+
"Aring",
|
|
44
|
+
"Atilde",
|
|
45
|
+
"Auml",
|
|
46
|
+
"Ccedil",
|
|
47
|
+
"ETH",
|
|
48
|
+
"Eacute",
|
|
49
|
+
"Ecirc",
|
|
50
|
+
"Egrave",
|
|
51
|
+
"Euml",
|
|
52
|
+
"Iacute",
|
|
53
|
+
"Icirc",
|
|
54
|
+
"Igrave",
|
|
55
|
+
"Iuml",
|
|
56
|
+
"Ntilde",
|
|
57
|
+
"Oacute",
|
|
58
|
+
"Ocirc",
|
|
59
|
+
"Ograve",
|
|
60
|
+
"Oslash",
|
|
61
|
+
"Otilde",
|
|
62
|
+
"Ouml",
|
|
63
|
+
"THORN",
|
|
64
|
+
"Uacute",
|
|
65
|
+
"Ucirc",
|
|
66
|
+
"Ugrave",
|
|
67
|
+
"Uuml",
|
|
68
|
+
"Yacute",
|
|
69
|
+
"aacute",
|
|
70
|
+
"acirc",
|
|
71
|
+
"acute",
|
|
72
|
+
"aelig",
|
|
73
|
+
"agrave",
|
|
74
|
+
"aring",
|
|
75
|
+
"atilde",
|
|
76
|
+
"auml",
|
|
77
|
+
"brvbar",
|
|
78
|
+
"ccedil",
|
|
79
|
+
"cedil",
|
|
80
|
+
"cent",
|
|
81
|
+
"copy",
|
|
82
|
+
"curren",
|
|
83
|
+
"deg",
|
|
84
|
+
"divide",
|
|
85
|
+
"eacute",
|
|
86
|
+
"ecirc",
|
|
87
|
+
"egrave",
|
|
88
|
+
"eth",
|
|
89
|
+
"euml",
|
|
90
|
+
"frac12",
|
|
91
|
+
"frac14",
|
|
92
|
+
"frac34",
|
|
93
|
+
"iacute",
|
|
94
|
+
"icirc",
|
|
95
|
+
"iexcl",
|
|
96
|
+
"igrave",
|
|
97
|
+
"iquest",
|
|
98
|
+
"iuml",
|
|
99
|
+
"laquo",
|
|
100
|
+
"macr",
|
|
101
|
+
"micro",
|
|
102
|
+
"middot",
|
|
103
|
+
"not",
|
|
104
|
+
"ntilde",
|
|
105
|
+
"oacute",
|
|
106
|
+
"ocirc",
|
|
107
|
+
"ograve",
|
|
108
|
+
"ordf",
|
|
109
|
+
"ordm",
|
|
110
|
+
"oslash",
|
|
111
|
+
"otilde",
|
|
112
|
+
"ouml",
|
|
113
|
+
"para",
|
|
114
|
+
"plusmn",
|
|
115
|
+
"pound",
|
|
116
|
+
"raquo",
|
|
117
|
+
"reg",
|
|
118
|
+
"sect",
|
|
119
|
+
"shy",
|
|
120
|
+
"sup1",
|
|
121
|
+
"sup2",
|
|
122
|
+
"sup3",
|
|
123
|
+
"szlig",
|
|
124
|
+
"thorn",
|
|
125
|
+
"times",
|
|
126
|
+
"uacute",
|
|
127
|
+
"ucirc",
|
|
128
|
+
"ugrave",
|
|
129
|
+
"uml",
|
|
130
|
+
"uuml",
|
|
131
|
+
"yacute",
|
|
132
|
+
"yen",
|
|
133
|
+
"yuml",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# HTML5 numeric character reference replacements (§13.2.5.73)
|
|
137
|
+
NUMERIC_REPLACEMENTS = {
|
|
138
|
+
0x00: "\ufffd", # NULL
|
|
139
|
+
0x80: "\u20ac", # EURO SIGN
|
|
140
|
+
0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK
|
|
141
|
+
0x83: "\u0192", # LATIN SMALL LETTER F WITH HOOK
|
|
142
|
+
0x84: "\u201e", # DOUBLE LOW-9 QUOTATION MARK
|
|
143
|
+
0x85: "\u2026", # HORIZONTAL ELLIPSIS
|
|
144
|
+
0x86: "\u2020", # DAGGER
|
|
145
|
+
0x87: "\u2021", # DOUBLE DAGGER
|
|
146
|
+
0x88: "\u02c6", # MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
147
|
+
0x89: "\u2030", # PER MILLE SIGN
|
|
148
|
+
0x8A: "\u0160", # LATIN CAPITAL LETTER S WITH CARON
|
|
149
|
+
0x8B: "\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
150
|
+
0x8C: "\u0152", # LATIN CAPITAL LIGATURE OE
|
|
151
|
+
0x8E: "\u017d", # LATIN CAPITAL LETTER Z WITH CARON
|
|
152
|
+
0x91: "\u2018", # LEFT SINGLE QUOTATION MARK
|
|
153
|
+
0x92: "\u2019", # RIGHT SINGLE QUOTATION MARK
|
|
154
|
+
0x93: "\u201c", # LEFT DOUBLE QUOTATION MARK
|
|
155
|
+
0x94: "\u201d", # RIGHT DOUBLE QUOTATION MARK
|
|
156
|
+
0x95: "\u2022", # BULLET
|
|
157
|
+
0x96: "\u2013", # EN DASH
|
|
158
|
+
0x97: "\u2014", # EM DASH
|
|
159
|
+
0x98: "\u02dc", # SMALL TILDE
|
|
160
|
+
0x99: "\u2122", # TRADE MARK SIGN
|
|
161
|
+
0x9A: "\u0161", # LATIN SMALL LETTER S WITH CARON
|
|
162
|
+
0x9B: "\u203a", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
163
|
+
0x9C: "\u0153", # LATIN SMALL LIGATURE OE
|
|
164
|
+
0x9E: "\u017e", # LATIN SMALL LETTER Z WITH CARON
|
|
165
|
+
0x9F: "\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def decode_numeric_entity(text, is_hex=False):
|
|
170
|
+
"""Decode a numeric character reference like < or <.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
text: The numeric part (without &# or ;)
|
|
174
|
+
is_hex: Whether this is hexadecimal (&#x) or decimal (&#)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
The decoded character, or None if invalid
|
|
178
|
+
"""
|
|
179
|
+
base = 16 if is_hex else 10
|
|
180
|
+
codepoint = int(text, base)
|
|
181
|
+
|
|
182
|
+
# Apply HTML5 replacements for certain ranges
|
|
183
|
+
if codepoint in NUMERIC_REPLACEMENTS:
|
|
184
|
+
return NUMERIC_REPLACEMENTS[codepoint]
|
|
185
|
+
|
|
186
|
+
# Invalid ranges per HTML5 spec
|
|
187
|
+
if codepoint > 0x10FFFF:
|
|
188
|
+
return "\ufffd" # REPLACEMENT CHARACTER
|
|
189
|
+
if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
|
|
190
|
+
return "\ufffd"
|
|
191
|
+
|
|
192
|
+
return chr(codepoint)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def decode_entities_in_text(text, in_attribute=False):
|
|
196
|
+
"""Decode all HTML entities in text.
|
|
197
|
+
|
|
198
|
+
This is a simple implementation that handles:
|
|
199
|
+
- Named entities: & < > " etc.
|
|
200
|
+
- Decimal numeric: <   etc.
|
|
201
|
+
- Hex numeric: <   etc.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
text: Input text potentially containing entities
|
|
205
|
+
in_attribute: Whether this is attribute value (stricter rules for legacy entities)
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Text with entities decoded
|
|
209
|
+
"""
|
|
210
|
+
result = []
|
|
211
|
+
i = 0
|
|
212
|
+
length = len(text)
|
|
213
|
+
while i < length:
|
|
214
|
+
next_amp = text.find("&", i)
|
|
215
|
+
if next_amp == -1:
|
|
216
|
+
result.append(text[i:])
|
|
217
|
+
break
|
|
218
|
+
|
|
219
|
+
if next_amp > i:
|
|
220
|
+
result.append(text[i:next_amp])
|
|
221
|
+
|
|
222
|
+
i = next_amp
|
|
223
|
+
# Look for entity
|
|
224
|
+
j = i + 1
|
|
225
|
+
|
|
226
|
+
# Check for numeric entity
|
|
227
|
+
if j < length and text[j] == "#":
|
|
228
|
+
j += 1
|
|
229
|
+
is_hex = False
|
|
230
|
+
|
|
231
|
+
if j < length and text[j] in "xX":
|
|
232
|
+
is_hex = True
|
|
233
|
+
j += 1
|
|
234
|
+
|
|
235
|
+
# Collect digits
|
|
236
|
+
digit_start = j
|
|
237
|
+
if is_hex:
|
|
238
|
+
while j < length and text[j] in "0123456789abcdefABCDEF":
|
|
239
|
+
j += 1
|
|
240
|
+
else:
|
|
241
|
+
while j < length and text[j].isdigit():
|
|
242
|
+
j += 1
|
|
243
|
+
|
|
244
|
+
has_semicolon = j < length and text[j] == ";"
|
|
245
|
+
digit_text = text[digit_start:j]
|
|
246
|
+
|
|
247
|
+
if digit_text:
|
|
248
|
+
result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
|
|
249
|
+
i = j + 1 if has_semicolon else j
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# Invalid numeric entity, keep as-is
|
|
253
|
+
result.append(text[i : j + 1 if has_semicolon else j])
|
|
254
|
+
i = j + 1 if has_semicolon else j
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
# Named entity
|
|
258
|
+
# Collect alphanumeric characters (entity names are case-sensitive and can include uppercase)
|
|
259
|
+
while j < length and (text[j].isalpha() or text[j].isdigit()):
|
|
260
|
+
j += 1
|
|
261
|
+
|
|
262
|
+
entity_name = text[i + 1 : j]
|
|
263
|
+
has_semicolon = j < length and text[j] == ";"
|
|
264
|
+
|
|
265
|
+
if not entity_name:
|
|
266
|
+
result.append("&")
|
|
267
|
+
i += 1
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
# Try exact match first (with semicolon expected)
|
|
271
|
+
if has_semicolon and entity_name in NAMED_ENTITIES:
|
|
272
|
+
result.append(NAMED_ENTITIES[entity_name])
|
|
273
|
+
i = j + 1
|
|
274
|
+
continue
|
|
275
|
+
# If semicolon present but no exact match, allow legacy prefix match in text
|
|
276
|
+
if has_semicolon and not in_attribute:
|
|
277
|
+
best_match = None
|
|
278
|
+
best_match_len = 0
|
|
279
|
+
for k in range(len(entity_name), 0, -1):
|
|
280
|
+
prefix = entity_name[:k]
|
|
281
|
+
if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
|
|
282
|
+
best_match = NAMED_ENTITIES[prefix]
|
|
283
|
+
best_match_len = k
|
|
284
|
+
break
|
|
285
|
+
if best_match:
|
|
286
|
+
result.append(best_match)
|
|
287
|
+
i = i + 1 + best_match_len
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
# Try without semicolon for legacy compatibility
|
|
291
|
+
# Only legacy entities can be used without semicolons
|
|
292
|
+
if entity_name in LEGACY_ENTITIES and entity_name in NAMED_ENTITIES:
|
|
293
|
+
# Legacy entities without semicolon have strict rules in attributes:
|
|
294
|
+
# don't decode if followed by alphanumeric or '='
|
|
295
|
+
# Per HTML5 spec §13.2.5.72
|
|
296
|
+
next_char = text[j] if j < length else None
|
|
297
|
+
if in_attribute and next_char and (next_char.isalnum() or next_char == "="):
|
|
298
|
+
result.append("&")
|
|
299
|
+
i += 1
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Decode legacy entity
|
|
303
|
+
result.append(NAMED_ENTITIES[entity_name])
|
|
304
|
+
i = j
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
# Try longest prefix match for legacy entities without semicolon
|
|
308
|
+
# This handles cases like ¬it where ¬ is valid but ¬it is not
|
|
309
|
+
best_match = None
|
|
310
|
+
best_match_len = 0
|
|
311
|
+
for k in range(len(entity_name), 0, -1):
|
|
312
|
+
prefix = entity_name[:k]
|
|
313
|
+
if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
|
|
314
|
+
best_match = NAMED_ENTITIES[prefix]
|
|
315
|
+
best_match_len = k
|
|
316
|
+
break
|
|
317
|
+
|
|
318
|
+
if best_match:
|
|
319
|
+
# Check legacy entity rules
|
|
320
|
+
end_pos = i + 1 + best_match_len
|
|
321
|
+
next_char = text[end_pos] if end_pos < length else None
|
|
322
|
+
if in_attribute:
|
|
323
|
+
# In attributes with prefix match, the next char is always alphanumeric
|
|
324
|
+
# (since entity_name was built from alphanumerics only)
|
|
325
|
+
# Per HTML5 spec, don't decode if followed by alphanumeric or =
|
|
326
|
+
result.append("&")
|
|
327
|
+
i += 1
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
result.append(best_match)
|
|
331
|
+
i = i + 1 + best_match_len
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
# No match found
|
|
335
|
+
if has_semicolon:
|
|
336
|
+
result.append(text[i : j + 1])
|
|
337
|
+
i = j + 1
|
|
338
|
+
else:
|
|
339
|
+
result.append("&")
|
|
340
|
+
i += 1
|
|
341
|
+
|
|
342
|
+
return "".join(result)
|
justhtml/errors.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Centralized error message definitions and helpers for HTML parsing errors.
|
|
2
|
+
|
|
3
|
+
This module provides human-readable error messages for all parse error codes
|
|
4
|
+
emitted by both the tokenizer and tree builder during HTML parsing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate_error_message(code, tag_name=None):
|
|
9
|
+
"""Generate human-readable error message from error code.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
code: The error code string (kebab-case format)
|
|
13
|
+
tag_name: Optional tag name to include in the message for context
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Human-readable error message string
|
|
17
|
+
"""
|
|
18
|
+
messages = {
|
|
19
|
+
# ================================================================
|
|
20
|
+
# TOKENIZER ERRORS
|
|
21
|
+
# ================================================================
|
|
22
|
+
# DOCTYPE errors
|
|
23
|
+
"eof-in-doctype": "Unexpected end of file in DOCTYPE declaration",
|
|
24
|
+
"eof-in-doctype-name": "Unexpected end of file while reading DOCTYPE name",
|
|
25
|
+
"eof-in-doctype-public-identifier": "Unexpected end of file in DOCTYPE public identifier",
|
|
26
|
+
"eof-in-doctype-system-identifier": "Unexpected end of file in DOCTYPE system identifier",
|
|
27
|
+
"expected-doctype-name-but-got-right-bracket": "Expected DOCTYPE name but got >",
|
|
28
|
+
"missing-whitespace-before-doctype-name": "Missing whitespace after <!DOCTYPE",
|
|
29
|
+
"abrupt-doctype-public-identifier": "DOCTYPE public identifier ended abruptly",
|
|
30
|
+
"abrupt-doctype-system-identifier": "DOCTYPE system identifier ended abruptly",
|
|
31
|
+
"missing-quote-before-doctype-public-identifier": "Missing quote before DOCTYPE public identifier",
|
|
32
|
+
"missing-quote-before-doctype-system-identifier": "Missing quote before DOCTYPE system identifier",
|
|
33
|
+
"missing-doctype-public-identifier": "Missing DOCTYPE public identifier",
|
|
34
|
+
"missing-doctype-system-identifier": "Missing DOCTYPE system identifier",
|
|
35
|
+
"missing-whitespace-before-doctype-public-identifier": "Missing whitespace before DOCTYPE public identifier",
|
|
36
|
+
"missing-whitespace-after-doctype-public-identifier": "Missing whitespace after DOCTYPE public identifier",
|
|
37
|
+
"missing-whitespace-between-doctype-public-and-system-identifiers": "Missing whitespace between DOCTYPE identifiers",
|
|
38
|
+
"missing-whitespace-after-doctype-name": "Missing whitespace after DOCTYPE name",
|
|
39
|
+
"unexpected-character-after-doctype-public-keyword": "Unexpected character after PUBLIC keyword",
|
|
40
|
+
"unexpected-character-after-doctype-system-keyword": "Unexpected character after SYSTEM keyword",
|
|
41
|
+
"unexpected-character-after-doctype-public-identifier": "Unexpected character after public identifier",
|
|
42
|
+
"unexpected-character-after-doctype-system-identifier": "Unexpected character after system identifier",
|
|
43
|
+
# Comment errors
|
|
44
|
+
"eof-in-comment": "Unexpected end of file in comment",
|
|
45
|
+
"abrupt-closing-of-empty-comment": "Comment ended abruptly with -->",
|
|
46
|
+
"incorrectly-closed-comment": "Comment ended with --!> instead of -->",
|
|
47
|
+
# Tag errors
|
|
48
|
+
"eof-in-tag": "Unexpected end of file in tag",
|
|
49
|
+
"eof-before-tag-name": "Unexpected end of file before tag name",
|
|
50
|
+
"empty-end-tag": "Empty end tag </> is not allowed",
|
|
51
|
+
"invalid-first-character-of-tag-name": "Invalid first character of tag name",
|
|
52
|
+
"unexpected-question-mark-instead-of-tag-name": "Unexpected ? instead of tag name",
|
|
53
|
+
"unexpected-character-after-solidus-in-tag": "Unexpected character after / in tag",
|
|
54
|
+
# Attribute errors
|
|
55
|
+
"duplicate-attribute": "Duplicate attribute name",
|
|
56
|
+
"missing-attribute-value": "Missing attribute value",
|
|
57
|
+
"unexpected-character-in-attribute-name": "Unexpected character in attribute name",
|
|
58
|
+
"unexpected-character-in-unquoted-attribute-value": "Unexpected character in unquoted attribute value",
|
|
59
|
+
"missing-whitespace-between-attributes": "Missing whitespace between attributes",
|
|
60
|
+
"unexpected-equals-sign-before-attribute-name": "Unexpected = before attribute name",
|
|
61
|
+
# Script errors
|
|
62
|
+
"eof-in-script-html-comment-like-text": "Unexpected end of file in script with HTML-like comment",
|
|
63
|
+
"eof-in-script-in-script": "Unexpected end of file in nested script tag",
|
|
64
|
+
# CDATA errors
|
|
65
|
+
"eof-in-cdata": "Unexpected end of file in CDATA section",
|
|
66
|
+
"cdata-in-html-content": "CDATA section only allowed in SVG/MathML content",
|
|
67
|
+
# NULL character errors
|
|
68
|
+
"unexpected-null-character": "Unexpected NULL character (U+0000)",
|
|
69
|
+
# Markup declaration errors
|
|
70
|
+
"incorrectly-opened-comment": "Incorrectly opened comment",
|
|
71
|
+
# Character reference errors
|
|
72
|
+
"control-character-reference": "Invalid control character in character reference",
|
|
73
|
+
"illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
|
|
74
|
+
"missing-semicolon-after-character-reference": "Missing semicolon after character reference",
|
|
75
|
+
"named-entity-without-semicolon": "Named entity used without semicolon",
|
|
76
|
+
# ================================================================
|
|
77
|
+
# TREE BUILDER ERRORS
|
|
78
|
+
# ================================================================
|
|
79
|
+
# DOCTYPE errors
|
|
80
|
+
"unexpected-doctype": "Unexpected DOCTYPE declaration",
|
|
81
|
+
"unknown-doctype": "Unknown DOCTYPE (expected <!DOCTYPE html>)",
|
|
82
|
+
"expected-doctype-but-got-chars": "Expected DOCTYPE but got text content",
|
|
83
|
+
"expected-doctype-but-got-eof": "Expected DOCTYPE but reached end of file",
|
|
84
|
+
"expected-doctype-but-got-start-tag": f"Expected DOCTYPE but got <{tag_name}> tag",
|
|
85
|
+
"expected-doctype-but-got-end-tag": f"Expected DOCTYPE but got </{tag_name}> tag",
|
|
86
|
+
"unexpected-doctype-in-foreign-content": "Unexpected DOCTYPE in SVG/MathML content",
|
|
87
|
+
# Unexpected tag errors
|
|
88
|
+
"unexpected-start-tag": f"Unexpected <{tag_name}> start tag",
|
|
89
|
+
"unexpected-end-tag": f"Unexpected </{tag_name}> end tag",
|
|
90
|
+
"unexpected-end-tag-before-html": f"Unexpected </{tag_name}> end tag before <html>",
|
|
91
|
+
"unexpected-end-tag-before-head": f"Unexpected </{tag_name}> end tag before <head>",
|
|
92
|
+
"unexpected-end-tag-after-head": f"Unexpected </{tag_name}> end tag after <head>",
|
|
93
|
+
"unexpected-start-tag-ignored": f"<{tag_name}> start tag ignored in current context",
|
|
94
|
+
"unexpected-start-tag-implies-end-tag": f"<{tag_name}> start tag implicitly closes previous element",
|
|
95
|
+
# EOF errors
|
|
96
|
+
"expected-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
|
|
97
|
+
"expected-named-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
|
|
98
|
+
# Invalid character errors
|
|
99
|
+
"invalid-codepoint": "Invalid character (U+0000 NULL or U+000C FORM FEED)",
|
|
100
|
+
"invalid-codepoint-before-head": "Invalid character before <head>",
|
|
101
|
+
"invalid-codepoint-in-body": "Invalid character in <body>",
|
|
102
|
+
"invalid-codepoint-in-table-text": "Invalid character in table text",
|
|
103
|
+
"invalid-codepoint-in-select": "Invalid character in <select>",
|
|
104
|
+
"invalid-codepoint-in-foreign-content": "Invalid character in SVG/MathML content",
|
|
105
|
+
# Foster parenting / table errors
|
|
106
|
+
"foster-parenting-character": "Text content in table requires foster parenting",
|
|
107
|
+
"foster-parenting-start-tag": "Start tag in table requires foster parenting",
|
|
108
|
+
"unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
|
|
109
|
+
"unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
|
|
110
|
+
"unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
|
|
111
|
+
"unexpected-form-in-table": "Form element not allowed in table context",
|
|
112
|
+
"unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
|
|
113
|
+
# Context-specific errors
|
|
114
|
+
"unexpected-hidden-input-after-head": "Unexpected hidden input after <head>",
|
|
115
|
+
"unexpected-token-in-frameset": "Unexpected content in <frameset>",
|
|
116
|
+
"unexpected-token-after-frameset": "Unexpected content after <frameset>",
|
|
117
|
+
"unexpected-token-after-after-frameset": "Unexpected content after frameset closed",
|
|
118
|
+
"unexpected-token-after-body": "Unexpected content after </body>",
|
|
119
|
+
"unexpected-char-after-body": "Unexpected character after </body>",
|
|
120
|
+
"unexpected-characters-in-column-group": "Text not allowed in <colgroup>",
|
|
121
|
+
"unexpected-characters-in-template-column-group": "Text not allowed in template column group",
|
|
122
|
+
"unexpected-start-tag-in-column-group": f"<{tag_name}> start tag not allowed in <colgroup>",
|
|
123
|
+
"unexpected-start-tag-in-template-column-group": f"<{tag_name}> start tag not allowed in template column group",
|
|
124
|
+
"unexpected-start-tag-in-template-table-context": f"<{tag_name}> start tag not allowed in template table context",
|
|
125
|
+
"unexpected-start-tag-in-cell-fragment": f"<{tag_name}> start tag not allowed in cell fragment context",
|
|
126
|
+
# Foreign content errors
|
|
127
|
+
"unexpected-html-element-in-foreign-content": "HTML element breaks out of SVG/MathML content",
|
|
128
|
+
"unexpected-end-tag-in-foreign-content": f"Mismatched </{tag_name}> end tag in SVG/MathML content",
|
|
129
|
+
"unexpected-end-tag-in-fragment-context": f"</{tag_name}> end tag not allowed in fragment parsing context",
|
|
130
|
+
# Miscellaneous errors
|
|
131
|
+
"end-tag-too-early": f"</{tag_name}> end tag closed early (unclosed children)",
|
|
132
|
+
"adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
|
|
133
|
+
"non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
|
|
134
|
+
"image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Return message or fall back to the code itself if not found
|
|
138
|
+
return messages.get(code, code)
|
justhtml/node.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from .selector import query
|
|
2
|
+
from .serialize import to_html
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SimpleDomNode:
|
|
6
|
+
__slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
|
|
7
|
+
|
|
8
|
+
def __init__(self, name, attrs=None, data=None, namespace=None):
|
|
9
|
+
self.name = name
|
|
10
|
+
self.parent = None
|
|
11
|
+
self.data = data
|
|
12
|
+
|
|
13
|
+
if name.startswith("#") or name == "!doctype":
|
|
14
|
+
self.namespace = namespace
|
|
15
|
+
if name == "#comment" or name == "!doctype":
|
|
16
|
+
self.children = None
|
|
17
|
+
self.attrs = None
|
|
18
|
+
else:
|
|
19
|
+
self.children = []
|
|
20
|
+
self.attrs = attrs if attrs is not None else {}
|
|
21
|
+
else:
|
|
22
|
+
self.namespace = namespace or "html"
|
|
23
|
+
self.children = []
|
|
24
|
+
self.attrs = attrs if attrs is not None else {}
|
|
25
|
+
|
|
26
|
+
def append_child(self, node):
|
|
27
|
+
self.children.append(node)
|
|
28
|
+
node.parent = self
|
|
29
|
+
|
|
30
|
+
def remove_child(self, node):
|
|
31
|
+
self.children.remove(node)
|
|
32
|
+
node.parent = None
|
|
33
|
+
|
|
34
|
+
def to_html(self, indent=0, indent_size=2, pretty=True):
|
|
35
|
+
"""Convert node to HTML string."""
|
|
36
|
+
return to_html(self, indent, indent_size, pretty=pretty)
|
|
37
|
+
|
|
38
|
+
def query(self, selector):
|
|
39
|
+
"""
|
|
40
|
+
Query this subtree using a CSS selector.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
selector: A CSS selector string
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
A list of matching nodes
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If the selector is invalid
|
|
50
|
+
"""
|
|
51
|
+
return query(self, selector)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def text(self):
|
|
55
|
+
"""Return the text content of this node and its descendants."""
|
|
56
|
+
if self.name == "#text":
|
|
57
|
+
return self.data or ""
|
|
58
|
+
if not self.children:
|
|
59
|
+
return ""
|
|
60
|
+
return "".join(child.text for child in self.children)
|
|
61
|
+
|
|
62
|
+
def insert_before(self, node, reference_node):
|
|
63
|
+
"""
|
|
64
|
+
Insert a node before a reference node.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
node: The node to insert
|
|
68
|
+
reference_node: The node to insert before. If None, append to end.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: If reference_node is not a child of this node
|
|
72
|
+
"""
|
|
73
|
+
if self.children is None:
|
|
74
|
+
raise ValueError(f"Node {self.name} cannot have children")
|
|
75
|
+
|
|
76
|
+
if reference_node is None:
|
|
77
|
+
self.append_child(node)
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
index = self.children.index(reference_node)
|
|
82
|
+
self.children.insert(index, node)
|
|
83
|
+
node.parent = self
|
|
84
|
+
except ValueError:
|
|
85
|
+
raise ValueError("Reference node is not a child of this node") from None
|
|
86
|
+
|
|
87
|
+
def replace_child(self, new_node, old_node):
|
|
88
|
+
"""
|
|
89
|
+
Replace a child node with a new node.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
new_node: The new node to insert
|
|
93
|
+
old_node: The child node to replace
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
The replaced node (old_node)
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: If old_node is not a child of this node
|
|
100
|
+
"""
|
|
101
|
+
if self.children is None:
|
|
102
|
+
raise ValueError(f"Node {self.name} cannot have children")
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
index = self.children.index(old_node)
|
|
106
|
+
except ValueError:
|
|
107
|
+
raise ValueError("The node to be replaced is not a child of this node") from None
|
|
108
|
+
|
|
109
|
+
self.children[index] = new_node
|
|
110
|
+
new_node.parent = self
|
|
111
|
+
old_node.parent = None
|
|
112
|
+
return old_node
|
|
113
|
+
|
|
114
|
+
def has_child_nodes(self):
|
|
115
|
+
"""Return True if this node has children."""
|
|
116
|
+
return bool(self.children)
|
|
117
|
+
|
|
118
|
+
def clone_node(self, deep=False):
|
|
119
|
+
"""
|
|
120
|
+
Clone this node.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
deep: If True, recursively clone children.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
A new node that is a copy of this node.
|
|
127
|
+
"""
|
|
128
|
+
clone = SimpleDomNode(
|
|
129
|
+
self.name,
|
|
130
|
+
self.attrs.copy() if self.attrs else None,
|
|
131
|
+
self.data,
|
|
132
|
+
self.namespace,
|
|
133
|
+
)
|
|
134
|
+
if deep and self.children:
|
|
135
|
+
for child in self.children:
|
|
136
|
+
clone.append_child(child.clone_node(deep=True))
|
|
137
|
+
return clone
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ElementNode(SimpleDomNode):
|
|
141
|
+
__slots__ = ()
|
|
142
|
+
|
|
143
|
+
def __init__(self, name, attrs, namespace):
|
|
144
|
+
self.name = name
|
|
145
|
+
self.parent = None
|
|
146
|
+
self.data = None
|
|
147
|
+
self.namespace = namespace
|
|
148
|
+
self.children = []
|
|
149
|
+
self.attrs = attrs
|
|
150
|
+
|
|
151
|
+
def clone_node(self, deep=False):
|
|
152
|
+
clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
|
|
153
|
+
if deep:
|
|
154
|
+
for child in self.children:
|
|
155
|
+
clone.append_child(child.clone_node(deep=True))
|
|
156
|
+
return clone
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TemplateNode(ElementNode):
|
|
160
|
+
__slots__ = ("template_content",)
|
|
161
|
+
|
|
162
|
+
def __init__(self, name, attrs=None, data=None, namespace=None):
|
|
163
|
+
super().__init__(name, attrs, namespace)
|
|
164
|
+
if self.namespace == "html":
|
|
165
|
+
self.template_content = SimpleDomNode("#document-fragment")
|
|
166
|
+
else:
|
|
167
|
+
self.template_content = None
|
|
168
|
+
|
|
169
|
+
def clone_node(self, deep=False):
|
|
170
|
+
clone = TemplateNode(
|
|
171
|
+
self.name,
|
|
172
|
+
self.attrs.copy() if self.attrs else {},
|
|
173
|
+
self.data,
|
|
174
|
+
self.namespace,
|
|
175
|
+
)
|
|
176
|
+
if deep:
|
|
177
|
+
if self.template_content:
|
|
178
|
+
clone.template_content = self.template_content.clone_node(deep=True)
|
|
179
|
+
for child in self.children:
|
|
180
|
+
clone.append_child(child.clone_node(deep=True))
|
|
181
|
+
return clone
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class TextNode:
|
|
185
|
+
__slots__ = ("data", "name", "namespace", "parent")
|
|
186
|
+
|
|
187
|
+
def __init__(self, data):
|
|
188
|
+
self.data = data
|
|
189
|
+
self.parent = None
|
|
190
|
+
self.name = "#text"
|
|
191
|
+
self.namespace = None
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def text(self):
|
|
195
|
+
"""Return the text content of this node."""
|
|
196
|
+
return self.data or ""
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def children(self):
|
|
200
|
+
"""Return empty list for TextNode (leaf node)."""
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
def has_child_nodes(self):
|
|
204
|
+
"""Return False for TextNode."""
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
def clone_node(self, deep=False):
|
|
208
|
+
return TextNode(self.data)
|