justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/entities.py ADDED
@@ -0,0 +1,344 @@
1
+ """HTML5 character entity decoding.
2
+
3
+ Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
4
+ Supports both named entities (&,  ) and numeric references (<, <).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import html.entities
10
+
11
+ # Use Python's complete HTML5 entity list (2231 entities)
12
+ # Keys include the trailing semicolon (e.g., "amp;", "lang;")
13
+ # We'll strip semicolons when looking up to match both forms
14
+ _HTML5_ENTITIES: dict[str, str] = html.entities.html5
15
+
16
+ # Build a normalized lookup without semicolons for easier access
17
+ NAMED_ENTITIES: dict[str, str] = {}
18
+ for _key, _value in _HTML5_ENTITIES.items():
19
+ # Remove trailing semicolon for lookup
20
+ if _key.endswith(";"):
21
+ NAMED_ENTITIES[_key[:-1]] = _value
22
+ else:
23
+ NAMED_ENTITIES[_key] = _value
24
+
25
+ # Legacy named character references that can be used without semicolons
26
+ # Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
27
+ # Modern entities like "prod", "notin" etc. require semicolons
28
+ # Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
29
+ LEGACY_ENTITIES: set[str] = {
30
+ "gt",
31
+ "lt",
32
+ "amp",
33
+ "quot",
34
+ "nbsp",
35
+ "AMP",
36
+ "QUOT",
37
+ "GT",
38
+ "LT",
39
+ "COPY",
40
+ "REG",
41
+ "AElig",
42
+ "Aacute",
43
+ "Acirc",
44
+ "Agrave",
45
+ "Aring",
46
+ "Atilde",
47
+ "Auml",
48
+ "Ccedil",
49
+ "ETH",
50
+ "Eacute",
51
+ "Ecirc",
52
+ "Egrave",
53
+ "Euml",
54
+ "Iacute",
55
+ "Icirc",
56
+ "Igrave",
57
+ "Iuml",
58
+ "Ntilde",
59
+ "Oacute",
60
+ "Ocirc",
61
+ "Ograve",
62
+ "Oslash",
63
+ "Otilde",
64
+ "Ouml",
65
+ "THORN",
66
+ "Uacute",
67
+ "Ucirc",
68
+ "Ugrave",
69
+ "Uuml",
70
+ "Yacute",
71
+ "aacute",
72
+ "acirc",
73
+ "acute",
74
+ "aelig",
75
+ "agrave",
76
+ "aring",
77
+ "atilde",
78
+ "auml",
79
+ "brvbar",
80
+ "ccedil",
81
+ "cedil",
82
+ "cent",
83
+ "copy",
84
+ "curren",
85
+ "deg",
86
+ "divide",
87
+ "eacute",
88
+ "ecirc",
89
+ "egrave",
90
+ "eth",
91
+ "euml",
92
+ "frac12",
93
+ "frac14",
94
+ "frac34",
95
+ "iacute",
96
+ "icirc",
97
+ "iexcl",
98
+ "igrave",
99
+ "iquest",
100
+ "iuml",
101
+ "laquo",
102
+ "macr",
103
+ "micro",
104
+ "middot",
105
+ "not",
106
+ "ntilde",
107
+ "oacute",
108
+ "ocirc",
109
+ "ograve",
110
+ "ordf",
111
+ "ordm",
112
+ "oslash",
113
+ "otilde",
114
+ "ouml",
115
+ "para",
116
+ "plusmn",
117
+ "pound",
118
+ "raquo",
119
+ "reg",
120
+ "sect",
121
+ "shy",
122
+ "sup1",
123
+ "sup2",
124
+ "sup3",
125
+ "szlig",
126
+ "thorn",
127
+ "times",
128
+ "uacute",
129
+ "ucirc",
130
+ "ugrave",
131
+ "uml",
132
+ "uuml",
133
+ "yacute",
134
+ "yen",
135
+ "yuml",
136
+ }
137
+
138
+ # HTML5 numeric character reference replacements (§13.2.5.73)
139
+ NUMERIC_REPLACEMENTS: dict[int, str] = {
140
+ 0x00: "\ufffd", # NULL
141
+ 0x80: "\u20ac", # EURO SIGN
142
+ 0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK
143
+ 0x83: "\u0192", # LATIN SMALL LETTER F WITH HOOK
144
+ 0x84: "\u201e", # DOUBLE LOW-9 QUOTATION MARK
145
+ 0x85: "\u2026", # HORIZONTAL ELLIPSIS
146
+ 0x86: "\u2020", # DAGGER
147
+ 0x87: "\u2021", # DOUBLE DAGGER
148
+ 0x88: "\u02c6", # MODIFIER LETTER CIRCUMFLEX ACCENT
149
+ 0x89: "\u2030", # PER MILLE SIGN
150
+ 0x8A: "\u0160", # LATIN CAPITAL LETTER S WITH CARON
151
+ 0x8B: "\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
152
+ 0x8C: "\u0152", # LATIN CAPITAL LIGATURE OE
153
+ 0x8E: "\u017d", # LATIN CAPITAL LETTER Z WITH CARON
154
+ 0x91: "\u2018", # LEFT SINGLE QUOTATION MARK
155
+ 0x92: "\u2019", # RIGHT SINGLE QUOTATION MARK
156
+ 0x93: "\u201c", # LEFT DOUBLE QUOTATION MARK
157
+ 0x94: "\u201d", # RIGHT DOUBLE QUOTATION MARK
158
+ 0x95: "\u2022", # BULLET
159
+ 0x96: "\u2013", # EN DASH
160
+ 0x97: "\u2014", # EM DASH
161
+ 0x98: "\u02dc", # SMALL TILDE
162
+ 0x99: "\u2122", # TRADE MARK SIGN
163
+ 0x9A: "\u0161", # LATIN SMALL LETTER S WITH CARON
164
+ 0x9B: "\u203a", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
165
+ 0x9C: "\u0153", # LATIN SMALL LIGATURE OE
166
+ 0x9E: "\u017e", # LATIN SMALL LETTER Z WITH CARON
167
+ 0x9F: "\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
168
+ }
169
+
170
+
171
+ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
172
+ """Decode a numeric character reference like < or <.
173
+
174
+ Args:
175
+ text: The numeric part (without &# or ;)
176
+ is_hex: Whether this is hexadecimal (&#x) or decimal (&#)
177
+
178
+ Returns:
179
+ The decoded character, or None if invalid
180
+ """
181
+ base = 16 if is_hex else 10
182
+ codepoint = int(text, base)
183
+
184
+ # Apply HTML5 replacements for certain ranges
185
+ if codepoint in NUMERIC_REPLACEMENTS:
186
+ return NUMERIC_REPLACEMENTS[codepoint]
187
+
188
+ # Invalid ranges per HTML5 spec
189
+ if codepoint > 0x10FFFF:
190
+ return "\ufffd" # REPLACEMENT CHARACTER
191
+ if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
192
+ return "\ufffd"
193
+
194
+ return chr(codepoint)
195
+
196
+
197
+ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
198
+ """Decode all HTML entities in text.
199
+
200
+ This is a simple implementation that handles:
201
+ - Named entities: &amp; &lt; &gt; &quot; &nbsp; etc.
202
+ - Decimal numeric: &#60; &#160; etc.
203
+ - Hex numeric: &#x3C; &#xA0; etc.
204
+
205
+ Args:
206
+ text: Input text potentially containing entities
207
+ in_attribute: Whether this is attribute value (stricter rules for legacy entities)
208
+
209
+ Returns:
210
+ Text with entities decoded
211
+ """
212
+ result: list[str] = []
213
+ i = 0
214
+ length = len(text)
215
+ while i < length:
216
+ next_amp = text.find("&", i)
217
+ if next_amp == -1:
218
+ result.append(text[i:])
219
+ break
220
+
221
+ if next_amp > i:
222
+ result.append(text[i:next_amp])
223
+
224
+ i = next_amp
225
+ # Look for entity
226
+ j = i + 1
227
+
228
+ # Check for numeric entity
229
+ if j < length and text[j] == "#":
230
+ j += 1
231
+ is_hex = False
232
+
233
+ if j < length and text[j] in "xX":
234
+ is_hex = True
235
+ j += 1
236
+
237
+ # Collect digits
238
+ digit_start = j
239
+ if is_hex:
240
+ while j < length and text[j] in "0123456789abcdefABCDEF":
241
+ j += 1
242
+ else:
243
+ while j < length and text[j].isdigit():
244
+ j += 1
245
+
246
+ has_semicolon = j < length and text[j] == ";"
247
+ digit_text = text[digit_start:j]
248
+
249
+ if digit_text:
250
+ result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
251
+ i = j + 1 if has_semicolon else j
252
+ continue
253
+
254
+ # Invalid numeric entity, keep as-is
255
+ result.append(text[i : j + 1 if has_semicolon else j])
256
+ i = j + 1 if has_semicolon else j
257
+ continue
258
+
259
+ # Named entity
260
+ # Collect alphanumeric characters (entity names are case-sensitive and can include uppercase)
261
+ while j < length and (text[j].isalpha() or text[j].isdigit()):
262
+ j += 1
263
+
264
+ entity_name = text[i + 1 : j]
265
+ has_semicolon = j < length and text[j] == ";"
266
+
267
+ if not entity_name:
268
+ result.append("&")
269
+ i += 1
270
+ continue
271
+
272
+ # Try exact match first (with semicolon expected)
273
+ if has_semicolon and entity_name in NAMED_ENTITIES:
274
+ result.append(NAMED_ENTITIES[entity_name])
275
+ i = j + 1
276
+ continue
277
+ # If semicolon present but no exact match, allow legacy prefix match in text
278
+ if has_semicolon and not in_attribute:
279
+ best_match: str | None = None
280
+ best_match_len = 0
281
+ for k in range(len(entity_name), 0, -1):
282
+ prefix = entity_name[:k]
283
+ if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
284
+ best_match = NAMED_ENTITIES[prefix]
285
+ best_match_len = k
286
+ break
287
+ if best_match:
288
+ result.append(best_match)
289
+ i = i + 1 + best_match_len
290
+ continue
291
+
292
+ # Try without semicolon for legacy compatibility
293
+ # Only legacy entities can be used without semicolons
294
+ if entity_name in LEGACY_ENTITIES and entity_name in NAMED_ENTITIES:
295
+ # Legacy entities without semicolon have strict rules in attributes:
296
+ # don't decode if followed by alphanumeric or '='
297
+ # Per HTML5 spec §13.2.5.72
298
+ next_char = text[j] if j < length else None
299
+ if in_attribute and next_char and (next_char.isalnum() or next_char == "="):
300
+ result.append("&")
301
+ i += 1
302
+ continue
303
+
304
+ # Decode legacy entity
305
+ result.append(NAMED_ENTITIES[entity_name])
306
+ i = j
307
+ continue
308
+
309
+ # Try longest prefix match for legacy entities without semicolon
310
+ # This handles cases like &notit where &not is valid but &notit is not
311
+ best_match = None
312
+ best_match_len = 0
313
+ for k in range(len(entity_name), 0, -1):
314
+ prefix = entity_name[:k]
315
+ if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
316
+ best_match = NAMED_ENTITIES[prefix]
317
+ best_match_len = k
318
+ break
319
+
320
+ if best_match:
321
+ # Check legacy entity rules
322
+ end_pos = i + 1 + best_match_len
323
+ next_char = text[end_pos] if end_pos < length else None
324
+ if in_attribute:
325
+ # In attributes with prefix match, the next char is always alphanumeric
326
+ # (since entity_name was built from alphanumerics only)
327
+ # Per HTML5 spec, don't decode if followed by alphanumeric or =
328
+ result.append("&")
329
+ i += 1
330
+ continue
331
+
332
+ result.append(best_match)
333
+ i = i + 1 + best_match_len
334
+ continue
335
+
336
+ # No match found
337
+ if has_semicolon:
338
+ result.append(text[i : j + 1])
339
+ i = j + 1
340
+ else:
341
+ result.append("&")
342
+ i += 1
343
+
344
+ return "".join(result)
justhtml/errors.py ADDED
@@ -0,0 +1,140 @@
1
+ """Centralized error message definitions and helpers for HTML parsing errors.
2
+
3
+ This module provides human-readable error messages for all parse error codes
4
+ emitted by both the tokenizer and tree builder during HTML parsing.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ def generate_error_message(code: str, tag_name: str | None = None) -> str:
11
+ """Generate human-readable error message from error code.
12
+
13
+ Args:
14
+ code: The error code string (kebab-case format)
15
+ tag_name: Optional tag name to include in the message for context
16
+
17
+ Returns:
18
+ Human-readable error message string
19
+ """
20
+ messages = {
21
+ # ================================================================
22
+ # TOKENIZER ERRORS
23
+ # ================================================================
24
+ # DOCTYPE errors
25
+ "eof-in-doctype": "Unexpected end of file in DOCTYPE declaration",
26
+ "eof-in-doctype-name": "Unexpected end of file while reading DOCTYPE name",
27
+ "eof-in-doctype-public-identifier": "Unexpected end of file in DOCTYPE public identifier",
28
+ "eof-in-doctype-system-identifier": "Unexpected end of file in DOCTYPE system identifier",
29
+ "expected-doctype-name-but-got-right-bracket": "Expected DOCTYPE name but got >",
30
+ "missing-whitespace-before-doctype-name": "Missing whitespace after <!DOCTYPE",
31
+ "abrupt-doctype-public-identifier": "DOCTYPE public identifier ended abruptly",
32
+ "abrupt-doctype-system-identifier": "DOCTYPE system identifier ended abruptly",
33
+ "missing-quote-before-doctype-public-identifier": "Missing quote before DOCTYPE public identifier",
34
+ "missing-quote-before-doctype-system-identifier": "Missing quote before DOCTYPE system identifier",
35
+ "missing-doctype-public-identifier": "Missing DOCTYPE public identifier",
36
+ "missing-doctype-system-identifier": "Missing DOCTYPE system identifier",
37
+ "missing-whitespace-before-doctype-public-identifier": "Missing whitespace before DOCTYPE public identifier",
38
+ "missing-whitespace-after-doctype-public-identifier": "Missing whitespace after DOCTYPE public identifier",
39
+ "missing-whitespace-between-doctype-public-and-system-identifiers": "Missing whitespace between DOCTYPE identifiers",
40
+ "missing-whitespace-after-doctype-name": "Missing whitespace after DOCTYPE name",
41
+ "unexpected-character-after-doctype-public-keyword": "Unexpected character after PUBLIC keyword",
42
+ "unexpected-character-after-doctype-system-keyword": "Unexpected character after SYSTEM keyword",
43
+ "unexpected-character-after-doctype-public-identifier": "Unexpected character after public identifier",
44
+ "unexpected-character-after-doctype-system-identifier": "Unexpected character after system identifier",
45
+ # Comment errors
46
+ "eof-in-comment": "Unexpected end of file in comment",
47
+ "abrupt-closing-of-empty-comment": "Comment ended abruptly with -->",
48
+ "incorrectly-closed-comment": "Comment ended with --!> instead of -->",
49
+ # Tag errors
50
+ "eof-in-tag": "Unexpected end of file in tag",
51
+ "eof-before-tag-name": "Unexpected end of file before tag name",
52
+ "empty-end-tag": "Empty end tag </> is not allowed",
53
+ "invalid-first-character-of-tag-name": "Invalid first character of tag name",
54
+ "unexpected-question-mark-instead-of-tag-name": "Unexpected ? instead of tag name",
55
+ "unexpected-character-after-solidus-in-tag": "Unexpected character after / in tag",
56
+ # Attribute errors
57
+ "duplicate-attribute": "Duplicate attribute name",
58
+ "missing-attribute-value": "Missing attribute value",
59
+ "unexpected-character-in-attribute-name": "Unexpected character in attribute name",
60
+ "unexpected-character-in-unquoted-attribute-value": "Unexpected character in unquoted attribute value",
61
+ "missing-whitespace-between-attributes": "Missing whitespace between attributes",
62
+ "unexpected-equals-sign-before-attribute-name": "Unexpected = before attribute name",
63
+ # Script errors
64
+ "eof-in-script-html-comment-like-text": "Unexpected end of file in script with HTML-like comment",
65
+ "eof-in-script-in-script": "Unexpected end of file in nested script tag",
66
+ # CDATA errors
67
+ "eof-in-cdata": "Unexpected end of file in CDATA section",
68
+ "cdata-in-html-content": "CDATA section only allowed in SVG/MathML content",
69
+ # NULL character errors
70
+ "unexpected-null-character": "Unexpected NULL character (U+0000)",
71
+ # Markup declaration errors
72
+ "incorrectly-opened-comment": "Incorrectly opened comment",
73
+ # Character reference errors
74
+ "control-character-reference": "Invalid control character in character reference",
75
+ "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
76
+ "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
77
+ "named-entity-without-semicolon": "Named entity used without semicolon",
78
+ # ================================================================
79
+ # TREE BUILDER ERRORS
80
+ # ================================================================
81
+ # DOCTYPE errors
82
+ "unexpected-doctype": "Unexpected DOCTYPE declaration",
83
+ "unknown-doctype": "Unknown DOCTYPE (expected <!DOCTYPE html>)",
84
+ "expected-doctype-but-got-chars": "Expected DOCTYPE but got text content",
85
+ "expected-doctype-but-got-eof": "Expected DOCTYPE but reached end of file",
86
+ "expected-doctype-but-got-start-tag": f"Expected DOCTYPE but got <{tag_name}> tag",
87
+ "expected-doctype-but-got-end-tag": f"Expected DOCTYPE but got </{tag_name}> tag",
88
+ "unexpected-doctype-in-foreign-content": "Unexpected DOCTYPE in SVG/MathML content",
89
+ # Unexpected tag errors
90
+ "unexpected-start-tag": f"Unexpected <{tag_name}> start tag",
91
+ "unexpected-end-tag": f"Unexpected </{tag_name}> end tag",
92
+ "unexpected-end-tag-before-html": f"Unexpected </{tag_name}> end tag before <html>",
93
+ "unexpected-end-tag-before-head": f"Unexpected </{tag_name}> end tag before <head>",
94
+ "unexpected-end-tag-after-head": f"Unexpected </{tag_name}> end tag after <head>",
95
+ "unexpected-start-tag-ignored": f"<{tag_name}> start tag ignored in current context",
96
+ "unexpected-start-tag-implies-end-tag": f"<{tag_name}> start tag implicitly closes previous element",
97
+ # EOF errors
98
+ "expected-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
99
+ "expected-named-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
100
+ # Invalid character errors
101
+ "invalid-codepoint": "Invalid character (U+0000 NULL or U+000C FORM FEED)",
102
+ "invalid-codepoint-before-head": "Invalid character before <head>",
103
+ "invalid-codepoint-in-body": "Invalid character in <body>",
104
+ "invalid-codepoint-in-table-text": "Invalid character in table text",
105
+ "invalid-codepoint-in-select": "Invalid character in <select>",
106
+ "invalid-codepoint-in-foreign-content": "Invalid character in SVG/MathML content",
107
+ # Foster parenting / table errors
108
+ "foster-parenting-character": "Text content in table requires foster parenting",
109
+ "foster-parenting-start-tag": "Start tag in table requires foster parenting",
110
+ "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
111
+ "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
112
+ "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
113
+ "unexpected-form-in-table": "Form element not allowed in table context",
114
+ "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
115
+ # Context-specific errors
116
+ "unexpected-hidden-input-after-head": "Unexpected hidden input after <head>",
117
+ "unexpected-token-in-frameset": "Unexpected content in <frameset>",
118
+ "unexpected-token-after-frameset": "Unexpected content after <frameset>",
119
+ "unexpected-token-after-after-frameset": "Unexpected content after frameset closed",
120
+ "unexpected-token-after-body": "Unexpected content after </body>",
121
+ "unexpected-char-after-body": "Unexpected character after </body>",
122
+ "unexpected-characters-in-column-group": "Text not allowed in <colgroup>",
123
+ "unexpected-characters-in-template-column-group": "Text not allowed in template column group",
124
+ "unexpected-start-tag-in-column-group": f"<{tag_name}> start tag not allowed in <colgroup>",
125
+ "unexpected-start-tag-in-template-column-group": f"<{tag_name}> start tag not allowed in template column group",
126
+ "unexpected-start-tag-in-template-table-context": f"<{tag_name}> start tag not allowed in template table context",
127
+ "unexpected-start-tag-in-cell-fragment": f"<{tag_name}> start tag not allowed in cell fragment context",
128
+ # Foreign content errors
129
+ "unexpected-html-element-in-foreign-content": "HTML element breaks out of SVG/MathML content",
130
+ "unexpected-end-tag-in-foreign-content": f"Mismatched </{tag_name}> end tag in SVG/MathML content",
131
+ "unexpected-end-tag-in-fragment-context": f"</{tag_name}> end tag not allowed in fragment parsing context",
132
+ # Miscellaneous errors
133
+ "end-tag-too-early": f"</{tag_name}> end tag closed early (unclosed children)",
134
+ "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
135
+ "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
136
+ "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
137
+ }
138
+
139
+ # Return message or fall back to the code itself if not found
140
+ return messages.get(code, code)