PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl - Mend

justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (23) hide show

justhtml/__init__.py +17 -0
justhtml/__main__.py +144 -0
justhtml/constants.py +445 -0
justhtml/context.py +12 -0
justhtml/encoding.py +405 -0
justhtml/entities.py +344 -0
justhtml/errors.py +140 -0
justhtml/node.py +632 -0
justhtml/parser.py +131 -0
justhtml/py.typed +0 -0
justhtml/selector.py +965 -0
justhtml/serialize.py +258 -0
justhtml/stream.py +107 -0
justhtml/tokenizer.py +2647 -0
justhtml/tokens.py +223 -0
justhtml/treebuilder.py +1279 -0
justhtml/treebuilder_modes.py +2016 -0
justhtml/treebuilder_utils.py +93 -0
justhtml-0.12.0.dist-info/METADATA +164 -0
justhtml-0.12.0.dist-info/RECORD +23 -0
justhtml-0.12.0.dist-info/WHEEL +4 -0
justhtml-0.12.0.dist-info/entry_points.txt +2 -0
justhtml-0.12.0.dist-info/licenses/LICENSE +21 -0

justhtml/entities.py ADDED Viewed

@@ -0,0 +1,344 @@
+"""HTML5 character entity decoding.
+Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
+Supports both named entities (&amp;, &nbsp;) and numeric references (&#60;, &#x3C;).
+"""
+from __future__ import annotations
+import html.entities
+# Use Python's complete HTML5 entity list (2231 entities)
+# Keys include the trailing semicolon (e.g., "amp;", "lang;")
+# We'll strip semicolons when looking up to match both forms
+_HTML5_ENTITIES: dict[str, str] = html.entities.html5
+# Build a normalized lookup without semicolons for easier access
+NAMED_ENTITIES: dict[str, str] = {}
+for _key, _value in _HTML5_ENTITIES.items():
+    # Remove trailing semicolon for lookup
+    if _key.endswith(";"):
+        NAMED_ENTITIES[_key[:-1]] = _value
+    else:
+        NAMED_ENTITIES[_key] = _value
+# Legacy named character references that can be used without semicolons
+# Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
+# Modern entities like "prod", "notin" etc. require semicolons
+# Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
+LEGACY_ENTITIES: set[str] = {
+    "gt",
+    "lt",
+    "amp",
+    "quot",
+    "nbsp",
+    "AMP",
+    "QUOT",
+    "GT",
+    "LT",
+    "COPY",
+    "REG",
+    "AElig",
+    "Aacute",
+    "Acirc",
+    "Agrave",
+    "Aring",
+    "Atilde",
+    "Auml",
+    "Ccedil",
+    "ETH",
+    "Eacute",
+    "Ecirc",
+    "Egrave",
+    "Euml",
+    "Iacute",
+    "Icirc",
+    "Igrave",
+    "Iuml",
+    "Ntilde",
+    "Oacute",
+    "Ocirc",
+    "Ograve",
+    "Oslash",
+    "Otilde",
+    "Ouml",
+    "THORN",
+    "Uacute",
+    "Ucirc",
+    "Ugrave",
+    "Uuml",
+    "Yacute",
+    "aacute",
+    "acirc",
+    "acute",
+    "aelig",
+    "agrave",
+    "aring",
+    "atilde",
+    "auml",
+    "brvbar",
+    "ccedil",
+    "cedil",
+    "cent",
+    "copy",
+    "curren",
+    "deg",
+    "divide",
+    "eacute",
+    "ecirc",
+    "egrave",
+    "eth",
+    "euml",
+    "frac12",
+    "frac14",
+    "frac34",
+    "iacute",
+    "icirc",
+    "iexcl",
+    "igrave",
+    "iquest",
+    "iuml",
+    "laquo",
+    "macr",
+    "micro",
+    "middot",
+    "not",
+    "ntilde",
+    "oacute",
+    "ocirc",
+    "ograve",
+    "ordf",
+    "ordm",
+    "oslash",
+    "otilde",
+    "ouml",
+    "para",
+    "plusmn",
+    "pound",
+    "raquo",
+    "reg",
+    "sect",
+    "shy",
+    "sup1",
+    "sup2",
+    "sup3",
+    "szlig",
+    "thorn",
+    "times",
+    "uacute",
+    "ucirc",
+    "ugrave",
+    "uml",
+    "uuml",
+    "yacute",
+    "yen",
+    "yuml",
+}
+# HTML5 numeric character reference replacements (§13.2.5.73)
+NUMERIC_REPLACEMENTS: dict[int, str] = {
+    0x00: "\ufffd",  # NULL
+    0x80: "\u20ac",  # EURO SIGN
+    0x82: "\u201a",  # SINGLE LOW-9 QUOTATION MARK
+    0x83: "\u0192",  # LATIN SMALL LETTER F WITH HOOK
+    0x84: "\u201e",  # DOUBLE LOW-9 QUOTATION MARK
+    0x85: "\u2026",  # HORIZONTAL ELLIPSIS
+    0x86: "\u2020",  # DAGGER
+    0x87: "\u2021",  # DOUBLE DAGGER
+    0x88: "\u02c6",  # MODIFIER LETTER CIRCUMFLEX ACCENT
+    0x89: "\u2030",  # PER MILLE SIGN
+    0x8A: "\u0160",  # LATIN CAPITAL LETTER S WITH CARON
+    0x8B: "\u2039",  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    0x8C: "\u0152",  # LATIN CAPITAL LIGATURE OE
+    0x8E: "\u017d",  # LATIN CAPITAL LETTER Z WITH CARON
+    0x91: "\u2018",  # LEFT SINGLE QUOTATION MARK
+    0x92: "\u2019",  # RIGHT SINGLE QUOTATION MARK
+    0x93: "\u201c",  # LEFT DOUBLE QUOTATION MARK
+    0x94: "\u201d",  # RIGHT DOUBLE QUOTATION MARK
+    0x95: "\u2022",  # BULLET
+    0x96: "\u2013",  # EN DASH
+    0x97: "\u2014",  # EM DASH
+    0x98: "\u02dc",  # SMALL TILDE
+    0x99: "\u2122",  # TRADE MARK SIGN
+    0x9A: "\u0161",  # LATIN SMALL LETTER S WITH CARON
+    0x9B: "\u203a",  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    0x9C: "\u0153",  # LATIN SMALL LIGATURE OE
+    0x9E: "\u017e",  # LATIN SMALL LETTER Z WITH CARON
+    0x9F: "\u0178",  # LATIN CAPITAL LETTER Y WITH DIAERESIS
+}
+def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
+    """Decode a numeric character reference like &#60; or &#x3C;.
+    Args:
+        text: The numeric part (without &# or ;)
+        is_hex: Whether this is hexadecimal (&#x) or decimal (&#)
+    Returns:
+        The decoded character, or None if invalid
+    """
+    base = 16 if is_hex else 10
+    codepoint = int(text, base)
+    # Apply HTML5 replacements for certain ranges
+    if codepoint in NUMERIC_REPLACEMENTS:
+        return NUMERIC_REPLACEMENTS[codepoint]
+    # Invalid ranges per HTML5 spec
+    if codepoint > 0x10FFFF:
+        return "\ufffd"  # REPLACEMENT CHARACTER
+    if 0xD800 <= codepoint <= 0xDFFF:  # Surrogate range
+        return "\ufffd"
+    return chr(codepoint)
+def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
+    """Decode all HTML entities in text.
+    This is a simple implementation that handles:
+    - Named entities: &amp; &lt; &gt; &quot; &nbsp; etc.
+    - Decimal numeric: &#60; &#160; etc.
+    - Hex numeric: &#x3C; &#xA0; etc.
+    Args:
+        text: Input text potentially containing entities
+        in_attribute: Whether this is attribute value (stricter rules for legacy entities)
+    Returns:
+        Text with entities decoded
+    """
+    result: list[str] = []
+    i = 0
+    length = len(text)
+    while i < length:
+        next_amp = text.find("&", i)
+        if next_amp == -1:
+            result.append(text[i:])
+            break
+        if next_amp > i:
+            result.append(text[i:next_amp])
+        i = next_amp
+        # Look for entity
+        j = i + 1
+        # Check for numeric entity
+        if j < length and text[j] == "#":
+            j += 1
+            is_hex = False
+            if j < length and text[j] in "xX":
+                is_hex = True
+                j += 1
+            # Collect digits
+            digit_start = j
+            if is_hex:
+                while j < length and text[j] in "0123456789abcdefABCDEF":
+                    j += 1
+            else:
+                while j < length and text[j].isdigit():
+                    j += 1
+            has_semicolon = j < length and text[j] == ";"
+            digit_text = text[digit_start:j]
+            if digit_text:
+                result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
+                i = j + 1 if has_semicolon else j
+                continue
+            # Invalid numeric entity, keep as-is
+            result.append(text[i : j + 1 if has_semicolon else j])
+            i = j + 1 if has_semicolon else j
+            continue
+        # Named entity
+        # Collect alphanumeric characters (entity names are case-sensitive and can include uppercase)
+        while j < length and (text[j].isalpha() or text[j].isdigit()):
+            j += 1
+        entity_name = text[i + 1 : j]
+        has_semicolon = j < length and text[j] == ";"
+        if not entity_name:
+            result.append("&")
+            i += 1
+            continue
+        # Try exact match first (with semicolon expected)
+        if has_semicolon and entity_name in NAMED_ENTITIES:
+            result.append(NAMED_ENTITIES[entity_name])
+            i = j + 1
+            continue
+        # If semicolon present but no exact match, allow legacy prefix match in text
+        if has_semicolon and not in_attribute:
+            best_match: str | None = None
+            best_match_len = 0
+            for k in range(len(entity_name), 0, -1):
+                prefix = entity_name[:k]
+                if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
+                    best_match = NAMED_ENTITIES[prefix]
+                    best_match_len = k
+                    break
+            if best_match:
+                result.append(best_match)
+                i = i + 1 + best_match_len
+                continue
+        # Try without semicolon for legacy compatibility
+        # Only legacy entities can be used without semicolons
+        if entity_name in LEGACY_ENTITIES and entity_name in NAMED_ENTITIES:
+            # Legacy entities without semicolon have strict rules in attributes:
+            # don't decode if followed by alphanumeric or '='
+            # Per HTML5 spec §13.2.5.72
+            next_char = text[j] if j < length else None
+            if in_attribute and next_char and (next_char.isalnum() or next_char == "="):
+                result.append("&")
+                i += 1
+                continue
+            # Decode legacy entity
+            result.append(NAMED_ENTITIES[entity_name])
+            i = j
+            continue
+        # Try longest prefix match for legacy entities without semicolon
+        # This handles cases like &notit where &not is valid but &notit is not
+        best_match = None
+        best_match_len = 0
+        for k in range(len(entity_name), 0, -1):
+            prefix = entity_name[:k]
+            if prefix in LEGACY_ENTITIES and prefix in NAMED_ENTITIES:
+                best_match = NAMED_ENTITIES[prefix]
+                best_match_len = k
+                break
+        if best_match:
+            # Check legacy entity rules
+            end_pos = i + 1 + best_match_len
+            next_char = text[end_pos] if end_pos < length else None
+            if in_attribute:
+                # In attributes with prefix match, the next char is always alphanumeric
+                # (since entity_name was built from alphanumerics only)
+                # Per HTML5 spec, don't decode if followed by alphanumeric or =
+                result.append("&")
+                i += 1
+                continue
+            result.append(best_match)
+            i = i + 1 + best_match_len
+            continue
+        # No match found
+        if has_semicolon:
+            result.append(text[i : j + 1])
+            i = j + 1
+        else:
+            result.append("&")
+            i += 1
+    return "".join(result)

justhtml/errors.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""Centralized error message definitions and helpers for HTML parsing errors.
+This module provides human-readable error messages for all parse error codes
+emitted by both the tokenizer and tree builder during HTML parsing.
+"""
+from __future__ import annotations
+def generate_error_message(code: str, tag_name: str | None = None) -> str:
+    """Generate human-readable error message from error code.
+    Args:
+        code: The error code string (kebab-case format)
+        tag_name: Optional tag name to include in the message for context
+    Returns:
+        Human-readable error message string
+    """
+    messages = {
+        # ================================================================
+        # TOKENIZER ERRORS
+        # ================================================================
+        # DOCTYPE errors
+        "eof-in-doctype": "Unexpected end of file in DOCTYPE declaration",
+        "eof-in-doctype-name": "Unexpected end of file while reading DOCTYPE name",
+        "eof-in-doctype-public-identifier": "Unexpected end of file in DOCTYPE public identifier",
+        "eof-in-doctype-system-identifier": "Unexpected end of file in DOCTYPE system identifier",
+        "expected-doctype-name-but-got-right-bracket": "Expected DOCTYPE name but got >",
+        "missing-whitespace-before-doctype-name": "Missing whitespace after <!DOCTYPE",
+        "abrupt-doctype-public-identifier": "DOCTYPE public identifier ended abruptly",
+        "abrupt-doctype-system-identifier": "DOCTYPE system identifier ended abruptly",
+        "missing-quote-before-doctype-public-identifier": "Missing quote before DOCTYPE public identifier",
+        "missing-quote-before-doctype-system-identifier": "Missing quote before DOCTYPE system identifier",
+        "missing-doctype-public-identifier": "Missing DOCTYPE public identifier",
+        "missing-doctype-system-identifier": "Missing DOCTYPE system identifier",
+        "missing-whitespace-before-doctype-public-identifier": "Missing whitespace before DOCTYPE public identifier",
+        "missing-whitespace-after-doctype-public-identifier": "Missing whitespace after DOCTYPE public identifier",
+        "missing-whitespace-between-doctype-public-and-system-identifiers": "Missing whitespace between DOCTYPE identifiers",
+        "missing-whitespace-after-doctype-name": "Missing whitespace after DOCTYPE name",
+        "unexpected-character-after-doctype-public-keyword": "Unexpected character after PUBLIC keyword",
+        "unexpected-character-after-doctype-system-keyword": "Unexpected character after SYSTEM keyword",
+        "unexpected-character-after-doctype-public-identifier": "Unexpected character after public identifier",
+        "unexpected-character-after-doctype-system-identifier": "Unexpected character after system identifier",
+        # Comment errors
+        "eof-in-comment": "Unexpected end of file in comment",
+        "abrupt-closing-of-empty-comment": "Comment ended abruptly with -->",
+        "incorrectly-closed-comment": "Comment ended with --!> instead of -->",
+        # Tag errors
+        "eof-in-tag": "Unexpected end of file in tag",
+        "eof-before-tag-name": "Unexpected end of file before tag name",
+        "empty-end-tag": "Empty end tag </> is not allowed",
+        "invalid-first-character-of-tag-name": "Invalid first character of tag name",
+        "unexpected-question-mark-instead-of-tag-name": "Unexpected ? instead of tag name",
+        "unexpected-character-after-solidus-in-tag": "Unexpected character after / in tag",
+        # Attribute errors
+        "duplicate-attribute": "Duplicate attribute name",
+        "missing-attribute-value": "Missing attribute value",
+        "unexpected-character-in-attribute-name": "Unexpected character in attribute name",
+        "unexpected-character-in-unquoted-attribute-value": "Unexpected character in unquoted attribute value",
+        "missing-whitespace-between-attributes": "Missing whitespace between attributes",
+        "unexpected-equals-sign-before-attribute-name": "Unexpected = before attribute name",
+        # Script errors
+        "eof-in-script-html-comment-like-text": "Unexpected end of file in script with HTML-like comment",
+        "eof-in-script-in-script": "Unexpected end of file in nested script tag",
+        # CDATA errors
+        "eof-in-cdata": "Unexpected end of file in CDATA section",
+        "cdata-in-html-content": "CDATA section only allowed in SVG/MathML content",
+        # NULL character errors
+        "unexpected-null-character": "Unexpected NULL character (U+0000)",
+        # Markup declaration errors
+        "incorrectly-opened-comment": "Incorrectly opened comment",
+        # Character reference errors
+        "control-character-reference": "Invalid control character in character reference",
+        "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
+        "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
+        "named-entity-without-semicolon": "Named entity used without semicolon",
+        # ================================================================
+        # TREE BUILDER ERRORS
+        # ================================================================
+        # DOCTYPE errors
+        "unexpected-doctype": "Unexpected DOCTYPE declaration",
+        "unknown-doctype": "Unknown DOCTYPE (expected <!DOCTYPE html>)",
+        "expected-doctype-but-got-chars": "Expected DOCTYPE but got text content",
+        "expected-doctype-but-got-eof": "Expected DOCTYPE but reached end of file",
+        "expected-doctype-but-got-start-tag": f"Expected DOCTYPE but got <{tag_name}> tag",
+        "expected-doctype-but-got-end-tag": f"Expected DOCTYPE but got </{tag_name}> tag",
+        "unexpected-doctype-in-foreign-content": "Unexpected DOCTYPE in SVG/MathML content",
+        # Unexpected tag errors
+        "unexpected-start-tag": f"Unexpected <{tag_name}> start tag",
+        "unexpected-end-tag": f"Unexpected </{tag_name}> end tag",
+        "unexpected-end-tag-before-html": f"Unexpected </{tag_name}> end tag before <html>",
+        "unexpected-end-tag-before-head": f"Unexpected </{tag_name}> end tag before <head>",
+        "unexpected-end-tag-after-head": f"Unexpected </{tag_name}> end tag after <head>",
+        "unexpected-start-tag-ignored": f"<{tag_name}> start tag ignored in current context",
+        "unexpected-start-tag-implies-end-tag": f"<{tag_name}> start tag implicitly closes previous element",
+        # EOF errors
+        "expected-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
+        "expected-named-closing-tag-but-got-eof": f"Expected </{tag_name}> closing tag but reached end of file",
+        # Invalid character errors
+        "invalid-codepoint": "Invalid character (U+0000 NULL or U+000C FORM FEED)",
+        "invalid-codepoint-before-head": "Invalid character before <head>",
+        "invalid-codepoint-in-body": "Invalid character in <body>",
+        "invalid-codepoint-in-table-text": "Invalid character in table text",
+        "invalid-codepoint-in-select": "Invalid character in <select>",
+        "invalid-codepoint-in-foreign-content": "Invalid character in SVG/MathML content",
+        # Foster parenting / table errors
+        "foster-parenting-character": "Text content in table requires foster parenting",
+        "foster-parenting-start-tag": "Start tag in table requires foster parenting",
+        "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
+        "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
+        "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
+        "unexpected-form-in-table": "Form element not allowed in table context",
+        "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
+        # Context-specific errors
+        "unexpected-hidden-input-after-head": "Unexpected hidden input after <head>",
+        "unexpected-token-in-frameset": "Unexpected content in <frameset>",
+        "unexpected-token-after-frameset": "Unexpected content after <frameset>",
+        "unexpected-token-after-after-frameset": "Unexpected content after frameset closed",
+        "unexpected-token-after-body": "Unexpected content after </body>",
+        "unexpected-char-after-body": "Unexpected character after </body>",
+        "unexpected-characters-in-column-group": "Text not allowed in <colgroup>",
+        "unexpected-characters-in-template-column-group": "Text not allowed in template column group",
+        "unexpected-start-tag-in-column-group": f"<{tag_name}> start tag not allowed in <colgroup>",
+        "unexpected-start-tag-in-template-column-group": f"<{tag_name}> start tag not allowed in template column group",
+        "unexpected-start-tag-in-template-table-context": f"<{tag_name}> start tag not allowed in template table context",
+        "unexpected-start-tag-in-cell-fragment": f"<{tag_name}> start tag not allowed in cell fragment context",
+        # Foreign content errors
+        "unexpected-html-element-in-foreign-content": "HTML element breaks out of SVG/MathML content",
+        "unexpected-end-tag-in-foreign-content": f"Mismatched </{tag_name}> end tag in SVG/MathML content",
+        "unexpected-end-tag-in-fragment-context": f"</{tag_name}> end tag not allowed in fragment parsing context",
+        # Miscellaneous errors
+        "end-tag-too-early": f"</{tag_name}> end tag closed early (unclosed children)",
+        "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
+        "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
+        "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
+    }
+    # Return message or fall back to the code itself if not found
+    return messages.get(code, code)