PyPI - justhtml - Versions diffs - 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl - Mend

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (21) hide show

justhtml/__init__.py +44 -2
justhtml/__main__.py +45 -9
justhtml/constants.py +12 -0
justhtml/errors.py +8 -3
justhtml/linkify.py +438 -0
justhtml/node.py +54 -35
justhtml/parser.py +105 -38
justhtml/sanitize.py +511 -282
justhtml/selector.py +3 -1
justhtml/serialize.py +398 -72
justhtml/tokenizer.py +121 -21
justhtml/tokens.py +21 -3
justhtml/transforms.py +2568 -0
justhtml/treebuilder.py +247 -190
justhtml/treebuilder_modes.py +108 -102
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
justhtml-0.38.0.dist-info/RECORD +26 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
justhtml-0.24.0.dist-info/RECORD +0 -24
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0

justhtml/selector.py CHANGED Viewed

@@ -651,7 +651,9 @@ class SelectorMatcher:
         attr_value: str | None = None
         for name, value in attrs.items():
             if name.lower() == attr_name:
-                attr_value = value
+                # Attributes can be boolean (represented as None in JustHTML).
+                # For selector matching, presence should still count.
+                attr_value = "" if value is None else str(value)
                 break
         if attr_value is None:

justhtml/serialize.py CHANGED Viewed

@@ -4,17 +4,22 @@
 from __future__ import annotations
+import re
 from typing import Any
-from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS
-from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, sanitize
+from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
+# Matches characters that prevent an attribute value from being unquoted.
+# Note: This matches the logic of the previous loop-based implementation.
+# It checks for space characters, quotes, equals sign, and greater-than.
+_UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
 def _escape_text(text: str | None) -> str:
     if not text:
         return ""
     # Minimal, but matches html5lib serializer expectations in core cases.
-    return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
 def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
@@ -22,7 +27,7 @@ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None)
         return forced_quote_char
     if value is None:
         return '"'
-    value = str(value)
+    # value is assumed to be a string
     if '"' in value and "'" not in value:
         return "'"
     return '"'
@@ -31,7 +36,7 @@ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None)
 def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
     if value is None:
         return ""
-    value = str(value)
+    # value is assumed to be a string
     value = value.replace("&", "&amp;")
     if escape_lt_in_attrs:
         value = value.replace("<", "&lt;")
@@ -44,15 +49,8 @@ def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs
 def _can_unquote_attr_value(value: str | None) -> bool:
     if value is None:
         return False
-    value = str(value)
-    for ch in value:
-        if ch == ">":
-            return False
-        if ch in {'"', "'", "="}:
-            return False
-        if ch in {" ", "\t", "\n", "\f", "\r"}:
-            return False
-    return True
+    # Optimization: use regex instead of loop
+    return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
 def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
@@ -60,7 +58,9 @@ def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boole
         return False
     if value is None or value == "":
         return True
-    return str(value).lower() == str(name).lower()
+    if value == name:
+        return True
+    return value.lower() == name
 def serialize_start_tag(
@@ -86,7 +86,8 @@ def serialize_start_tag(
                 parts.extend([" ", key, '=""'])
                 continue
-            value_str = str(value)
+            # value is guaranteed to be a string here because attrs is dict[str, str | None]
+            value_str = value
             if value_str == "":
                 parts.extend([" ", key, '=""'])
                 continue
@@ -118,15 +119,8 @@ def to_html(
     indent_size: int = 2,
     *,
     pretty: bool = True,
-    safe: bool = True,
-    policy: SanitizationPolicy | None = None,
 ) -> str:
     """Convert node to HTML string."""
-    if safe:
-        if policy is None and node.name == "#document":
-            node = sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
-        else:
-            node = sanitize(node, policy=policy or DEFAULT_POLICY)
     if node.name == "#document":
         # Document root - just render children
         parts: list[str] = []
@@ -136,12 +130,6 @@ def to_html(
     return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
-_PREFORMATTED_ELEMENTS: set[str] = {"pre", "textarea", "code"}
-# Elements whose text content must not be normalized (e.g. scripts/styles).
-_RAWTEXT_ELEMENTS: set[str] = {"script", "style"}
 def _collapse_html_whitespace(text: str) -> str:
     """Collapse HTML whitespace runs to a single space and trim edges.
@@ -151,20 +139,26 @@ def _collapse_html_whitespace(text: str) -> str:
     if not text:
         return ""
-    parts: list[str] = []
-    in_whitespace = False
-    for ch in text:
-        if ch in {" ", "\t", "\n", "\f", "\r"}:
-            if not in_whitespace:
-                parts.append(" ")
-                in_whitespace = True
-            continue
-        parts.append(ch)
+    # Optimization: split() handles whitespace collapsing efficiently.
+    # Note: split() treats \v as whitespace, which is not HTML whitespace.
+    # But \v is extremely rare in HTML.
+    if "\v" in text:
+        parts: list[str] = []
         in_whitespace = False
+        for ch in text:
+            if ch in {" ", "\t", "\n", "\f", "\r"}:
+                if not in_whitespace:
+                    parts.append(" ")
+                    in_whitespace = True
+                continue
+            parts.append(ch)
+            in_whitespace = False
-    collapsed = "".join(parts)
-    return collapsed.strip(" ")
+        collapsed = "".join(parts)
+        return collapsed.strip(" ")
+    return " ".join(text.split())
 def _normalize_formatting_whitespace(text: str) -> str:
@@ -226,6 +220,149 @@ def _is_whitespace_text_node(node: Any) -> bool:
     return node.name == "#text" and (node.data or "").strip() == ""
+def _is_blocky_element(node: Any) -> bool:
+    # Treat elements as block-ish if they are block-level *or* contain any block-level
+    # descendants. This keeps pretty-printing readable for constructs like <a><div>...</div></a>.
+    try:
+        name = node.name
+    except AttributeError:
+        return False
+    if name in {"#text", "#comment", "!doctype"}:
+        return False
+    if name in SPECIAL_ELEMENTS:
+        return True
+    try:
+        children = node.children or []
+    except AttributeError:
+        return False
+    if not children:
+        return False
+    stack: list[Any] = list(children)
+    while stack:
+        child = stack.pop()
+        if child is None:
+            continue
+        child_name = child.name
+        if child_name in SPECIAL_ELEMENTS:
+            return True
+        if child_name in {"#text", "#comment", "!doctype"}:
+            continue
+        grand_children = child.children
+        if grand_children:
+            stack.extend(grand_children)
+    return False
+_LAYOUT_BLOCK_ELEMENTS = {
+    "address",
+    "article",
+    "aside",
+    "blockquote",
+    "body",
+    "caption",
+    "center",
+    "dd",
+    "details",
+    "dialog",
+    "dir",
+    "div",
+    "dl",
+    "dt",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "header",
+    "hgroup",
+    "hr",
+    "html",
+    "iframe",
+    "li",
+    "listing",
+    "main",
+    "marquee",
+    "menu",
+    "nav",
+    "noframes",
+    "noscript",
+    "ol",
+    "p",
+    "plaintext",
+    "pre",
+    "search",
+    "section",
+    "summary",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "tr",
+    "ul",
+}
+_FORMAT_SEP = object()
+def _is_layout_blocky_element(node: Any) -> bool:
+    # Similar to _is_blocky_element(), but limited to actual layout blocks.
+    # This avoids turning inline-ish "special" elements like <script> into
+    # multiline pretty-print breaks in contexts like <p>.
+    try:
+        name = node.name
+    except AttributeError:
+        return False
+    if name in {"#text", "#comment", "!doctype"}:
+        return False
+    if name in _LAYOUT_BLOCK_ELEMENTS:
+        return True
+    try:
+        children = node.children or []
+    except AttributeError:
+        return False
+    if not children:
+        return False
+    stack: list[Any] = list(children)
+    while stack:
+        child = stack.pop()
+        if child is None:
+            continue
+        child_name = child.name
+        if child_name in _LAYOUT_BLOCK_ELEMENTS:
+            return True
+        if child_name in {"#text", "#comment", "!doctype"}:
+            continue
+        grand_children = child.children
+        if grand_children:
+            stack.extend(grand_children)
+    return False
+def _is_formatting_whitespace_text(data: str) -> bool:
+    # Formatting whitespace is something users typically don't intend to preserve
+    # exactly (e.g. newlines/indentation, or large runs of spaces).
+    if not data:
+        return False
+    if "\n" in data or "\r" in data or "\t" in data or "\f" in data:
+        return True
+    return len(data) > 2
 def _should_pretty_indent_children(children: list[Any]) -> bool:
     for child in children:
         if child is None:
@@ -243,26 +380,18 @@ def _should_pretty_indent_children(children: list[Any]) -> bool:
         return True
     if len(element_children) == 1:
         only_child = element_children[0]
-        if only_child.name in SPECIAL_ELEMENTS:
+        if _is_blocky_element(only_child):
             return True
-        if only_child.name == "a":
-            # If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
-            # for pretty-printing so the parent can indent it on its own line.
-            for grandchild in only_child.children or []:
-                if grandchild is None:
-                    continue
-                if grandchild.name in SPECIAL_ELEMENTS:
-                    return True
         return False
     # Safe indentation rule: only insert inter-element whitespace when we won't
     # be placing it between two adjacent inline/phrasing elements.
-    prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
+    prev_is_blocky = _is_blocky_element(element_children[0])
     for child in element_children[1:]:
-        current_is_special = child.name in SPECIAL_ELEMENTS
-        if not prev_is_special and not current_is_special:
+        current_is_blocky = _is_blocky_element(child)
+        if not prev_is_blocky and not current_is_blocky:
             return False
-        prev_is_special = current_is_special
+        prev_is_blocky = current_is_blocky
     return True
@@ -270,7 +399,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
     """Helper to convert a node to HTML."""
     prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
     name: str = node.name
-    content_pre = in_pre or name in _PREFORMATTED_ELEMENTS
+    content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
     newline = "\n" if pretty and not content_pre else ""
     # Text node
@@ -320,14 +449,19 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
         return f"{prefix}{open_tag}{serialize_end_tag(name)}"
     # Check if all children are text-only (inline rendering)
-    all_text = all(c.name == "#text" for c in children)
+    all_text = True
+    for child in children:
+        if child is None:
+            continue
+        if child.name != "#text":
+            all_text = False
+            break
     if all_text and pretty and not content_pre:
         # Serializer controls sanitization at the to_html() entry point; avoid
         # implicit re-sanitization during rendering.
-        text_content = node.to_text(separator="", strip=False, safe=False)
-        if name not in _RAWTEXT_ELEMENTS:
-            text_content = _collapse_html_whitespace(text_content)
+        text_content = node.to_text(separator="", strip=False)
+        text_content = _collapse_html_whitespace(text_content)
         return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
     if pretty and content_pre:
@@ -338,11 +472,204 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
         )
         return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
+    if pretty and not content_pre and name in SPECIAL_ELEMENTS:
+        # For block-ish containers that only have element children (and/or
+        # whitespace-only text nodes), prefer a multiline layout for readability
+        # even when children are inline elements.
+        can_indent = True
+        for child in children:
+            if child is None:
+                continue
+            if child.name == "#comment":
+                can_indent = False
+                break
+            if child.name == "#text" and (child.data or "").strip():
+                can_indent = False
+                break
+        if can_indent:
+            inner_lines: list[str] = []
+            for child in children:
+                if child is None:
+                    continue
+                if _is_whitespace_text_node(child):
+                    continue
+                child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
+                if child_html:
+                    inner_lines.append(child_html)
+            if inner_lines:
+                parts = [f"{prefix}{open_tag}"]
+                parts.extend(inner_lines)
+                parts.append(f"{prefix}{serialize_end_tag(name)}")
+                return "\n".join(parts)
+        # Smart pretty-printing: if the author already inserted formatting whitespace
+        # between siblings, we can split into "inline runs" and put each run on its
+        # own line without introducing new inter-token whitespace.
+        has_comment = any(child is not None and child.name == "#comment" for child in children)
+        if not has_comment:
+            non_none_children: list[Any] = [child for child in children if child is not None]
+            # Only enable this mode if there is at least one formatting whitespace text node
+            # between non-whitespace siblings.
+            has_separator = False
+            for child in non_none_children[1:-1]:
+                if child.name != "#text":
+                    continue
+                data = child.data or ""
+                if data.strip() != "":
+                    continue
+                if _is_formatting_whitespace_text(data):
+                    has_separator = True
+                    break
+            if has_separator:
+                # Build runs by splitting on formatting whitespace text nodes.
+                # Keep small spacing nodes (" " or "  ") inside runs.
+                items: list[Any] = []
+                last_was_sep = False
+                for child in non_none_children:
+                    if child.name == "#text":
+                        data = child.data or ""
+                        if data.strip() == "" and _is_formatting_whitespace_text(data):
+                            if not last_was_sep:
+                                items.append(_FORMAT_SEP)
+                                last_was_sep = True
+                            continue
+                    items.append(child)
+                    last_was_sep = False
+                while items and items[0] is _FORMAT_SEP:
+                    items.pop(0)
+                while items and items[-1] is _FORMAT_SEP:
+                    items.pop()
+                runs: list[list[Any]] = []
+                current_run: list[Any] = []
+                for item in items:
+                    if item is _FORMAT_SEP:
+                        runs.append(current_run)
+                        current_run = []
+                        continue
+                    current_run.append(item)
+                runs.append(current_run)
+                runs = [run for run in runs if run]
+                # Only apply if we can render each run either as a single blocky element
+                # (possibly multiline) or as a single-line inline run.
+                smart_lines: list[str] = []
+                can_apply = True
+                for run in runs:
+                    blocky_elements = [c for c in run if c.name not in {"#text", "#comment"} and _is_blocky_element(c)]
+                    if blocky_elements and len(run) != 1:
+                        can_apply = False
+                        break
+                    if len(run) == 1 and run[0].name != "#text":
+                        child_html = _node_to_html(run[0], indent + 1, indent_size, pretty=True, in_pre=content_pre)
+                        smart_lines.append(child_html)
+                        continue
+                    # Inline run: render on one line.
+                    run_parts: list[str] = []
+                    for c in run:
+                        if c.name == "#text":
+                            data = c.data or ""
+                            if not data.strip():
+                                # Formatting whitespace never appears inside runs (it is used as a separator).
+                                # Preserve intentional tiny spacing.
+                                run_parts.append(data)
+                                continue
+                            run_parts.append(_escape_text(_normalize_formatting_whitespace(data)))
+                            continue
+                        # Render inline elements without their own leading indentation.
+                        child_html = _node_to_html(c, 0, indent_size, pretty=True, in_pre=content_pre)
+                        run_parts.append(child_html)
+                    smart_lines.append(f"{' ' * ((indent + 1) * indent_size)}{''.join(run_parts)}")
+                if can_apply and smart_lines:
+                    return f"{prefix}{open_tag}\n" + "\n".join(smart_lines) + f"\n{prefix}{serialize_end_tag(name)}"
     if pretty and not content_pre and not _should_pretty_indent_children(children):
         # For block-ish elements that contain only element children and whitespace-only
         # text nodes, we can still format each child on its own line (only when there
         # is already whitespace separating element siblings).
         if name in SPECIAL_ELEMENTS:
+            # Mixed content in block-ish containers: if we encounter a blocky child
+            # (e.g. <ul>) adjacent to inline text, printing everything on one line
+            # both hurts readability and can lose indentation inside the block subtree.
+            # In that case, put inline runs and blocky children on their own lines.
+            has_comment = any(child is not None and child.name == "#comment" for child in children)
+            if not has_comment:
+                has_blocky_child = any(
+                    child is not None and child.name not in {"#text", "#comment"} and _is_layout_blocky_element(child)
+                    for child in children
+                )
+                has_non_whitespace_text = any(
+                    child is not None and child.name == "#text" and (child.data or "").strip() for child in children
+                )
+                if has_blocky_child and has_non_whitespace_text:
+                    mixed_multiline_lines: list[str] = []
+                    inline_parts: list[str] = []
+                    mixed_first_non_none_index: int | None = None
+                    mixed_last_non_none_index: int | None = None
+                    for i, child in enumerate(children):
+                        if child is None:
+                            continue
+                        if mixed_first_non_none_index is None:
+                            mixed_first_non_none_index = i
+                        mixed_last_non_none_index = i
+                    def flush_inline() -> None:
+                        if not inline_parts:
+                            return
+                        line = "".join(inline_parts).strip(" ")
+                        inline_parts.clear()
+                        if line:
+                            mixed_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{line}")
+                    for i, child in enumerate(children):
+                        if child is None:
+                            continue
+                        if child.name == "#text":
+                            data = child.data or ""
+                            if not data.strip():
+                                # Drop leading/trailing formatting whitespace.
+                                if i == mixed_first_non_none_index or i == mixed_last_non_none_index:
+                                    continue
+                                # Preserve intentional small spacing, but treat formatting whitespace
+                                # as a separator between inline runs (new line).
+                                if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
+                                    flush_inline()
+                                else:
+                                    inline_parts.append(data)
+                                continue
+                            data = _normalize_formatting_whitespace(data)
+                            inline_parts.append(_escape_text(data))
+                            continue
+                        if _is_layout_blocky_element(child):
+                            flush_inline()
+                            mixed_multiline_lines.append(
+                                _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
+                            )
+                            continue
+                        # Inline element: keep it in the current line without leading indentation.
+                        inline_parts.append(_node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre))
+                    flush_inline()
+                    inner = "\n".join(line for line in mixed_multiline_lines if line)
+                    return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
             has_comment = False
             has_element = False
             has_whitespace_between_elements = False
@@ -388,32 +715,32 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
                         break
             if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
-                inner_lines: list[str] = []
+                element_multiline_lines: list[str] = []
                 for child in children:
                     if child is None:
                         continue
                     if child.name == "#text":
                         text = _collapse_html_whitespace(child.data or "")
                         if text:
-                            inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
+                            element_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
                         continue
                     child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
                     if child_html:
-                        inner_lines.append(child_html)
-                if inner_lines:
-                    inner = "\n".join(inner_lines)
+                        element_multiline_lines.append(child_html)
+                if element_multiline_lines:
+                    inner = "\n".join(element_multiline_lines)
                     return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
         inner_parts: list[str] = []
-        first_non_none_index: int | None = None
-        last_non_none_index: int | None = None
+        compact_first_non_none_index: int | None = None
+        compact_last_non_none_index: int | None = None
         for i, child in enumerate(children):
             if child is None:
                 continue
-            if first_non_none_index is None:
-                first_non_none_index = i
-            last_non_none_index = i
+            if compact_first_non_none_index is None:
+                compact_first_non_none_index = i
+            compact_last_non_none_index = i
         for i, child in enumerate(children):
             if child is None:
@@ -423,15 +750,14 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
                 data = child.data or ""
                 if not data.strip():
                     # Drop leading/trailing formatting whitespace in compact mode.
-                    if i == first_non_none_index or i == last_non_none_index:
+                    if i == compact_first_non_none_index or i == compact_last_non_none_index:
                         continue
                     # Preserve intentional small spacing, but collapse large formatting gaps.
                     if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
                         inner_parts.append(" ")
                         continue
-                if not content_pre and name not in _RAWTEXT_ELEMENTS:
-                    data = _normalize_formatting_whitespace(data)
+                data = _normalize_formatting_whitespace(data)
                 child_html = _escape_text(data) if data else ""
             else:
                 # Even when we can't safely insert whitespace *between* siblings, we can

justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl