PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (20) hide show

justhtml/__init__.py +6 -0
justhtml/__main__.py +49 -16
justhtml/entities.py +45 -7
justhtml/errors.py +9 -0
justhtml/node.py +358 -89
justhtml/parser.py +70 -14
justhtml/sanitize.py +763 -0
justhtml/selector.py +114 -18
justhtml/serialize.py +332 -28
justhtml/tokenizer.py +249 -179
justhtml/tokens.py +8 -3
justhtml/treebuilder.py +50 -14
justhtml/treebuilder_modes.py +100 -36
justhtml-0.24.0.dist-info/METADATA +192 -0
justhtml-0.24.0.dist-info/RECORD +24 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.12.0.dist-info/METADATA +0 -164
justhtml-0.12.0.dist-info/RECORD +0 -23
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0

justhtml/selector.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
+from functools import lru_cache
 from typing import Any
@@ -529,6 +530,14 @@ class SelectorMatcher:
     __slots__ = ()
+    def _unquote_pseudo_arg(self, arg: str) -> str:
+        arg = arg.strip()
+        if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
+            quote = arg[0]
+            # Minimal unescaping for common cases like :contains("click me")
+            return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
+        return arg
     def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
         """Check if a node matches a parsed selector."""
         if isinstance(selector, SelectorList):
@@ -724,6 +733,17 @@ class SelectorMatcher:
                 return parent.name in ("#document", "#document-fragment")
             return False
+        if name == "contains":
+            if selector.arg is None:
+                raise SelectorError(":contains() requires a string argument")
+            needle = self._unquote_pseudo_arg(selector.arg)
+            if needle == "":
+                return True
+            # Non-standard (jQuery-style) pseudo-class: match elements whose descendant
+            # text contains the substring. We use `to_text()` to approximate textContent.
+            haystack: str = node.to_text(separator=" ", strip=True)
+            return needle in haystack
         if name == "first-of-type":
             return self._is_first_of_type(node)
@@ -743,7 +763,7 @@ class SelectorMatcher:
         """Get only element children (exclude text, comments, etc.)."""
         if not parent or not parent.has_child_nodes():
             return []
-        return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
+        return [c for c in parent.children if not c.name.startswith("#")]
     def _get_previous_sibling(self, node: Any) -> Any | None:
         """Get the previous element sibling. Returns None if node is first or not found."""
@@ -755,7 +775,7 @@ class SelectorMatcher:
         for child in parent.children:
             if child is node:
                 return prev
-            if hasattr(child, "name") and not child.name.startswith("#"):
+            if not child.name.startswith("#"):
                 prev = child
         return None  # node not in parent.children (detached)
@@ -903,7 +923,12 @@ def parse_selector(selector_string: str) -> ParsedSelector:
     if not selector_string or not selector_string.strip():
         raise SelectorError("Empty selector")
-    tokenizer = SelectorTokenizer(selector_string.strip())
+    return _parse_selector_cached(selector_string.strip())
+@lru_cache(maxsize=512)
+def _parse_selector_cached(selector_string: str) -> ParsedSelector:
+    tokenizer = SelectorTokenizer(selector_string)
     tokens = tokenizer.tokenize()
     parser = SelectorParser(tokens)
     return parser.parse()
@@ -913,6 +938,51 @@ def parse_selector(selector_string: str) -> ParsedSelector:
 _matcher: SelectorMatcher = SelectorMatcher()
+def _is_simple_tag_selector(selector: str) -> bool:
+    if not selector:
+        return False
+    ch0 = selector[0]
+    if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
+        return False
+    for ch in selector[1:]:
+        if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
+            continue
+        return False
+    return True
+def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
+    results_append = results.append
+    stack: list[Any] = []
+    root_children = node.children
+    if root_children:
+        stack.extend(reversed(root_children))
+    if node.name == "template" and node.namespace == "html":
+        template_content = node.template_content
+        if template_content:
+            stack.append(template_content)
+    while stack:
+        current = stack.pop()
+        name = current.name
+        if not name.startswith("#"):
+            if name == tag_lower or name.lower() == tag_lower:
+                results_append(current)
+        children = current.children
+        if children:
+            stack.extend(reversed(children))
+        if name == "template" and current.namespace == "html":
+            template_content = current.template_content
+            if template_content:
+                stack.append(template_content)
 def query(root: Any, selector_string: str) -> list[Any]:
     """
     Query the DOM tree starting from root, returning all matching elements.
@@ -927,27 +997,53 @@ def query(root: Any, selector_string: str) -> list[Any]:
     Returns:
         A list of matching nodes
     """
-    selector = parse_selector(selector_string)
+    selector_string = selector_string.strip()
+    if not selector_string:
+        raise SelectorError("Empty selector")
     results: list[Any] = []
+    if _is_simple_tag_selector(selector_string):
+        _query_descendants_tag(root, selector_string.lower(), results)
+        return results
+    selector = _parse_selector_cached(selector_string)
     _query_descendants(root, selector, results)
     return results
 def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
-    """Recursively search for matching nodes in descendants."""
-    # Only recurse into children (not the node itself)
-    if node.has_child_nodes():
-        for child in node.children:
-            # Check if this child matches
-            if hasattr(child, "name") and not child.name.startswith("#"):
-                if _matcher.matches(child, selector):
-                    results.append(child)
-            # Recurse into child's descendants
-            _query_descendants(child, selector, results)
-    # Also check template content if present
-    if hasattr(node, "template_content") and node.template_content:
-        _query_descendants(node.template_content, selector, results)
+    """Search for matching nodes in descendants."""
+    matcher_matches = _matcher.matches
+    results_append = results.append
+    # querySelectorAll searches descendants of root, not including root itself.
+    stack: list[Any] = []
+    root_children = node.children
+    if root_children:
+        stack.extend(reversed(root_children))
+    if node.name == "template" and node.namespace == "html":
+        template_content = node.template_content
+        if template_content:
+            stack.append(template_content)
+    while stack:
+        current = stack.pop()
+        name = current.name
+        if not name.startswith("#") and matcher_matches(current, selector):
+            results_append(current)
+        children = current.children
+        if children:
+            stack.extend(reversed(children))
+        if name == "template" and current.namespace == "html":
+            template_content = current.template_content
+            if template_content:
+                stack.append(template_content)
 def matches(node: Any, selector_string: str) -> bool:

justhtml/serialize.py CHANGED Viewed

@@ -6,7 +6,8 @@ from __future__ import annotations
 from typing import Any
-from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
+from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS
+from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, sanitize
 def _escape_text(text: str | None) -> str:
@@ -16,7 +17,9 @@ def _escape_text(text: str | None) -> str:
     return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-def _choose_attr_quote(value: str | None) -> str:
+def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
+    if forced_quote_char in {'"', "'"}:
+        return forced_quote_char
     if value is None:
         return '"'
     value = str(value)
@@ -25,11 +28,13 @@ def _choose_attr_quote(value: str | None) -> str:
     return '"'
-def _escape_attr_value(value: str | None, quote_char: str) -> str:
+def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
     if value is None:
         return ""
     value = str(value)
     value = value.replace("&", "&amp;")
+    if escape_lt_in_attrs:
+        value = value.replace("<", "&lt;")
     # Note: html5lib's default serializer does not escape '>' in attrs.
     if quote_char == '"':
         return value.replace('"', "&quot;")
@@ -40,8 +45,6 @@ def _can_unquote_attr_value(value: str | None) -> bool:
     if value is None:
         return False
     value = str(value)
-    # html5lib's serializer unquotes aggressively; match fixture expectations.
-    # Disallow whitespace and characters that would terminate/ambiguate the value.
     for ch in value:
         if ch == ">":
             return False
@@ -52,22 +55,56 @@ def _can_unquote_attr_value(value: str | None) -> bool:
     return True
-def serialize_start_tag(name: str, attrs: dict[str, str | None] | None) -> str:
+def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
+    if not minimize_boolean_attributes:
+        return False
+    if value is None or value == "":
+        return True
+    return str(value).lower() == str(name).lower()
+def serialize_start_tag(
+    name: str,
+    attrs: dict[str, str | None] | None,
+    *,
+    quote_attr_values: bool = True,
+    minimize_boolean_attributes: bool = True,
+    quote_char: str | None = None,
+    escape_lt_in_attrs: bool = False,
+    use_trailing_solidus: bool = False,
+    is_void: bool = False,
+) -> str:
     attrs = attrs or {}
     parts: list[str] = ["<", name]
     if attrs:
         for key, value in attrs.items():
-            if value is None or value == "":
+            if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
                 parts.extend([" ", key])
+                continue
+            if value is None:
+                parts.extend([" ", key, '=""'])
+                continue
+            value_str = str(value)
+            if value_str == "":
+                parts.extend([" ", key, '=""'])
+                continue
+            if not quote_attr_values and _can_unquote_attr_value(value_str):
+                escaped = value_str.replace("&", "&amp;")
+                if escape_lt_in_attrs:
+                    escaped = escaped.replace("<", "&lt;")
+                parts.extend([" ", key, "=", escaped])
             else:
-                if _can_unquote_attr_value(value):
-                    escaped = str(value).replace("&", "&amp;")
-                    parts.extend([" ", key, "=", escaped])
-                else:
-                    quote = _choose_attr_quote(value)
-                    escaped = _escape_attr_value(value, quote)
-                    parts.extend([" ", key, "=", quote, escaped, quote])
-    parts.append(">")
+                quote = _choose_attr_quote(value_str, quote_char)
+                escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
+                parts.extend([" ", key, "=", quote, escaped, quote])
+    if use_trailing_solidus and is_void:
+        parts.append(" />")
+    else:
+        parts.append(">")
     return "".join(parts)
@@ -75,27 +112,171 @@ def serialize_end_tag(name: str) -> str:
     return f"</{name}>"
-def to_html(node: Any, indent: int = 0, indent_size: int = 2, *, pretty: bool = True) -> str:
+def to_html(
+    node: Any,
+    indent: int = 0,
+    indent_size: int = 2,
+    *,
+    pretty: bool = True,
+    safe: bool = True,
+    policy: SanitizationPolicy | None = None,
+) -> str:
     """Convert node to HTML string."""
+    if safe:
+        if policy is None and node.name == "#document":
+            node = sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
+        else:
+            node = sanitize(node, policy=policy or DEFAULT_POLICY)
     if node.name == "#document":
         # Document root - just render children
         parts: list[str] = []
         for child in node.children or []:
-            parts.append(_node_to_html(child, indent, indent_size, pretty))
+            parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
         return "\n".join(parts) if pretty else "".join(parts)
-    return _node_to_html(node, indent, indent_size, pretty)
+    return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
+_PREFORMATTED_ELEMENTS: set[str] = {"pre", "textarea", "code"}
+# Elements whose text content must not be normalized (e.g. scripts/styles).
+_RAWTEXT_ELEMENTS: set[str] = {"script", "style"}
+def _collapse_html_whitespace(text: str) -> str:
+    """Collapse HTML whitespace runs to a single space and trim edges.
+    This matches how HTML rendering treats most whitespace in text nodes, and is
+    used only for pretty-printing in non-preformatted contexts.
+    """
+    if not text:
+        return ""
+    parts: list[str] = []
+    in_whitespace = False
+    for ch in text:
+        if ch in {" ", "\t", "\n", "\f", "\r"}:
+            if not in_whitespace:
+                parts.append(" ")
+                in_whitespace = True
+            continue
+        parts.append(ch)
+        in_whitespace = False
+    collapsed = "".join(parts)
+    return collapsed.strip(" ")
+def _normalize_formatting_whitespace(text: str) -> str:
+    """Normalize formatting whitespace within a text node.
+    Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
+    include such formatting whitespace to a single space.
-def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
+    Pure space runs are preserved as-is (so existing double-spaces remain).
+    """
+    if not text:
+        return ""
+    if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
+        return text
+    starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
+    ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
+    out: list[str] = []
+    in_ws = False
+    saw_formatting_ws = False
+    for ch in text:
+        if ch == " ":
+            if in_ws:
+                # Only collapse if this whitespace run included formatting whitespace.
+                if saw_formatting_ws:
+                    continue
+                out.append(" ")
+                continue
+            in_ws = True
+            saw_formatting_ws = False
+            out.append(" ")
+            continue
+        if ch in {"\n", "\r", "\t", "\f"}:
+            if in_ws:
+                saw_formatting_ws = True
+                continue
+            in_ws = True
+            saw_formatting_ws = True
+            out.append(" ")
+            continue
+        in_ws = False
+        saw_formatting_ws = False
+        out.append(ch)
+    normalized = "".join(out)
+    if starts_with_formatting and normalized.startswith(" "):
+        normalized = normalized[1:]
+    if ends_with_formatting and normalized.endswith(" "):
+        normalized = normalized[:-1]
+    return normalized
+def _is_whitespace_text_node(node: Any) -> bool:
+    return node.name == "#text" and (node.data or "").strip() == ""
+def _should_pretty_indent_children(children: list[Any]) -> bool:
+    for child in children:
+        if child is None:
+            continue
+        name = child.name
+        if name == "#comment":
+            return False
+        if name == "#text" and (child.data or "").strip():
+            return False
+    element_children: list[Any] = [
+        child for child in children if child is not None and child.name not in {"#text", "#comment"}
+    ]
+    if not element_children:
+        return True
+    if len(element_children) == 1:
+        only_child = element_children[0]
+        if only_child.name in SPECIAL_ELEMENTS:
+            return True
+        if only_child.name == "a":
+            # If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
+            # for pretty-printing so the parent can indent it on its own line.
+            for grandchild in only_child.children or []:
+                if grandchild is None:
+                    continue
+                if grandchild.name in SPECIAL_ELEMENTS:
+                    return True
+        return False
+    # Safe indentation rule: only insert inter-element whitespace when we won't
+    # be placing it between two adjacent inline/phrasing elements.
+    prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
+    for child in element_children[1:]:
+        current_is_special = child.name in SPECIAL_ELEMENTS
+        if not prev_is_special and not current_is_special:
+            return False
+        prev_is_special = current_is_special
+    return True
+def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
     """Helper to convert a node to HTML."""
-    prefix = " " * (indent * indent_size) if pretty else ""
-    newline = "\n" if pretty else ""
+    prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
     name: str = node.name
+    content_pre = in_pre or name in _PREFORMATTED_ELEMENTS
+    newline = "\n" if pretty and not content_pre else ""
     # Text node
     if name == "#text":
         text: str | None = node.data
-        if pretty:
+        if pretty and not in_pre:
             text = text.strip() if text else ""
             if text:
                 return f"{prefix}{_escape_text(text)}"
@@ -114,7 +295,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
     if name == "#document-fragment":
         parts: list[str] = []
         for child in node.children or []:
-            child_html = _node_to_html(child, indent, indent_size, pretty)
+            child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
             if child_html:
                 parts.append(child_html)
         return newline.join(parts) if pretty else "".join(parts)
@@ -130,20 +311,143 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
         return f"{prefix}{open_tag}"
     # Elements with children
-    children: list[Any] = node.children or []
+    # Template special handling: HTML templates store contents in `template_content`.
+    if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
+        children: list[Any] = node.template_content.children or []
+    else:
+        children = node.children or []
     if not children:
         return f"{prefix}{open_tag}{serialize_end_tag(name)}"
     # Check if all children are text-only (inline rendering)
     all_text = all(c.name == "#text" for c in children)
-    if all_text and pretty:
-        return f"{prefix}{open_tag}{_escape_text(node.to_text(separator='', strip=False))}{serialize_end_tag(name)}"
+    if all_text and pretty and not content_pre:
+        # Serializer controls sanitization at the to_html() entry point; avoid
+        # implicit re-sanitization during rendering.
+        text_content = node.to_text(separator="", strip=False, safe=False)
+        if name not in _RAWTEXT_ELEMENTS:
+            text_content = _collapse_html_whitespace(text_content)
+        return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
+    if pretty and content_pre:
+        inner = "".join(
+            _node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
+            for child in children
+            if child is not None
+        )
+        return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
+    if pretty and not content_pre and not _should_pretty_indent_children(children):
+        # For block-ish elements that contain only element children and whitespace-only
+        # text nodes, we can still format each child on its own line (only when there
+        # is already whitespace separating element siblings).
+        if name in SPECIAL_ELEMENTS:
+            has_comment = False
+            has_element = False
+            has_whitespace_between_elements = False
+            first_element_index: int | None = None
+            last_element_index: int | None = None
+            previous_was_element = False
+            saw_whitespace_since_last_element = False
+            for i, child in enumerate(children):
+                if child is None:
+                    continue
+                if child.name == "#comment":
+                    has_comment = True
+                    break
+                if child.name == "#text":
+                    # Track whether there is already whitespace between element siblings.
+                    if previous_was_element and not (child.data or "").strip():
+                        saw_whitespace_since_last_element = True
+                    continue
+                has_element = True
+                if first_element_index is None:
+                    first_element_index = i
+                last_element_index = i
+                if previous_was_element and saw_whitespace_since_last_element:
+                    has_whitespace_between_elements = True
+                previous_was_element = True
+                saw_whitespace_since_last_element = False
+            can_indent_non_whitespace_text = True
+            if has_element and first_element_index is not None and last_element_index is not None:
+                for i, child in enumerate(children):
+                    if child is None or child.name != "#text":
+                        continue
+                    if not (child.data or "").strip():
+                        continue
+                    # Only allow non-whitespace text *after* the last element.
+                    # Leading text or text between elements could gain new spaces
+                    # due to indentation/newlines.
+                    if i < first_element_index or first_element_index < i < last_element_index:
+                        can_indent_non_whitespace_text = False
+                        break
+            if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
+                inner_lines: list[str] = []
+                for child in children:
+                    if child is None:
+                        continue
+                    if child.name == "#text":
+                        text = _collapse_html_whitespace(child.data or "")
+                        if text:
+                            inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
+                        continue
+                    child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
+                    if child_html:
+                        inner_lines.append(child_html)
+                if inner_lines:
+                    inner = "\n".join(inner_lines)
+                    return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
+        inner_parts: list[str] = []
+        first_non_none_index: int | None = None
+        last_non_none_index: int | None = None
+        for i, child in enumerate(children):
+            if child is None:
+                continue
+            if first_non_none_index is None:
+                first_non_none_index = i
+            last_non_none_index = i
+        for i, child in enumerate(children):
+            if child is None:
+                continue
+            if child.name == "#text":
+                data = child.data or ""
+                if not data.strip():
+                    # Drop leading/trailing formatting whitespace in compact mode.
+                    if i == first_non_none_index or i == last_non_none_index:
+                        continue
+                    # Preserve intentional small spacing, but collapse large formatting gaps.
+                    if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
+                        inner_parts.append(" ")
+                        continue
+                if not content_pre and name not in _RAWTEXT_ELEMENTS:
+                    data = _normalize_formatting_whitespace(data)
+                child_html = _escape_text(data) if data else ""
+            else:
+                # Even when we can't safely insert whitespace *between* siblings, we can
+                # still pretty-print each element subtree to improve readability.
+                child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
+            if child_html:
+                inner_parts.append(child_html)
+        return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
     # Render with child indentation
     parts = [f"{prefix}{open_tag}"]
     for child in children:
-        child_html = _node_to_html(child, indent + 1, indent_size, pretty)
+        if pretty and not content_pre and _is_whitespace_text_node(child):
+            continue
+        child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
         if child_html:
             parts.append(child_html)
     parts.append(f"{prefix}{serialize_end_tag(name)}")
@@ -180,7 +484,7 @@ def _node_to_test_format(node: Any, indent: int) -> str:
     attribute_lines = _attrs_to_test_format(node, indent)
     # Template special handling (only HTML namespace templates have template_content)
-    if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
+    if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
         sections: list[str] = [line]
         if attribute_lines:
             sections.extend(attribute_lines)

justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl