PyPI - justhtml - Versions diffs - 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl - Mend

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (21) hide show

justhtml/__init__.py +44 -2
justhtml/__main__.py +45 -9
justhtml/constants.py +12 -0
justhtml/errors.py +8 -3
justhtml/linkify.py +438 -0
justhtml/node.py +54 -35
justhtml/parser.py +105 -38
justhtml/sanitize.py +511 -282
justhtml/selector.py +3 -1
justhtml/serialize.py +398 -72
justhtml/tokenizer.py +121 -21
justhtml/tokens.py +21 -3
justhtml/transforms.py +2568 -0
justhtml/treebuilder.py +247 -190
justhtml/treebuilder_modes.py +108 -102
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
justhtml-0.38.0.dist-info/RECORD +26 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
justhtml-0.24.0.dist-info/RECORD +0 -24
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0

justhtml/sanitize.py CHANGED Viewed

@@ -10,12 +10,38 @@ from __future__ import annotations
 from collections.abc import Callable, Collection, Mapping
 from dataclasses import dataclass, field
-from typing import Any
+from typing import Any, Literal, cast
 from urllib.parse import quote, urlsplit
+from .tokens import ParseError
 UrlFilter = Callable[[str, str, str], str | None]
+class UnsafeHtmlError(ValueError):
+    """Raised when unsafe HTML is encountered and unsafe_handling='raise'."""
+UnsafeHandling = Literal["strip", "raise", "collect"]
+DisallowedTagHandling = Literal["unwrap", "escape", "drop"]
+UrlHandling = Literal["allow", "strip", "proxy"]
+@dataclass(frozen=True, slots=True)
+class UrlProxy:
+    url: str
+    param: str = "url"
+    def __post_init__(self) -> None:
+        proxy_url = str(self.url)
+        if not proxy_url:
+            raise ValueError("UrlProxy.url must be a non-empty string")
+        object.__setattr__(self, "url", proxy_url)
+        object.__setattr__(self, "param", str(self.param))
 @dataclass(frozen=True, slots=True)
 class UrlRule:
     """Rule for a single URL-valued attribute (e.g. a[href], img[src]).
@@ -27,9 +53,6 @@ class UrlRule:
         want to block remote loads by default.
     """
-    # Allow relative URLs (including /path, ./path, ../path, ?query).
-    allow_relative: bool = True
     # Allow same-document fragments (#foo). Typically safe.
     allow_fragment: bool = True
@@ -46,13 +69,17 @@ class UrlRule:
     # allowlist.
     allowed_hosts: Collection[str] | None = None
-    # Optional proxy rewrite for allowed absolute/protocol-relative URLs.
-    # Example: proxy_url="/proxy" -> https://google.com becomes
-    # /proxy?url=https%3A%2F%2Fgoogle.com
-    proxy_url: str | None = None
+    # Optional per-rule handling override.
+    # If None, the URL is kept ("allow") after it passes validation.
+    handling: UrlHandling | None = None
+    # Optional per-rule override of UrlPolicy.default_allow_relative.
+    # If None, UrlPolicy.default_allow_relative is used.
+    allow_relative: bool | None = None
-    # Query parameter name used when proxy_url is set.
-    proxy_param: str = "url"
+    # Optional proxy override for absolute/protocol-relative URLs.
+    # Used when the effective URL handling is "proxy".
+    proxy: UrlProxy | None = None
     def __post_init__(self) -> None:
         # Accept lists/tuples from user code, normalize for internal use.
@@ -61,15 +88,158 @@ class UrlRule:
         if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
             object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
-        if self.proxy_url is not None:
-            proxy_url = str(self.proxy_url)
-            object.__setattr__(self, "proxy_url", proxy_url if proxy_url else None)
-        object.__setattr__(self, "proxy_param", str(self.proxy_param))
+        if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
+            raise TypeError("UrlRule.proxy must be a UrlProxy or None")
+        if self.handling is not None:
+            mode = str(self.handling)
+            if mode not in {"allow", "strip", "proxy"}:
+                raise ValueError("Invalid UrlRule.handling. Expected one of: 'allow', 'strip', 'proxy'")
+            object.__setattr__(self, "handling", mode)
-def _proxy_url_value(*, proxy_url: str, proxy_param: str, value: str) -> str:
-    sep = "&" if "?" in proxy_url else "?"
-    return f"{proxy_url}{sep}{proxy_param}={quote(value, safe='')}"
+        if self.allow_relative is not None:
+            object.__setattr__(self, "allow_relative", bool(self.allow_relative))
+@dataclass(frozen=True, slots=True)
+class UrlPolicy:
+    # Default handling for URL-like attributes after they pass UrlRule checks.
+    # - "allow": keep the URL as-is
+    # - "strip": drop the attribute
+    # - "proxy": rewrite the URL through a proxy (UrlPolicy.proxy or UrlRule.proxy)
+    default_handling: UrlHandling = "strip"
+    # Default allowance for relative URLs (including /path, ./path, ../path, ?query)
+    # for URL-like attributes that have a matching UrlRule.
+    default_allow_relative: bool = True
+    # Rule configuration for URL-valued attributes.
+    allow_rules: Mapping[tuple[str, str], UrlRule] = field(default_factory=dict)
+    # Optional hook that can drop or rewrite URLs.
+    # url_filter(tag, attr, value) should return:
+    # - a replacement string to keep (possibly rewritten), or
+    # - None to drop the attribute.
+    url_filter: UrlFilter | None = None
+    # Default proxy config used when a rule is handled with "proxy" and
+    # the rule does not specify its own UrlRule.proxy override.
+    proxy: UrlProxy | None = None
+    def __post_init__(self) -> None:
+        mode = str(self.default_handling)
+        if mode not in {"allow", "strip", "proxy"}:
+            raise ValueError("Invalid default_handling. Expected one of: 'allow', 'strip', 'proxy'")
+        object.__setattr__(self, "default_handling", mode)
+        object.__setattr__(self, "default_allow_relative", bool(self.default_allow_relative))
+        if not isinstance(self.allow_rules, dict):
+            object.__setattr__(self, "allow_rules", dict(self.allow_rules))
+        if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
+            raise TypeError("UrlPolicy.proxy must be a UrlProxy or None")
+        # Validate proxy configuration for any rules that are in proxy mode.
+        for rule in self.allow_rules.values():
+            if not isinstance(rule, UrlRule):
+                raise TypeError("UrlPolicy.allow_rules values must be UrlRule")
+            if rule.handling == "proxy" and self.proxy is None and rule.proxy is None:
+                raise ValueError("UrlRule.handling='proxy' requires a UrlPolicy.proxy or a per-rule UrlRule.proxy")
+def _proxy_url_value(*, proxy: UrlProxy, value: str) -> str:
+    sep = "&" if "?" in proxy.url else "?"
+    return f"{proxy.url}{sep}{proxy.param}={quote(value, safe='')}"
+@dataclass(slots=True)
+class UnsafeHandler:
+    """Centralized handler for security findings.
+    This is intentionally a small stateful object so multiple sanitization-
+    related passes/transforms can share the same unsafe-handling behavior and
+    (in collect mode) append into the same error list.
+    """
+    unsafe_handling: UnsafeHandling
+    # Optional external sink (e.g. a JustHTML document's .errors list).
+    # When set and unsafe_handling == "collect", security findings are written
+    # into that list so multiple components can share a single sink.
+    sink: list[ParseError] | None = None
+    _errors: list[ParseError] | None = None
+    def reset(self) -> None:
+        if self.unsafe_handling != "collect":
+            self._errors = None
+            return
+        if self.sink is None:
+            self._errors = []
+            return
+        # Remove previously collected security findings from the shared sink to
+        # avoid accumulating duplicates across multiple runs.
+        errors = self.sink
+        write_i = 0
+        for e in errors:
+            if e.category == "security":
+                continue
+            errors[write_i] = e
+            write_i += 1
+        del errors[write_i:]
+    def collected(self) -> list[ParseError]:
+        src = self.sink if self.sink is not None else self._errors
+        if not src:
+            return []
+        if self.sink is not None:
+            out = [e for e in src if e.category == "security"]
+        else:
+            out = list(src)
+        out.sort(
+            key=lambda e: (
+                e.line if e.line is not None else 1_000_000_000,
+                e.column if e.column is not None else 1_000_000_000,
+            )
+        )
+        return out
+    def handle(self, msg: str, *, node: Any | None = None) -> None:
+        mode = self.unsafe_handling
+        if mode == "strip":
+            return
+        if mode == "raise":
+            raise UnsafeHtmlError(msg)
+        if mode == "collect":
+            dest = self.sink
+            if dest is None:
+                if self._errors is None:
+                    self._errors = []
+                dest = self._errors
+            line: int | None = None
+            column: int | None = None
+            if node is not None:
+                # Best-effort: use node origin metadata when enabled.
+                # This stays allocation-light and avoids any input re-parsing.
+                line = node.origin_line
+                column = node.origin_col
+            dest.append(
+                ParseError(
+                    "unsafe-html",
+                    line=line,
+                    column=column,
+                    category="security",
+                    message=msg,
+                )
+            )
+            return
+        raise AssertionError(f"Unhandled unsafe_handling: {mode!r}")
 @dataclass(frozen=True, slots=True)
@@ -90,24 +260,13 @@ class SanitizationPolicy:
     allowed_tags: Collection[str]
     allowed_attributes: Mapping[str, Collection[str]]
-    # URL handling:
-    # - `url_rules` is the data-driven allowlist for URL-valued attributes.
-    # - `url_filter` is an optional hook that can drop or rewrite URLs.
-    #
-    # `url_filter(tag, attr, value)` should return:
-    # - a replacement string to keep (possibly rewritten), or
-    # - None to drop the attribute.
-    url_rules: Mapping[tuple[str, str], UrlRule]
-    url_filter: UrlFilter | None = None
+    # URL handling.
+    url_policy: UrlPolicy = field(default_factory=UrlPolicy)
     drop_comments: bool = True
     drop_doctype: bool = True
     drop_foreign_namespaces: bool = True
-    # If True, disallowed elements are removed but their children may be kept
-    # (except for tags in `drop_content_tags`).
-    strip_disallowed_tags: bool = True
     # Dangerous containers whose text payload should not be preserved.
     drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
@@ -121,6 +280,52 @@ class SanitizationPolicy:
     # (The sanitizer will merge tokens; it will not remove existing ones.)
     force_link_rel: Collection[str] = field(default_factory=set)
+    # Determines how unsafe input is handled.
+    #
+    # - "strip": Default. Remove/drop unsafe constructs and keep going.
+    # - "raise": Raise UnsafeHtmlError on the first unsafe construct.
+    #
+    # This is intentionally a string mode (instead of a boolean) so we can add
+    # more behaviors over time without changing the API shape.
+    unsafe_handling: UnsafeHandling = "strip"
+    # Determines how disallowed tags are handled.
+    #
+    # - "unwrap": Default. Drop the tag but keep/sanitize its children.
+    # - "escape": Emit original tag tokens as text, keep/sanitize children.
+    # - "drop": Drop the entire disallowed subtree.
+    disallowed_tag_handling: DisallowedTagHandling = "unwrap"
+    _unsafe_handler: UnsafeHandler = field(
+        default_factory=lambda: UnsafeHandler("strip"),
+        init=False,
+        repr=False,
+        compare=False,
+    )
+    # Internal caches to avoid per-node allocations in hot paths.
+    _allowed_attrs_global: frozenset[str] = field(
+        default_factory=frozenset,
+        init=False,
+        repr=False,
+        compare=False,
+    )
+    _allowed_attrs_by_tag: dict[str, frozenset[str]] = field(
+        default_factory=dict,
+        init=False,
+        repr=False,
+        compare=False,
+    )
+    # Cache for the compiled `Sanitize(policy=...)` transform pipeline.
+    # This lets safe serialization reuse the same compiled transforms.
+    _compiled_sanitize_transforms: list[Any] | None = field(
+        default=None,
+        init=False,
+        repr=False,
+        compare=False,
+    )
     def __post_init__(self) -> None:
         # Normalize to sets so the sanitizer can do fast membership checks.
         if not isinstance(self.allowed_tags, set):
@@ -143,6 +348,57 @@ class SanitizationPolicy:
         if not isinstance(self.force_link_rel, set):
             object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
+        unsafe_handling = str(self.unsafe_handling)
+        if unsafe_handling not in {"strip", "raise", "collect"}:
+            raise ValueError("Invalid unsafe_handling. Expected one of: 'strip', 'raise', 'collect'")
+        object.__setattr__(self, "unsafe_handling", unsafe_handling)
+        disallowed_tag_handling = str(self.disallowed_tag_handling)
+        if disallowed_tag_handling not in {"unwrap", "escape", "drop"}:
+            raise ValueError("Invalid disallowed_tag_handling. Expected one of: 'unwrap', 'escape', 'drop'")
+        object.__setattr__(self, "disallowed_tag_handling", disallowed_tag_handling)
+        # Centralize unsafe-handling logic so multiple passes can share it.
+        handler = UnsafeHandler(cast("UnsafeHandling", unsafe_handling))
+        handler.reset()
+        object.__setattr__(self, "_unsafe_handler", handler)
+        # Normalize rel tokens once so downstream sanitization can stay allocation-light.
+        # (Downstream code expects lowercase tokens and ignores empty/whitespace.)
+        if self.force_link_rel:
+            normalized_force_link_rel = {t.strip().lower() for t in self.force_link_rel if str(t).strip()}
+            object.__setattr__(self, "force_link_rel", normalized_force_link_rel)
+        style_allowed = any("style" in attrs for attrs in self.allowed_attributes.values())
+        if style_allowed and not self.allowed_css_properties:
+            raise ValueError(
+                "SanitizationPolicy allows the 'style' attribute but allowed_css_properties is empty. "
+                "Either remove 'style' from allowed_attributes or set allowed_css_properties (for example CSS_PRESET_TEXT)."
+            )
+        allowed_attributes = self.allowed_attributes
+        allowed_global = frozenset(allowed_attributes.get("*", ()))
+        by_tag: dict[str, frozenset[str]] = {}
+        for tag, attrs in allowed_attributes.items():
+            if tag == "*":
+                continue
+            by_tag[tag] = frozenset(allowed_global.union(attrs))
+        object.__setattr__(self, "_allowed_attrs_global", allowed_global)
+        object.__setattr__(self, "_allowed_attrs_by_tag", by_tag)
+    def reset_collected_security_errors(self) -> None:
+        self._unsafe_handler.reset()
+    def collected_security_errors(self) -> list[ParseError]:
+        return self._unsafe_handler.collected()
+    def handle_unsafe(self, msg: str, *, node: Any | None = None) -> None:
+        self._unsafe_handler.handle(msg, node=node)
+_URL_NORMALIZE_STRIP_TABLE = {i: None for i in range(0x21)}
+_URL_NORMALIZE_STRIP_TABLE[0x7F] = None
 DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
     allowed_tags=[
@@ -199,32 +455,53 @@ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
         "th": ["colspan", "rowspan"],
         "td": ["colspan", "rowspan"],
     },
-    # Default URL stance:
-    # - Links may point to http/https/mailto/tel and relative URLs.
-    # - Images may point to relative URLs only.
-    url_rules={
-        ("a", "href"): UrlRule(
-            allowed_schemes=["http", "https", "mailto", "tel"],
-            resolve_protocol_relative="https",
-        ),
-        ("img", "src"): UrlRule(
-            allowed_schemes=["http", "https"],
-            resolve_protocol_relative="https",
-        ),
-    },
+    url_policy=UrlPolicy(
+        default_handling="allow",
+        allow_rules={
+            ("a", "href"): UrlRule(
+                allowed_schemes=["http", "https", "mailto", "tel"],
+                resolve_protocol_relative="https",
+            ),
+            ("img", "src"): UrlRule(
+                allowed_schemes=[],
+                resolve_protocol_relative=None,
+            ),
+        },
+    ),
     allowed_css_properties=set(),
 )
+# A conservative preset for allowing a small amount of inline styling.
+# This is intentionally focused on text-level styling and avoids layout/
+# positioning properties that are commonly abused for UI redress.
+CSS_PRESET_TEXT: frozenset[str] = frozenset(
+    {
+        "background-color",
+        "color",
+        "font-size",
+        "font-style",
+        "font-weight",
+        "letter-spacing",
+        "line-height",
+        "text-align",
+        "text-decoration",
+        "text-transform",
+        "white-space",
+        "word-break",
+        "word-spacing",
+        "word-wrap",
+    }
+)
 DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
     allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
     allowed_attributes=DEFAULT_POLICY.allowed_attributes,
-    url_rules=DEFAULT_POLICY.url_rules,
-    url_filter=DEFAULT_POLICY.url_filter,
+    url_policy=DEFAULT_POLICY.url_policy,
     drop_comments=DEFAULT_POLICY.drop_comments,
     drop_doctype=DEFAULT_POLICY.drop_doctype,
     drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
-    strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
     drop_content_tags=DEFAULT_POLICY.drop_content_tags,
     allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
     force_link_rel=DEFAULT_POLICY.force_link_rel,
@@ -372,8 +649,8 @@ def _css_value_may_load_external_resource(value: str) -> bool:
     return False
-def _sanitize_inline_style(*, policy: SanitizationPolicy, value: str) -> str | None:
-    allowed = policy.allowed_css_properties
+def _sanitize_inline_style(*, allowed_css_properties: Collection[str], value: str) -> str | None:
+    allowed = allowed_css_properties
     if not allowed:
         return None
@@ -414,13 +691,7 @@ def _normalize_url_for_checking(value: str) -> str:
     # Strip whitespace/control chars commonly used for scheme obfuscation.
     # Note: do not strip backslashes; they are not whitespace/control chars,
     # and removing them can turn invalid schemes into valid ones.
-    out: list[str] = []
-    for ch in value:
-        o = ord(ch)
-        if o <= 0x20 or o == 0x7F:
-            continue
-        out.append(ch)
-    return "".join(out)
+    return value.translate(_URL_NORMALIZE_STRIP_TABLE)
 def _is_valid_scheme(scheme: str) -> bool:
@@ -467,15 +738,46 @@ def _has_invalid_scheme_like_prefix(value: str) -> bool:
 def _sanitize_url_value(
     *,
-    policy: SanitizationPolicy,
+    url_policy: UrlPolicy,
+    rule: UrlRule,
+    tag: str,
+    attr: str,
+    value: str,
+) -> str | None:
+    return _sanitize_url_value_inner(
+        url_policy=url_policy, rule=rule, tag=tag, attr=attr, value=value, apply_filter=True
+    )
+def _effective_proxy(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlProxy | None:
+    return rule.proxy if rule.proxy is not None else url_policy.proxy
+def _effective_url_handling(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlHandling:
+    # URL-like attributes are allowlisted via UrlPolicy.allow_rules. When they are
+    # allowlisted and the URL passes validation, the default action is to keep the URL.
+    return rule.handling if rule.handling is not None else "allow"
+def _effective_allow_relative(*, url_policy: UrlPolicy, rule: UrlRule) -> bool:
+    return rule.allow_relative if rule.allow_relative is not None else url_policy.default_allow_relative
+def _sanitize_url_value_inner(
+    *,
+    url_policy: UrlPolicy,
     rule: UrlRule,
     tag: str,
     attr: str,
     value: str,
+    apply_filter: bool,
 ) -> str | None:
     v = value
-    if policy.url_filter is not None:
-        rewritten = policy.url_filter(tag, attr, v)
+    mode = _effective_url_handling(url_policy=url_policy, rule=rule)
+    allow_relative = _effective_allow_relative(url_policy=url_policy, rule=rule)
+    if apply_filter and url_policy.url_filter is not None:
+        rewritten = url_policy.url_filter(tag, attr, v)
         if rewritten is None:
             return None
         v = rewritten
@@ -488,11 +790,18 @@ def _sanitize_url_value(
         return None
     if normalized.startswith("#"):
-        return stripped if rule.allow_fragment else None
-    # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
-    # Some user agents normalize backslashes and other characters during navigation.
-    if rule.proxy_url and _has_invalid_scheme_like_prefix(normalized):
+        if not rule.allow_fragment:
+            return None
+        if mode == "strip":
+            return None
+        if mode == "proxy":
+            proxy = _effective_proxy(url_policy=url_policy, rule=rule)
+            return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
+        return stripped
+    if mode == "proxy" and _has_invalid_scheme_like_prefix(normalized):
+        # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
+        # Some user agents normalize backslashes and other characters during navigation.
         return None
     if normalized.startswith("//"):
@@ -513,12 +822,12 @@ def _sanitize_url_value(
             if not host or host not in rule.allowed_hosts:
                 return None
-        # Return the resolved URL.
-        return (
-            _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=resolved_url)
-            if rule.proxy_url
-            else resolved_url
-        )
+        if mode == "strip":
+            return None
+        if mode == "proxy":
+            proxy = _effective_proxy(url_policy=url_policy, rule=rule)
+            return None if proxy is None else _proxy_url_value(proxy=proxy, value=resolved_url)
+        return resolved_url
     if _has_scheme(normalized):
         parsed = urlsplit(normalized)
@@ -529,235 +838,155 @@ def _sanitize_url_value(
             host = (parsed.hostname or "").lower()
             if not host or host not in rule.allowed_hosts:
                 return None
-        return (
-            _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=stripped)
-            if rule.proxy_url
-            else stripped
-        )
+        if mode == "strip":
+            return None
+        if mode == "proxy":
+            proxy = _effective_proxy(url_policy=url_policy, rule=rule)
+            return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
+        return stripped
-    return stripped if rule.allow_relative else None
+    if not allow_relative:
+        return None
+    if mode == "strip":
+        return None
+    if mode == "proxy":
+        proxy = _effective_proxy(url_policy=url_policy, rule=rule)
+        return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
+    return stripped
-def _sanitize_attrs(
+def _sanitize_srcset_value(
     *,
-    policy: SanitizationPolicy,
+    url_policy: UrlPolicy,
+    rule: UrlRule,
     tag: str,
-    attrs: dict[str, str | None] | None,
-) -> dict[str, str | None]:
-    if not attrs:
-        attrs = {}
-    allowed_global = set(policy.allowed_attributes.get("*", ()))
-    allowed_tag = set(policy.allowed_attributes.get(tag, ()))
-    allowed = allowed_global | allowed_tag
-    out: dict[str, str | None] = {}
-    for raw_name, raw_value in attrs.items():
-        if not raw_name:
-            continue
-        name = str(raw_name).strip().lower()
-        if not name:
-            continue
-        # Disallow namespace-ish attributes by default.
-        if ":" in name:
-            continue
-        # Always drop event handlers.
-        if name.startswith("on"):
-            continue
-        # Dangerous attribute contexts.
-        if name == "srcdoc":
-            continue
-        if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
-            continue
-        if raw_value is None:
-            out[name] = None
-            continue
-        value = str(raw_value)
-        rule = policy.url_rules.get((tag, name))
-        if rule is not None:
-            sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
-            if sanitized is None:
-                continue
-            out[name] = sanitized
-        elif name == "style":
-            sanitized_style = _sanitize_inline_style(policy=policy, value=value)
-            if sanitized_style is None:
-                continue
-            out[name] = sanitized_style
-        else:
-            out[name] = value
-    # Link hardening (merge tokens; do not remove existing ones).
-    forced_tokens = [t.strip().lower() for t in policy.force_link_rel if str(t).strip()]
-    if tag == "a" and forced_tokens:
-        existing_raw = out.get("rel")
-        existing: list[str] = []
-        if isinstance(existing_raw, str) and existing_raw:
-            for tok in existing_raw.split():
-                t = tok.strip().lower()
-                if t and t not in existing:
-                    existing.append(t)
-        for tok in sorted(forced_tokens):
-            if tok not in existing:
-                existing.append(tok)
-        out["rel"] = " ".join(existing)
-    return out
-def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
-    stack: list[tuple[Any, Any]] = [(original, parent_out)]
-    while stack:
-        current, out_parent = stack.pop()
-        name: str = current.name
-        if name == "#text":
-            out_parent.append_child(current.clone_node(deep=False))
-            continue
-        if name == "#comment":
-            if policy.drop_comments:
-                continue
-            out_parent.append_child(current.clone_node(deep=False))
-            continue
-        if name == "!doctype":
-            if policy.drop_doctype:
-                continue
-            out_parent.append_child(current.clone_node(deep=False))
-            continue
-        # Document containers.
-        if name.startswith("#"):
-            clone = current.clone_node(deep=False)
-            clone.children.clear()
-            out_parent.append_child(clone)
-            children = current.children or []
-            stack.extend((child, clone) for child in reversed(children))
-            continue
-        # Element.
-        tag = str(name).lower()
-        if policy.drop_foreign_namespaces:
-            ns = current.namespace
-            if ns not in (None, "html"):
-                continue
-        if tag in policy.drop_content_tags:
-            continue
+    attr: str,
+    value: str,
+) -> str | None:
+    # Apply the URL filter once to the whole attribute value.
+    v = value
+    if url_policy.url_filter is not None:
+        rewritten = url_policy.url_filter(tag, attr, v)
+        if rewritten is None:
+            return None
+        v = rewritten
-        if tag not in policy.allowed_tags:
-            if policy.strip_disallowed_tags:
-                children = current.children or []
-                stack.extend((child, out_parent) for child in reversed(children))
+    stripped = str(v).strip()
+    if not stripped:
+        return None
-                if tag == "template" and current.namespace in (None, "html") and current.template_content:
-                    tc_children = current.template_content.children or []
-                    stack.extend((child, out_parent) for child in reversed(tc_children))
+    out_candidates: list[str] = []
+    for raw_candidate in stripped.split(","):
+        c = raw_candidate.strip()
+        if not c:
             continue
-        clone = current.clone_node(deep=False)
-        # Ensure children list is empty before we append sanitized descendants.
-        clone.children.clear()
-        # Filter attributes.
-        clone.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=current.attrs)
-        out_parent.append_child(clone)
-        # Template content is a separate subtree.
-        if tag == "template" and current.namespace in (None, "html"):
-            if current.template_content and clone.template_content:
-                clone.template_content.children.clear()
-                tc_children = current.template_content.children or []
-                stack.extend((child, clone.template_content) for child in reversed(tc_children))
+        parts = c.split(None, 1)
+        url_token = parts[0]
+        desc = parts[1].strip() if len(parts) == 2 else ""
+        sanitized_url = _sanitize_url_value_inner(
+            url_policy=url_policy,
+            rule=rule,
+            tag=tag,
+            attr=attr,
+            value=url_token,
+            apply_filter=False,
+        )
+        if sanitized_url is None:
+            return None
-        children = current.children or []
-        stack.extend((child, clone) for child in reversed(children))
+        out_candidates.append(f"{sanitized_url} {desc}".strip())
+    return None if not out_candidates else ", ".join(out_candidates)
+_URL_LIKE_ATTRS: frozenset[str] = frozenset(
+    {
+        # Common URL-valued attributes.
+        "href",
+        "src",
+        "srcset",
+        "poster",
+        "action",
+        "formaction",
+        "data",
+        "cite",
+        "background",
+        # Can trigger requests/pings.
+        "ping",
+    }
+)
-def sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
+def _sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
     """Return a sanitized clone of `node`.
-    If `policy` is not provided, JustHTML uses a conservative default policy.
-    For full documents (`#document` roots) it preserves `<html>`, `<head>`, and
-    `<body>` wrappers; for fragments it prefers snippet-shaped output.
+    This returns a sanitized clone without mutating the original tree.
+    For performance, it builds the sanitized clone in a single pass.
     """
     if policy is None:
         policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
-    # Root handling.
-    root_name: str = node.name
-    if root_name == "#text":
-        return node.clone_node(deep=False)
-    if root_name == "#comment":
-        out_root = node.clone_node(deep=False)
-        if policy.drop_comments:
-            out_root.name = "#document-fragment"
-        return out_root
-    if root_name == "!doctype":
-        out_root = node.clone_node(deep=False)
-        if policy.drop_doctype:
-            out_root.name = "#document-fragment"
-        return out_root
-    # Containers.
-    if root_name.startswith("#"):
-        out_root = node.clone_node(deep=False)
-        out_root.children.clear()
-        for child in node.children or []:
-            _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
-        return out_root
-    # Element root: keep element if allowed, otherwise unwrap into a fragment.
-    tag = str(root_name).lower()
-    if policy.drop_foreign_namespaces and node.namespace not in (None, "html"):
-        out_root = node.clone_node(deep=False)
-        out_root.name = "#document-fragment"
-        out_root.children.clear()
-        out_root.attrs.clear()
-        return out_root
-    if tag in policy.drop_content_tags or (tag not in policy.allowed_tags and not policy.strip_disallowed_tags):
-        out_root = node.clone_node(deep=False)
-        out_root.name = "#document-fragment"
-        out_root.children.clear()
-        out_root.attrs.clear()
-        return out_root
-    if tag not in policy.allowed_tags and policy.strip_disallowed_tags:
-        out_root = node.clone_node(deep=False)
-        out_root.name = "#document-fragment"
-        out_root.children.clear()
-        out_root.attrs.clear()
-        for child in node.children or []:
-            _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
-        if tag == "template" and node.namespace in (None, "html") and node.template_content:
-            for child in node.template_content.children or []:
-                _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
-        return out_root
-    out_root = node.clone_node(deep=False)
-    out_root.children.clear()
-    out_root.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=node.attrs)
-    for child in node.children or []:
-        _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
-    if tag == "template" and node.namespace in (None, "html"):
-        if node.template_content and out_root.template_content:
-            out_root.template_content.children.clear()
-            for child in node.template_content.children or []:
-                _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
-    return out_root
+    # Escape-mode tag reconstruction may need access to the original source HTML.
+    # Historically we allow a child element to inherit _source_html from an
+    # ancestor container; keep that behavior even though we sanitize a clone.
+    if policy.disallowed_tag_handling == "escape":
+        root_source_html = getattr(node, "_source_html", None)
+        if root_source_html:
+            from .node import TemplateNode  # noqa: PLC0415
+            stack: list[Any] = [node]
+            while stack:
+                current = stack.pop()
+                current_source_html = getattr(current, "_source_html", None) or root_source_html
+                children = getattr(current, "children", None) or []
+                for child in children:
+                    # TextNode does not have _source_html.
+                    if getattr(child, "name", "") == "#text":
+                        continue
+                    if getattr(child, "_source_html", None) is None:
+                        child._source_html = current_source_html
+                    stack.append(child)
+                if type(current) is TemplateNode and current.template_content is not None:
+                    tc = current.template_content
+                    if getattr(tc, "_source_html", None) is None:
+                        tc._source_html = current_source_html
+                    stack.append(tc)
+    # We intentionally implement safe-output sanitization by applying the
+    # `Sanitize(policy=...)` transform pipeline to a clone of the node.
+    # This keeps a single canonical sanitization algorithm.
+    from .transforms import Sanitize, apply_compiled_transforms, compile_transforms  # noqa: PLC0415
+    compiled = policy._compiled_sanitize_transforms
+    if compiled is None:
+        compiled = compile_transforms((Sanitize(policy=policy),))
+        object.__setattr__(policy, "_compiled_sanitize_transforms", compiled)
+    # Container-root rule: transforms walk children of the provided root.
+    # For non-container roots, wrap the cloned node in a document fragment so
+    # the sanitizer can act on the root node itself.
+    if node.name in {"#document", "#document-fragment"}:
+        cloned = node.clone_node(deep=True)
+        apply_compiled_transforms(cloned, compiled, errors=None)
+        return cloned
+    from .node import SimpleDomNode  # noqa: PLC0415
+    wrapper = SimpleDomNode("#document-fragment")
+    wrapper.append_child(node.clone_node(deep=True))
+    apply_compiled_transforms(wrapper, compiled, errors=None)
+    children = wrapper.children or []
+    if len(children) == 1:
+        only = children[0]
+        only.parent = None
+        wrapper.children = []
+        return only
+    return wrapper

justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl