PyPI - justhtml - Versions diffs - 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl - Mend

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (21) hide show

justhtml/__init__.py +44 -2
justhtml/__main__.py +45 -9
justhtml/constants.py +12 -0
justhtml/errors.py +8 -3
justhtml/linkify.py +438 -0
justhtml/node.py +54 -35
justhtml/parser.py +105 -38
justhtml/sanitize.py +511 -282
justhtml/selector.py +3 -1
justhtml/serialize.py +398 -72
justhtml/tokenizer.py +121 -21
justhtml/tokens.py +21 -3
justhtml/transforms.py +2568 -0
justhtml/treebuilder.py +247 -190
justhtml/treebuilder_modes.py +108 -102
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
justhtml-0.38.0.dist-info/RECORD +26 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
justhtml-0.24.0.dist-info/RECORD +0 -24
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
{justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0

justhtml/node.py CHANGED Viewed

@@ -3,12 +3,10 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
 from urllib.parse import quote
-from .sanitize import sanitize
 from .selector import query
 from .serialize import to_html
 if TYPE_CHECKING:
-    from .sanitize import SanitizationPolicy
     from .tokens import Doctype
@@ -192,6 +190,7 @@ class SimpleDomNode:
         "_origin_col",
         "_origin_line",
         "_origin_pos",
+        "_source_html",
         "attrs",
         "children",
         "data",
@@ -209,6 +208,7 @@ class SimpleDomNode:
     _origin_pos: int | None
     _origin_line: int | None
     _origin_col: int | None
+    _source_html: str | None
     def __init__(
         self,
@@ -220,6 +220,7 @@ class SimpleDomNode:
         self.name = name
         self.parent = None
         self.data = data
+        self._source_html = None
         self._origin_pos = None
         self._origin_line = None
         self._origin_col = None
@@ -271,12 +272,9 @@ class SimpleDomNode:
         indent: int = 0,
         indent_size: int = 2,
         pretty: bool = True,
-        *,
-        safe: bool = True,
-        policy: SanitizationPolicy | None = None,
     ) -> str:
         """Convert node to HTML string."""
-        return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
+        return to_html(self, indent, indent_size, pretty=pretty)
     def query(self, selector: str) -> list[Any]:
         """
@@ -312,39 +310,27 @@ class SimpleDomNode:
         self,
         separator: str = " ",
         strip: bool = True,
-        *,
-        safe: bool = True,
-        policy: SanitizationPolicy | None = None,
     ) -> str:
         """Return the concatenated text of this node's descendants.
         - `separator` controls how text nodes are joined (default: a single space).
         - `strip=True` strips each text node and drops empty segments.
-        - `safe=True` sanitizes untrusted HTML before extracting text.
-        - `policy` overrides the default sanitization policy.
         Template element contents are included via `template_content`.
         """
-        node: Any = sanitize(self, policy=policy) if safe else self
+        node: Any = self
         parts: list[str] = []
         _to_text_collect(node, parts, strip=strip)
         if not parts:
             return ""
         return separator.join(parts)
-    def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
+    def to_markdown(self) -> str:
         """Return a GitHub Flavored Markdown representation of this subtree.
         This is a pragmatic HTML->Markdown converter intended for readability.
         - Tables and images are preserved as raw HTML.
         - Unknown elements fall back to rendering their children.
         """
-        if safe:
-            node = sanitize(self, policy=policy)
-            builder = _MarkdownBuilder()
-            _to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
-            return builder.finish()
         builder = _MarkdownBuilder()
         _to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
         return builder.finish()
@@ -405,22 +391,25 @@ class SimpleDomNode:
         """Return True if this node has children."""
         return bool(self.children)
-    def clone_node(self, deep: bool = False) -> SimpleDomNode:
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
         """
         Clone this node.
         Args:
             deep: If True, recursively clone children.
+            override_attrs: Optional dictionary to use as attributes for the clone.
         Returns:
             A new node that is a copy of this node.
         """
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
         clone = SimpleDomNode(
             self.name,
-            self.attrs.copy() if self.attrs else None,
+            attrs,
             self.data,
             self.namespace,
         )
+        clone._source_html = self._source_html
         clone._origin_pos = self._origin_pos
         clone._origin_line = self._origin_line
         clone._origin_col = self._origin_col
@@ -431,11 +420,25 @@ class SimpleDomNode:
 class ElementNode(SimpleDomNode):
-    __slots__ = ("template_content",)
+    __slots__ = (
+        "_end_tag_end",
+        "_end_tag_present",
+        "_end_tag_start",
+        "_self_closing",
+        "_start_tag_end",
+        "_start_tag_start",
+        "template_content",
+    )
     template_content: SimpleDomNode | None
     children: list[Any]
     attrs: dict[str, str | None]
+    _start_tag_start: int | None
+    _start_tag_end: int | None
+    _end_tag_start: int | None
+    _end_tag_end: int | None
+    _end_tag_present: bool
+    _self_closing: bool
     def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
         self.name = name
@@ -445,15 +448,30 @@ class ElementNode(SimpleDomNode):
         self.children = []
         self.attrs = attrs if attrs is not None else {}
         self.template_content = None
+        self._source_html = None
         self._origin_pos = None
         self._origin_line = None
         self._origin_col = None
-    def clone_node(self, deep: bool = False) -> ElementNode:
-        clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
+        self._start_tag_start = None
+        self._start_tag_end = None
+        self._end_tag_start = None
+        self._end_tag_end = None
+        self._end_tag_present = False
+        self._self_closing = False
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
+        clone = ElementNode(self.name, attrs, self.namespace)
+        clone._source_html = self._source_html
         clone._origin_pos = self._origin_pos
         clone._origin_line = self._origin_line
         clone._origin_col = self._origin_col
+        clone._start_tag_start = self._start_tag_start
+        clone._start_tag_end = self._start_tag_end
+        clone._end_tag_start = self._end_tag_start
+        clone._end_tag_end = self._end_tag_end
+        clone._end_tag_present = self._end_tag_present
+        clone._self_closing = self._self_closing
         if deep:
             for child in self.children:
                 clone.append_child(child.clone_node(deep=True))
@@ -476,16 +494,24 @@ class TemplateNode(ElementNode):
         else:
             self.template_content = None
-    def clone_node(self, deep: bool = False) -> TemplateNode:
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
         clone = TemplateNode(
             self.name,
-            self.attrs.copy() if self.attrs else {},
+            attrs,
             None,
             self.namespace,
         )
+        clone._source_html = self._source_html
         clone._origin_pos = self._origin_pos
         clone._origin_line = self._origin_line
         clone._origin_col = self._origin_col
+        clone._start_tag_start = self._start_tag_start
+        clone._start_tag_end = self._start_tag_end
+        clone._end_tag_start = self._end_tag_start
+        clone._end_tag_end = self._end_tag_end
+        clone._end_tag_present = self._end_tag_present
+        clone._self_closing = self._self_closing
         if deep:
             if self.template_content:
                 clone.template_content = self.template_content.clone_node(deep=True)
@@ -542,15 +568,8 @@ class TextNode:
         self,
         separator: str = " ",
         strip: bool = True,
-        *,
-        safe: bool = True,
-        policy: SanitizationPolicy | None = None,
     ) -> str:
-        # Parameters are accepted for API consistency; they don't affect leaf nodes.
         _ = separator
-        _ = safe
-        _ = policy
         if self.data is None:
             return ""
         if strip:

justhtml/parser.py CHANGED Viewed

@@ -7,12 +7,14 @@ from typing import TYPE_CHECKING, Any
 from .context import FragmentContext
 from .encoding import decode_html
 from .tokenizer import Tokenizer, TokenizerOpts
+from .transforms import apply_compiled_transforms, compile_transforms
 from .treebuilder import TreeBuilder
 if TYPE_CHECKING:
     from .node import SimpleDomNode
     from .sanitize import SanitizationPolicy
     from .tokens import ParseError
+    from .transforms import TransformSpec
 class StrictModeError(SyntaxError):
@@ -53,6 +55,8 @@ class JustHTML:
         self,
         html: str | bytes | bytearray | memoryview | None,
         *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
         collect_errors: bool = False,
         track_node_locations: bool = False,
         debug: bool = False,
@@ -63,6 +67,7 @@ class JustHTML:
         strict: bool = False,
         tokenizer_opts: TokenizerOpts | None = None,
         tree_builder: TreeBuilder | None = None,
+        transforms: list[TransformSpec] | None = None,
     ) -> None:
         if fragment_context is not None:
             fragment = True
@@ -70,6 +75,29 @@ class JustHTML:
         if fragment and fragment_context is None:
             fragment_context = FragmentContext("div")
+        track_tag_spans = False
+        has_sanitize_transform = False
+        needs_escape_incomplete_tags = False
+        if transforms:
+            from .sanitize import DEFAULT_POLICY  # noqa: PLC0415
+            from .transforms import Sanitize  # noqa: PLC0415
+            for t in transforms:
+                if isinstance(t, Sanitize):
+                    has_sanitize_transform = True
+                    effective = t.policy or DEFAULT_POLICY
+                    if effective.disallowed_tag_handling == "escape":
+                        track_tag_spans = True
+                        needs_escape_incomplete_tags = True
+                        break
+        # If we will auto-sanitize (safe=True and no Sanitize in transforms),
+        # escape-mode tag reconstruction may require tracking tag spans.
+        if safe and not has_sanitize_transform and policy is not None:
+            if policy.disallowed_tag_handling == "escape":
+                track_tag_spans = True
+                needs_escape_incomplete_tags = True
         self.debug = bool(debug)
         self.fragment_context = fragment_context
         self.encoding = None
@@ -91,8 +119,11 @@ class JustHTML:
             fragment_context=fragment_context,
             iframe_srcdoc=iframe_srcdoc,
             collect_errors=should_collect,
+            track_tag_spans=track_tag_spans,
         )
         opts = tokenizer_opts or TokenizerOpts()
+        if needs_escape_incomplete_tags:
+            opts.emit_bogus_markup_as_text = True
         # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
         if fragment_context and not fragment_context.namespace:
@@ -109,6 +140,7 @@ class JustHTML:
             opts,
             collect_errors=should_collect,
             track_node_locations=bool(track_node_locations),
+            track_tag_positions=bool(track_node_locations) or track_tag_spans,
         )
         # Link tokenizer to tree_builder for position info
         self.tree_builder.tokenizer = self.tokenizer
@@ -116,11 +148,73 @@ class JustHTML:
         self.tokenizer.run(html_str)
         self.root = self.tree_builder.finish()
-        # Merge errors from both tokenizer and tree builder.
-        # Public API: users expect errors to be ordered by input position.
-        merged_errors = self.tokenizer.errors + self.tree_builder.errors
-        indexed_errors = enumerate(merged_errors)
-        self.errors = [
+        transform_errors: list[ParseError] = []
+        # Apply transforms after parse.
+        # Safety model: when safe=True, the in-memory tree is sanitized exactly once
+        # during construction by ensuring a Sanitize transform runs.
+        if transforms or safe:
+            from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY  # noqa: PLC0415
+            from .transforms import Sanitize  # noqa: PLC0415
+            final_transforms: list[TransformSpec] = list(transforms or [])
+            # Normalize explicit Sanitize() transforms to use the same default policy
+            # choice as the old safe-output sanitizer (document vs fragment).
+            if final_transforms:
+                default_mode_policy = DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY
+                for i, t in enumerate(final_transforms):
+                    if isinstance(t, Sanitize) and t.policy is None:
+                        final_transforms[i] = Sanitize(
+                            policy=default_mode_policy, enabled=t.enabled, callback=t.callback, report=t.report
+                        )
+            # Auto-append a final Sanitize step only if the user didn't include
+            # Sanitize anywhere in their transform list.
+            if safe and not any(isinstance(t, Sanitize) for t in final_transforms):
+                effective_policy = (
+                    policy
+                    if policy is not None
+                    else (DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY)
+                )
+                # Avoid stale collected errors on reused policy objects.
+                if effective_policy.unsafe_handling == "collect":
+                    effective_policy.reset_collected_security_errors()
+                final_transforms.append(Sanitize(policy=effective_policy))
+            if final_transforms:
+                compiled_transforms = compile_transforms(tuple(final_transforms))
+                apply_compiled_transforms(self.root, compiled_transforms, errors=transform_errors)
+                # Merge collected security errors into the document error list.
+                # This mirrors the old behavior where safe output could feed
+                # security findings into doc.errors.
+                for t in final_transforms:
+                    if isinstance(t, Sanitize):
+                        t_policy = t.policy
+                        if t_policy is not None and t_policy.unsafe_handling == "collect":
+                            transform_errors.extend(t_policy.collected_security_errors())
+        if should_collect:
+            # Merge errors from both tokenizer and tree builder.
+            # Public API: users expect errors to be ordered by input position.
+            merged_errors = self.tokenizer.errors + self.tree_builder.errors + transform_errors
+            self.errors = self._sorted_errors(merged_errors)
+        else:
+            self.errors = transform_errors
+        # In strict mode, raise on first error
+        if strict and self.errors:
+            raise StrictModeError(self.errors[0])
+    def query(self, selector: str) -> list[Any]:
+        """Query the document using a CSS selector. Delegates to root.query()."""
+        return self.root.query(selector)
+    @staticmethod
+    def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
+        indexed_errors = enumerate(errors)
+        return [
             e
             for _, e in sorted(
                 indexed_errors,
@@ -132,56 +226,29 @@ class JustHTML:
             )
         ]
-        # In strict mode, raise on first error
-        if strict and self.errors:
-            raise StrictModeError(self.errors[0])
-    def query(self, selector: str) -> list[Any]:
-        """Query the document using a CSS selector. Delegates to root.query()."""
-        return self.root.query(selector)
     def to_html(
         self,
         pretty: bool = True,
         indent_size: int = 2,
-        *,
-        safe: bool = True,
-        policy: SanitizationPolicy | None = None,
     ) -> str:
         """Serialize the document to HTML.
-        - `safe=True` sanitizes untrusted content before serialization.
-        - `policy` overrides the default sanitization policy.
+        Sanitization (when enabled) happens during construction.
         """
         return self.root.to_html(
             indent=0,
             indent_size=indent_size,
             pretty=pretty,
-            safe=safe,
-            policy=policy,
         )
     def to_text(
         self,
         separator: str = " ",
         strip: bool = True,
-        *,
-        safe: bool = True,
-        policy: SanitizationPolicy | None = None,
     ) -> str:
-        """Return the document's concatenated text.
-        - `safe=True` sanitizes untrusted content before text extraction.
-        - `policy` overrides the default sanitization policy.
+        """Return the document's concatenated text."""
+        return self.root.to_text(separator=separator, strip=strip)
-        Delegates to `root.to_text(...)`.
-        """
-        return self.root.to_text(separator=separator, strip=strip, safe=safe, policy=policy)
-    def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
-        """Return a GitHub Flavored Markdown representation.
-        - `safe=True` sanitizes untrusted content before conversion.
-        - `policy` overrides the default sanitization policy.
-        """
-        return self.root.to_markdown(safe=safe, policy=policy)
+    def to_markdown(self) -> str:
+        """Return a GitHub Flavored Markdown representation."""
+        return self.root.to_markdown()

justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.24.0py3-none-any.whl → 0.38.0py3-none-any.whl