PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (20) hide show

justhtml/__init__.py +6 -0
justhtml/__main__.py +49 -16
justhtml/entities.py +45 -7
justhtml/errors.py +9 -0
justhtml/node.py +358 -89
justhtml/parser.py +70 -14
justhtml/sanitize.py +763 -0
justhtml/selector.py +114 -18
justhtml/serialize.py +332 -28
justhtml/tokenizer.py +249 -179
justhtml/tokens.py +8 -3
justhtml/treebuilder.py +50 -14
justhtml/treebuilder_modes.py +100 -36
justhtml-0.24.0.dist-info/METADATA +192 -0
justhtml-0.24.0.dist-info/RECORD +24 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.12.0.dist-info/METADATA +0 -164
justhtml-0.12.0.dist-info/RECORD +0 -23
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0

justhtml/node.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING, Any
+from urllib.parse import quote
+from .sanitize import sanitize
 from .selector import query
 from .serialize import to_html
 if TYPE_CHECKING:
+    from .sanitize import SanitizationPolicy
     from .tokens import Doctype
@@ -43,6 +46,30 @@ def _markdown_code_span(s: str | None) -> str:
     return f"{fence}{s}{fence}"
+def _markdown_link_destination(url: str) -> str:
+    """Return a Markdown-safe link destination.
+    We primarily care about avoiding Markdown formatting injection and broken
+    parsing for URLs that contain whitespace or parentheses.
+    CommonMark supports destinations wrapped in angle brackets:
+    `[text](<https://example.com/a(b)c>)`
+    """
+    u = (url or "").strip()
+    if not u:
+        return ""
+    # If the destination contains characters that can terminate or confuse
+    # the Markdown destination parser, wrap in <...> and percent-encode
+    # whitespace and angle brackets.
+    if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
+        u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
+        return f"<{u}>"
+    return u
 class _MarkdownBuilder:
     __slots__ = ("_buf", "_newline_count", "_pending_space")
@@ -133,29 +160,45 @@ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
 def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
-    name: str = node.name
-    if name == "#text":
-        data: str | None = node.data
-        if not data:
-            return
-        if strip:
-            data = data.strip()
+    # Iterative traversal avoids recursion overhead on large documents.
+    stack: list[Any] = [node]
+    while stack:
+        current = stack.pop()
+        name: str = current.name
+        if name == "#text":
+            data: str | None = current.data
             if not data:
-                return
-        parts.append(data)
-        return
+                continue
+            if strip:
+                data = data.strip()
+                if not data:
+                    continue
+            parts.append(data)
+            continue
-    if node.children:
-        for child in node.children:
-            _to_text_collect(child, parts, strip=strip)
+        # Preserve the same traversal order as the recursive implementation:
+        # children first, then template content.
+        if type(current) is TemplateNode and current.template_content:
+            stack.append(current.template_content)
-    if isinstance(node, ElementNode) and node.template_content:
-        _to_text_collect(node.template_content, parts, strip=strip)
+        children = current.children
+        if children:
+            stack.extend(reversed(children))
 class SimpleDomNode:
-    __slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
+    __slots__ = (
+        "_origin_col",
+        "_origin_line",
+        "_origin_pos",
+        "attrs",
+        "children",
+        "data",
+        "name",
+        "namespace",
+        "parent",
+    )
     name: str
     parent: SimpleDomNode | ElementNode | TemplateNode | None
@@ -163,6 +206,9 @@ class SimpleDomNode:
     children: list[Any] | None
     data: str | Doctype | None
     namespace: str | None
+    _origin_pos: int | None
+    _origin_line: int | None
+    _origin_col: int | None
     def __init__(
         self,
@@ -174,6 +220,9 @@ class SimpleDomNode:
         self.name = name
         self.parent = None
         self.data = data
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
         if name.startswith("#") or name == "!doctype":
             self.namespace = namespace
@@ -193,14 +242,41 @@ class SimpleDomNode:
             self.children.append(node)
             node.parent = self
+    @property
+    def origin_offset(self) -> int | None:
+        """Best-effort origin offset (0-indexed) in the source HTML, if known."""
+        return self._origin_pos
+    @property
+    def origin_line(self) -> int | None:
+        return self._origin_line
+    @property
+    def origin_col(self) -> int | None:
+        return self._origin_col
+    @property
+    def origin_location(self) -> tuple[int, int] | None:
+        if self._origin_line is None or self._origin_col is None:
+            return None
+        return (self._origin_line, self._origin_col)
     def remove_child(self, node: Any) -> None:
         if self.children is not None:
             self.children.remove(node)
             node.parent = None
-    def to_html(self, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
+    def to_html(
+        self,
+        indent: int = 0,
+        indent_size: int = 2,
+        pretty: bool = True,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
         """Convert node to HTML string."""
-        return to_html(self, indent, indent_size, pretty=pretty)
+        return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
     def query(self, selector: str) -> list[Any]:
         """
@@ -232,27 +308,43 @@ class SimpleDomNode:
             return ""
         return ""
-    def to_text(self, separator: str = " ", strip: bool = True) -> str:
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
         """Return the concatenated text of this node's descendants.
         - `separator` controls how text nodes are joined (default: a single space).
         - `strip=True` strips each text node and drops empty segments.
+        - `safe=True` sanitizes untrusted HTML before extracting text.
+        - `policy` overrides the default sanitization policy.
         Template element contents are included via `template_content`.
         """
+        node: Any = sanitize(self, policy=policy) if safe else self
         parts: list[str] = []
-        _to_text_collect(self, parts, strip=strip)
+        _to_text_collect(node, parts, strip=strip)
         if not parts:
             return ""
         return separator.join(parts)
-    def to_markdown(self) -> str:
+    def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
         """Return a GitHub Flavored Markdown representation of this subtree.
         This is a pragmatic HTML->Markdown converter intended for readability.
         - Tables and images are preserved as raw HTML.
         - Unknown elements fall back to rendering their children.
         """
+        if safe:
+            node = sanitize(self, policy=policy)
+            builder = _MarkdownBuilder()
+            _to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
+            return builder.finish()
         builder = _MarkdownBuilder()
         _to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
         return builder.finish()
@@ -329,6 +421,9 @@ class SimpleDomNode:
             self.data,
             self.namespace,
         )
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
         if deep and self.children:
             for child in self.children:
                 clone.append_child(child.clone_node(deep=True))
@@ -350,9 +445,15 @@ class ElementNode(SimpleDomNode):
         self.children = []
         self.attrs = attrs if attrs is not None else {}
         self.template_content = None
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
     def clone_node(self, deep: bool = False) -> ElementNode:
         clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
         if deep:
             for child in self.children:
                 clone.append_child(child.clone_node(deep=True))
@@ -382,6 +483,9 @@ class TemplateNode(ElementNode):
             None,
             self.namespace,
         )
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
         if deep:
             if self.template_content:
                 clone.template_content = self.template_content.clone_node(deep=True)
@@ -391,26 +495,62 @@ class TemplateNode(ElementNode):
 class TextNode:
-    __slots__ = ("data", "name", "namespace", "parent")
+    __slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
     data: str | None
     name: str
     namespace: None
     parent: SimpleDomNode | ElementNode | TemplateNode | None
+    _origin_pos: int | None
+    _origin_line: int | None
+    _origin_col: int | None
     def __init__(self, data: str | None) -> None:
         self.data = data
         self.parent = None
         self.name = "#text"
         self.namespace = None
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
+    @property
+    def origin_offset(self) -> int | None:
+        """Best-effort origin offset (0-indexed) in the source HTML, if known."""
+        return self._origin_pos
+    @property
+    def origin_line(self) -> int | None:
+        return self._origin_line
+    @property
+    def origin_col(self) -> int | None:
+        return self._origin_col
+    @property
+    def origin_location(self) -> tuple[int, int] | None:
+        if self._origin_line is None or self._origin_col is None:
+            return None
+        return (self._origin_line, self._origin_col)
     @property
     def text(self) -> str:
         """Return the text content of this node."""
         return self.data or ""
-    def to_text(self, separator: str = " ", strip: bool = True) -> str:
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
         # Parameters are accepted for API consistency; they don't affect leaf nodes.
+        _ = separator
+        _ = safe
+        _ = policy
         if self.data is None:
             return ""
         if strip:
@@ -432,7 +572,11 @@ class TextNode:
         return False
     def clone_node(self, deep: bool = False) -> TextNode:
-        return TextNode(self.data)
+        clone = TextNode(self.data)
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
+        return clone
 _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
@@ -463,7 +607,13 @@ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
 )
-def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace: bool, list_depth: int) -> None:
+def _to_markdown_walk(
+    node: Any,
+    builder: _MarkdownBuilder,
+    preserve_whitespace: bool,
+    list_depth: int,
+    in_link: bool = False,
+) -> None:
     name: str = node.name
     if name == "#text":
@@ -474,7 +624,10 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
         return
     if name == "br":
-        builder.newline(1)
+        if in_link:
+            builder.text(" ", preserve_whitespace=False)
+        else:
+            builder.newline(1)
         return
     # Comments/doctype don't contribute.
@@ -485,52 +638,80 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     if name.startswith("#"):
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace, list_depth)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace,
+                    list_depth,
+                    in_link=in_link,
+                )
         return
     tag = name.lower()
+    # Metadata containers don't contribute to body text.
+    if tag == "head" or tag == "title":
+        return
     # Preserve <img> and <table> as HTML.
     if tag == "img":
         builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
         return
     if tag == "table":
-        builder.ensure_newlines(2 if builder._buf else 0)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
         builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2)
         return
     # Headings.
     if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
-        builder.ensure_newlines(2 if builder._buf else 0)
-        level = int(tag[1])
-        builder.raw("#" * level)
-        builder.raw(" ")
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            level = int(tag[1])
+            builder.raw("#" * level)
+            builder.raw(" ")
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
-        builder.ensure_newlines(2)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace=False,
+                    list_depth=list_depth,
+                    in_link=in_link,
+                )
+        if not in_link:
+            builder.ensure_newlines(2)
         return
     # Horizontal rule.
     if tag == "hr":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        builder.raw("---")
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            builder.raw("---")
+            builder.ensure_newlines(2)
         return
     # Code blocks.
     if tag == "pre":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        code = node.to_text(separator="", strip=False)
-        builder.raw("```")
-        builder.newline(1)
-        if code:
-            builder.raw(code.rstrip("\n"))
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            code = node.to_text(separator="", strip=False)
+            builder.raw("```")
             builder.newline(1)
-        builder.raw("```")
-        builder.ensure_newlines(2)
+            if code:
+                builder.raw(code.rstrip("\n"))
+                builder.newline(1)
+            builder.raw("```")
+            builder.ensure_newlines(2)
+        else:
+            # Inside link, render as inline code or text
+            code = node.to_text(separator="", strip=False)
+            builder.raw(_markdown_code_span(code))
         return
     # Inline code.
@@ -541,64 +722,126 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     # Paragraph-like blocks.
     if tag == "p":
-        builder.ensure_newlines(2 if builder._buf else 0)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
-        builder.ensure_newlines(2)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace=False,
+                    list_depth=list_depth,
+                    in_link=in_link,
+                )
+        if not in_link:
+            builder.ensure_newlines(2)
+        else:
+            builder.text(" ", preserve_whitespace=False)
         return
     # Blockquotes.
     if tag == "blockquote":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        inner = _MarkdownBuilder()
-        if node.children:
-            for child in node.children:
-                _to_markdown_walk(child, inner, preserve_whitespace=False, list_depth=list_depth)
-        text = inner.finish()
-        if text:
-            lines = text.split("\n")
-            for i, line in enumerate(lines):
-                if i:
-                    builder.newline(1)
-                builder.raw("> ")
-                builder.raw(line)
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            inner = _MarkdownBuilder()
+            if node.children:
+                for child in node.children:
+                    _to_markdown_walk(
+                        child,
+                        inner,
+                        preserve_whitespace=False,
+                        list_depth=list_depth,
+                        in_link=in_link,
+                    )
+            text = inner.finish()
+            if text:
+                lines = text.split("\n")
+                for i, line in enumerate(lines):
+                    if i:
+                        builder.newline(1)
+                    builder.raw("> ")
+                    builder.raw(line)
+            builder.ensure_newlines(2)
+        else:
+            if node.children:
+                for child in node.children:
+                    _to_markdown_walk(
+                        child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth,
+                        in_link=in_link,
+                    )
         return
     # Lists.
     if tag in {"ul", "ol"}:
-        builder.ensure_newlines(2 if builder._buf else 0)
-        ordered = tag == "ol"
-        idx = 1
-        for child in node.children or []:
-            if child.name.lower() != "li":
-                continue
-            if idx > 1:
-                builder.newline(1)
-            indent = "  " * list_depth
-            marker = f"{idx}. " if ordered else "- "
-            builder.raw(indent)
-            builder.raw(marker)
-            # Render list item content inline-ish.
-            for li_child in child.children or []:
-                _to_markdown_walk(li_child, builder, preserve_whitespace=False, list_depth=list_depth + 1)
-            idx += 1
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            ordered = tag == "ol"
+            idx = 1
+            for child in node.children or []:
+                if child.name.lower() != "li":
+                    continue
+                if idx > 1:
+                    builder.newline(1)
+                indent = "  " * list_depth
+                marker = f"{idx}. " if ordered else "- "
+                builder.raw(indent)
+                builder.raw(marker)
+                # Render list item content inline-ish.
+                for li_child in child.children or []:
+                    _to_markdown_walk(
+                        li_child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth + 1,
+                        in_link=in_link,
+                    )
+                idx += 1
+            builder.ensure_newlines(2)
+        else:
+            # Flatten list inside link
+            for child in node.children or []:
+                if child.name.lower() != "li":
+                    continue
+                builder.raw(" ")
+                for li_child in child.children or []:
+                    _to_markdown_walk(
+                        li_child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth + 1,
+                        in_link=in_link,
+                    )
         return
     # Emphasis/strong.
     if tag in {"em", "i"}:
         builder.raw("*")
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=in_link,
+            )
         builder.raw("*")
         return
     if tag in {"strong", "b"}:
         builder.raw("**")
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=in_link,
+            )
         builder.raw("**")
         return
@@ -608,13 +851,24 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
         if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
             href = str(node.attrs["href"])
-        builder.raw("[")
+        # Capture inner text to strip whitespace.
+        inner_builder = _MarkdownBuilder()
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                inner_builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=True,
+            )
+        link_text = inner_builder.finish()
+        builder.raw("[")
+        builder.raw(link_text)
         builder.raw("]")
         if href:
             builder.raw("(")
-            builder.raw(href)
+            builder.raw(_markdown_link_destination(href))
             builder.raw(")")
         return
@@ -622,11 +876,26 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
     if node.children:
         for child in node.children:
-            _to_markdown_walk(child, builder, next_preserve, list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                next_preserve,
+                list_depth,
+                in_link=in_link,
+            )
     if isinstance(node, ElementNode) and node.template_content:
-        _to_markdown_walk(node.template_content, builder, next_preserve, list_depth)
+        _to_markdown_walk(
+            node.template_content,
+            builder,
+            next_preserve,
+            list_depth,
+            in_link=in_link,
+        )
     # Add spacing after block containers to keep output readable.
     if tag in _MARKDOWN_BLOCK_ELEMENTS:
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2)
+        else:
+            builder.text(" ", preserve_whitespace=False)

justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl