PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl - Mend

justhtml 0.12.0py3-none-any.whl → 0.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (23) hide show

justhtml/__init__.py +48 -0
justhtml/__main__.py +86 -17
justhtml/constants.py +12 -0
justhtml/entities.py +45 -7
justhtml/errors.py +17 -3
justhtml/linkify.py +438 -0
justhtml/node.py +385 -97
justhtml/parser.py +139 -16
justhtml/sanitize.py +992 -0
justhtml/selector.py +117 -19
justhtml/serialize.py +671 -41
justhtml/tokenizer.py +364 -194
justhtml/tokens.py +28 -5
justhtml/transforms.py +2568 -0
justhtml/treebuilder.py +297 -204
justhtml/treebuilder_modes.py +208 -138
justhtml-0.38.0.dist-info/METADATA +213 -0
justhtml-0.38.0.dist-info/RECORD +26 -0
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.12.0.dist-info/METADATA +0 -164
justhtml-0.12.0.dist-info/RECORD +0 -23
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0

justhtml/node.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING, Any
+from urllib.parse import quote
 from .selector import query
 from .serialize import to_html
@@ -43,6 +44,30 @@ def _markdown_code_span(s: str | None) -> str:
     return f"{fence}{s}{fence}"
+def _markdown_link_destination(url: str) -> str:
+    """Return a Markdown-safe link destination.
+    We primarily care about avoiding Markdown formatting injection and broken
+    parsing for URLs that contain whitespace or parentheses.
+    CommonMark supports destinations wrapped in angle brackets:
+    `[text](<https://example.com/a(b)c>)`
+    """
+    u = (url or "").strip()
+    if not u:
+        return ""
+    # If the destination contains characters that can terminate or confuse
+    # the Markdown destination parser, wrap in <...> and percent-encode
+    # whitespace and angle brackets.
+    if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
+        u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
+        return f"<{u}>"
+    return u
 class _MarkdownBuilder:
     __slots__ = ("_buf", "_newline_count", "_pending_space")
@@ -133,29 +158,46 @@ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
 def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
-    name: str = node.name
-    if name == "#text":
-        data: str | None = node.data
-        if not data:
-            return
-        if strip:
-            data = data.strip()
+    # Iterative traversal avoids recursion overhead on large documents.
+    stack: list[Any] = [node]
+    while stack:
+        current = stack.pop()
+        name: str = current.name
+        if name == "#text":
+            data: str | None = current.data
             if not data:
-                return
-        parts.append(data)
-        return
+                continue
+            if strip:
+                data = data.strip()
+                if not data:
+                    continue
+            parts.append(data)
+            continue
-    if node.children:
-        for child in node.children:
-            _to_text_collect(child, parts, strip=strip)
+        # Preserve the same traversal order as the recursive implementation:
+        # children first, then template content.
+        if type(current) is TemplateNode and current.template_content:
+            stack.append(current.template_content)
-    if isinstance(node, ElementNode) and node.template_content:
-        _to_text_collect(node.template_content, parts, strip=strip)
+        children = current.children
+        if children:
+            stack.extend(reversed(children))
 class SimpleDomNode:
-    __slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
+    __slots__ = (
+        "_origin_col",
+        "_origin_line",
+        "_origin_pos",
+        "_source_html",
+        "attrs",
+        "children",
+        "data",
+        "name",
+        "namespace",
+        "parent",
+    )
     name: str
     parent: SimpleDomNode | ElementNode | TemplateNode | None
@@ -163,6 +205,10 @@ class SimpleDomNode:
     children: list[Any] | None
     data: str | Doctype | None
     namespace: str | None
+    _origin_pos: int | None
+    _origin_line: int | None
+    _origin_col: int | None
+    _source_html: str | None
     def __init__(
         self,
@@ -174,6 +220,10 @@ class SimpleDomNode:
         self.name = name
         self.parent = None
         self.data = data
+        self._source_html = None
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
         if name.startswith("#") or name == "!doctype":
             self.namespace = namespace
@@ -193,12 +243,36 @@ class SimpleDomNode:
             self.children.append(node)
             node.parent = self
+    @property
+    def origin_offset(self) -> int | None:
+        """Best-effort origin offset (0-indexed) in the source HTML, if known."""
+        return self._origin_pos
+    @property
+    def origin_line(self) -> int | None:
+        return self._origin_line
+    @property
+    def origin_col(self) -> int | None:
+        return self._origin_col
+    @property
+    def origin_location(self) -> tuple[int, int] | None:
+        if self._origin_line is None or self._origin_col is None:
+            return None
+        return (self._origin_line, self._origin_col)
     def remove_child(self, node: Any) -> None:
         if self.children is not None:
             self.children.remove(node)
             node.parent = None
-    def to_html(self, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
+    def to_html(
+        self,
+        indent: int = 0,
+        indent_size: int = 2,
+        pretty: bool = True,
+    ) -> str:
         """Convert node to HTML string."""
         return to_html(self, indent, indent_size, pretty=pretty)
@@ -232,16 +306,20 @@ class SimpleDomNode:
             return ""
         return ""
-    def to_text(self, separator: str = " ", strip: bool = True) -> str:
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+    ) -> str:
         """Return the concatenated text of this node's descendants.
         - `separator` controls how text nodes are joined (default: a single space).
         - `strip=True` strips each text node and drops empty segments.
         Template element contents are included via `template_content`.
         """
+        node: Any = self
         parts: list[str] = []
-        _to_text_collect(self, parts, strip=strip)
+        _to_text_collect(node, parts, strip=strip)
         if not parts:
             return ""
         return separator.join(parts)
@@ -313,22 +391,28 @@ class SimpleDomNode:
         """Return True if this node has children."""
         return bool(self.children)
-    def clone_node(self, deep: bool = False) -> SimpleDomNode:
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
         """
         Clone this node.
         Args:
             deep: If True, recursively clone children.
+            override_attrs: Optional dictionary to use as attributes for the clone.
         Returns:
             A new node that is a copy of this node.
         """
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
         clone = SimpleDomNode(
             self.name,
-            self.attrs.copy() if self.attrs else None,
+            attrs,
             self.data,
             self.namespace,
         )
+        clone._source_html = self._source_html
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
         if deep and self.children:
             for child in self.children:
                 clone.append_child(child.clone_node(deep=True))
@@ -336,11 +420,25 @@ class SimpleDomNode:
 class ElementNode(SimpleDomNode):
-    __slots__ = ("template_content",)
+    __slots__ = (
+        "_end_tag_end",
+        "_end_tag_present",
+        "_end_tag_start",
+        "_self_closing",
+        "_start_tag_end",
+        "_start_tag_start",
+        "template_content",
+    )
     template_content: SimpleDomNode | None
     children: list[Any]
     attrs: dict[str, str | None]
+    _start_tag_start: int | None
+    _start_tag_end: int | None
+    _end_tag_start: int | None
+    _end_tag_end: int | None
+    _end_tag_present: bool
+    _self_closing: bool
     def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
         self.name = name
@@ -350,9 +448,30 @@ class ElementNode(SimpleDomNode):
         self.children = []
         self.attrs = attrs if attrs is not None else {}
         self.template_content = None
-    def clone_node(self, deep: bool = False) -> ElementNode:
-        clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
+        self._source_html = None
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
+        self._start_tag_start = None
+        self._start_tag_end = None
+        self._end_tag_start = None
+        self._end_tag_end = None
+        self._end_tag_present = False
+        self._self_closing = False
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
+        clone = ElementNode(self.name, attrs, self.namespace)
+        clone._source_html = self._source_html
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
+        clone._start_tag_start = self._start_tag_start
+        clone._start_tag_end = self._start_tag_end
+        clone._end_tag_start = self._end_tag_start
+        clone._end_tag_end = self._end_tag_end
+        clone._end_tag_present = self._end_tag_present
+        clone._self_closing = self._self_closing
         if deep:
             for child in self.children:
                 clone.append_child(child.clone_node(deep=True))
@@ -375,13 +494,24 @@ class TemplateNode(ElementNode):
         else:
             self.template_content = None
-    def clone_node(self, deep: bool = False) -> TemplateNode:
+    def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
+        attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
         clone = TemplateNode(
             self.name,
-            self.attrs.copy() if self.attrs else {},
+            attrs,
             None,
             self.namespace,
         )
+        clone._source_html = self._source_html
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
+        clone._start_tag_start = self._start_tag_start
+        clone._start_tag_end = self._start_tag_end
+        clone._end_tag_start = self._end_tag_start
+        clone._end_tag_end = self._end_tag_end
+        clone._end_tag_present = self._end_tag_present
+        clone._self_closing = self._self_closing
         if deep:
             if self.template_content:
                 clone.template_content = self.template_content.clone_node(deep=True)
@@ -391,26 +521,55 @@ class TemplateNode(ElementNode):
 class TextNode:
-    __slots__ = ("data", "name", "namespace", "parent")
+    __slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
     data: str | None
     name: str
     namespace: None
     parent: SimpleDomNode | ElementNode | TemplateNode | None
+    _origin_pos: int | None
+    _origin_line: int | None
+    _origin_col: int | None
     def __init__(self, data: str | None) -> None:
         self.data = data
         self.parent = None
         self.name = "#text"
         self.namespace = None
+        self._origin_pos = None
+        self._origin_line = None
+        self._origin_col = None
+    @property
+    def origin_offset(self) -> int | None:
+        """Best-effort origin offset (0-indexed) in the source HTML, if known."""
+        return self._origin_pos
+    @property
+    def origin_line(self) -> int | None:
+        return self._origin_line
+    @property
+    def origin_col(self) -> int | None:
+        return self._origin_col
+    @property
+    def origin_location(self) -> tuple[int, int] | None:
+        if self._origin_line is None or self._origin_col is None:
+            return None
+        return (self._origin_line, self._origin_col)
     @property
     def text(self) -> str:
         """Return the text content of this node."""
         return self.data or ""
-    def to_text(self, separator: str = " ", strip: bool = True) -> str:
-        # Parameters are accepted for API consistency; they don't affect leaf nodes.
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+    ) -> str:
+        _ = separator
         if self.data is None:
             return ""
         if strip:
@@ -432,7 +591,11 @@ class TextNode:
         return False
     def clone_node(self, deep: bool = False) -> TextNode:
-        return TextNode(self.data)
+        clone = TextNode(self.data)
+        clone._origin_pos = self._origin_pos
+        clone._origin_line = self._origin_line
+        clone._origin_col = self._origin_col
+        return clone
 _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
@@ -463,7 +626,13 @@ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
 )
-def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace: bool, list_depth: int) -> None:
+def _to_markdown_walk(
+    node: Any,
+    builder: _MarkdownBuilder,
+    preserve_whitespace: bool,
+    list_depth: int,
+    in_link: bool = False,
+) -> None:
     name: str = node.name
     if name == "#text":
@@ -474,7 +643,10 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
         return
     if name == "br":
-        builder.newline(1)
+        if in_link:
+            builder.text(" ", preserve_whitespace=False)
+        else:
+            builder.newline(1)
         return
     # Comments/doctype don't contribute.
@@ -485,52 +657,80 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     if name.startswith("#"):
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace, list_depth)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace,
+                    list_depth,
+                    in_link=in_link,
+                )
         return
     tag = name.lower()
+    # Metadata containers don't contribute to body text.
+    if tag == "head" or tag == "title":
+        return
     # Preserve <img> and <table> as HTML.
     if tag == "img":
         builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
         return
     if tag == "table":
-        builder.ensure_newlines(2 if builder._buf else 0)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
         builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2)
         return
     # Headings.
     if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
-        builder.ensure_newlines(2 if builder._buf else 0)
-        level = int(tag[1])
-        builder.raw("#" * level)
-        builder.raw(" ")
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            level = int(tag[1])
+            builder.raw("#" * level)
+            builder.raw(" ")
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
-        builder.ensure_newlines(2)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace=False,
+                    list_depth=list_depth,
+                    in_link=in_link,
+                )
+        if not in_link:
+            builder.ensure_newlines(2)
         return
     # Horizontal rule.
     if tag == "hr":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        builder.raw("---")
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            builder.raw("---")
+            builder.ensure_newlines(2)
         return
     # Code blocks.
     if tag == "pre":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        code = node.to_text(separator="", strip=False)
-        builder.raw("```")
-        builder.newline(1)
-        if code:
-            builder.raw(code.rstrip("\n"))
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            code = node.to_text(separator="", strip=False)
+            builder.raw("```")
             builder.newline(1)
-        builder.raw("```")
-        builder.ensure_newlines(2)
+            if code:
+                builder.raw(code.rstrip("\n"))
+                builder.newline(1)
+            builder.raw("```")
+            builder.ensure_newlines(2)
+        else:
+            # Inside link, render as inline code or text
+            code = node.to_text(separator="", strip=False)
+            builder.raw(_markdown_code_span(code))
         return
     # Inline code.
@@ -541,64 +741,126 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     # Paragraph-like blocks.
     if tag == "p":
-        builder.ensure_newlines(2 if builder._buf else 0)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
         if node.children:
             for child in node.children:
-                _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
-        builder.ensure_newlines(2)
+                _to_markdown_walk(
+                    child,
+                    builder,
+                    preserve_whitespace=False,
+                    list_depth=list_depth,
+                    in_link=in_link,
+                )
+        if not in_link:
+            builder.ensure_newlines(2)
+        else:
+            builder.text(" ", preserve_whitespace=False)
         return
     # Blockquotes.
     if tag == "blockquote":
-        builder.ensure_newlines(2 if builder._buf else 0)
-        inner = _MarkdownBuilder()
-        if node.children:
-            for child in node.children:
-                _to_markdown_walk(child, inner, preserve_whitespace=False, list_depth=list_depth)
-        text = inner.finish()
-        if text:
-            lines = text.split("\n")
-            for i, line in enumerate(lines):
-                if i:
-                    builder.newline(1)
-                builder.raw("> ")
-                builder.raw(line)
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            inner = _MarkdownBuilder()
+            if node.children:
+                for child in node.children:
+                    _to_markdown_walk(
+                        child,
+                        inner,
+                        preserve_whitespace=False,
+                        list_depth=list_depth,
+                        in_link=in_link,
+                    )
+            text = inner.finish()
+            if text:
+                lines = text.split("\n")
+                for i, line in enumerate(lines):
+                    if i:
+                        builder.newline(1)
+                    builder.raw("> ")
+                    builder.raw(line)
+            builder.ensure_newlines(2)
+        else:
+            if node.children:
+                for child in node.children:
+                    _to_markdown_walk(
+                        child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth,
+                        in_link=in_link,
+                    )
         return
     # Lists.
     if tag in {"ul", "ol"}:
-        builder.ensure_newlines(2 if builder._buf else 0)
-        ordered = tag == "ol"
-        idx = 1
-        for child in node.children or []:
-            if child.name.lower() != "li":
-                continue
-            if idx > 1:
-                builder.newline(1)
-            indent = "  " * list_depth
-            marker = f"{idx}. " if ordered else "- "
-            builder.raw(indent)
-            builder.raw(marker)
-            # Render list item content inline-ish.
-            for li_child in child.children or []:
-                _to_markdown_walk(li_child, builder, preserve_whitespace=False, list_depth=list_depth + 1)
-            idx += 1
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2 if builder._buf else 0)
+            ordered = tag == "ol"
+            idx = 1
+            for child in node.children or []:
+                if child.name.lower() != "li":
+                    continue
+                if idx > 1:
+                    builder.newline(1)
+                indent = "  " * list_depth
+                marker = f"{idx}. " if ordered else "- "
+                builder.raw(indent)
+                builder.raw(marker)
+                # Render list item content inline-ish.
+                for li_child in child.children or []:
+                    _to_markdown_walk(
+                        li_child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth + 1,
+                        in_link=in_link,
+                    )
+                idx += 1
+            builder.ensure_newlines(2)
+        else:
+            # Flatten list inside link
+            for child in node.children or []:
+                if child.name.lower() != "li":
+                    continue
+                builder.raw(" ")
+                for li_child in child.children or []:
+                    _to_markdown_walk(
+                        li_child,
+                        builder,
+                        preserve_whitespace=False,
+                        list_depth=list_depth + 1,
+                        in_link=in_link,
+                    )
         return
     # Emphasis/strong.
     if tag in {"em", "i"}:
         builder.raw("*")
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=in_link,
+            )
         builder.raw("*")
         return
     if tag in {"strong", "b"}:
         builder.raw("**")
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=in_link,
+            )
         builder.raw("**")
         return
@@ -608,13 +870,24 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
         if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
             href = str(node.attrs["href"])
-        builder.raw("[")
+        # Capture inner text to strip whitespace.
+        inner_builder = _MarkdownBuilder()
         for child in node.children or []:
-            _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
+            _to_markdown_walk(
+                child,
+                inner_builder,
+                preserve_whitespace=False,
+                list_depth=list_depth,
+                in_link=True,
+            )
+        link_text = inner_builder.finish()
+        builder.raw("[")
+        builder.raw(link_text)
         builder.raw("]")
         if href:
             builder.raw("(")
-            builder.raw(href)
+            builder.raw(_markdown_link_destination(href))
             builder.raw(")")
         return
@@ -622,11 +895,26 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
     next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
     if node.children:
         for child in node.children:
-            _to_markdown_walk(child, builder, next_preserve, list_depth)
+            _to_markdown_walk(
+                child,
+                builder,
+                next_preserve,
+                list_depth,
+                in_link=in_link,
+            )
     if isinstance(node, ElementNode) and node.template_content:
-        _to_markdown_walk(node.template_content, builder, next_preserve, list_depth)
+        _to_markdown_walk(
+            node.template_content,
+            builder,
+            next_preserve,
+            list_depth,
+            in_link=in_link,
+        )
     # Add spacing after block containers to keep output readable.
     if tag in _MARKDOWN_BLOCK_ELEMENTS:
-        builder.ensure_newlines(2)
+        if not in_link:
+            builder.ensure_newlines(2)
+        else:
+            builder.text(" ", preserve_whitespace=False)

justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.12.0py3-none-any.whl → 0.38.0py3-none-any.whl