PyPI - justhtml - Versions diffs - 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

justhtml 0.6.0py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

justhtml/__init__.py +28 -0
justhtml/__main__.py +161 -13
justhtml/constants.py +17 -1
justhtml/context.py +7 -1
justhtml/encoding.py +405 -0
justhtml/entities.py +57 -17
justhtml/errors.py +20 -4
justhtml/linkify.py +438 -0
justhtml/node.py +738 -41
justhtml/parser.py +188 -21
justhtml/py.typed +0 -0
justhtml/sanitize.py +1141 -0
justhtml/selector.py +240 -104
justhtml/serialize.py +418 -57
justhtml/stream.py +34 -10
justhtml/tokenizer.py +433 -289
justhtml/tokens.py +91 -23
justhtml/transforms.py +690 -0
justhtml/treebuilder.py +196 -111
justhtml/treebuilder_modes.py +191 -117
justhtml/treebuilder_utils.py +11 -4
justhtml-0.33.0.dist-info/METADATA +196 -0
justhtml-0.33.0.dist-info/RECORD +26 -0
justhtml-0.33.0.dist-info/entry_points.txt +2 -0
{justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.6.0.dist-info/METADATA +0 -126
justhtml-0.6.0.dist-info/RECORD +0 -20
{justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0

justhtml/parser.py CHANGED Viewed

@@ -1,8 +1,23 @@
 """Minimal JustHTML parser entry point."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from .context import FragmentContext
+from .encoding import decode_html
 from .tokenizer import Tokenizer, TokenizerOpts
+from .transforms import apply_compiled_transforms, compile_transforms
 from .treebuilder import TreeBuilder
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from .node import SimpleDomNode
+    from .sanitize import SanitizationPolicy
+    from .tokens import ParseError
+    from .transforms import Transform
 class StrictModeError(SyntaxError):
     """Raised when strict mode encounters a parse error.
@@ -11,7 +26,9 @@ class StrictModeError(SyntaxError):
     with source location highlighting.
     """
-    def __init__(self, error):
+    error: ParseError
+    def __init__(self, error: ParseError) -> None:
         self.error = error
         # Use the ParseError's as_exception() to get enhanced display
         exc = error.as_exception()
@@ -26,24 +43,58 @@ class StrictModeError(SyntaxError):
 class JustHTML:
-    __slots__ = ("debug", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
+    __slots__ = ("debug", "encoding", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
+    debug: bool
+    encoding: str | None
+    errors: list[ParseError]
+    fragment_context: FragmentContext | None
+    root: SimpleDomNode
+    tokenizer: Tokenizer
+    tree_builder: TreeBuilder
     def __init__(
         self,
-        html,
+        html: str | bytes | bytearray | memoryview | None,
         *,
-        collect_errors=False,
-        debug=False,
-        fragment_context=None,
-        iframe_srcdoc=False,
-        strict=False,
-        tokenizer_opts=None,
-        tree_builder=None,
-    ):
+        collect_errors: bool = False,
+        track_node_locations: bool = False,
+        debug: bool = False,
+        encoding: str | None = None,
+        fragment: bool = False,
+        fragment_context: FragmentContext | None = None,
+        iframe_srcdoc: bool = False,
+        strict: bool = False,
+        tokenizer_opts: TokenizerOpts | None = None,
+        tree_builder: TreeBuilder | None = None,
+        transforms: list[Transform] | None = None,
+    ) -> None:
+        if fragment_context is not None:
+            fragment = True
+        if fragment and fragment_context is None:
+            fragment_context = FragmentContext("div")
+        # Compile transforms early so invalid selectors fail fast.
+        compiled_transforms = None
+        if transforms:
+            compiled_transforms = compile_transforms(tuple(transforms))
         self.debug = bool(debug)
         self.fragment_context = fragment_context
-        # Enable error collection if strict mode is on
+        self.encoding = None
+        html_str: str
+        if isinstance(html, (bytes, bytearray, memoryview)):
+            html_str, chosen = decode_html(bytes(html), transport_encoding=encoding)
+            self.encoding = chosen
+        elif html is not None:
+            html_str = str(html)
+        else:
+            html_str = ""
+        # Enable error collection if strict mode is on.
+        # Node location tracking is opt-in to avoid slowing down the common case.
         should_collect = collect_errors or strict
         self.tree_builder = tree_builder or TreeBuilder(
@@ -63,24 +114,140 @@ class JustHTML:
             elif tag_name in ("plaintext", "script"):
                 opts.initial_state = Tokenizer.PLAINTEXT
-        self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
+        self.tokenizer = Tokenizer(
+            self.tree_builder,
+            opts,
+            collect_errors=should_collect,
+            track_node_locations=bool(track_node_locations),
+        )
         # Link tokenizer to tree_builder for position info
         self.tree_builder.tokenizer = self.tokenizer
-        self.tokenizer.run(html or "")
+        self.tokenizer.run(html_str)
         self.root = self.tree_builder.finish()
-        # Merge errors from both tokenizer and tree builder
-        self.errors = self.tokenizer.errors + self.tree_builder.errors
+        if compiled_transforms is not None:
+            apply_compiled_transforms(self.root, compiled_transforms)
+        if should_collect:
+            # Merge errors from both tokenizer and tree builder.
+            # Public API: users expect errors to be ordered by input position.
+            merged_errors = self.tokenizer.errors + self.tree_builder.errors
+            self.errors = self._sorted_errors(merged_errors)
+        else:
+            self.errors = []
         # In strict mode, raise on first error
         if strict and self.errors:
             raise StrictModeError(self.errors[0])
-    def query(self, selector):
+    def query(self, selector: str) -> list[Any]:
         """Query the document using a CSS selector. Delegates to root.query()."""
         return self.root.query(selector)
-    def to_html(self, pretty=True, indent_size=2):
-        """Serialize the document to HTML. Delegates to root.to_html()."""
-        return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
+    @staticmethod
+    def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
+        indexed_errors = enumerate(errors)
+        return [
+            e
+            for _, e in sorted(
+                indexed_errors,
+                key=lambda t: (
+                    t[1].line if t[1].line is not None else 1_000_000_000,
+                    t[1].column if t[1].column is not None else 1_000_000_000,
+                    t[0],
+                ),
+            )
+        ]
+    def _set_security_errors(self, errors: list[ParseError]) -> None:
+        if not self.errors and not errors:
+            return
+        base = [e for e in self.errors if e.category != "security"]
+        self.errors = self._sorted_errors(base + errors)
+    def _with_security_error_collection(
+        self,
+        policy: SanitizationPolicy | None,
+        serialize: Callable[[], str],
+    ) -> str:
+        if policy is not None and policy.unsafe_handling == "collect":
+            policy.reset_collected_security_errors()
+            out = serialize()
+            self._set_security_errors(policy.collected_security_errors())
+            return out
+        # Avoid stale security errors if a previous serialization used collect.
+        self._set_security_errors([])
+        return serialize()
+    def to_html(
+        self,
+        pretty: bool = True,
+        indent_size: int = 2,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
+        """Serialize the document to HTML.
+        - `safe=True` sanitizes untrusted content before serialization.
+        - `policy` overrides the default sanitization policy.
+        """
+        if not safe:
+            return self.root.to_html(
+                indent=0,
+                indent_size=indent_size,
+                pretty=pretty,
+                safe=False,
+                policy=policy,
+            )
+        return self._with_security_error_collection(
+            policy,
+            lambda: self.root.to_html(
+                indent=0,
+                indent_size=indent_size,
+                pretty=pretty,
+                safe=True,
+                policy=policy,
+            ),
+        )
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
+        """Return the document's concatenated text.
+        - `safe=True` sanitizes untrusted content before text extraction.
+        - `policy` overrides the default sanitization policy.
+        Delegates to `root.to_text(...)`.
+        """
+        if not safe:
+            return self.root.to_text(separator=separator, strip=strip, safe=False, policy=policy)
+        return self._with_security_error_collection(
+            policy,
+            lambda: self.root.to_text(separator=separator, strip=strip, safe=True, policy=policy),
+        )
+    def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
+        """Return a GitHub Flavored Markdown representation.
+        - `safe=True` sanitizes untrusted content before conversion.
+        - `policy` overrides the default sanitization policy.
+        """
+        if not safe:
+            return self.root.to_markdown(safe=False, policy=policy)
+        return self._with_security_error_collection(
+            policy,
+            lambda: self.root.to_markdown(safe=True, policy=policy),
+        )

justhtml/py.typed ADDED Viewed

File without changes

justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

justhtml 0.6.0py3-none-any.whl → 0.33.0py3-none-any.whl