PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (20) hide show

justhtml/__init__.py +6 -0
justhtml/__main__.py +49 -16
justhtml/entities.py +45 -7
justhtml/errors.py +9 -0
justhtml/node.py +358 -89
justhtml/parser.py +70 -14
justhtml/sanitize.py +763 -0
justhtml/selector.py +114 -18
justhtml/serialize.py +332 -28
justhtml/tokenizer.py +249 -179
justhtml/tokens.py +8 -3
justhtml/treebuilder.py +50 -14
justhtml/treebuilder_modes.py +100 -36
justhtml-0.24.0.dist-info/METADATA +192 -0
justhtml-0.24.0.dist-info/RECORD +24 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.12.0.dist-info/METADATA +0 -164
justhtml-0.12.0.dist-info/RECORD +0 -23
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
{justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0

justhtml/parser.py CHANGED Viewed

@@ -4,13 +4,14 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
+from .context import FragmentContext
 from .encoding import decode_html
 from .tokenizer import Tokenizer, TokenizerOpts
 from .treebuilder import TreeBuilder
 if TYPE_CHECKING:
-    from .context import FragmentContext
     from .node import SimpleDomNode
+    from .sanitize import SanitizationPolicy
     from .tokens import ParseError
@@ -53,14 +54,22 @@ class JustHTML:
         html: str | bytes | bytearray | memoryview | None,
         *,
         collect_errors: bool = False,
+        track_node_locations: bool = False,
         debug: bool = False,
         encoding: str | None = None,
+        fragment: bool = False,
         fragment_context: FragmentContext | None = None,
         iframe_srcdoc: bool = False,
         strict: bool = False,
         tokenizer_opts: TokenizerOpts | None = None,
         tree_builder: TreeBuilder | None = None,
     ) -> None:
+        if fragment_context is not None:
+            fragment = True
+        if fragment and fragment_context is None:
+            fragment_context = FragmentContext("div")
         self.debug = bool(debug)
         self.fragment_context = fragment_context
         self.encoding = None
@@ -74,7 +83,8 @@ class JustHTML:
         else:
             html_str = ""
-        # Enable error collection if strict mode is on
+        # Enable error collection if strict mode is on.
+        # Node location tracking is opt-in to avoid slowing down the common case.
         should_collect = collect_errors or strict
         self.tree_builder = tree_builder or TreeBuilder(
@@ -94,15 +104,33 @@ class JustHTML:
             elif tag_name in ("plaintext", "script"):
                 opts.initial_state = Tokenizer.PLAINTEXT
-        self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
+        self.tokenizer = Tokenizer(
+            self.tree_builder,
+            opts,
+            collect_errors=should_collect,
+            track_node_locations=bool(track_node_locations),
+        )
         # Link tokenizer to tree_builder for position info
         self.tree_builder.tokenizer = self.tokenizer
         self.tokenizer.run(html_str)
         self.root = self.tree_builder.finish()
-        # Merge errors from both tokenizer and tree builder
-        self.errors = self.tokenizer.errors + self.tree_builder.errors
+        # Merge errors from both tokenizer and tree builder.
+        # Public API: users expect errors to be ordered by input position.
+        merged_errors = self.tokenizer.errors + self.tree_builder.errors
+        indexed_errors = enumerate(merged_errors)
+        self.errors = [
+            e
+            for _, e in sorted(
+                indexed_errors,
+                key=lambda t: (
+                    t[1].line if t[1].line is not None else 1_000_000_000,
+                    t[1].column if t[1].column is not None else 1_000_000_000,
+                    t[0],
+                ),
+            )
+        ]
         # In strict mode, raise on first error
         if strict and self.errors:
@@ -112,20 +140,48 @@ class JustHTML:
         """Query the document using a CSS selector. Delegates to root.query()."""
         return self.root.query(selector)
-    def to_html(self, pretty: bool = True, indent_size: int = 2) -> str:
-        """Serialize the document to HTML. Delegates to root.to_html()."""
-        return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
+    def to_html(
+        self,
+        pretty: bool = True,
+        indent_size: int = 2,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
+        """Serialize the document to HTML.
+        - `safe=True` sanitizes untrusted content before serialization.
+        - `policy` overrides the default sanitization policy.
+        """
+        return self.root.to_html(
+            indent=0,
+            indent_size=indent_size,
+            pretty=pretty,
+            safe=safe,
+            policy=policy,
+        )
-    def to_text(self, separator: str = " ", strip: bool = True) -> str:
+    def to_text(
+        self,
+        separator: str = " ",
+        strip: bool = True,
+        *,
+        safe: bool = True,
+        policy: SanitizationPolicy | None = None,
+    ) -> str:
         """Return the document's concatenated text.
-        Delegates to `root.to_text(separator=..., strip=...)`.
+        - `safe=True` sanitizes untrusted content before text extraction.
+        - `policy` overrides the default sanitization policy.
+        Delegates to `root.to_text(...)`.
         """
-        return self.root.to_text(separator=separator, strip=strip)
+        return self.root.to_text(separator=separator, strip=strip, safe=safe, policy=policy)
-    def to_markdown(self) -> str:
+    def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
         """Return a GitHub Flavored Markdown representation.
-        Delegates to `root.to_markdown()`.
+        - `safe=True` sanitizes untrusted content before conversion.
+        - `policy` overrides the default sanitization policy.
         """
-        return self.root.to_markdown()
+        return self.root.to_markdown(safe=safe, policy=policy)

justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.12.0py3-none-any.whl → 0.24.0py3-none-any.whl