PyPI - justhtml - Versions diffs - 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl - Mend

justhtml 0.12.0py3-none-any.whl → 0.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of justhtml might be problematic. Click here for more details.

Files changed (23) hide show

justhtml/__init__.py +48 -0
justhtml/__main__.py +86 -17
justhtml/constants.py +12 -0
justhtml/entities.py +45 -7
justhtml/errors.py +17 -3
justhtml/linkify.py +438 -0
justhtml/node.py +385 -97
justhtml/parser.py +139 -16
justhtml/sanitize.py +992 -0
justhtml/selector.py +117 -19
justhtml/serialize.py +671 -41
justhtml/tokenizer.py +364 -194
justhtml/tokens.py +28 -5
justhtml/transforms.py +2568 -0
justhtml/treebuilder.py +297 -204
justhtml/treebuilder_modes.py +208 -138
justhtml-0.38.0.dist-info/METADATA +213 -0
justhtml-0.38.0.dist-info/RECORD +26 -0
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.12.0.dist-info/METADATA +0 -164
justhtml-0.12.0.dist-info/RECORD +0 -23
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
{justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0

justhtml/__init__.py CHANGED Viewed

@@ -1,14 +1,62 @@
 from .parser import JustHTML, StrictModeError
+from .sanitize import (
+    CSS_PRESET_TEXT,
+    DEFAULT_DOCUMENT_POLICY,
+    DEFAULT_POLICY,
+    SanitizationPolicy,
+    UnsafeHtmlError,
+    UrlPolicy,
+    UrlProxy,
+    UrlRule,
+)
 from .selector import SelectorError, matches, query
 from .serialize import to_html, to_test_format
 from .stream import stream
 from .tokens import ParseError
+from .transforms import (
+    CollapseWhitespace,
+    Decide,
+    Drop,
+    Edit,
+    EditAttrs,
+    EditDocument,
+    Empty,
+    Linkify,
+    PruneEmpty,
+    RewriteAttrs,
+    Sanitize,
+    SetAttrs,
+    Stage,
+    Unwrap,
+)
 __all__ = [
+    "CSS_PRESET_TEXT",
+    "DEFAULT_DOCUMENT_POLICY",
+    "DEFAULT_POLICY",
+    "CollapseWhitespace",
+    "Decide",
+    "Drop",
+    "Edit",
+    "EditAttrs",
+    "EditDocument",
+    "Empty",
     "JustHTML",
+    "Linkify",
     "ParseError",
+    "PruneEmpty",
+    "RewriteAttrs",
+    "SanitizationPolicy",
+    "Sanitize",
     "SelectorError",
+    "SetAttrs",
+    "Stage",
     "StrictModeError",
+    "UnsafeHtmlError",
+    "Unwrap",
+    "UrlPolicy",
+    "UrlProxy",
+    "UrlRule",
     "matches",
     "query",
     "stream",

justhtml/__main__.py CHANGED Viewed

@@ -8,9 +8,10 @@ import io
 import sys
 from importlib.metadata import PackageNotFoundError, version
 from pathlib import Path
-from typing import cast
+from typing import TextIO, cast
 from . import JustHTML
+from .context import FragmentContext
 from .selector import SelectorError
@@ -31,7 +32,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
             "  curl -s https://example.com | justhtml -\n"
             "  justhtml page.html --selector 'main p' --format text\n"
             "  justhtml page.html --selector 'a' --format html\n"
-            "  justhtml page.html --selector 'article' --format markdown\n"
+            "  justhtml page.html --selector 'article' --allow-tags article --format markdown\n"
             "\n"
             "If you don't have the 'justhtml' command available, use:\n"
             "  python -m justhtml ...\n"
@@ -44,6 +45,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
         nargs="?",
         help="HTML file to parse, or '-' to read from stdin",
     )
+    parser.add_argument("--output", help="File to write output to")
     parser.add_argument(
         "--selector",
         help="CSS selector for choosing nodes (defaults to the document root)",
@@ -54,12 +56,32 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
         default="html",
         help="Output format (default: html)",
     )
+    parser.add_argument(
+        "--unsafe",
+        action="store_true",
+        help="Disable sanitization (trusted input only)",
+    )
+    parser.add_argument(
+        "--allow-tags",
+        help=(
+            "Safe mode: allow these additional tags during sanitization (comma-separated). "
+            "Example: --allow-tags article,section"
+        ),
+    )
     parser.add_argument(
         "--first",
         action="store_true",
         help="Only output the first matching node",
     )
+    parser.add_argument(
+        "--fragment",
+        action="store_true",
+        help="Parse input as an HTML fragment (context: <div>)",
+    )
     parser.add_argument(
         "--separator",
         default=" ",
@@ -108,7 +130,37 @@ def _read_html(path: str) -> str | bytes:
 def main() -> None:
     args = _parse_args(sys.argv[1:])
     html = _read_html(args.path)
-    doc = JustHTML(html)
+    fragment_context = FragmentContext("div") if args.fragment else None
+    safe = not args.unsafe
+    policy = None
+    if safe and args.allow_tags:
+        from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy  # noqa: PLC0415
+        extra_tags: set[str] = set()
+        for part in str(args.allow_tags).replace(" ", ",").split(","):
+            tag = part.strip().lower()
+            if tag:
+                extra_tags.add(tag)
+        base = DEFAULT_POLICY if fragment_context is not None else DEFAULT_DOCUMENT_POLICY
+        allowed = set(base.allowed_tags)
+        allowed.update(extra_tags)
+        policy = SanitizationPolicy(
+            allowed_tags=allowed,
+            allowed_attributes=base.allowed_attributes,
+            url_policy=base.url_policy,
+            drop_comments=base.drop_comments,
+            drop_doctype=base.drop_doctype,
+            drop_foreign_namespaces=base.drop_foreign_namespaces,
+            drop_content_tags=base.drop_content_tags,
+            allowed_css_properties=base.allowed_css_properties,
+            force_link_rel=base.force_link_rel,
+            unsafe_handling=base.unsafe_handling,
+            disallowed_tag_handling=base.disallowed_tag_handling,
+        )
+    doc = JustHTML(html, fragment_context=fragment_context, safe=safe, policy=policy)
     try:
         nodes = doc.query(args.selector) if args.selector else [doc.root]
@@ -122,22 +174,39 @@ def main() -> None:
     if args.first:
         nodes = [nodes[0]]
-    if args.format == "html":
-        outputs = [node.to_html() for node in nodes]
-        sys.stdout.write("\n".join(outputs))
-        sys.stdout.write("\n")
-        return
-    if args.format == "text":
-        outputs = [node.to_text(separator=args.separator, strip=args.strip) for node in nodes]
-        sys.stdout.write("\n".join(outputs))
-        sys.stdout.write("\n")
+    def write_output(out: TextIO) -> None:
+        if args.format == "html":
+            outputs = [node.to_html() for node in nodes]
+            out.write("\n".join(outputs))
+            out.write("\n")
+            return
+        if args.format == "text":
+            # Keep these branches explicit so coverage will highlight untested CLI options.
+            if args.separator == " ":
+                if args.strip:
+                    outputs = [node.to_text(strip=True) for node in nodes]
+                else:
+                    outputs = [node.to_text(strip=False) for node in nodes]
+            else:
+                if args.strip:
+                    outputs = [node.to_text(separator=args.separator, strip=True) for node in nodes]
+                else:
+                    outputs = [node.to_text(separator=args.separator, strip=False) for node in nodes]
+            out.write("\n".join(outputs))
+            out.write("\n")
+            return
+        outputs = [node.to_markdown() for node in nodes]
+        out.write("\n\n".join(outputs))
+        out.write("\n")
+    if args.output:
+        with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
+            write_output(outfile)
         return
-    outputs = [node.to_markdown() for node in nodes]
-    sys.stdout.write("\n\n".join(outputs))
-    sys.stdout.write("\n")
-    return
+    write_output(sys.stdout)
 if __name__ == "__main__":

justhtml/constants.py CHANGED Viewed

@@ -184,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
 HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
+# Elements where pretty-printing and whitespace-collapsing transforms should
+# preserve text node whitespace.
+WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
+    {
+        "code",
+        "pre",
+        "script",
+        "style",
+        "textarea",
+    }
+)
 FORMATTING_ELEMENTS = {
     "a",
     "b",

justhtml/entities.py CHANGED Viewed

@@ -7,6 +7,10 @@ Supports both named entities (&amp;, &nbsp;) and numeric references (&#60;, &#x3
 from __future__ import annotations
 import html.entities
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Callable
 # Use Python's complete HTML5 entity list (2231 entities)
 # Keys include the trailing semicolon (e.g., "amp;", "lang;")
@@ -168,7 +172,23 @@ NUMERIC_REPLACEMENTS: dict[int, str] = {
 }
-def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
+def _is_control_character(codepoint: int) -> bool:
+    # C0 controls and C1 controls
+    return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
+def _is_noncharacter(codepoint: int) -> bool:
+    if 0xFDD0 <= codepoint <= 0xFDEF:
+        return True
+    last = codepoint & 0xFFFF
+    return last == 0xFFFE or last == 0xFFFF
+def decode_numeric_entity(
+    text: str,
+    is_hex: bool = False,
+    report_error: Callable[[str], None] | None = None,
+) -> str:
     """Decode a numeric character reference like &#60; or &#x3C;.
     Args:
@@ -181,20 +201,30 @@ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
     base = 16 if is_hex else 10
     codepoint = int(text, base)
-    # Apply HTML5 replacements for certain ranges
-    if codepoint in NUMERIC_REPLACEMENTS:
-        return NUMERIC_REPLACEMENTS[codepoint]
     # Invalid ranges per HTML5 spec
     if codepoint > 0x10FFFF:
         return "\ufffd"  # REPLACEMENT CHARACTER
     if 0xD800 <= codepoint <= 0xDFFF:  # Surrogate range
         return "\ufffd"
+    if report_error is not None:
+        if _is_control_character(codepoint):
+            report_error("control-character-reference")
+        if _is_noncharacter(codepoint):
+            report_error("noncharacter-character-reference")
+    # Apply HTML5 replacements for certain ranges
+    if codepoint in NUMERIC_REPLACEMENTS:
+        return NUMERIC_REPLACEMENTS[codepoint]
     return chr(codepoint)
-def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
+def decode_entities_in_text(
+    text: str,
+    in_attribute: bool = False,
+    report_error: Callable[[str], None] | None = None,
+) -> str:
     """Decode all HTML entities in text.
     This is a simple implementation that handles:
@@ -247,7 +277,9 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
             digit_text = text[digit_start:j]
             if digit_text:
-                result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
+                if report_error is not None and not has_semicolon:
+                    report_error("missing-semicolon-after-character-reference")
+                result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
                 i = j + 1 if has_semicolon else j
                 continue
@@ -285,6 +317,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
                     best_match_len = k
                     break
             if best_match:
+                if report_error is not None:
+                    report_error("missing-semicolon-after-character-reference")
                 result.append(best_match)
                 i = i + 1 + best_match_len
                 continue
@@ -302,6 +336,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
                 continue
             # Decode legacy entity
+            if report_error is not None and not has_semicolon:
+                report_error("missing-semicolon-after-character-reference")
             result.append(NAMED_ENTITIES[entity_name])
             i = j
             continue
@@ -329,6 +365,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
                 i += 1
                 continue
+            if report_error is not None:
+                report_error("missing-semicolon-after-character-reference")
             result.append(best_match)
             i = i + 1 + best_match_len
             continue

justhtml/errors.py CHANGED Viewed

@@ -1,7 +1,8 @@
-"""Centralized error message definitions and helpers for HTML parsing errors.
+"""Centralized error message definitions and helpers for JustHTML errors.
-This module provides human-readable error messages for all parse error codes
-emitted by both the tokenizer and tree builder during HTML parsing.
+This module provides human-readable error messages for parse error codes
+emitted by the tokenizer and tree builder during HTML parsing, plus selected
+security findings emitted by the sanitizer.
 """
 from __future__ import annotations
@@ -75,6 +76,8 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
         "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
         "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
         "named-entity-without-semicolon": "Named entity used without semicolon",
+        "noncharacter-character-reference": "Noncharacter in character reference",
+        "noncharacter-in-input-stream": "Noncharacter in input stream",
         # ================================================================
         # TREE BUILDER ERRORS
         # ================================================================
@@ -107,8 +110,11 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
         # Foster parenting / table errors
         "foster-parenting-character": "Text content in table requires foster parenting",
         "foster-parenting-start-tag": "Start tag in table requires foster parenting",
+        "unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
         "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
         "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
+        "unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
+        "eof-in-table": "Unexpected end of file in table",
         "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
         "unexpected-form-in-table": "Form element not allowed in table context",
         "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
@@ -134,6 +140,14 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
         "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
         "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
         "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
+        # Select insertion mode (context-specific taxonomy)
+        "unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
+        "unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
+        "unexpected-select-in-select": "Unexpected nested <select> in <select>",
+        # ================================================================
+        # SECURITY ERRORS
+        # ================================================================
+        "unsafe-html": "Unsafe HTML detected by sanitization policy",
     }
     # Return message or fall back to the code itself if not found

justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

Potentially problematic release.

justhtml 0.12.0py3-none-any.whl → 0.38.0py3-none-any.whl