justhtml 2.1.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {justhtml-2.1.0 → justhtml-2.2.0}/CHANGELOG.md +18 -0
  2. {justhtml-2.1.0 → justhtml-2.2.0}/PKG-INFO +3 -1
  3. {justhtml-2.1.0 → justhtml-2.2.0}/README.md +2 -0
  4. justhtml-2.2.0/assets/justhtml-readme-explainer.png +0 -0
  5. {justhtml-2.1.0 → justhtml-2.2.0}/pyproject.toml +1 -1
  6. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/__init__.py +9 -6
  7. justhtml-2.2.0/src/justhtml/parser/stream.py +206 -0
  8. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/runtime.py +2 -0
  9. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/html.py +3 -1
  10. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/html.py +48 -4
  11. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/core.py +87 -55
  12. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/modes.py +103 -58
  13. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/branch_coverage.dat +3 -3
  14. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize.py +86 -0
  15. justhtml-2.2.0/tests/test_stream.py +273 -0
  16. justhtml-2.2.0/tests/test_treebuilder.py +640 -0
  17. justhtml-2.1.0/src/justhtml/parser/stream.py +0 -110
  18. justhtml-2.1.0/tests/test_stream.py +0 -65
  19. justhtml-2.1.0/tests/test_treebuilder.py +0 -249
  20. {justhtml-2.1.0 → justhtml-2.2.0}/.github/copilot-instructions.md +0 -0
  21. {justhtml-2.1.0 → justhtml-2.2.0}/.github/workflows/ci.yml +0 -0
  22. {justhtml-2.1.0 → justhtml-2.2.0}/.github/workflows/publish.yml +0 -0
  23. {justhtml-2.1.0 → justhtml-2.2.0}/.gitignore +0 -0
  24. {justhtml-2.1.0 → justhtml-2.2.0}/.pre-commit-config.yaml +0 -0
  25. {justhtml-2.1.0 → justhtml-2.2.0}/CODE_OF_CONDUCT.md +0 -0
  26. {justhtml-2.1.0 → justhtml-2.2.0}/CONTRIBUTING.md +0 -0
  27. {justhtml-2.1.0 → justhtml-2.2.0}/LICENSE +0 -0
  28. {justhtml-2.1.0 → justhtml-2.2.0}/Makefile +0 -0
  29. {justhtml-2.1.0 → justhtml-2.2.0}/SECURITY.md +0 -0
  30. {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/correctness.py +0 -0
  31. {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/fuzz.py +0 -0
  32. {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/performance.py +0 -0
  33. {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/profile.py +0 -0
  34. {justhtml-2.1.0 → justhtml-2.2.0}/docs/_config.yml +0 -0
  35. {justhtml-2.1.0 → justhtml-2.2.0}/docs/_layouts/default.html +0 -0
  36. {justhtml-2.1.0 → justhtml-2.2.0}/docs/api.md +0 -0
  37. {justhtml-2.1.0 → justhtml-2.2.0}/docs/assets/search.js +0 -0
  38. {justhtml-2.1.0 → justhtml-2.2.0}/docs/bleach-migration.md +0 -0
  39. {justhtml-2.1.0 → justhtml-2.2.0}/docs/building.md +0 -0
  40. {justhtml-2.1.0 → justhtml-2.2.0}/docs/cli.md +0 -0
  41. {justhtml-2.1.0 → justhtml-2.2.0}/docs/comparison.md +0 -0
  42. {justhtml-2.1.0 → justhtml-2.2.0}/docs/correctness.md +0 -0
  43. {justhtml-2.1.0 → justhtml-2.2.0}/docs/encoding.md +0 -0
  44. {justhtml-2.1.0 → justhtml-2.2.0}/docs/errors.md +0 -0
  45. {justhtml-2.1.0 → justhtml-2.2.0}/docs/fragments.md +0 -0
  46. {justhtml-2.1.0 → justhtml-2.2.0}/docs/html-cleaning.md +0 -0
  47. {justhtml-2.1.0 → justhtml-2.2.0}/docs/index.md +0 -0
  48. {justhtml-2.1.0 → justhtml-2.2.0}/docs/linkify.md +0 -0
  49. {justhtml-2.1.0 → justhtml-2.2.0}/docs/migration-examples.md +0 -0
  50. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/app.js +0 -0
  51. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/index.html +0 -0
  52. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/__init__.py +0 -0
  53. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/install_latest_justhtml.py +0 -0
  54. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/render.py +0 -0
  55. {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/use_local_repo.py +0 -0
  56. {justhtml-2.1.0 → justhtml-2.2.0}/docs/quickstart.md +0 -0
  57. {justhtml-2.1.0 → justhtml-2.2.0}/docs/sanitization.md +0 -0
  58. {justhtml-2.1.0 → justhtml-2.2.0}/docs/selectors.md +0 -0
  59. {justhtml-2.1.0 → justhtml-2.2.0}/docs/streaming.md +0 -0
  60. {justhtml-2.1.0 → justhtml-2.2.0}/docs/text.md +0 -0
  61. {justhtml-2.1.0 → justhtml-2.2.0}/docs/transforms.md +0 -0
  62. {justhtml-2.1.0 → justhtml-2.2.0}/docs/unsafe-handling.md +0 -0
  63. {justhtml-2.1.0 → justhtml-2.2.0}/docs/url-cleaning.md +0 -0
  64. {justhtml-2.1.0 → justhtml-2.2.0}/llms.txt +0 -0
  65. {justhtml-2.1.0 → justhtml-2.2.0}/run_tests.py +0 -0
  66. {justhtml-2.1.0 → justhtml-2.2.0}/scripts/release.py +0 -0
  67. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/__init__.py +0 -0
  68. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/__main__.py +0 -0
  69. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/__init__.py +0 -0
  70. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/constants.py +0 -0
  71. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/entities.py +0 -0
  72. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/errors.py +0 -0
  73. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/rawtext.py +0 -0
  74. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/dom/__init__.py +0 -0
  75. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/dom/builder.py +0 -0
  76. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/context.py +0 -0
  77. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/encoding.py +0 -0
  78. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/py.typed +0 -0
  79. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/__init__.py +0 -0
  80. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/css.py +0 -0
  81. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/dom.py +0 -0
  82. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy.py +0 -0
  83. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy_defaults.py +0 -0
  84. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/rawtext.py +0 -0
  85. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/__init__.py +0 -0
  86. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/policy.py +0 -0
  87. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/spec.py +0 -0
  88. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/selector/__init__.py +0 -0
  89. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/selector/core.py +0 -0
  90. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/__init__.py +0 -0
  91. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/markdown.py +0 -0
  92. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/__init__.py +0 -0
  93. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/tokens.py +0 -0
  94. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/__init__.py +0 -0
  95. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/compile.py +0 -0
  96. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify.py +0 -0
  97. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify_core.py +0 -0
  98. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/runtime.py +0 -0
  99. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/spec.py +0 -0
  100. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/__init__.py +0 -0
  101. {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/utils.py +0 -0
  102. {justhtml-2.1.0 → justhtml-2.2.0}/test-summary.txt +0 -0
  103. {justhtml-2.1.0 → justhtml-2.2.0}/tests/README.md +0 -0
  104. {justhtml-2.1.0 → justhtml-2.2.0}/tests/__init__.py +0 -0
  105. {justhtml-2.1.0 → justhtml-2.2.0}/tests/data/wikipedia.html +0 -0
  106. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/__init__.py +0 -0
  107. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/encoding.py +0 -0
  108. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/regressions.py +0 -0
  109. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/reporter.py +0 -0
  110. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/serializer.py +0 -0
  111. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/tokenizer.py +0 -0
  112. {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/tree.py +0 -0
  113. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-sanitize-tests/cases.json +0 -0
  114. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/coverage_gaps.test +0 -0
  115. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/empty_stack_edge_cases.dat +0 -0
  116. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/entities.test +0 -0
  117. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/iframe_srcdoc.dat +0 -0
  118. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/tokenizer_edge_cases.test +0 -0
  119. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/treebuilder_coverage.dat +0 -0
  120. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion.dat +0 -0
  121. {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion_coverage.test +0 -0
  122. {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/LICENSE.txt +0 -0
  123. {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/README.md +0 -0
  124. {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/links.txt +0 -0
  125. {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/not_links.txt +0 -0
  126. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_builder.py +0 -0
  127. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_cli.py +0 -0
  128. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_docs_examples.py +0 -0
  129. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_encoding.py +0 -0
  130. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_errors.py +0 -0
  131. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_internals.py +0 -0
  132. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_it.py +0 -0
  133. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_transform.py +0 -0
  134. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_node.py +0 -0
  135. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_playground_local_repo_file_list.py +0 -0
  136. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_precommit_coverage.py +0 -0
  137. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize_integration.py +0 -0
  138. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize_transform.py +0 -0
  139. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_selector.py +0 -0
  140. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_serialize.py +0 -0
  141. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_tokenizer.py +0 -0
  142. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms.py +0 -0
  143. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_compiler.py +0 -0
  144. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_edge_cases.py +0 -0
  145. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_sanitize_integration.py +0 -0
  146. {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_wikipedia.py +0 -0
@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [2.2.0] - 2026-06-07
11
+
12
+ ### Fixed
13
+ - Handle `<select><selectedcontent></selectedcontent></select>` without crashing when no `<option>` is present, replace selectedcontent fallback content during parser finalization, and avoid repeated selectedcontent subtree scans.
14
+ - Preserve source order and tag text when escape-mode sanitization handles disallowed rawtext/RCDATA elements with attributed or self-closing end tags.
15
+ - Make `stream()` use namespace-aware tokenizer context for SVG/MathML CDATA, rawtext decisions, self-closing foreign tags, and foreign end-tag stack updates.
16
+ - Use the correct initial tokenizer states for HTML fragment contexts such as `<title>`, `<textarea>`, `<script>`, `<style>`, and scripting-disabled `<noscript>`.
17
+ - Use HTML rawtext/RCDATA tokenizer states for text-like elements inside SVG/MathML HTML integration points and MathML text integration points.
18
+ - Generate implied end tags before removing `<form>` on `</form>` so following controls do not remain inside still-open descendants.
19
+ - Keep `<form>` elements inside `<template>` from claiming the global form pointer, including table-template form insertion.
20
+ - Close open `<p>` elements correctly around `<option>`, `<optgroup>`, `<hr>`, `<p>`, and `<div>` starts in `<select>` parsing.
21
+ - Close `<template>` correctly when `</template>` is seen while parsing inside `<select>`.
22
+ - Keep `</p>` and `</br>` foreign-content breakouts inside MathML text integration points such as `<mi>` and `<mtext>`.
23
+ - Align customizable `<select>` parsing with Chromium for phantom `</p>` handling and generic custom child elements.
24
+
25
+ ### Security
26
+ - (Severity: Low) Strip invisible Unicode during URL sink validation even when general invisible-Unicode stripping is disabled. Previously, custom policies using `strip_invisible_unicode=False` could preserve scheme-obfuscated values such as `javascript\u200b:` in otherwise URL-validated attributes.
27
+
10
28
  ## [2.1.0] - 2026-06-06
11
29
 
12
30
  ### Performance
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: justhtml
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: A pure Python HTML5 parser that just works.
5
5
  Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
6
6
  Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
@@ -72,6 +72,8 @@ Requires Python 3.10 or later.
72
72
 
73
73
  [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
74
74
 
75
+ ![JustHTML turns messy unsafe HTML into a sanitized, queryable DOM, then serializes it to text, Markdown, or HTML.](assets/justhtml-readme-explainer.png)
76
+
75
77
  ## Why Use It?
76
78
 
77
79
  Most Python HTML libraries optimize for one part of the problem.
@@ -18,6 +18,8 @@ Requires Python 3.10 or later.
18
18
 
19
19
  [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
20
20
 
21
+ ![JustHTML turns messy unsafe HTML into a sanitized, queryable DOM, then serializes it to text, Markdown, or HTML.](assets/justhtml-readme-explainer.png)
22
+
21
23
  ## Why Use It?
22
24
 
23
25
  Most Python HTML libraries optimize for one part of the problem.
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "justhtml"
3
3
  authors = [{ name = "Emil Stenström", email = "emil@emilstenstrom.se" }]
4
- version = "2.1.0"
4
+ version = "2.2.0"
5
5
  description = "A pure Python HTML5 parser that just works."
6
6
  readme = "README.md"
7
7
  license = { file = "LICENSE" }
@@ -163,14 +163,17 @@ class JustHTML:
163
163
  if needs_escape_incomplete_tags:
164
164
  opts.emit_bogus_markup_as_text = True
165
165
 
166
- # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
167
- if fragment_context and not fragment_context.namespace:
168
- rawtext_elements = {"textarea", "title", "style"}
166
+ # For text-like HTML fragment contexts, set the initial tokenizer state
167
+ # to match the context element.
168
+ if fragment_context and fragment_context.namespace in {None, "html"}:
169
169
  tag_name = fragment_context.tag_name.lower()
170
- if tag_name in rawtext_elements:
170
+ if tag_name in {"textarea", "title"}:
171
+ opts.initial_state = Tokenizer.RCDATA
172
+ elif tag_name in {"iframe", "noembed", "noframes", "script", "style", "xmp"} or (
173
+ tag_name == "noscript" and opts.scripting_enabled
174
+ ):
171
175
  opts.initial_state = Tokenizer.RAWTEXT
172
- opts.initial_rawtext_tag = tag_name
173
- elif tag_name in ("plaintext", "script"):
176
+ elif tag_name == "plaintext":
174
177
  opts.initial_state = Tokenizer.PLAINTEXT
175
178
 
176
179
  self.tokenizer = Tokenizer(
@@ -0,0 +1,206 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal, TypeAlias, cast
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Generator
7
+
8
+ from justhtml.core.constants import (
9
+ FOREIGN_BREAKOUT_ELEMENTS,
10
+ HTML_INTEGRATION_POINT_SET,
11
+ MATHML_TEXT_INTEGRATION_POINT_SET,
12
+ SVG_TAG_NAME_ADJUSTMENTS,
13
+ )
14
+ from justhtml.tokenizer import Tokenizer
15
+ from justhtml.tokenizer.tokens import CommentToken, DoctypeToken, Tag
16
+
17
+ from .encoding import decode_html
18
+
19
+ StartEvent: TypeAlias = tuple[Literal["start"], tuple[str, dict[str, str | None]]]
20
+ EndEvent: TypeAlias = tuple[Literal["end"], str]
21
+ TextEvent: TypeAlias = tuple[Literal["text"], str]
22
+ CommentEvent: TypeAlias = tuple[Literal["comment"], str]
23
+ DoctypeEvent: TypeAlias = tuple[Literal["doctype"], tuple[str | None, str | None, str | None]]
24
+ StreamEvent: TypeAlias = StartEvent | EndEvent | TextEvent | CommentEvent | DoctypeEvent
25
+
26
+
27
+ class _DummyNode:
28
+ __slots__ = ("attrs", "name", "namespace")
29
+
30
+ attrs: dict[str, str | None]
31
+ name: str
32
+ namespace: str
33
+
34
+ def __init__(self, name: str, namespace: str, attrs: dict[str, str | None] | None = None) -> None:
35
+ self.attrs = attrs or {}
36
+ self.name = name
37
+ self.namespace = namespace
38
+
39
+
40
+ class StreamSink:
41
+ """A sink that buffers tokens for the stream API."""
42
+
43
+ tokens: list[StreamEvent]
44
+ open_elements: list[_DummyNode]
45
+
46
+ def __init__(self) -> None:
47
+ self.tokens = []
48
+ self.open_elements = [] # Required by tokenizer for rawtext checks
49
+
50
+ def _font_breaks_out_of_foreign_content(self, attrs: dict[str, str | None]) -> bool:
51
+ for name in attrs:
52
+ if name.lower() in {"color", "face", "size"}:
53
+ return True
54
+ return False
55
+
56
+ def _node_attribute_value(self, node: _DummyNode, name: str) -> str | None:
57
+ target = name.lower()
58
+ for attr_name, attr_value in node.attrs.items():
59
+ if attr_name.lower() == target:
60
+ return attr_value or ""
61
+ return None
62
+
63
+ def _is_html_integration_point(self, node: _DummyNode) -> bool:
64
+ if node.namespace == "math" and node.name == "annotation-xml":
65
+ encoding = self._node_attribute_value(node, "encoding")
66
+ return encoding is not None and encoding.lower() in {"application/xhtml+xml", "text/html"}
67
+ return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
68
+
69
+ def _is_mathml_text_integration_point(self, node: _DummyNode) -> bool:
70
+ return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
71
+
72
+ def _adjusted_name_for_namespace(self, name: str, namespace: str) -> str:
73
+ if namespace == "svg":
74
+ return SVG_TAG_NAME_ADJUSTMENTS.get(name, name)
75
+ return name
76
+
77
+ def _namespace_from_html_context(self, name: str) -> str:
78
+ if name == "svg":
79
+ return "svg"
80
+ if name == "math":
81
+ return "math"
82
+ return "html"
83
+
84
+ def _namespace_for_start_tag(self, token: Tag) -> str:
85
+ name = token.name
86
+ parent = self.open_elements[-1] if self.open_elements else None
87
+ parent_namespace = parent.namespace if parent is not None else "html"
88
+
89
+ if parent is not None:
90
+ if self._is_html_integration_point(parent):
91
+ return self._namespace_from_html_context(name)
92
+ if self._is_mathml_text_integration_point(parent) and name not in {"mglyph", "malignmark"}:
93
+ return self._namespace_from_html_context(name)
94
+ if parent_namespace == "math" and parent.name == "annotation-xml" and name == "svg":
95
+ return "svg"
96
+
97
+ if parent_namespace not in {None, "html"}:
98
+ breaks_out = name in FOREIGN_BREAKOUT_ELEMENTS or (
99
+ name == "font" and self._font_breaks_out_of_foreign_content(token.attrs)
100
+ )
101
+ if breaks_out:
102
+ while self.open_elements and self.open_elements[-1].namespace not in {None, "html"}:
103
+ self.open_elements.pop()
104
+ parent_namespace = self.open_elements[-1].namespace if self.open_elements else "html"
105
+ else:
106
+ return parent_namespace
107
+
108
+ return self._namespace_from_html_context(name)
109
+
110
+ def _pop_foreign_context(self) -> None:
111
+ while self.open_elements and self.open_elements[-1].namespace not in {None, "html"}:
112
+ self.open_elements.pop()
113
+
114
+ def _pop_for_end_tag(self, name: str) -> None:
115
+ if not self.open_elements:
116
+ return
117
+
118
+ name_lower = name.lower()
119
+ current = self.open_elements[-1]
120
+ if current.namespace not in {None, "html"} and name_lower in {"br", "p"}:
121
+ self._pop_foreign_context()
122
+ return
123
+
124
+ for index in range(len(self.open_elements) - 1, -1, -1):
125
+ node = self.open_elements[index]
126
+ if node.name.lower() == name_lower:
127
+ del self.open_elements[index:]
128
+ return
129
+ if node.namespace in {None, "html"}:
130
+ break
131
+
132
+ self.open_elements.pop()
133
+
134
+ def process_token(self, token: Tag | CommentToken | DoctypeToken | Any) -> int:
135
+ # Tokenizer reuses token objects, so we must copy data
136
+ if isinstance(token, Tag):
137
+ # Copy tag data
138
+ if token.kind == Tag.START:
139
+ self.tokens.append(("start", (token.name, token.attrs.copy())))
140
+ else:
141
+ self.tokens.append(("end", token.name))
142
+ # Maintain open_elements stack for tokenizer rawtext/CDATA checks.
143
+ if token.kind == Tag.START:
144
+ namespace = self._namespace_for_start_tag(token)
145
+ if not (token.self_closing and namespace not in {None, "html"}):
146
+ name = self._adjusted_name_for_namespace(token.name, namespace)
147
+ self.open_elements.append(_DummyNode(name, namespace, token.attrs.copy()))
148
+ else: # Tag.END
149
+ self._pop_for_end_tag(token.name)
150
+
151
+ elif isinstance(token, CommentToken):
152
+ self.tokens.append(("comment", token.data))
153
+
154
+ elif isinstance(token, DoctypeToken):
155
+ dt = token.doctype
156
+ self.tokens.append(("doctype", (dt.name, dt.public_id, dt.system_id)))
157
+
158
+ return 0 # TokenSinkResult.Continue
159
+
160
+ def process_characters(self, data: str) -> None:
161
+ """Handle character data from tokenizer."""
162
+ self.tokens.append(("text", data))
163
+
164
+
165
+ def stream(
166
+ html: str | bytes | bytearray | memoryview,
167
+ *,
168
+ encoding: str | None = None,
169
+ ) -> Generator[StreamEvent, None, None]:
170
+ """
171
+ Stream HTML events from the given HTML string.
172
+ Yields tuples of (event_type, data).
173
+ """
174
+ html_str: str
175
+ if isinstance(html, (bytes, bytearray, memoryview)):
176
+ html_str, _ = decode_html(bytes(html), transport_encoding=encoding)
177
+ else:
178
+ html_str = html
179
+ sink = StreamSink()
180
+ tokenizer = Tokenizer(sink)
181
+ tokenizer.initialize(html_str)
182
+
183
+ while True:
184
+ # Run one step of the tokenizer
185
+ is_eof = tokenizer.step()
186
+
187
+ # Yield any tokens produced by this step
188
+ if sink.tokens:
189
+ # Coalesce text tokens
190
+ text_buffer: list[str] = []
191
+ for event, data in sink.tokens:
192
+ if event == "text":
193
+ text_buffer.append(cast("str", data))
194
+ else:
195
+ if text_buffer:
196
+ yield ("text", "".join(text_buffer))
197
+ text_buffer = []
198
+ yield cast("StartEvent | EndEvent | CommentEvent | DoctypeEvent", (event, data))
199
+
200
+ if text_buffer:
201
+ yield ("text", "".join(text_buffer))
202
+
203
+ sink.tokens.clear()
204
+
205
+ if is_eof:
206
+ break
@@ -220,6 +220,8 @@ def _sanitize_url_value_with_rule(
220
220
  if rewritten is None:
221
221
  return None
222
222
  v = _strip_invisible_unicode(rewritten)
223
+ else:
224
+ v = _strip_invisible_unicode(v)
223
225
 
224
226
  stripped = v.strip()
225
227
  if _URL_CONTROL_CHAR_REGEX.search(stripped):
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
26
26
  # Note: This matches the logic of the previous loop-based implementation.
27
27
  # It checks for space characters, quotes, equals sign, and greater-than.
28
28
  _UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
29
- _LITERAL_TEXT_SERIALIZATION_ELEMENTS = frozenset({"script", "style"})
29
+ _LITERAL_TEXT_SERIALIZATION_ELEMENTS = frozenset({"plaintext", "script", "style"})
30
30
  _SERIALIZABLE_TAG_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9:_-]*$")
31
31
  _SERIALIZABLE_ATTR_NAME_RE = re.compile(r"^[A-Za-z_:][A-Za-z0-9:._-]*$")
32
32
 
@@ -101,6 +101,8 @@ def _serialize_text_for_parent(text: str | None, parent_name: str | None) -> str
101
101
  if parent_name is not None:
102
102
  normalized_parent_name = parent_name if parent_name.islower() else parent_name.lower()
103
103
  if normalized_parent_name in _LITERAL_TEXT_SERIALIZATION_ELEMENTS:
104
+ if normalized_parent_name == "plaintext":
105
+ return text
104
106
  return _neutralize_rawtext_end_tag_sequences(text, normalized_parent_name)
105
107
  return _escape_text(text)
106
108
 
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
7
7
  if TYPE_CHECKING:
8
8
  from collections.abc import Callable
9
9
 
10
+ from justhtml.core.constants import HTML_INTEGRATION_POINT_SET, MATHML_TEXT_INTEGRATION_POINT_SET
10
11
  from justhtml.core.entities import decode_entities_in_text
11
12
  from justhtml.core.errors import generate_error_message
12
13
 
@@ -35,6 +36,7 @@ _ATTR_VALUE_UNQUOTED_FAST_BAD_PATTERN = re.compile(r"""[\x00"'<=`]""")
35
36
  _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
36
37
  _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
37
38
  _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
39
+ _HTML_INTEGRATION_POINT_ENCODINGS = {"application/xhtml+xml", "text/html"}
38
40
 
39
41
  # XML Coercion Regex
40
42
  _xml_invalid_single_chars = []
@@ -2230,10 +2232,7 @@ class Tokenizer:
2230
2232
  or (name == "noscript" and self.opts.scripting_enabled)
2231
2233
  )
2232
2234
  if needs_rawtext_check:
2233
- stack = self.sink.open_elements
2234
- current_node = stack[-1] if stack else None
2235
- namespace = current_node.namespace if current_node else None
2236
- if namespace is None or namespace == "html":
2235
+ if self._current_node_uses_html_text_parsing():
2237
2236
  if name in _RCDATA_ELEMENTS:
2238
2237
  self.state = self.RCDATA
2239
2238
  self.rawtext_tag_name = name
@@ -2260,6 +2259,37 @@ class Tokenizer:
2260
2259
  self.current_tag_kind = Tag.START
2261
2260
  return switched_to_rawtext
2262
2261
 
2262
+ def _current_node_uses_html_text_parsing(self) -> bool:
2263
+ stack = self.sink.open_elements
2264
+ current_node = stack[-1] if stack else None
2265
+ if current_node is None:
2266
+ return True
2267
+
2268
+ namespace = current_node.namespace
2269
+ if namespace is None or namespace == "html":
2270
+ return True
2271
+
2272
+ node_name = current_node.name
2273
+ if (namespace, node_name) in MATHML_TEXT_INTEGRATION_POINT_SET:
2274
+ return True
2275
+
2276
+ if namespace == "math" and node_name == "annotation-xml":
2277
+ encoding = self._node_attribute_value(current_node, "encoding")
2278
+ return encoding is not None and encoding.lower() in _HTML_INTEGRATION_POINT_ENCODINGS
2279
+
2280
+ return (namespace, node_name) in HTML_INTEGRATION_POINT_SET
2281
+
2282
+ def _node_attribute_value(self, node: Any, name: str) -> str | None:
2283
+ attrs = node.attrs
2284
+ if not attrs:
2285
+ return None
2286
+
2287
+ target = name.lower()
2288
+ for attr_name, attr_value in attrs.items():
2289
+ if attr_name.lower() == target:
2290
+ return attr_value or ""
2291
+ return None
2292
+
2263
2293
  def _emit_incomplete_tag_as_text(self) -> None:
2264
2294
  if not self.opts.emit_bogus_markup_as_text:
2265
2295
  return
@@ -2528,6 +2558,7 @@ class Tokenizer:
2528
2558
  else:
2529
2559
  # lt_index == pos - the only remaining possibility
2530
2560
  # Less-than sign - might be start of end tag
2561
+ self.current_token_start_pos = pos
2531
2562
  pos += 1
2532
2563
  self.pos = pos
2533
2564
  self.state = self.RCDATA_LESS_THAN_SIGN
@@ -2570,6 +2601,9 @@ class Tokenizer:
2570
2601
  if c == ">":
2571
2602
  attrs: dict[str, str | None] = {}
2572
2603
  tag = Tag(Tag.END, tag_name, attrs, False)
2604
+ if self.track_tag_positions:
2605
+ tag.start_pos = self.current_token_start_pos
2606
+ tag.end_pos = self.pos
2573
2607
  self._flush_text()
2574
2608
  self._emit_token(tag)
2575
2609
  self.state = self.DATA
@@ -2578,6 +2612,7 @@ class Tokenizer:
2578
2612
  return False
2579
2613
  if c in (" ", "\t", "\n", "\r", "\f"):
2580
2614
  # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2615
+ self._flush_text()
2581
2616
  self.current_tag_kind = Tag.END
2582
2617
  self.current_tag_attrs = {}
2583
2618
  self.state = self.BEFORE_ATTRIBUTE_NAME
@@ -2647,6 +2682,7 @@ class Tokenizer:
2647
2682
  if lt_index > pos:
2648
2683
  chunk = buffer[pos:lt_index]
2649
2684
  self._append_text(chunk)
2685
+ self.current_token_start_pos = lt_index
2650
2686
  pos = lt_index + 1
2651
2687
  self.pos = pos
2652
2688
  # Handle script escaped transition before treating '<' as markup boundary
@@ -2701,6 +2737,9 @@ class Tokenizer:
2701
2737
  if c == ">":
2702
2738
  attrs: dict[str, str | None] = {}
2703
2739
  tag = Tag(Tag.END, tag_name, attrs, False)
2740
+ if self.track_tag_positions:
2741
+ tag.start_pos = self.current_token_start_pos
2742
+ tag.end_pos = self.pos
2704
2743
  self._flush_text()
2705
2744
  self._emit_token(tag)
2706
2745
  self.state = self.DATA
@@ -2709,6 +2748,7 @@ class Tokenizer:
2709
2748
  return False
2710
2749
  if c in (" ", "\t", "\n", "\r", "\f"):
2711
2750
  # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2751
+ self._flush_text()
2712
2752
  self.current_tag_kind = Tag.END
2713
2753
  self.current_tag_attrs = {}
2714
2754
  self.state = self.BEFORE_ATTRIBUTE_NAME
@@ -2866,6 +2906,7 @@ class Tokenizer:
2866
2906
 
2867
2907
  if is_appropriate:
2868
2908
  if c in (" ", "\t", "\n", "\r", "\f"):
2909
+ self._flush_text()
2869
2910
  self.current_tag_kind = Tag.END
2870
2911
  self.current_tag_attrs = {}
2871
2912
  self.state = self.BEFORE_ATTRIBUTE_NAME
@@ -2880,6 +2921,9 @@ class Tokenizer:
2880
2921
  self._flush_text()
2881
2922
  attrs: dict[str, str | None] = {}
2882
2923
  tag = Tag(Tag.END, tag_name, attrs, False)
2924
+ if self.track_tag_positions:
2925
+ tag.start_pos = self.current_token_start_pos
2926
+ tag.end_pos = self.pos
2883
2927
  self._emit_token(tag)
2884
2928
  self.state = self.DATA
2885
2929
  self.rawtext_tag_name = None
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import TYPE_CHECKING, Any, cast
5
+ from typing import TYPE_CHECKING, Any, NamedTuple, cast
6
6
 
7
7
  from justhtml.core.constants import (
8
8
  BUTTON_SCOPE_TERMINATORS,
@@ -48,6 +48,12 @@ if TYPE_CHECKING:
48
48
  from collections.abc import Callable
49
49
 
50
50
 
51
+ class _SelectedContentWalkItem(NamedTuple):
52
+ node: Any
53
+ in_disabled_optgroup: bool
54
+ in_datalist: bool
55
+
56
+
51
57
  class TreeBuilder(TreeBuilderModesMixin):
52
58
  __slots__ = (
53
59
  "_body_end_handlers",
@@ -787,14 +793,11 @@ class TreeBuilder(TreeBuilderModesMixin):
787
793
  if name not in existing:
788
794
  existing[name] = value
789
795
 
790
- def _remove_from_open_elements(self, node: Any) -> bool:
791
- for index, current in enumerate(self.open_elements):
792
- if current is node:
793
- self._maybe_mark_end_tag(current)
794
- self._note_open_element_removed(current)
795
- del self.open_elements[index]
796
- return True
797
- return False
796
+ def _remove_from_open_elements(self, node: Any) -> None:
797
+ index = self.open_elements.index(node)
798
+ self._maybe_mark_end_tag(node)
799
+ self._note_open_element_removed(node)
800
+ del self.open_elements[index]
798
801
 
799
802
  def _is_special_element(self, node: Any) -> bool:
800
803
  if node.namespace not in {None, "html"}:
@@ -852,6 +855,15 @@ class TreeBuilder(TreeBuilderModesMixin):
852
855
  return True
853
856
  return False
854
857
 
858
+ def _has_detached_active_formatting_a(self) -> bool:
859
+ for index in range(len(self.active_formatting) - 1, -1, -1):
860
+ entry = self.active_formatting[index]
861
+ if entry is FORMAT_MARKER:
862
+ break
863
+ if entry["name"] == "a":
864
+ return entry["node"] not in self.open_elements
865
+ return False
866
+
855
867
  def _remove_last_active_formatting_by_name(self, name: str) -> None:
856
868
  for index in range(len(self.active_formatting) - 1, -1, -1):
857
869
  entry = self.active_formatting[index]
@@ -988,7 +1000,7 @@ class TreeBuilder(TreeBuilderModesMixin):
988
1000
  self._append_text(data)
989
1001
  return
990
1002
 
991
- if self.pending_table_text_should_error:
1003
+ if self.pending_table_text_should_error and self.collect_errors:
992
1004
  # html5lib reports one foster-parenting error per non-whitespace character.
993
1005
  for ch in data:
994
1006
  if ch not in HTML_SPACE_CHARACTERS:
@@ -1169,7 +1181,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1169
1181
  node = self.open_elements[-1]
1170
1182
  if node.namespace in {None, "html"}:
1171
1183
  return
1172
- if self._is_html_integration_point(node):
1184
+ if self._is_html_integration_point(node) or self._is_mathml_text_integration_point(node):
1173
1185
  return
1174
1186
  if self.fragment_context_element is not None and node is self.fragment_context_element:
1175
1187
  return
@@ -1310,59 +1322,78 @@ class TreeBuilder(TreeBuilderModesMixin):
1310
1322
  Per HTML5 spec: selectedcontent mirrors the content of the selected option,
1311
1323
  or the first option if none is selected.
1312
1324
  """
1313
- # Find all select elements
1314
- selects: list[Any] = []
1315
- self._find_elements(root, "select", selects)
1316
-
1317
- for select in selects:
1318
- # Find selectedcontent element in this select
1319
- selectedcontent = self._find_element(select, "selectedcontent")
1320
- if not selectedcontent:
1321
- continue
1322
-
1323
- # Find all option elements
1324
- options: list[Any] = []
1325
- self._find_elements(select, "option", options)
1326
-
1327
- # Find selected option or use first one
1328
- selected_option = None
1329
- for opt in options:
1330
- if opt.attrs:
1331
- for attr_name in opt.attrs.keys():
1332
- if attr_name == "selected":
1333
- selected_option = opt
1334
- break
1335
- if selected_option:
1336
- break
1337
-
1338
- if not selected_option:
1339
- selected_option = options[0]
1340
-
1341
- # Clone content from selected option to selectedcontent
1342
- self._clone_children(selected_option, selectedcontent)
1343
-
1344
- def _find_elements(self, node: Any, name: str, result: list[Any]) -> None:
1345
- """Find all elements with given name using iterative preorder traversal."""
1346
- stack: list[Any] = [node]
1325
+ stack: list[Any] = [root]
1347
1326
  while stack:
1348
1327
  current = stack.pop()
1349
- if current.name == name:
1350
- result.append(current)
1328
+ if current.name == "select":
1329
+ self._populate_selectedcontent_for_select(current)
1351
1330
 
1352
1331
  if current.has_child_nodes():
1353
1332
  stack.extend(reversed(current.children))
1354
1333
 
1355
- def _find_element(self, node: Any, name: str) -> Any | None:
1356
- """Find first element with given name using iterative preorder traversal."""
1357
- stack: list[Any] = [node]
1334
+ def _populate_selectedcontent_for_select(self, select: Any) -> None:
1335
+ selectedcontents: list[Any] = []
1336
+ first_option = None
1337
+ selected_option = None
1338
+ is_multiple = select.attrs is not None and "multiple" in select.attrs
1339
+
1340
+ stack = [_SelectedContentWalkItem(select, in_disabled_optgroup=False, in_datalist=False)]
1358
1341
  while stack:
1359
- current = stack.pop()
1360
- if current.name == name:
1361
- return current
1342
+ item = stack.pop()
1343
+ current = item.node
1344
+ attrs = getattr(current, "attrs", None)
1345
+ name = current.name
1346
+ if current is not select:
1347
+ if name == "selectedcontent":
1348
+ selectedcontents.append(current)
1349
+ if name == "option" and not item.in_datalist:
1350
+ if first_option is None and self._is_selectedcontent_fallback_option(
1351
+ attrs, item.in_disabled_optgroup
1352
+ ):
1353
+ first_option = current
1354
+ if attrs is not None and "selected" in attrs:
1355
+ if is_multiple:
1356
+ if selected_option is None:
1357
+ selected_option = current
1358
+ else:
1359
+ selected_option = current
1362
1360
 
1363
1361
  if current.has_child_nodes():
1364
- stack.extend(reversed(current.children))
1365
- return None
1362
+ child_disabled_optgroup = item.in_disabled_optgroup or (
1363
+ name == "optgroup" and attrs is not None and "disabled" in attrs
1364
+ )
1365
+ child_in_datalist = item.in_datalist or name == "datalist"
1366
+ stack.extend(
1367
+ _SelectedContentWalkItem(child, child_disabled_optgroup, child_in_datalist)
1368
+ for child in reversed(current.children)
1369
+ )
1370
+
1371
+ if not selectedcontents:
1372
+ return
1373
+
1374
+ source_option = selected_option or first_option
1375
+
1376
+ for selectedcontent in selectedcontents:
1377
+ if source_option is not None and self._is_descendant_of(selectedcontent, source_option):
1378
+ continue
1379
+ children = selectedcontent.children
1380
+ if children:
1381
+ for child in children:
1382
+ child.parent = None
1383
+ children.clear()
1384
+ if source_option is not None:
1385
+ self._clone_children(source_option, selectedcontent)
1386
+
1387
+ def _is_selectedcontent_fallback_option(self, attrs: Any, in_disabled_optgroup: bool) -> bool:
1388
+ return not in_disabled_optgroup and (attrs is None or "disabled" not in attrs)
1389
+
1390
+ def _is_descendant_of(self, node: Any, ancestor: Any) -> bool:
1391
+ parent = node.parent
1392
+ while parent is not None:
1393
+ if parent is ancestor:
1394
+ return True
1395
+ parent = parent.parent
1396
+ return False
1366
1397
 
1367
1398
  def _clone_children(self, source: Any, target: Any) -> None:
1368
1399
  """Deep clone all children from source to target."""
@@ -1404,6 +1435,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1404
1435
  if not data:
1405
1436
  return TokenSinkResult.Continue
1406
1437
  if "\x00" in data:
1438
+ self._parse_error("invalid-codepoint")
1407
1439
  data = data.replace("\x00", "")
1408
1440
  if not data:
1409
1441
  return TokenSinkResult.Continue