justhtml 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {justhtml-2.1.0 → justhtml-2.2.0}/CHANGELOG.md +18 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/PKG-INFO +3 -1
- {justhtml-2.1.0 → justhtml-2.2.0}/README.md +2 -0
- justhtml-2.2.0/assets/justhtml-readme-explainer.png +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/pyproject.toml +1 -1
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/__init__.py +9 -6
- justhtml-2.2.0/src/justhtml/parser/stream.py +206 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/runtime.py +2 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/html.py +3 -1
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/html.py +48 -4
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/core.py +87 -55
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/modes.py +103 -58
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/branch_coverage.dat +3 -3
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize.py +86 -0
- justhtml-2.2.0/tests/test_stream.py +273 -0
- justhtml-2.2.0/tests/test_treebuilder.py +640 -0
- justhtml-2.1.0/src/justhtml/parser/stream.py +0 -110
- justhtml-2.1.0/tests/test_stream.py +0 -65
- justhtml-2.1.0/tests/test_treebuilder.py +0 -249
- {justhtml-2.1.0 → justhtml-2.2.0}/.github/copilot-instructions.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/.github/workflows/ci.yml +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/.github/workflows/publish.yml +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/.gitignore +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/.pre-commit-config.yaml +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/CODE_OF_CONDUCT.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/CONTRIBUTING.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/LICENSE +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/Makefile +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/SECURITY.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/correctness.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/fuzz.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/performance.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/benchmarks/profile.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/_config.yml +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/_layouts/default.html +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/api.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/assets/search.js +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/bleach-migration.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/building.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/cli.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/comparison.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/correctness.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/encoding.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/errors.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/fragments.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/html-cleaning.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/index.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/linkify.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/migration-examples.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/app.js +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/index.html +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/install_latest_justhtml.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/render.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/playground/py/use_local_repo.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/quickstart.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/sanitization.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/selectors.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/streaming.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/text.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/transforms.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/unsafe-handling.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/docs/url-cleaning.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/llms.txt +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/run_tests.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/scripts/release.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/__main__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/constants.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/entities.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/errors.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/core/rawtext.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/dom/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/dom/builder.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/context.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/parser/encoding.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/py.typed +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/css.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/dom.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy_defaults.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/rawtext.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/policy.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/spec.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/selector/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/selector/core.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/serializer/markdown.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/tokenizer/tokens.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/compile.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify_core.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/runtime.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/transforms/spec.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/src/justhtml/treebuilder/utils.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/test-summary.txt +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/README.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/data/wikipedia.html +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/__init__.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/encoding.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/regressions.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/reporter.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/serializer.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/tokenizer.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/harness/tree.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-sanitize-tests/cases.json +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/coverage_gaps.test +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/empty_stack_edge_cases.dat +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/entities.test +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/iframe_srcdoc.dat +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/tokenizer_edge_cases.test +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/treebuilder_coverage.dat +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion.dat +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion_coverage.test +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/LICENSE.txt +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/README.md +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/links.txt +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/not_links.txt +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_builder.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_cli.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_docs_examples.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_encoding.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_errors.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_internals.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_it.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_linkify_transform.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_node.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_playground_local_repo_file_list.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_precommit_coverage.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize_integration.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_sanitize_transform.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_selector.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_serialize.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_tokenizer.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_compiler.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_edge_cases.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_transforms_sanitize_integration.py +0 -0
- {justhtml-2.1.0 → justhtml-2.2.0}/tests/test_wikipedia.py +0 -0
|
@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [2.2.0] - 2026-06-07
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- Handle `<select><selectedcontent></selectedcontent></select>` without crashing when no `<option>` is present, replace selectedcontent fallback content during parser finalization, and avoid repeated selectedcontent subtree scans.
|
|
14
|
+
- Preserve source order and tag text when escape-mode sanitization handles disallowed rawtext/RCDATA elements with attributed or self-closing end tags.
|
|
15
|
+
- Make `stream()` use namespace-aware tokenizer context for SVG/MathML CDATA, rawtext decisions, self-closing foreign tags, and foreign end-tag stack updates.
|
|
16
|
+
- Use the correct initial tokenizer states for HTML fragment contexts such as `<title>`, `<textarea>`, `<script>`, `<style>`, and scripting-disabled `<noscript>`.
|
|
17
|
+
- Use HTML rawtext/RCDATA tokenizer states for text-like elements inside SVG/MathML HTML integration points and MathML text integration points.
|
|
18
|
+
- Generate implied end tags before removing `<form>` on `</form>` so following controls do not remain inside still-open descendants.
|
|
19
|
+
- Keep `<form>` elements inside `<template>` from claiming the global form pointer, including table-template form insertion.
|
|
20
|
+
- Close open `<p>` elements correctly around `<option>`, `<optgroup>`, `<hr>`, `<p>`, and `<div>` starts in `<select>` parsing.
|
|
21
|
+
- Close `<template>` correctly when `</template>` is seen while parsing inside `<select>`.
|
|
22
|
+
- Keep `</p>` and `</br>` foreign-content breakouts inside MathML text integration points such as `<mi>` and `<mtext>`.
|
|
23
|
+
- Align customizable `<select>` parsing with Chromium for phantom `</p>` handling and generic custom child elements.
|
|
24
|
+
|
|
25
|
+
### Security
|
|
26
|
+
- (Severity: Low) Strip invisible Unicode during URL sink validation even when general invisible-Unicode stripping is disabled. Previously, custom policies using `strip_invisible_unicode=False` could preserve scheme-obfuscated values such as `javascript\u200b:` in otherwise URL-validated attributes.
|
|
27
|
+
|
|
10
28
|
## [2.1.0] - 2026-06-06
|
|
11
29
|
|
|
12
30
|
### Performance
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: justhtml
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: A pure Python HTML5 parser that just works.
|
|
5
5
|
Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
|
|
6
6
|
Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
|
|
@@ -72,6 +72,8 @@ Requires Python 3.10 or later.
|
|
|
72
72
|
|
|
73
73
|
[Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
|
|
74
74
|
|
|
75
|
+

|
|
76
|
+
|
|
75
77
|
## Why Use It?
|
|
76
78
|
|
|
77
79
|
Most Python HTML libraries optimize for one part of the problem.
|
|
@@ -18,6 +18,8 @@ Requires Python 3.10 or later.
|
|
|
18
18
|
|
|
19
19
|
[Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
|
|
20
20
|
|
|
21
|
+

|
|
22
|
+
|
|
21
23
|
## Why Use It?
|
|
22
24
|
|
|
23
25
|
Most Python HTML libraries optimize for one part of the problem.
|
|
Binary file
|
|
@@ -163,14 +163,17 @@ class JustHTML:
|
|
|
163
163
|
if needs_escape_incomplete_tags:
|
|
164
164
|
opts.emit_bogus_markup_as_text = True
|
|
165
165
|
|
|
166
|
-
# For
|
|
167
|
-
|
|
168
|
-
|
|
166
|
+
# For text-like HTML fragment contexts, set the initial tokenizer state
|
|
167
|
+
# to match the context element.
|
|
168
|
+
if fragment_context and fragment_context.namespace in {None, "html"}:
|
|
169
169
|
tag_name = fragment_context.tag_name.lower()
|
|
170
|
-
if tag_name in
|
|
170
|
+
if tag_name in {"textarea", "title"}:
|
|
171
|
+
opts.initial_state = Tokenizer.RCDATA
|
|
172
|
+
elif tag_name in {"iframe", "noembed", "noframes", "script", "style", "xmp"} or (
|
|
173
|
+
tag_name == "noscript" and opts.scripting_enabled
|
|
174
|
+
):
|
|
171
175
|
opts.initial_state = Tokenizer.RAWTEXT
|
|
172
|
-
|
|
173
|
-
elif tag_name in ("plaintext", "script"):
|
|
176
|
+
elif tag_name == "plaintext":
|
|
174
177
|
opts.initial_state = Tokenizer.PLAINTEXT
|
|
175
178
|
|
|
176
179
|
self.tokenizer = Tokenizer(
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeAlias, cast
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Generator
|
|
7
|
+
|
|
8
|
+
from justhtml.core.constants import (
|
|
9
|
+
FOREIGN_BREAKOUT_ELEMENTS,
|
|
10
|
+
HTML_INTEGRATION_POINT_SET,
|
|
11
|
+
MATHML_TEXT_INTEGRATION_POINT_SET,
|
|
12
|
+
SVG_TAG_NAME_ADJUSTMENTS,
|
|
13
|
+
)
|
|
14
|
+
from justhtml.tokenizer import Tokenizer
|
|
15
|
+
from justhtml.tokenizer.tokens import CommentToken, DoctypeToken, Tag
|
|
16
|
+
|
|
17
|
+
from .encoding import decode_html
|
|
18
|
+
|
|
19
|
+
StartEvent: TypeAlias = tuple[Literal["start"], tuple[str, dict[str, str | None]]]
|
|
20
|
+
EndEvent: TypeAlias = tuple[Literal["end"], str]
|
|
21
|
+
TextEvent: TypeAlias = tuple[Literal["text"], str]
|
|
22
|
+
CommentEvent: TypeAlias = tuple[Literal["comment"], str]
|
|
23
|
+
DoctypeEvent: TypeAlias = tuple[Literal["doctype"], tuple[str | None, str | None, str | None]]
|
|
24
|
+
StreamEvent: TypeAlias = StartEvent | EndEvent | TextEvent | CommentEvent | DoctypeEvent
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _DummyNode:
|
|
28
|
+
__slots__ = ("attrs", "name", "namespace")
|
|
29
|
+
|
|
30
|
+
attrs: dict[str, str | None]
|
|
31
|
+
name: str
|
|
32
|
+
namespace: str
|
|
33
|
+
|
|
34
|
+
def __init__(self, name: str, namespace: str, attrs: dict[str, str | None] | None = None) -> None:
|
|
35
|
+
self.attrs = attrs or {}
|
|
36
|
+
self.name = name
|
|
37
|
+
self.namespace = namespace
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StreamSink:
|
|
41
|
+
"""A sink that buffers tokens for the stream API."""
|
|
42
|
+
|
|
43
|
+
tokens: list[StreamEvent]
|
|
44
|
+
open_elements: list[_DummyNode]
|
|
45
|
+
|
|
46
|
+
def __init__(self) -> None:
|
|
47
|
+
self.tokens = []
|
|
48
|
+
self.open_elements = [] # Required by tokenizer for rawtext checks
|
|
49
|
+
|
|
50
|
+
def _font_breaks_out_of_foreign_content(self, attrs: dict[str, str | None]) -> bool:
|
|
51
|
+
for name in attrs:
|
|
52
|
+
if name.lower() in {"color", "face", "size"}:
|
|
53
|
+
return True
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
def _node_attribute_value(self, node: _DummyNode, name: str) -> str | None:
|
|
57
|
+
target = name.lower()
|
|
58
|
+
for attr_name, attr_value in node.attrs.items():
|
|
59
|
+
if attr_name.lower() == target:
|
|
60
|
+
return attr_value or ""
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def _is_html_integration_point(self, node: _DummyNode) -> bool:
|
|
64
|
+
if node.namespace == "math" and node.name == "annotation-xml":
|
|
65
|
+
encoding = self._node_attribute_value(node, "encoding")
|
|
66
|
+
return encoding is not None and encoding.lower() in {"application/xhtml+xml", "text/html"}
|
|
67
|
+
return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
|
|
68
|
+
|
|
69
|
+
def _is_mathml_text_integration_point(self, node: _DummyNode) -> bool:
|
|
70
|
+
return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
|
|
71
|
+
|
|
72
|
+
def _adjusted_name_for_namespace(self, name: str, namespace: str) -> str:
|
|
73
|
+
if namespace == "svg":
|
|
74
|
+
return SVG_TAG_NAME_ADJUSTMENTS.get(name, name)
|
|
75
|
+
return name
|
|
76
|
+
|
|
77
|
+
def _namespace_from_html_context(self, name: str) -> str:
|
|
78
|
+
if name == "svg":
|
|
79
|
+
return "svg"
|
|
80
|
+
if name == "math":
|
|
81
|
+
return "math"
|
|
82
|
+
return "html"
|
|
83
|
+
|
|
84
|
+
def _namespace_for_start_tag(self, token: Tag) -> str:
|
|
85
|
+
name = token.name
|
|
86
|
+
parent = self.open_elements[-1] if self.open_elements else None
|
|
87
|
+
parent_namespace = parent.namespace if parent is not None else "html"
|
|
88
|
+
|
|
89
|
+
if parent is not None:
|
|
90
|
+
if self._is_html_integration_point(parent):
|
|
91
|
+
return self._namespace_from_html_context(name)
|
|
92
|
+
if self._is_mathml_text_integration_point(parent) and name not in {"mglyph", "malignmark"}:
|
|
93
|
+
return self._namespace_from_html_context(name)
|
|
94
|
+
if parent_namespace == "math" and parent.name == "annotation-xml" and name == "svg":
|
|
95
|
+
return "svg"
|
|
96
|
+
|
|
97
|
+
if parent_namespace not in {None, "html"}:
|
|
98
|
+
breaks_out = name in FOREIGN_BREAKOUT_ELEMENTS or (
|
|
99
|
+
name == "font" and self._font_breaks_out_of_foreign_content(token.attrs)
|
|
100
|
+
)
|
|
101
|
+
if breaks_out:
|
|
102
|
+
while self.open_elements and self.open_elements[-1].namespace not in {None, "html"}:
|
|
103
|
+
self.open_elements.pop()
|
|
104
|
+
parent_namespace = self.open_elements[-1].namespace if self.open_elements else "html"
|
|
105
|
+
else:
|
|
106
|
+
return parent_namespace
|
|
107
|
+
|
|
108
|
+
return self._namespace_from_html_context(name)
|
|
109
|
+
|
|
110
|
+
def _pop_foreign_context(self) -> None:
|
|
111
|
+
while self.open_elements and self.open_elements[-1].namespace not in {None, "html"}:
|
|
112
|
+
self.open_elements.pop()
|
|
113
|
+
|
|
114
|
+
def _pop_for_end_tag(self, name: str) -> None:
|
|
115
|
+
if not self.open_elements:
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
name_lower = name.lower()
|
|
119
|
+
current = self.open_elements[-1]
|
|
120
|
+
if current.namespace not in {None, "html"} and name_lower in {"br", "p"}:
|
|
121
|
+
self._pop_foreign_context()
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
for index in range(len(self.open_elements) - 1, -1, -1):
|
|
125
|
+
node = self.open_elements[index]
|
|
126
|
+
if node.name.lower() == name_lower:
|
|
127
|
+
del self.open_elements[index:]
|
|
128
|
+
return
|
|
129
|
+
if node.namespace in {None, "html"}:
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
self.open_elements.pop()
|
|
133
|
+
|
|
134
|
+
def process_token(self, token: Tag | CommentToken | DoctypeToken | Any) -> int:
|
|
135
|
+
# Tokenizer reuses token objects, so we must copy data
|
|
136
|
+
if isinstance(token, Tag):
|
|
137
|
+
# Copy tag data
|
|
138
|
+
if token.kind == Tag.START:
|
|
139
|
+
self.tokens.append(("start", (token.name, token.attrs.copy())))
|
|
140
|
+
else:
|
|
141
|
+
self.tokens.append(("end", token.name))
|
|
142
|
+
# Maintain open_elements stack for tokenizer rawtext/CDATA checks.
|
|
143
|
+
if token.kind == Tag.START:
|
|
144
|
+
namespace = self._namespace_for_start_tag(token)
|
|
145
|
+
if not (token.self_closing and namespace not in {None, "html"}):
|
|
146
|
+
name = self._adjusted_name_for_namespace(token.name, namespace)
|
|
147
|
+
self.open_elements.append(_DummyNode(name, namespace, token.attrs.copy()))
|
|
148
|
+
else: # Tag.END
|
|
149
|
+
self._pop_for_end_tag(token.name)
|
|
150
|
+
|
|
151
|
+
elif isinstance(token, CommentToken):
|
|
152
|
+
self.tokens.append(("comment", token.data))
|
|
153
|
+
|
|
154
|
+
elif isinstance(token, DoctypeToken):
|
|
155
|
+
dt = token.doctype
|
|
156
|
+
self.tokens.append(("doctype", (dt.name, dt.public_id, dt.system_id)))
|
|
157
|
+
|
|
158
|
+
return 0 # TokenSinkResult.Continue
|
|
159
|
+
|
|
160
|
+
def process_characters(self, data: str) -> None:
|
|
161
|
+
"""Handle character data from tokenizer."""
|
|
162
|
+
self.tokens.append(("text", data))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def stream(
|
|
166
|
+
html: str | bytes | bytearray | memoryview,
|
|
167
|
+
*,
|
|
168
|
+
encoding: str | None = None,
|
|
169
|
+
) -> Generator[StreamEvent, None, None]:
|
|
170
|
+
"""
|
|
171
|
+
Stream HTML events from the given HTML string.
|
|
172
|
+
Yields tuples of (event_type, data).
|
|
173
|
+
"""
|
|
174
|
+
html_str: str
|
|
175
|
+
if isinstance(html, (bytes, bytearray, memoryview)):
|
|
176
|
+
html_str, _ = decode_html(bytes(html), transport_encoding=encoding)
|
|
177
|
+
else:
|
|
178
|
+
html_str = html
|
|
179
|
+
sink = StreamSink()
|
|
180
|
+
tokenizer = Tokenizer(sink)
|
|
181
|
+
tokenizer.initialize(html_str)
|
|
182
|
+
|
|
183
|
+
while True:
|
|
184
|
+
# Run one step of the tokenizer
|
|
185
|
+
is_eof = tokenizer.step()
|
|
186
|
+
|
|
187
|
+
# Yield any tokens produced by this step
|
|
188
|
+
if sink.tokens:
|
|
189
|
+
# Coalesce text tokens
|
|
190
|
+
text_buffer: list[str] = []
|
|
191
|
+
for event, data in sink.tokens:
|
|
192
|
+
if event == "text":
|
|
193
|
+
text_buffer.append(cast("str", data))
|
|
194
|
+
else:
|
|
195
|
+
if text_buffer:
|
|
196
|
+
yield ("text", "".join(text_buffer))
|
|
197
|
+
text_buffer = []
|
|
198
|
+
yield cast("StartEvent | EndEvent | CommentEvent | DoctypeEvent", (event, data))
|
|
199
|
+
|
|
200
|
+
if text_buffer:
|
|
201
|
+
yield ("text", "".join(text_buffer))
|
|
202
|
+
|
|
203
|
+
sink.tokens.clear()
|
|
204
|
+
|
|
205
|
+
if is_eof:
|
|
206
|
+
break
|
|
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
|
|
|
26
26
|
# Note: This matches the logic of the previous loop-based implementation.
|
|
27
27
|
# It checks for space characters, quotes, equals sign, and greater-than.
|
|
28
28
|
_UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
|
|
29
|
-
_LITERAL_TEXT_SERIALIZATION_ELEMENTS = frozenset({"script", "style"})
|
|
29
|
+
_LITERAL_TEXT_SERIALIZATION_ELEMENTS = frozenset({"plaintext", "script", "style"})
|
|
30
30
|
_SERIALIZABLE_TAG_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9:_-]*$")
|
|
31
31
|
_SERIALIZABLE_ATTR_NAME_RE = re.compile(r"^[A-Za-z_:][A-Za-z0-9:._-]*$")
|
|
32
32
|
|
|
@@ -101,6 +101,8 @@ def _serialize_text_for_parent(text: str | None, parent_name: str | None) -> str
|
|
|
101
101
|
if parent_name is not None:
|
|
102
102
|
normalized_parent_name = parent_name if parent_name.islower() else parent_name.lower()
|
|
103
103
|
if normalized_parent_name in _LITERAL_TEXT_SERIALIZATION_ELEMENTS:
|
|
104
|
+
if normalized_parent_name == "plaintext":
|
|
105
|
+
return text
|
|
104
106
|
return _neutralize_rawtext_end_tag_sequences(text, normalized_parent_name)
|
|
105
107
|
return _escape_text(text)
|
|
106
108
|
|
|
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from collections.abc import Callable
|
|
9
9
|
|
|
10
|
+
from justhtml.core.constants import HTML_INTEGRATION_POINT_SET, MATHML_TEXT_INTEGRATION_POINT_SET
|
|
10
11
|
from justhtml.core.entities import decode_entities_in_text
|
|
11
12
|
from justhtml.core.errors import generate_error_message
|
|
12
13
|
|
|
@@ -35,6 +36,7 @@ _ATTR_VALUE_UNQUOTED_FAST_BAD_PATTERN = re.compile(r"""[\x00"'<=`]""")
|
|
|
35
36
|
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
|
|
36
37
|
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
|
|
37
38
|
_COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
|
|
39
|
+
_HTML_INTEGRATION_POINT_ENCODINGS = {"application/xhtml+xml", "text/html"}
|
|
38
40
|
|
|
39
41
|
# XML Coercion Regex
|
|
40
42
|
_xml_invalid_single_chars = []
|
|
@@ -2230,10 +2232,7 @@ class Tokenizer:
|
|
|
2230
2232
|
or (name == "noscript" and self.opts.scripting_enabled)
|
|
2231
2233
|
)
|
|
2232
2234
|
if needs_rawtext_check:
|
|
2233
|
-
|
|
2234
|
-
current_node = stack[-1] if stack else None
|
|
2235
|
-
namespace = current_node.namespace if current_node else None
|
|
2236
|
-
if namespace is None or namespace == "html":
|
|
2235
|
+
if self._current_node_uses_html_text_parsing():
|
|
2237
2236
|
if name in _RCDATA_ELEMENTS:
|
|
2238
2237
|
self.state = self.RCDATA
|
|
2239
2238
|
self.rawtext_tag_name = name
|
|
@@ -2260,6 +2259,37 @@ class Tokenizer:
|
|
|
2260
2259
|
self.current_tag_kind = Tag.START
|
|
2261
2260
|
return switched_to_rawtext
|
|
2262
2261
|
|
|
2262
|
+
def _current_node_uses_html_text_parsing(self) -> bool:
|
|
2263
|
+
stack = self.sink.open_elements
|
|
2264
|
+
current_node = stack[-1] if stack else None
|
|
2265
|
+
if current_node is None:
|
|
2266
|
+
return True
|
|
2267
|
+
|
|
2268
|
+
namespace = current_node.namespace
|
|
2269
|
+
if namespace is None or namespace == "html":
|
|
2270
|
+
return True
|
|
2271
|
+
|
|
2272
|
+
node_name = current_node.name
|
|
2273
|
+
if (namespace, node_name) in MATHML_TEXT_INTEGRATION_POINT_SET:
|
|
2274
|
+
return True
|
|
2275
|
+
|
|
2276
|
+
if namespace == "math" and node_name == "annotation-xml":
|
|
2277
|
+
encoding = self._node_attribute_value(current_node, "encoding")
|
|
2278
|
+
return encoding is not None and encoding.lower() in _HTML_INTEGRATION_POINT_ENCODINGS
|
|
2279
|
+
|
|
2280
|
+
return (namespace, node_name) in HTML_INTEGRATION_POINT_SET
|
|
2281
|
+
|
|
2282
|
+
def _node_attribute_value(self, node: Any, name: str) -> str | None:
|
|
2283
|
+
attrs = node.attrs
|
|
2284
|
+
if not attrs:
|
|
2285
|
+
return None
|
|
2286
|
+
|
|
2287
|
+
target = name.lower()
|
|
2288
|
+
for attr_name, attr_value in attrs.items():
|
|
2289
|
+
if attr_name.lower() == target:
|
|
2290
|
+
return attr_value or ""
|
|
2291
|
+
return None
|
|
2292
|
+
|
|
2263
2293
|
def _emit_incomplete_tag_as_text(self) -> None:
|
|
2264
2294
|
if not self.opts.emit_bogus_markup_as_text:
|
|
2265
2295
|
return
|
|
@@ -2528,6 +2558,7 @@ class Tokenizer:
|
|
|
2528
2558
|
else:
|
|
2529
2559
|
# lt_index == pos - the only remaining possibility
|
|
2530
2560
|
# Less-than sign - might be start of end tag
|
|
2561
|
+
self.current_token_start_pos = pos
|
|
2531
2562
|
pos += 1
|
|
2532
2563
|
self.pos = pos
|
|
2533
2564
|
self.state = self.RCDATA_LESS_THAN_SIGN
|
|
@@ -2570,6 +2601,9 @@ class Tokenizer:
|
|
|
2570
2601
|
if c == ">":
|
|
2571
2602
|
attrs: dict[str, str | None] = {}
|
|
2572
2603
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2604
|
+
if self.track_tag_positions:
|
|
2605
|
+
tag.start_pos = self.current_token_start_pos
|
|
2606
|
+
tag.end_pos = self.pos
|
|
2573
2607
|
self._flush_text()
|
|
2574
2608
|
self._emit_token(tag)
|
|
2575
2609
|
self.state = self.DATA
|
|
@@ -2578,6 +2612,7 @@ class Tokenizer:
|
|
|
2578
2612
|
return False
|
|
2579
2613
|
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2580
2614
|
# Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
|
|
2615
|
+
self._flush_text()
|
|
2581
2616
|
self.current_tag_kind = Tag.END
|
|
2582
2617
|
self.current_tag_attrs = {}
|
|
2583
2618
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
@@ -2647,6 +2682,7 @@ class Tokenizer:
|
|
|
2647
2682
|
if lt_index > pos:
|
|
2648
2683
|
chunk = buffer[pos:lt_index]
|
|
2649
2684
|
self._append_text(chunk)
|
|
2685
|
+
self.current_token_start_pos = lt_index
|
|
2650
2686
|
pos = lt_index + 1
|
|
2651
2687
|
self.pos = pos
|
|
2652
2688
|
# Handle script escaped transition before treating '<' as markup boundary
|
|
@@ -2701,6 +2737,9 @@ class Tokenizer:
|
|
|
2701
2737
|
if c == ">":
|
|
2702
2738
|
attrs: dict[str, str | None] = {}
|
|
2703
2739
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2740
|
+
if self.track_tag_positions:
|
|
2741
|
+
tag.start_pos = self.current_token_start_pos
|
|
2742
|
+
tag.end_pos = self.pos
|
|
2704
2743
|
self._flush_text()
|
|
2705
2744
|
self._emit_token(tag)
|
|
2706
2745
|
self.state = self.DATA
|
|
@@ -2709,6 +2748,7 @@ class Tokenizer:
|
|
|
2709
2748
|
return False
|
|
2710
2749
|
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2711
2750
|
# Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
|
|
2751
|
+
self._flush_text()
|
|
2712
2752
|
self.current_tag_kind = Tag.END
|
|
2713
2753
|
self.current_tag_attrs = {}
|
|
2714
2754
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
@@ -2866,6 +2906,7 @@ class Tokenizer:
|
|
|
2866
2906
|
|
|
2867
2907
|
if is_appropriate:
|
|
2868
2908
|
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2909
|
+
self._flush_text()
|
|
2869
2910
|
self.current_tag_kind = Tag.END
|
|
2870
2911
|
self.current_tag_attrs = {}
|
|
2871
2912
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
@@ -2880,6 +2921,9 @@ class Tokenizer:
|
|
|
2880
2921
|
self._flush_text()
|
|
2881
2922
|
attrs: dict[str, str | None] = {}
|
|
2882
2923
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2924
|
+
if self.track_tag_positions:
|
|
2925
|
+
tag.start_pos = self.current_token_start_pos
|
|
2926
|
+
tag.end_pos = self.pos
|
|
2883
2927
|
self._emit_token(tag)
|
|
2884
2928
|
self.state = self.DATA
|
|
2885
2929
|
self.rawtext_tag_name = None
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import TYPE_CHECKING, Any, cast
|
|
5
|
+
from typing import TYPE_CHECKING, Any, NamedTuple, cast
|
|
6
6
|
|
|
7
7
|
from justhtml.core.constants import (
|
|
8
8
|
BUTTON_SCOPE_TERMINATORS,
|
|
@@ -48,6 +48,12 @@ if TYPE_CHECKING:
|
|
|
48
48
|
from collections.abc import Callable
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
class _SelectedContentWalkItem(NamedTuple):
|
|
52
|
+
node: Any
|
|
53
|
+
in_disabled_optgroup: bool
|
|
54
|
+
in_datalist: bool
|
|
55
|
+
|
|
56
|
+
|
|
51
57
|
class TreeBuilder(TreeBuilderModesMixin):
|
|
52
58
|
__slots__ = (
|
|
53
59
|
"_body_end_handlers",
|
|
@@ -787,14 +793,11 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
787
793
|
if name not in existing:
|
|
788
794
|
existing[name] = value
|
|
789
795
|
|
|
790
|
-
def _remove_from_open_elements(self, node: Any) ->
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
del self.open_elements[index]
|
|
796
|
-
return True
|
|
797
|
-
return False
|
|
796
|
+
def _remove_from_open_elements(self, node: Any) -> None:
|
|
797
|
+
index = self.open_elements.index(node)
|
|
798
|
+
self._maybe_mark_end_tag(node)
|
|
799
|
+
self._note_open_element_removed(node)
|
|
800
|
+
del self.open_elements[index]
|
|
798
801
|
|
|
799
802
|
def _is_special_element(self, node: Any) -> bool:
|
|
800
803
|
if node.namespace not in {None, "html"}:
|
|
@@ -852,6 +855,15 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
852
855
|
return True
|
|
853
856
|
return False
|
|
854
857
|
|
|
858
|
+
def _has_detached_active_formatting_a(self) -> bool:
|
|
859
|
+
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
860
|
+
entry = self.active_formatting[index]
|
|
861
|
+
if entry is FORMAT_MARKER:
|
|
862
|
+
break
|
|
863
|
+
if entry["name"] == "a":
|
|
864
|
+
return entry["node"] not in self.open_elements
|
|
865
|
+
return False
|
|
866
|
+
|
|
855
867
|
def _remove_last_active_formatting_by_name(self, name: str) -> None:
|
|
856
868
|
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
857
869
|
entry = self.active_formatting[index]
|
|
@@ -988,7 +1000,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
988
1000
|
self._append_text(data)
|
|
989
1001
|
return
|
|
990
1002
|
|
|
991
|
-
if self.pending_table_text_should_error:
|
|
1003
|
+
if self.pending_table_text_should_error and self.collect_errors:
|
|
992
1004
|
# html5lib reports one foster-parenting error per non-whitespace character.
|
|
993
1005
|
for ch in data:
|
|
994
1006
|
if ch not in HTML_SPACE_CHARACTERS:
|
|
@@ -1169,7 +1181,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1169
1181
|
node = self.open_elements[-1]
|
|
1170
1182
|
if node.namespace in {None, "html"}:
|
|
1171
1183
|
return
|
|
1172
|
-
if self._is_html_integration_point(node):
|
|
1184
|
+
if self._is_html_integration_point(node) or self._is_mathml_text_integration_point(node):
|
|
1173
1185
|
return
|
|
1174
1186
|
if self.fragment_context_element is not None and node is self.fragment_context_element:
|
|
1175
1187
|
return
|
|
@@ -1310,59 +1322,78 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1310
1322
|
Per HTML5 spec: selectedcontent mirrors the content of the selected option,
|
|
1311
1323
|
or the first option if none is selected.
|
|
1312
1324
|
"""
|
|
1313
|
-
|
|
1314
|
-
selects: list[Any] = []
|
|
1315
|
-
self._find_elements(root, "select", selects)
|
|
1316
|
-
|
|
1317
|
-
for select in selects:
|
|
1318
|
-
# Find selectedcontent element in this select
|
|
1319
|
-
selectedcontent = self._find_element(select, "selectedcontent")
|
|
1320
|
-
if not selectedcontent:
|
|
1321
|
-
continue
|
|
1322
|
-
|
|
1323
|
-
# Find all option elements
|
|
1324
|
-
options: list[Any] = []
|
|
1325
|
-
self._find_elements(select, "option", options)
|
|
1326
|
-
|
|
1327
|
-
# Find selected option or use first one
|
|
1328
|
-
selected_option = None
|
|
1329
|
-
for opt in options:
|
|
1330
|
-
if opt.attrs:
|
|
1331
|
-
for attr_name in opt.attrs.keys():
|
|
1332
|
-
if attr_name == "selected":
|
|
1333
|
-
selected_option = opt
|
|
1334
|
-
break
|
|
1335
|
-
if selected_option:
|
|
1336
|
-
break
|
|
1337
|
-
|
|
1338
|
-
if not selected_option:
|
|
1339
|
-
selected_option = options[0]
|
|
1340
|
-
|
|
1341
|
-
# Clone content from selected option to selectedcontent
|
|
1342
|
-
self._clone_children(selected_option, selectedcontent)
|
|
1343
|
-
|
|
1344
|
-
def _find_elements(self, node: Any, name: str, result: list[Any]) -> None:
|
|
1345
|
-
"""Find all elements with given name using iterative preorder traversal."""
|
|
1346
|
-
stack: list[Any] = [node]
|
|
1325
|
+
stack: list[Any] = [root]
|
|
1347
1326
|
while stack:
|
|
1348
1327
|
current = stack.pop()
|
|
1349
|
-
if current.name ==
|
|
1350
|
-
|
|
1328
|
+
if current.name == "select":
|
|
1329
|
+
self._populate_selectedcontent_for_select(current)
|
|
1351
1330
|
|
|
1352
1331
|
if current.has_child_nodes():
|
|
1353
1332
|
stack.extend(reversed(current.children))
|
|
1354
1333
|
|
|
1355
|
-
def
|
|
1356
|
-
|
|
1357
|
-
|
|
1334
|
+
def _populate_selectedcontent_for_select(self, select: Any) -> None:
|
|
1335
|
+
selectedcontents: list[Any] = []
|
|
1336
|
+
first_option = None
|
|
1337
|
+
selected_option = None
|
|
1338
|
+
is_multiple = select.attrs is not None and "multiple" in select.attrs
|
|
1339
|
+
|
|
1340
|
+
stack = [_SelectedContentWalkItem(select, in_disabled_optgroup=False, in_datalist=False)]
|
|
1358
1341
|
while stack:
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1342
|
+
item = stack.pop()
|
|
1343
|
+
current = item.node
|
|
1344
|
+
attrs = getattr(current, "attrs", None)
|
|
1345
|
+
name = current.name
|
|
1346
|
+
if current is not select:
|
|
1347
|
+
if name == "selectedcontent":
|
|
1348
|
+
selectedcontents.append(current)
|
|
1349
|
+
if name == "option" and not item.in_datalist:
|
|
1350
|
+
if first_option is None and self._is_selectedcontent_fallback_option(
|
|
1351
|
+
attrs, item.in_disabled_optgroup
|
|
1352
|
+
):
|
|
1353
|
+
first_option = current
|
|
1354
|
+
if attrs is not None and "selected" in attrs:
|
|
1355
|
+
if is_multiple:
|
|
1356
|
+
if selected_option is None:
|
|
1357
|
+
selected_option = current
|
|
1358
|
+
else:
|
|
1359
|
+
selected_option = current
|
|
1362
1360
|
|
|
1363
1361
|
if current.has_child_nodes():
|
|
1364
|
-
|
|
1365
|
-
|
|
1362
|
+
child_disabled_optgroup = item.in_disabled_optgroup or (
|
|
1363
|
+
name == "optgroup" and attrs is not None and "disabled" in attrs
|
|
1364
|
+
)
|
|
1365
|
+
child_in_datalist = item.in_datalist or name == "datalist"
|
|
1366
|
+
stack.extend(
|
|
1367
|
+
_SelectedContentWalkItem(child, child_disabled_optgroup, child_in_datalist)
|
|
1368
|
+
for child in reversed(current.children)
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
if not selectedcontents:
|
|
1372
|
+
return
|
|
1373
|
+
|
|
1374
|
+
source_option = selected_option or first_option
|
|
1375
|
+
|
|
1376
|
+
for selectedcontent in selectedcontents:
|
|
1377
|
+
if source_option is not None and self._is_descendant_of(selectedcontent, source_option):
|
|
1378
|
+
continue
|
|
1379
|
+
children = selectedcontent.children
|
|
1380
|
+
if children:
|
|
1381
|
+
for child in children:
|
|
1382
|
+
child.parent = None
|
|
1383
|
+
children.clear()
|
|
1384
|
+
if source_option is not None:
|
|
1385
|
+
self._clone_children(source_option, selectedcontent)
|
|
1386
|
+
|
|
1387
|
+
def _is_selectedcontent_fallback_option(self, attrs: Any, in_disabled_optgroup: bool) -> bool:
|
|
1388
|
+
return not in_disabled_optgroup and (attrs is None or "disabled" not in attrs)
|
|
1389
|
+
|
|
1390
|
+
def _is_descendant_of(self, node: Any, ancestor: Any) -> bool:
|
|
1391
|
+
parent = node.parent
|
|
1392
|
+
while parent is not None:
|
|
1393
|
+
if parent is ancestor:
|
|
1394
|
+
return True
|
|
1395
|
+
parent = parent.parent
|
|
1396
|
+
return False
|
|
1366
1397
|
|
|
1367
1398
|
def _clone_children(self, source: Any, target: Any) -> None:
|
|
1368
1399
|
"""Deep clone all children from source to target."""
|
|
@@ -1404,6 +1435,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1404
1435
|
if not data:
|
|
1405
1436
|
return TokenSinkResult.Continue
|
|
1406
1437
|
if "\x00" in data:
|
|
1438
|
+
self._parse_error("invalid-codepoint")
|
|
1407
1439
|
data = data.replace("\x00", "")
|
|
1408
1440
|
if not data:
|
|
1409
1441
|
return TokenSinkResult.Continue
|