justhtml 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +144 -0
- justhtml/constants.py +445 -0
- justhtml/context.py +12 -0
- justhtml/encoding.py +405 -0
- justhtml/entities.py +344 -0
- justhtml/errors.py +140 -0
- justhtml/node.py +632 -0
- justhtml/parser.py +131 -0
- justhtml/py.typed +0 -0
- justhtml/selector.py +965 -0
- justhtml/serialize.py +258 -0
- justhtml/stream.py +107 -0
- justhtml/tokenizer.py +2647 -0
- justhtml/tokens.py +223 -0
- justhtml/treebuilder.py +1279 -0
- justhtml/treebuilder_modes.py +2016 -0
- justhtml/treebuilder_utils.py +93 -0
- justhtml-0.12.0.dist-info/METADATA +164 -0
- justhtml-0.12.0.dist-info/RECORD +23 -0
- justhtml-0.12.0.dist-info/WHEEL +4 -0
- justhtml-0.12.0.dist-info/entry_points.txt +2 -0
- justhtml-0.12.0.dist-info/licenses/LICENSE +21 -0
justhtml/serialize.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""HTML serialization utilities for JustHTML DOM nodes."""
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: PERF401
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _escape_text(text: str | None) -> str:
|
|
13
|
+
if not text:
|
|
14
|
+
return ""
|
|
15
|
+
# Minimal, but matches html5lib serializer expectations in core cases.
|
|
16
|
+
return str(text).replace("&", "&").replace("<", "<").replace(">", ">")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _choose_attr_quote(value: str | None) -> str:
|
|
20
|
+
if value is None:
|
|
21
|
+
return '"'
|
|
22
|
+
value = str(value)
|
|
23
|
+
if '"' in value and "'" not in value:
|
|
24
|
+
return "'"
|
|
25
|
+
return '"'
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _escape_attr_value(value: str | None, quote_char: str) -> str:
|
|
29
|
+
if value is None:
|
|
30
|
+
return ""
|
|
31
|
+
value = str(value)
|
|
32
|
+
value = value.replace("&", "&")
|
|
33
|
+
# Note: html5lib's default serializer does not escape '>' in attrs.
|
|
34
|
+
if quote_char == '"':
|
|
35
|
+
return value.replace('"', """)
|
|
36
|
+
return value.replace("'", "'")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _can_unquote_attr_value(value: str | None) -> bool:
|
|
40
|
+
if value is None:
|
|
41
|
+
return False
|
|
42
|
+
value = str(value)
|
|
43
|
+
# html5lib's serializer unquotes aggressively; match fixture expectations.
|
|
44
|
+
# Disallow whitespace and characters that would terminate/ambiguate the value.
|
|
45
|
+
for ch in value:
|
|
46
|
+
if ch == ">":
|
|
47
|
+
return False
|
|
48
|
+
if ch in {'"', "'", "="}:
|
|
49
|
+
return False
|
|
50
|
+
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
51
|
+
return False
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def serialize_start_tag(name: str, attrs: dict[str, str | None] | None) -> str:
|
|
56
|
+
attrs = attrs or {}
|
|
57
|
+
parts: list[str] = ["<", name]
|
|
58
|
+
if attrs:
|
|
59
|
+
for key, value in attrs.items():
|
|
60
|
+
if value is None or value == "":
|
|
61
|
+
parts.extend([" ", key])
|
|
62
|
+
else:
|
|
63
|
+
if _can_unquote_attr_value(value):
|
|
64
|
+
escaped = str(value).replace("&", "&")
|
|
65
|
+
parts.extend([" ", key, "=", escaped])
|
|
66
|
+
else:
|
|
67
|
+
quote = _choose_attr_quote(value)
|
|
68
|
+
escaped = _escape_attr_value(value, quote)
|
|
69
|
+
parts.extend([" ", key, "=", quote, escaped, quote])
|
|
70
|
+
parts.append(">")
|
|
71
|
+
return "".join(parts)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def serialize_end_tag(name: str) -> str:
|
|
75
|
+
return f"</{name}>"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def to_html(node: Any, indent: int = 0, indent_size: int = 2, *, pretty: bool = True) -> str:
|
|
79
|
+
"""Convert node to HTML string."""
|
|
80
|
+
if node.name == "#document":
|
|
81
|
+
# Document root - just render children
|
|
82
|
+
parts: list[str] = []
|
|
83
|
+
for child in node.children or []:
|
|
84
|
+
parts.append(_node_to_html(child, indent, indent_size, pretty))
|
|
85
|
+
return "\n".join(parts) if pretty else "".join(parts)
|
|
86
|
+
return _node_to_html(node, indent, indent_size, pretty)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
|
|
90
|
+
"""Helper to convert a node to HTML."""
|
|
91
|
+
prefix = " " * (indent * indent_size) if pretty else ""
|
|
92
|
+
newline = "\n" if pretty else ""
|
|
93
|
+
name: str = node.name
|
|
94
|
+
|
|
95
|
+
# Text node
|
|
96
|
+
if name == "#text":
|
|
97
|
+
text: str | None = node.data
|
|
98
|
+
if pretty:
|
|
99
|
+
text = text.strip() if text else ""
|
|
100
|
+
if text:
|
|
101
|
+
return f"{prefix}{_escape_text(text)}"
|
|
102
|
+
return ""
|
|
103
|
+
return _escape_text(text) if text else ""
|
|
104
|
+
|
|
105
|
+
# Comment node
|
|
106
|
+
if name == "#comment":
|
|
107
|
+
return f"{prefix}<!--{node.data or ''}-->"
|
|
108
|
+
|
|
109
|
+
# Doctype
|
|
110
|
+
if name == "!doctype":
|
|
111
|
+
return f"{prefix}<!DOCTYPE html>"
|
|
112
|
+
|
|
113
|
+
# Document fragment
|
|
114
|
+
if name == "#document-fragment":
|
|
115
|
+
parts: list[str] = []
|
|
116
|
+
for child in node.children or []:
|
|
117
|
+
child_html = _node_to_html(child, indent, indent_size, pretty)
|
|
118
|
+
if child_html:
|
|
119
|
+
parts.append(child_html)
|
|
120
|
+
return newline.join(parts) if pretty else "".join(parts)
|
|
121
|
+
|
|
122
|
+
# Element node
|
|
123
|
+
attrs: dict[str, str | None] = node.attrs or {}
|
|
124
|
+
|
|
125
|
+
# Build opening tag
|
|
126
|
+
open_tag = serialize_start_tag(name, attrs)
|
|
127
|
+
|
|
128
|
+
# Void elements
|
|
129
|
+
if name in VOID_ELEMENTS:
|
|
130
|
+
return f"{prefix}{open_tag}"
|
|
131
|
+
|
|
132
|
+
# Elements with children
|
|
133
|
+
children: list[Any] = node.children or []
|
|
134
|
+
if not children:
|
|
135
|
+
return f"{prefix}{open_tag}{serialize_end_tag(name)}"
|
|
136
|
+
|
|
137
|
+
# Check if all children are text-only (inline rendering)
|
|
138
|
+
all_text = all(c.name == "#text" for c in children)
|
|
139
|
+
|
|
140
|
+
if all_text and pretty:
|
|
141
|
+
return f"{prefix}{open_tag}{_escape_text(node.to_text(separator='', strip=False))}{serialize_end_tag(name)}"
|
|
142
|
+
|
|
143
|
+
# Render with child indentation
|
|
144
|
+
parts = [f"{prefix}{open_tag}"]
|
|
145
|
+
for child in children:
|
|
146
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty)
|
|
147
|
+
if child_html:
|
|
148
|
+
parts.append(child_html)
|
|
149
|
+
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
150
|
+
return newline.join(parts) if pretty else "".join(parts)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def to_test_format(node: Any, indent: int = 0) -> str:
|
|
154
|
+
"""Convert node to html5lib test format string.
|
|
155
|
+
|
|
156
|
+
This format is used by html5lib-tests for validating parser output.
|
|
157
|
+
Uses '| ' prefixes and specific indentation rules.
|
|
158
|
+
"""
|
|
159
|
+
if node.name in {"#document", "#document-fragment"}:
|
|
160
|
+
parts = [_node_to_test_format(child, 0) for child in node.children]
|
|
161
|
+
return "\n".join(parts)
|
|
162
|
+
return _node_to_test_format(node, indent)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _node_to_test_format(node: Any, indent: int) -> str:
|
|
166
|
+
"""Helper to convert a node to test format."""
|
|
167
|
+
if node.name == "#comment":
|
|
168
|
+
comment: str = node.data or ""
|
|
169
|
+
return f"| {' ' * indent}<!-- {comment} -->"
|
|
170
|
+
|
|
171
|
+
if node.name == "!doctype":
|
|
172
|
+
return _doctype_to_test_format(node)
|
|
173
|
+
|
|
174
|
+
if node.name == "#text":
|
|
175
|
+
text: str = node.data or ""
|
|
176
|
+
return f'| {" " * indent}"{text}"'
|
|
177
|
+
|
|
178
|
+
# Regular element
|
|
179
|
+
line = f"| {' ' * indent}<{_qualified_name(node)}>"
|
|
180
|
+
attribute_lines = _attrs_to_test_format(node, indent)
|
|
181
|
+
|
|
182
|
+
# Template special handling (only HTML namespace templates have template_content)
|
|
183
|
+
if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
|
|
184
|
+
sections: list[str] = [line]
|
|
185
|
+
if attribute_lines:
|
|
186
|
+
sections.extend(attribute_lines)
|
|
187
|
+
content_line = f"| {' ' * (indent + 2)}content"
|
|
188
|
+
sections.append(content_line)
|
|
189
|
+
sections.extend(_node_to_test_format(child, indent + 4) for child in node.template_content.children)
|
|
190
|
+
return "\n".join(sections)
|
|
191
|
+
|
|
192
|
+
# Regular element with children
|
|
193
|
+
child_lines = [_node_to_test_format(child, indent + 2) for child in node.children] if node.children else []
|
|
194
|
+
|
|
195
|
+
sections = [line]
|
|
196
|
+
if attribute_lines:
|
|
197
|
+
sections.extend(attribute_lines)
|
|
198
|
+
sections.extend(child_lines)
|
|
199
|
+
return "\n".join(sections)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _qualified_name(node: Any) -> str:
|
|
203
|
+
"""Get the qualified name of a node (with namespace prefix if needed)."""
|
|
204
|
+
if node.namespace and node.namespace not in {"html", None}:
|
|
205
|
+
return f"{node.namespace} {node.name}"
|
|
206
|
+
return str(node.name)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _attrs_to_test_format(node: Any, indent: int) -> list[str]:
|
|
210
|
+
"""Format element attributes for test output."""
|
|
211
|
+
if not node.attrs:
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
formatted: list[str] = []
|
|
215
|
+
padding = " " * (indent + 2)
|
|
216
|
+
|
|
217
|
+
# Prepare display names for sorting
|
|
218
|
+
display_attrs: list[tuple[str, str]] = []
|
|
219
|
+
namespace: str | None = node.namespace
|
|
220
|
+
for attr_name, attr_value in node.attrs.items():
|
|
221
|
+
value = attr_value or ""
|
|
222
|
+
display_name = attr_name
|
|
223
|
+
if namespace and namespace not in {None, "html"}:
|
|
224
|
+
lower_name = attr_name.lower()
|
|
225
|
+
if lower_name in FOREIGN_ATTRIBUTE_ADJUSTMENTS:
|
|
226
|
+
display_name = attr_name.replace(":", " ")
|
|
227
|
+
display_attrs.append((display_name, value))
|
|
228
|
+
|
|
229
|
+
# Sort by display name for canonical test output
|
|
230
|
+
display_attrs.sort(key=lambda x: x[0])
|
|
231
|
+
|
|
232
|
+
for display_name, value in display_attrs:
|
|
233
|
+
formatted.append(f'| {padding}{display_name}="{value}"')
|
|
234
|
+
return formatted
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _doctype_to_test_format(node: Any) -> str:
|
|
238
|
+
"""Format DOCTYPE node for test output."""
|
|
239
|
+
doctype = node.data
|
|
240
|
+
|
|
241
|
+
name: str = doctype.name or ""
|
|
242
|
+
public_id: str | None = doctype.public_id
|
|
243
|
+
system_id: str | None = doctype.system_id
|
|
244
|
+
|
|
245
|
+
parts: list[str] = ["| <!DOCTYPE"]
|
|
246
|
+
if name:
|
|
247
|
+
parts.append(f" {name}")
|
|
248
|
+
else:
|
|
249
|
+
parts.append(" ")
|
|
250
|
+
|
|
251
|
+
if public_id is not None or system_id is not None:
|
|
252
|
+
pub = public_id if public_id is not None else ""
|
|
253
|
+
sys = system_id if system_id is not None else ""
|
|
254
|
+
parts.append(f' "{pub}"')
|
|
255
|
+
parts.append(f' "{sys}"')
|
|
256
|
+
|
|
257
|
+
parts.append(">")
|
|
258
|
+
return "".join(parts)
|
justhtml/stream.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Generator
|
|
7
|
+
|
|
8
|
+
from .encoding import decode_html
|
|
9
|
+
from .tokenizer import Tokenizer
|
|
10
|
+
from .tokens import CommentToken, DoctypeToken, Tag
|
|
11
|
+
|
|
12
|
+
# Type alias for stream events
|
|
13
|
+
StreamEvent = tuple[str, Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _DummyNode:
|
|
17
|
+
namespace: str = "html"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StreamSink:
|
|
21
|
+
"""A sink that buffers tokens for the stream API."""
|
|
22
|
+
|
|
23
|
+
tokens: list[StreamEvent]
|
|
24
|
+
open_elements: list[_DummyNode]
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
self.tokens = []
|
|
28
|
+
self.open_elements = [] # Required by tokenizer for rawtext checks
|
|
29
|
+
|
|
30
|
+
def process_token(self, token: Tag | CommentToken | DoctypeToken | Any) -> int:
|
|
31
|
+
# Tokenizer reuses token objects, so we must copy data
|
|
32
|
+
if isinstance(token, Tag):
|
|
33
|
+
# Copy tag data
|
|
34
|
+
self.tokens.append(
|
|
35
|
+
(
|
|
36
|
+
"start" if token.kind == Tag.START else "end",
|
|
37
|
+
(token.name, token.attrs.copy()) if token.kind == Tag.START else token.name,
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
# Maintain open_elements stack for tokenizer's rawtext checks
|
|
41
|
+
if token.kind == Tag.START:
|
|
42
|
+
# We need a dummy object with namespace for tokenizer checks
|
|
43
|
+
# Tokenizer checks: stack[-1].namespace
|
|
44
|
+
# We can just use a simple object
|
|
45
|
+
self.open_elements.append(_DummyNode())
|
|
46
|
+
else: # Tag.END
|
|
47
|
+
if self.open_elements:
|
|
48
|
+
self.open_elements.pop()
|
|
49
|
+
# If open_elements is empty, we ignore the end tag for rawtext tracking purposes
|
|
50
|
+
# (it's an unmatched end tag at the root level)
|
|
51
|
+
|
|
52
|
+
elif isinstance(token, CommentToken):
|
|
53
|
+
self.tokens.append(("comment", token.data))
|
|
54
|
+
|
|
55
|
+
elif isinstance(token, DoctypeToken):
|
|
56
|
+
dt = token.doctype
|
|
57
|
+
self.tokens.append(("doctype", (dt.name, dt.public_id, dt.system_id)))
|
|
58
|
+
|
|
59
|
+
return 0 # TokenSinkResult.Continue
|
|
60
|
+
|
|
61
|
+
def process_characters(self, data: str) -> None:
|
|
62
|
+
"""Handle character data from tokenizer."""
|
|
63
|
+
self.tokens.append(("text", data))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def stream(
|
|
67
|
+
html: str | bytes | bytearray | memoryview,
|
|
68
|
+
*,
|
|
69
|
+
encoding: str | None = None,
|
|
70
|
+
) -> Generator[StreamEvent, None, None]:
|
|
71
|
+
"""
|
|
72
|
+
Stream HTML events from the given HTML string.
|
|
73
|
+
Yields tuples of (event_type, data).
|
|
74
|
+
"""
|
|
75
|
+
html_str: str
|
|
76
|
+
if isinstance(html, (bytes, bytearray, memoryview)):
|
|
77
|
+
html_str, _ = decode_html(bytes(html), transport_encoding=encoding)
|
|
78
|
+
else:
|
|
79
|
+
html_str = html
|
|
80
|
+
sink = StreamSink()
|
|
81
|
+
tokenizer = Tokenizer(sink)
|
|
82
|
+
tokenizer.initialize(html_str)
|
|
83
|
+
|
|
84
|
+
while True:
|
|
85
|
+
# Run one step of the tokenizer
|
|
86
|
+
is_eof = tokenizer.step()
|
|
87
|
+
|
|
88
|
+
# Yield any tokens produced by this step
|
|
89
|
+
if sink.tokens:
|
|
90
|
+
# Coalesce text tokens
|
|
91
|
+
text_buffer: list[str] = []
|
|
92
|
+
for event, data in sink.tokens:
|
|
93
|
+
if event == "text":
|
|
94
|
+
text_buffer.append(data)
|
|
95
|
+
else:
|
|
96
|
+
if text_buffer:
|
|
97
|
+
yield ("text", "".join(text_buffer))
|
|
98
|
+
text_buffer = []
|
|
99
|
+
yield (event, data)
|
|
100
|
+
|
|
101
|
+
if text_buffer:
|
|
102
|
+
yield ("text", "".join(text_buffer))
|
|
103
|
+
|
|
104
|
+
sink.tokens.clear()
|
|
105
|
+
|
|
106
|
+
if is_eof:
|
|
107
|
+
break
|