justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/serialize.py
CHANGED
|
@@ -2,35 +2,288 @@
|
|
|
2
2
|
|
|
3
3
|
# ruff: noqa: PERF401
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
|
|
11
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, _sanitize
|
|
12
|
+
|
|
13
|
+
# Matches characters that prevent an attribute value from being unquoted.
|
|
14
|
+
# Note: This matches the logic of the previous loop-based implementation.
|
|
15
|
+
# It checks for space characters, quotes, equals sign, and greater-than.
|
|
16
|
+
_UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _escape_text(text: str | None) -> str:
|
|
20
|
+
if not text:
|
|
21
|
+
return ""
|
|
22
|
+
# Minimal, but matches html5lib serializer expectations in core cases.
|
|
23
|
+
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
|
|
27
|
+
if forced_quote_char in {'"', "'"}:
|
|
28
|
+
return forced_quote_char
|
|
29
|
+
if value is None:
|
|
30
|
+
return '"'
|
|
31
|
+
# value is assumed to be a string
|
|
32
|
+
if '"' in value and "'" not in value:
|
|
33
|
+
return "'"
|
|
34
|
+
return '"'
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
|
|
38
|
+
if value is None:
|
|
39
|
+
return ""
|
|
40
|
+
# value is assumed to be a string
|
|
41
|
+
value = value.replace("&", "&")
|
|
42
|
+
if escape_lt_in_attrs:
|
|
43
|
+
value = value.replace("<", "<")
|
|
44
|
+
# Note: html5lib's default serializer does not escape '>' in attrs.
|
|
45
|
+
if quote_char == '"':
|
|
46
|
+
return value.replace('"', """)
|
|
47
|
+
return value.replace("'", "'")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _can_unquote_attr_value(value: str | None) -> bool:
|
|
51
|
+
if value is None:
|
|
52
|
+
return False
|
|
53
|
+
# Optimization: use regex instead of loop
|
|
54
|
+
return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
|
|
58
|
+
if not minimize_boolean_attributes:
|
|
59
|
+
return False
|
|
60
|
+
if value is None or value == "":
|
|
61
|
+
return True
|
|
62
|
+
if value == name:
|
|
63
|
+
return True
|
|
64
|
+
return value.lower() == name
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def serialize_start_tag(
|
|
68
|
+
name: str,
|
|
69
|
+
attrs: dict[str, str | None] | None,
|
|
70
|
+
*,
|
|
71
|
+
quote_attr_values: bool = True,
|
|
72
|
+
minimize_boolean_attributes: bool = True,
|
|
73
|
+
quote_char: str | None = None,
|
|
74
|
+
escape_lt_in_attrs: bool = False,
|
|
75
|
+
use_trailing_solidus: bool = False,
|
|
76
|
+
is_void: bool = False,
|
|
77
|
+
) -> str:
|
|
78
|
+
attrs = attrs or {}
|
|
79
|
+
parts: list[str] = ["<", name]
|
|
80
|
+
if attrs:
|
|
81
|
+
for key, value in attrs.items():
|
|
82
|
+
if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
|
|
83
|
+
parts.extend([" ", key])
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
if value is None:
|
|
87
|
+
parts.extend([" ", key, '=""'])
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# value is guaranteed to be a string here because attrs is dict[str, str | None]
|
|
91
|
+
value_str = value
|
|
92
|
+
if value_str == "":
|
|
93
|
+
parts.extend([" ", key, '=""'])
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
if not quote_attr_values and _can_unquote_attr_value(value_str):
|
|
97
|
+
escaped = value_str.replace("&", "&")
|
|
98
|
+
if escape_lt_in_attrs:
|
|
99
|
+
escaped = escaped.replace("<", "<")
|
|
100
|
+
parts.extend([" ", key, "=", escaped])
|
|
101
|
+
else:
|
|
102
|
+
quote = _choose_attr_quote(value_str, quote_char)
|
|
103
|
+
escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
|
|
104
|
+
parts.extend([" ", key, "=", quote, escaped, quote])
|
|
105
|
+
|
|
106
|
+
if use_trailing_solidus and is_void:
|
|
107
|
+
parts.append(" />")
|
|
108
|
+
else:
|
|
109
|
+
parts.append(">")
|
|
110
|
+
return "".join(parts)
|
|
6
111
|
|
|
7
112
|
|
|
8
|
-
def
|
|
113
|
+
def serialize_end_tag(name: str) -> str:
|
|
114
|
+
return f"</{name}>"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def to_html(
|
|
118
|
+
node: Any,
|
|
119
|
+
indent: int = 0,
|
|
120
|
+
indent_size: int = 2,
|
|
121
|
+
*,
|
|
122
|
+
pretty: bool = True,
|
|
123
|
+
safe: bool = True,
|
|
124
|
+
policy: SanitizationPolicy | None = None,
|
|
125
|
+
) -> str:
|
|
9
126
|
"""Convert node to HTML string."""
|
|
127
|
+
if safe:
|
|
128
|
+
if policy is None and node.name == "#document":
|
|
129
|
+
node = _sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
|
|
130
|
+
else:
|
|
131
|
+
node = _sanitize(node, policy=policy or DEFAULT_POLICY)
|
|
10
132
|
if node.name == "#document":
|
|
11
133
|
# Document root - just render children
|
|
12
|
-
parts = []
|
|
134
|
+
parts: list[str] = []
|
|
13
135
|
for child in node.children or []:
|
|
14
|
-
parts.append(_node_to_html(child, indent, indent_size, pretty))
|
|
136
|
+
parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
|
|
15
137
|
return "\n".join(parts) if pretty else "".join(parts)
|
|
16
|
-
return _node_to_html(node, indent, indent_size, pretty)
|
|
138
|
+
return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _collapse_html_whitespace(text: str) -> str:
|
|
142
|
+
"""Collapse HTML whitespace runs to a single space and trim edges.
|
|
143
|
+
|
|
144
|
+
This matches how HTML rendering treats most whitespace in text nodes, and is
|
|
145
|
+
used only for pretty-printing in non-preformatted contexts.
|
|
146
|
+
"""
|
|
147
|
+
if not text:
|
|
148
|
+
return ""
|
|
149
|
+
|
|
150
|
+
# Optimization: split() handles whitespace collapsing efficiently.
|
|
151
|
+
# Note: split() treats \v as whitespace, which is not HTML whitespace.
|
|
152
|
+
# But \v is extremely rare in HTML.
|
|
153
|
+
if "\v" in text:
|
|
154
|
+
parts: list[str] = []
|
|
155
|
+
in_whitespace = False
|
|
156
|
+
for ch in text:
|
|
157
|
+
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
158
|
+
if not in_whitespace:
|
|
159
|
+
parts.append(" ")
|
|
160
|
+
in_whitespace = True
|
|
161
|
+
continue
|
|
17
162
|
|
|
163
|
+
parts.append(ch)
|
|
164
|
+
in_whitespace = False
|
|
18
165
|
|
|
19
|
-
|
|
166
|
+
collapsed = "".join(parts)
|
|
167
|
+
return collapsed.strip(" ")
|
|
168
|
+
|
|
169
|
+
return " ".join(text.split())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _normalize_formatting_whitespace(text: str) -> str:
|
|
173
|
+
"""Normalize formatting whitespace within a text node.
|
|
174
|
+
|
|
175
|
+
Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
|
|
176
|
+
include such formatting whitespace to a single space.
|
|
177
|
+
|
|
178
|
+
Pure space runs are preserved as-is (so existing double-spaces remain).
|
|
179
|
+
"""
|
|
180
|
+
if not text:
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
|
|
184
|
+
return text
|
|
185
|
+
|
|
186
|
+
starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
|
|
187
|
+
ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
|
|
188
|
+
|
|
189
|
+
out: list[str] = []
|
|
190
|
+
in_ws = False
|
|
191
|
+
saw_formatting_ws = False
|
|
192
|
+
|
|
193
|
+
for ch in text:
|
|
194
|
+
if ch == " ":
|
|
195
|
+
if in_ws:
|
|
196
|
+
# Only collapse if this whitespace run included formatting whitespace.
|
|
197
|
+
if saw_formatting_ws:
|
|
198
|
+
continue
|
|
199
|
+
out.append(" ")
|
|
200
|
+
continue
|
|
201
|
+
in_ws = True
|
|
202
|
+
saw_formatting_ws = False
|
|
203
|
+
out.append(" ")
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
if ch in {"\n", "\r", "\t", "\f"}:
|
|
207
|
+
if in_ws:
|
|
208
|
+
saw_formatting_ws = True
|
|
209
|
+
continue
|
|
210
|
+
in_ws = True
|
|
211
|
+
saw_formatting_ws = True
|
|
212
|
+
out.append(" ")
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
in_ws = False
|
|
216
|
+
saw_formatting_ws = False
|
|
217
|
+
out.append(ch)
|
|
218
|
+
|
|
219
|
+
normalized = "".join(out)
|
|
220
|
+
if starts_with_formatting and normalized.startswith(" "):
|
|
221
|
+
normalized = normalized[1:]
|
|
222
|
+
if ends_with_formatting and normalized.endswith(" "):
|
|
223
|
+
normalized = normalized[:-1]
|
|
224
|
+
return normalized
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _is_whitespace_text_node(node: Any) -> bool:
|
|
228
|
+
return node.name == "#text" and (node.data or "").strip() == ""
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _should_pretty_indent_children(children: list[Any]) -> bool:
|
|
232
|
+
for child in children:
|
|
233
|
+
if child is None:
|
|
234
|
+
continue
|
|
235
|
+
name = child.name
|
|
236
|
+
if name == "#comment":
|
|
237
|
+
return False
|
|
238
|
+
if name == "#text" and (child.data or "").strip():
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
element_children: list[Any] = [
|
|
242
|
+
child for child in children if child is not None and child.name not in {"#text", "#comment"}
|
|
243
|
+
]
|
|
244
|
+
if not element_children:
|
|
245
|
+
return True
|
|
246
|
+
if len(element_children) == 1:
|
|
247
|
+
only_child = element_children[0]
|
|
248
|
+
if only_child.name in SPECIAL_ELEMENTS:
|
|
249
|
+
return True
|
|
250
|
+
if only_child.name == "a":
|
|
251
|
+
# If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
|
|
252
|
+
# for pretty-printing so the parent can indent it on its own line.
|
|
253
|
+
for grandchild in only_child.children or []:
|
|
254
|
+
if grandchild is None:
|
|
255
|
+
continue
|
|
256
|
+
if grandchild.name in SPECIAL_ELEMENTS:
|
|
257
|
+
return True
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Safe indentation rule: only insert inter-element whitespace when we won't
|
|
261
|
+
# be placing it between two adjacent inline/phrasing elements.
|
|
262
|
+
prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
|
|
263
|
+
for child in element_children[1:]:
|
|
264
|
+
current_is_special = child.name in SPECIAL_ELEMENTS
|
|
265
|
+
if not prev_is_special and not current_is_special:
|
|
266
|
+
return False
|
|
267
|
+
prev_is_special = current_is_special
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
|
|
20
272
|
"""Helper to convert a node to HTML."""
|
|
21
|
-
prefix = " " * (indent * indent_size) if pretty else ""
|
|
22
|
-
|
|
23
|
-
|
|
273
|
+
prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
|
|
274
|
+
name: str = node.name
|
|
275
|
+
content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
|
|
276
|
+
newline = "\n" if pretty and not content_pre else ""
|
|
24
277
|
|
|
25
278
|
# Text node
|
|
26
279
|
if name == "#text":
|
|
27
|
-
text = node.data
|
|
28
|
-
if pretty:
|
|
280
|
+
text: str | None = node.data
|
|
281
|
+
if pretty and not in_pre:
|
|
29
282
|
text = text.strip() if text else ""
|
|
30
283
|
if text:
|
|
31
|
-
return f"{prefix}{text}"
|
|
284
|
+
return f"{prefix}{_escape_text(text)}"
|
|
32
285
|
return ""
|
|
33
|
-
return text
|
|
286
|
+
return _escape_text(text) if text else ""
|
|
34
287
|
|
|
35
288
|
# Comment node
|
|
36
289
|
if name == "#comment":
|
|
@@ -42,58 +295,166 @@ def _node_to_html(node, indent=0, indent_size=2, pretty=True):
|
|
|
42
295
|
|
|
43
296
|
# Document fragment
|
|
44
297
|
if name == "#document-fragment":
|
|
45
|
-
parts = []
|
|
298
|
+
parts: list[str] = []
|
|
46
299
|
for child in node.children or []:
|
|
47
|
-
child_html = _node_to_html(child, indent, indent_size, pretty)
|
|
300
|
+
child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
|
|
48
301
|
if child_html:
|
|
49
302
|
parts.append(child_html)
|
|
50
303
|
return newline.join(parts) if pretty else "".join(parts)
|
|
51
304
|
|
|
52
305
|
# Element node
|
|
53
|
-
attrs = node.attrs or {}
|
|
306
|
+
attrs: dict[str, str | None] = node.attrs or {}
|
|
54
307
|
|
|
55
308
|
# Build opening tag
|
|
56
|
-
|
|
57
|
-
if attrs:
|
|
58
|
-
attr_parts = []
|
|
59
|
-
for key, value in attrs.items():
|
|
60
|
-
if value is None:
|
|
61
|
-
attr_parts.append(key)
|
|
62
|
-
elif value == "":
|
|
63
|
-
attr_parts.append(key)
|
|
64
|
-
else:
|
|
65
|
-
# Escape quotes in attribute values
|
|
66
|
-
escaped = str(value).replace("&", "&").replace('"', """)
|
|
67
|
-
attr_parts.append(f'{key}="{escaped}"')
|
|
68
|
-
if attr_parts: # pragma: no branch
|
|
69
|
-
attr_str = " " + " ".join(attr_parts)
|
|
309
|
+
open_tag = serialize_start_tag(name, attrs)
|
|
70
310
|
|
|
71
311
|
# Void elements
|
|
72
312
|
if name in VOID_ELEMENTS:
|
|
73
|
-
return f"{prefix}
|
|
313
|
+
return f"{prefix}{open_tag}"
|
|
74
314
|
|
|
75
315
|
# Elements with children
|
|
76
|
-
|
|
316
|
+
# Template special handling: HTML templates store contents in `template_content`.
|
|
317
|
+
if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
318
|
+
children: list[Any] = node.template_content.children or []
|
|
319
|
+
else:
|
|
320
|
+
children = node.children or []
|
|
77
321
|
if not children:
|
|
78
|
-
return f"{prefix}
|
|
322
|
+
return f"{prefix}{open_tag}{serialize_end_tag(name)}"
|
|
79
323
|
|
|
80
324
|
# Check if all children are text-only (inline rendering)
|
|
81
|
-
all_text = all(
|
|
325
|
+
all_text = all(c.name == "#text" for c in children)
|
|
326
|
+
|
|
327
|
+
if all_text and pretty and not content_pre:
|
|
328
|
+
# Serializer controls sanitization at the to_html() entry point; avoid
|
|
329
|
+
# implicit re-sanitization during rendering.
|
|
330
|
+
text_content = node.to_text(separator="", strip=False, safe=False)
|
|
331
|
+
text_content = _collapse_html_whitespace(text_content)
|
|
332
|
+
return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
|
|
333
|
+
|
|
334
|
+
if pretty and content_pre:
|
|
335
|
+
inner = "".join(
|
|
336
|
+
_node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
|
|
337
|
+
for child in children
|
|
338
|
+
if child is not None
|
|
339
|
+
)
|
|
340
|
+
return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
|
|
341
|
+
|
|
342
|
+
if pretty and not content_pre and not _should_pretty_indent_children(children):
|
|
343
|
+
# For block-ish elements that contain only element children and whitespace-only
|
|
344
|
+
# text nodes, we can still format each child on its own line (only when there
|
|
345
|
+
# is already whitespace separating element siblings).
|
|
346
|
+
if name in SPECIAL_ELEMENTS:
|
|
347
|
+
has_comment = False
|
|
348
|
+
has_element = False
|
|
349
|
+
has_whitespace_between_elements = False
|
|
350
|
+
|
|
351
|
+
first_element_index: int | None = None
|
|
352
|
+
last_element_index: int | None = None
|
|
353
|
+
|
|
354
|
+
previous_was_element = False
|
|
355
|
+
saw_whitespace_since_last_element = False
|
|
356
|
+
for i, child in enumerate(children):
|
|
357
|
+
if child is None:
|
|
358
|
+
continue
|
|
359
|
+
if child.name == "#comment":
|
|
360
|
+
has_comment = True
|
|
361
|
+
break
|
|
362
|
+
if child.name == "#text":
|
|
363
|
+
# Track whether there is already whitespace between element siblings.
|
|
364
|
+
if previous_was_element and not (child.data or "").strip():
|
|
365
|
+
saw_whitespace_since_last_element = True
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
has_element = True
|
|
369
|
+
if first_element_index is None:
|
|
370
|
+
first_element_index = i
|
|
371
|
+
last_element_index = i
|
|
372
|
+
if previous_was_element and saw_whitespace_since_last_element:
|
|
373
|
+
has_whitespace_between_elements = True
|
|
374
|
+
previous_was_element = True
|
|
375
|
+
saw_whitespace_since_last_element = False
|
|
376
|
+
|
|
377
|
+
can_indent_non_whitespace_text = True
|
|
378
|
+
if has_element and first_element_index is not None and last_element_index is not None:
|
|
379
|
+
for i, child in enumerate(children):
|
|
380
|
+
if child is None or child.name != "#text":
|
|
381
|
+
continue
|
|
382
|
+
if not (child.data or "").strip():
|
|
383
|
+
continue
|
|
384
|
+
# Only allow non-whitespace text *after* the last element.
|
|
385
|
+
# Leading text or text between elements could gain new spaces
|
|
386
|
+
# due to indentation/newlines.
|
|
387
|
+
if i < first_element_index or first_element_index < i < last_element_index:
|
|
388
|
+
can_indent_non_whitespace_text = False
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
|
|
392
|
+
inner_lines: list[str] = []
|
|
393
|
+
for child in children:
|
|
394
|
+
if child is None:
|
|
395
|
+
continue
|
|
396
|
+
if child.name == "#text":
|
|
397
|
+
text = _collapse_html_whitespace(child.data or "")
|
|
398
|
+
if text:
|
|
399
|
+
inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
|
|
400
|
+
continue
|
|
401
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
402
|
+
if child_html:
|
|
403
|
+
inner_lines.append(child_html)
|
|
404
|
+
if inner_lines:
|
|
405
|
+
inner = "\n".join(inner_lines)
|
|
406
|
+
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
407
|
+
|
|
408
|
+
inner_parts: list[str] = []
|
|
409
|
+
|
|
410
|
+
first_non_none_index: int | None = None
|
|
411
|
+
last_non_none_index: int | None = None
|
|
412
|
+
for i, child in enumerate(children):
|
|
413
|
+
if child is None:
|
|
414
|
+
continue
|
|
415
|
+
if first_non_none_index is None:
|
|
416
|
+
first_non_none_index = i
|
|
417
|
+
last_non_none_index = i
|
|
418
|
+
|
|
419
|
+
for i, child in enumerate(children):
|
|
420
|
+
if child is None:
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
if child.name == "#text":
|
|
424
|
+
data = child.data or ""
|
|
425
|
+
if not data.strip():
|
|
426
|
+
# Drop leading/trailing formatting whitespace in compact mode.
|
|
427
|
+
if i == first_non_none_index or i == last_non_none_index:
|
|
428
|
+
continue
|
|
429
|
+
# Preserve intentional small spacing, but collapse large formatting gaps.
|
|
430
|
+
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
431
|
+
inner_parts.append(" ")
|
|
432
|
+
continue
|
|
433
|
+
|
|
434
|
+
data = _normalize_formatting_whitespace(data)
|
|
435
|
+
child_html = _escape_text(data) if data else ""
|
|
436
|
+
else:
|
|
437
|
+
# Even when we can't safely insert whitespace *between* siblings, we can
|
|
438
|
+
# still pretty-print each element subtree to improve readability.
|
|
439
|
+
child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
|
|
440
|
+
if child_html:
|
|
441
|
+
inner_parts.append(child_html)
|
|
82
442
|
|
|
83
|
-
|
|
84
|
-
return f"{prefix}<{name}{attr_str}>{node.text}</{name}>"
|
|
443
|
+
return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
|
|
85
444
|
|
|
86
445
|
# Render with child indentation
|
|
87
|
-
parts = [f"{prefix}
|
|
446
|
+
parts = [f"{prefix}{open_tag}"]
|
|
88
447
|
for child in children:
|
|
89
|
-
|
|
448
|
+
if pretty and not content_pre and _is_whitespace_text_node(child):
|
|
449
|
+
continue
|
|
450
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
|
|
90
451
|
if child_html:
|
|
91
452
|
parts.append(child_html)
|
|
92
|
-
parts.append(f"{prefix}
|
|
453
|
+
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
93
454
|
return newline.join(parts) if pretty else "".join(parts)
|
|
94
455
|
|
|
95
456
|
|
|
96
|
-
def to_test_format(node, indent=0):
|
|
457
|
+
def to_test_format(node: Any, indent: int = 0) -> str:
|
|
97
458
|
"""Convert node to html5lib test format string.
|
|
98
459
|
|
|
99
460
|
This format is used by html5lib-tests for validating parser output.
|
|
@@ -105,26 +466,26 @@ def to_test_format(node, indent=0):
|
|
|
105
466
|
return _node_to_test_format(node, indent)
|
|
106
467
|
|
|
107
468
|
|
|
108
|
-
def _node_to_test_format(node, indent):
|
|
469
|
+
def _node_to_test_format(node: Any, indent: int) -> str:
|
|
109
470
|
"""Helper to convert a node to test format."""
|
|
110
471
|
if node.name == "#comment":
|
|
111
|
-
comment = node.data or ""
|
|
472
|
+
comment: str = node.data or ""
|
|
112
473
|
return f"| {' ' * indent}<!-- {comment} -->"
|
|
113
474
|
|
|
114
475
|
if node.name == "!doctype":
|
|
115
476
|
return _doctype_to_test_format(node)
|
|
116
477
|
|
|
117
478
|
if node.name == "#text":
|
|
118
|
-
text = node.data or ""
|
|
479
|
+
text: str = node.data or ""
|
|
119
480
|
return f'| {" " * indent}"{text}"'
|
|
120
481
|
|
|
121
482
|
# Regular element
|
|
122
483
|
line = f"| {' ' * indent}<{_qualified_name(node)}>"
|
|
123
484
|
attribute_lines = _attrs_to_test_format(node, indent)
|
|
124
485
|
|
|
125
|
-
# Template special handling
|
|
126
|
-
if node.name == "template" and
|
|
127
|
-
sections = [line]
|
|
486
|
+
# Template special handling (only HTML namespace templates have template_content)
|
|
487
|
+
if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
488
|
+
sections: list[str] = [line]
|
|
128
489
|
if attribute_lines:
|
|
129
490
|
sections.extend(attribute_lines)
|
|
130
491
|
content_line = f"| {' ' * (indent + 2)}content"
|
|
@@ -142,24 +503,24 @@ def _node_to_test_format(node, indent):
|
|
|
142
503
|
return "\n".join(sections)
|
|
143
504
|
|
|
144
505
|
|
|
145
|
-
def _qualified_name(node):
|
|
506
|
+
def _qualified_name(node: Any) -> str:
|
|
146
507
|
"""Get the qualified name of a node (with namespace prefix if needed)."""
|
|
147
508
|
if node.namespace and node.namespace not in {"html", None}:
|
|
148
509
|
return f"{node.namespace} {node.name}"
|
|
149
|
-
return node.name
|
|
510
|
+
return str(node.name)
|
|
150
511
|
|
|
151
512
|
|
|
152
|
-
def _attrs_to_test_format(node, indent):
|
|
513
|
+
def _attrs_to_test_format(node: Any, indent: int) -> list[str]:
|
|
153
514
|
"""Format element attributes for test output."""
|
|
154
515
|
if not node.attrs:
|
|
155
516
|
return []
|
|
156
517
|
|
|
157
|
-
formatted = []
|
|
518
|
+
formatted: list[str] = []
|
|
158
519
|
padding = " " * (indent + 2)
|
|
159
520
|
|
|
160
521
|
# Prepare display names for sorting
|
|
161
|
-
display_attrs = []
|
|
162
|
-
namespace = node.namespace
|
|
522
|
+
display_attrs: list[tuple[str, str]] = []
|
|
523
|
+
namespace: str | None = node.namespace
|
|
163
524
|
for attr_name, attr_value in node.attrs.items():
|
|
164
525
|
value = attr_value or ""
|
|
165
526
|
display_name = attr_name
|
|
@@ -177,15 +538,15 @@ def _attrs_to_test_format(node, indent):
|
|
|
177
538
|
return formatted
|
|
178
539
|
|
|
179
540
|
|
|
180
|
-
def _doctype_to_test_format(node):
|
|
541
|
+
def _doctype_to_test_format(node: Any) -> str:
|
|
181
542
|
"""Format DOCTYPE node for test output."""
|
|
182
543
|
doctype = node.data
|
|
183
544
|
|
|
184
|
-
name = doctype.name or ""
|
|
185
|
-
public_id = doctype.public_id
|
|
186
|
-
system_id = doctype.system_id
|
|
545
|
+
name: str = doctype.name or ""
|
|
546
|
+
public_id: str | None = doctype.public_id
|
|
547
|
+
system_id: str | None = doctype.system_id
|
|
187
548
|
|
|
188
|
-
parts = ["| <!DOCTYPE"]
|
|
549
|
+
parts: list[str] = ["| <!DOCTYPE"]
|
|
189
550
|
if name:
|
|
190
551
|
parts.append(f" {name}")
|
|
191
552
|
else:
|
justhtml/stream.py
CHANGED
|
@@ -1,15 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Generator
|
|
7
|
+
|
|
8
|
+
from .encoding import decode_html
|
|
1
9
|
from .tokenizer import Tokenizer
|
|
2
10
|
from .tokens import CommentToken, DoctypeToken, Tag
|
|
3
11
|
|
|
12
|
+
# Type alias for stream events
|
|
13
|
+
StreamEvent = tuple[str, Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _DummyNode:
|
|
17
|
+
namespace: str = "html"
|
|
18
|
+
|
|
4
19
|
|
|
5
20
|
class StreamSink:
|
|
6
21
|
"""A sink that buffers tokens for the stream API."""
|
|
7
22
|
|
|
8
|
-
|
|
23
|
+
tokens: list[StreamEvent]
|
|
24
|
+
open_elements: list[_DummyNode]
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
9
27
|
self.tokens = []
|
|
10
28
|
self.open_elements = [] # Required by tokenizer for rawtext checks
|
|
11
29
|
|
|
12
|
-
def process_token(self, token):
|
|
30
|
+
def process_token(self, token: Tag | CommentToken | DoctypeToken | Any) -> int:
|
|
13
31
|
# Tokenizer reuses token objects, so we must copy data
|
|
14
32
|
if isinstance(token, Tag):
|
|
15
33
|
# Copy tag data
|
|
@@ -24,10 +42,7 @@ class StreamSink:
|
|
|
24
42
|
# We need a dummy object with namespace for tokenizer checks
|
|
25
43
|
# Tokenizer checks: stack[-1].namespace
|
|
26
44
|
# We can just use a simple object
|
|
27
|
-
|
|
28
|
-
namespace = "html"
|
|
29
|
-
|
|
30
|
-
self.open_elements.append(DummyNode())
|
|
45
|
+
self.open_elements.append(_DummyNode())
|
|
31
46
|
else: # Tag.END
|
|
32
47
|
if self.open_elements:
|
|
33
48
|
self.open_elements.pop()
|
|
@@ -43,19 +58,28 @@ class StreamSink:
|
|
|
43
58
|
|
|
44
59
|
return 0 # TokenSinkResult.Continue
|
|
45
60
|
|
|
46
|
-
def process_characters(self, data):
|
|
61
|
+
def process_characters(self, data: str) -> None:
|
|
47
62
|
"""Handle character data from tokenizer."""
|
|
48
63
|
self.tokens.append(("text", data))
|
|
49
64
|
|
|
50
65
|
|
|
51
|
-
def stream(
|
|
66
|
+
def stream(
|
|
67
|
+
html: str | bytes | bytearray | memoryview,
|
|
68
|
+
*,
|
|
69
|
+
encoding: str | None = None,
|
|
70
|
+
) -> Generator[StreamEvent, None, None]:
|
|
52
71
|
"""
|
|
53
72
|
Stream HTML events from the given HTML string.
|
|
54
73
|
Yields tuples of (event_type, data).
|
|
55
74
|
"""
|
|
75
|
+
html_str: str
|
|
76
|
+
if isinstance(html, (bytes, bytearray, memoryview)):
|
|
77
|
+
html_str, _ = decode_html(bytes(html), transport_encoding=encoding)
|
|
78
|
+
else:
|
|
79
|
+
html_str = html
|
|
56
80
|
sink = StreamSink()
|
|
57
81
|
tokenizer = Tokenizer(sink)
|
|
58
|
-
tokenizer.initialize(
|
|
82
|
+
tokenizer.initialize(html_str)
|
|
59
83
|
|
|
60
84
|
while True:
|
|
61
85
|
# Run one step of the tokenizer
|
|
@@ -64,7 +88,7 @@ def stream(html):
|
|
|
64
88
|
# Yield any tokens produced by this step
|
|
65
89
|
if sink.tokens:
|
|
66
90
|
# Coalesce text tokens
|
|
67
|
-
text_buffer = []
|
|
91
|
+
text_buffer: list[str] = []
|
|
68
92
|
for event, data in sink.tokens:
|
|
69
93
|
if event == "text":
|
|
70
94
|
text_buffer.append(data)
|