justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/serialize.py
CHANGED
|
@@ -4,32 +4,42 @@
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import re
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
|
-
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
|
|
10
|
+
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
|
|
11
|
+
|
|
12
|
+
# Matches characters that prevent an attribute value from being unquoted.
|
|
13
|
+
# Note: This matches the logic of the previous loop-based implementation.
|
|
14
|
+
# It checks for space characters, quotes, equals sign, and greater-than.
|
|
15
|
+
_UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
|
|
10
16
|
|
|
11
17
|
|
|
12
18
|
def _escape_text(text: str | None) -> str:
|
|
13
19
|
if not text:
|
|
14
20
|
return ""
|
|
15
21
|
# Minimal, but matches html5lib serializer expectations in core cases.
|
|
16
|
-
return
|
|
22
|
+
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
17
23
|
|
|
18
24
|
|
|
19
|
-
def _choose_attr_quote(value: str | None) -> str:
|
|
25
|
+
def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
|
|
26
|
+
if forced_quote_char in {'"', "'"}:
|
|
27
|
+
return forced_quote_char
|
|
20
28
|
if value is None:
|
|
21
29
|
return '"'
|
|
22
|
-
value
|
|
30
|
+
# value is assumed to be a string
|
|
23
31
|
if '"' in value and "'" not in value:
|
|
24
32
|
return "'"
|
|
25
33
|
return '"'
|
|
26
34
|
|
|
27
35
|
|
|
28
|
-
def _escape_attr_value(value: str | None, quote_char: str) -> str:
|
|
36
|
+
def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
|
|
29
37
|
if value is None:
|
|
30
38
|
return ""
|
|
31
|
-
value
|
|
39
|
+
# value is assumed to be a string
|
|
32
40
|
value = value.replace("&", "&")
|
|
41
|
+
if escape_lt_in_attrs:
|
|
42
|
+
value = value.replace("<", "<")
|
|
33
43
|
# Note: html5lib's default serializer does not escape '>' in attrs.
|
|
34
44
|
if quote_char == '"':
|
|
35
45
|
return value.replace('"', """)
|
|
@@ -39,35 +49,63 @@ def _escape_attr_value(value: str | None, quote_char: str) -> str:
|
|
|
39
49
|
def _can_unquote_attr_value(value: str | None) -> bool:
|
|
40
50
|
if value is None:
|
|
41
51
|
return False
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# Disallow whitespace and characters that would terminate/ambiguate the value.
|
|
45
|
-
for ch in value:
|
|
46
|
-
if ch == ">":
|
|
47
|
-
return False
|
|
48
|
-
if ch in {'"', "'", "="}:
|
|
49
|
-
return False
|
|
50
|
-
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
51
|
-
return False
|
|
52
|
-
return True
|
|
52
|
+
# Optimization: use regex instead of loop
|
|
53
|
+
return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
|
|
57
|
+
if not minimize_boolean_attributes:
|
|
58
|
+
return False
|
|
59
|
+
if value is None or value == "":
|
|
60
|
+
return True
|
|
61
|
+
if value == name:
|
|
62
|
+
return True
|
|
63
|
+
return value.lower() == name
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def serialize_start_tag(
|
|
67
|
+
name: str,
|
|
68
|
+
attrs: dict[str, str | None] | None,
|
|
69
|
+
*,
|
|
70
|
+
quote_attr_values: bool = True,
|
|
71
|
+
minimize_boolean_attributes: bool = True,
|
|
72
|
+
quote_char: str | None = None,
|
|
73
|
+
escape_lt_in_attrs: bool = False,
|
|
74
|
+
use_trailing_solidus: bool = False,
|
|
75
|
+
is_void: bool = False,
|
|
76
|
+
) -> str:
|
|
56
77
|
attrs = attrs or {}
|
|
57
78
|
parts: list[str] = ["<", name]
|
|
58
79
|
if attrs:
|
|
59
80
|
for key, value in attrs.items():
|
|
60
|
-
if
|
|
81
|
+
if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
|
|
61
82
|
parts.extend([" ", key])
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if value is None:
|
|
86
|
+
parts.extend([" ", key, '=""'])
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
# value is guaranteed to be a string here because attrs is dict[str, str | None]
|
|
90
|
+
value_str = value
|
|
91
|
+
if value_str == "":
|
|
92
|
+
parts.extend([" ", key, '=""'])
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
if not quote_attr_values and _can_unquote_attr_value(value_str):
|
|
96
|
+
escaped = value_str.replace("&", "&")
|
|
97
|
+
if escape_lt_in_attrs:
|
|
98
|
+
escaped = escaped.replace("<", "<")
|
|
99
|
+
parts.extend([" ", key, "=", escaped])
|
|
62
100
|
else:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
101
|
+
quote = _choose_attr_quote(value_str, quote_char)
|
|
102
|
+
escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
|
|
103
|
+
parts.extend([" ", key, "=", quote, escaped, quote])
|
|
104
|
+
|
|
105
|
+
if use_trailing_solidus and is_void:
|
|
106
|
+
parts.append(" />")
|
|
107
|
+
else:
|
|
108
|
+
parts.append(">")
|
|
71
109
|
return "".join(parts)
|
|
72
110
|
|
|
73
111
|
|
|
@@ -75,27 +113,299 @@ def serialize_end_tag(name: str) -> str:
|
|
|
75
113
|
return f"</{name}>"
|
|
76
114
|
|
|
77
115
|
|
|
78
|
-
def to_html(
|
|
116
|
+
def to_html(
|
|
117
|
+
node: Any,
|
|
118
|
+
indent: int = 0,
|
|
119
|
+
indent_size: int = 2,
|
|
120
|
+
*,
|
|
121
|
+
pretty: bool = True,
|
|
122
|
+
) -> str:
|
|
79
123
|
"""Convert node to HTML string."""
|
|
80
124
|
if node.name == "#document":
|
|
81
125
|
# Document root - just render children
|
|
82
126
|
parts: list[str] = []
|
|
83
127
|
for child in node.children or []:
|
|
84
|
-
parts.append(_node_to_html(child, indent, indent_size, pretty))
|
|
128
|
+
parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
|
|
85
129
|
return "\n".join(parts) if pretty else "".join(parts)
|
|
86
|
-
return _node_to_html(node, indent, indent_size, pretty)
|
|
130
|
+
return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _collapse_html_whitespace(text: str) -> str:
|
|
134
|
+
"""Collapse HTML whitespace runs to a single space and trim edges.
|
|
135
|
+
|
|
136
|
+
This matches how HTML rendering treats most whitespace in text nodes, and is
|
|
137
|
+
used only for pretty-printing in non-preformatted contexts.
|
|
138
|
+
"""
|
|
139
|
+
if not text:
|
|
140
|
+
return ""
|
|
87
141
|
|
|
142
|
+
# Optimization: split() handles whitespace collapsing efficiently.
|
|
143
|
+
# Note: split() treats \v as whitespace, which is not HTML whitespace.
|
|
144
|
+
# But \v is extremely rare in HTML.
|
|
145
|
+
if "\v" in text:
|
|
146
|
+
parts: list[str] = []
|
|
147
|
+
in_whitespace = False
|
|
148
|
+
for ch in text:
|
|
149
|
+
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
150
|
+
if not in_whitespace:
|
|
151
|
+
parts.append(" ")
|
|
152
|
+
in_whitespace = True
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
parts.append(ch)
|
|
156
|
+
in_whitespace = False
|
|
157
|
+
|
|
158
|
+
collapsed = "".join(parts)
|
|
159
|
+
return collapsed.strip(" ")
|
|
160
|
+
|
|
161
|
+
return " ".join(text.split())
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _normalize_formatting_whitespace(text: str) -> str:
|
|
165
|
+
"""Normalize formatting whitespace within a text node.
|
|
166
|
+
|
|
167
|
+
Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
|
|
168
|
+
include such formatting whitespace to a single space.
|
|
169
|
+
|
|
170
|
+
Pure space runs are preserved as-is (so existing double-spaces remain).
|
|
171
|
+
"""
|
|
172
|
+
if not text:
|
|
173
|
+
return ""
|
|
174
|
+
|
|
175
|
+
if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
|
|
176
|
+
return text
|
|
177
|
+
|
|
178
|
+
starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
|
|
179
|
+
ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
|
|
180
|
+
|
|
181
|
+
out: list[str] = []
|
|
182
|
+
in_ws = False
|
|
183
|
+
saw_formatting_ws = False
|
|
184
|
+
|
|
185
|
+
for ch in text:
|
|
186
|
+
if ch == " ":
|
|
187
|
+
if in_ws:
|
|
188
|
+
# Only collapse if this whitespace run included formatting whitespace.
|
|
189
|
+
if saw_formatting_ws:
|
|
190
|
+
continue
|
|
191
|
+
out.append(" ")
|
|
192
|
+
continue
|
|
193
|
+
in_ws = True
|
|
194
|
+
saw_formatting_ws = False
|
|
195
|
+
out.append(" ")
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
if ch in {"\n", "\r", "\t", "\f"}:
|
|
199
|
+
if in_ws:
|
|
200
|
+
saw_formatting_ws = True
|
|
201
|
+
continue
|
|
202
|
+
in_ws = True
|
|
203
|
+
saw_formatting_ws = True
|
|
204
|
+
out.append(" ")
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
in_ws = False
|
|
208
|
+
saw_formatting_ws = False
|
|
209
|
+
out.append(ch)
|
|
210
|
+
|
|
211
|
+
normalized = "".join(out)
|
|
212
|
+
if starts_with_formatting and normalized.startswith(" "):
|
|
213
|
+
normalized = normalized[1:]
|
|
214
|
+
if ends_with_formatting and normalized.endswith(" "):
|
|
215
|
+
normalized = normalized[:-1]
|
|
216
|
+
return normalized
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _is_whitespace_text_node(node: Any) -> bool:
|
|
220
|
+
return node.name == "#text" and (node.data or "").strip() == ""
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _is_blocky_element(node: Any) -> bool:
|
|
224
|
+
# Treat elements as block-ish if they are block-level *or* contain any block-level
|
|
225
|
+
# descendants. This keeps pretty-printing readable for constructs like <a><div>...</div></a>.
|
|
226
|
+
try:
|
|
227
|
+
name = node.name
|
|
228
|
+
except AttributeError:
|
|
229
|
+
return False
|
|
230
|
+
if name in {"#text", "#comment", "!doctype"}:
|
|
231
|
+
return False
|
|
232
|
+
if name in SPECIAL_ELEMENTS:
|
|
233
|
+
return True
|
|
88
234
|
|
|
89
|
-
|
|
235
|
+
try:
|
|
236
|
+
children = node.children or []
|
|
237
|
+
except AttributeError:
|
|
238
|
+
return False
|
|
239
|
+
if not children:
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
stack: list[Any] = list(children)
|
|
243
|
+
while stack:
|
|
244
|
+
child = stack.pop()
|
|
245
|
+
if child is None:
|
|
246
|
+
continue
|
|
247
|
+
child_name = child.name
|
|
248
|
+
if child_name in SPECIAL_ELEMENTS:
|
|
249
|
+
return True
|
|
250
|
+
if child_name in {"#text", "#comment", "!doctype"}:
|
|
251
|
+
continue
|
|
252
|
+
grand_children = child.children
|
|
253
|
+
if grand_children:
|
|
254
|
+
stack.extend(grand_children)
|
|
255
|
+
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
_LAYOUT_BLOCK_ELEMENTS = {
|
|
260
|
+
"address",
|
|
261
|
+
"article",
|
|
262
|
+
"aside",
|
|
263
|
+
"blockquote",
|
|
264
|
+
"body",
|
|
265
|
+
"caption",
|
|
266
|
+
"center",
|
|
267
|
+
"dd",
|
|
268
|
+
"details",
|
|
269
|
+
"dialog",
|
|
270
|
+
"dir",
|
|
271
|
+
"div",
|
|
272
|
+
"dl",
|
|
273
|
+
"dt",
|
|
274
|
+
"fieldset",
|
|
275
|
+
"figcaption",
|
|
276
|
+
"figure",
|
|
277
|
+
"footer",
|
|
278
|
+
"form",
|
|
279
|
+
"h1",
|
|
280
|
+
"h2",
|
|
281
|
+
"h3",
|
|
282
|
+
"h4",
|
|
283
|
+
"h5",
|
|
284
|
+
"h6",
|
|
285
|
+
"header",
|
|
286
|
+
"hgroup",
|
|
287
|
+
"hr",
|
|
288
|
+
"html",
|
|
289
|
+
"iframe",
|
|
290
|
+
"li",
|
|
291
|
+
"listing",
|
|
292
|
+
"main",
|
|
293
|
+
"marquee",
|
|
294
|
+
"menu",
|
|
295
|
+
"nav",
|
|
296
|
+
"noframes",
|
|
297
|
+
"noscript",
|
|
298
|
+
"ol",
|
|
299
|
+
"p",
|
|
300
|
+
"plaintext",
|
|
301
|
+
"pre",
|
|
302
|
+
"search",
|
|
303
|
+
"section",
|
|
304
|
+
"summary",
|
|
305
|
+
"table",
|
|
306
|
+
"tbody",
|
|
307
|
+
"td",
|
|
308
|
+
"tfoot",
|
|
309
|
+
"th",
|
|
310
|
+
"thead",
|
|
311
|
+
"tr",
|
|
312
|
+
"ul",
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
_FORMAT_SEP = object()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _is_layout_blocky_element(node: Any) -> bool:
|
|
320
|
+
# Similar to _is_blocky_element(), but limited to actual layout blocks.
|
|
321
|
+
# This avoids turning inline-ish "special" elements like <script> into
|
|
322
|
+
# multiline pretty-print breaks in contexts like <p>.
|
|
323
|
+
try:
|
|
324
|
+
name = node.name
|
|
325
|
+
except AttributeError:
|
|
326
|
+
return False
|
|
327
|
+
if name in {"#text", "#comment", "!doctype"}:
|
|
328
|
+
return False
|
|
329
|
+
if name in _LAYOUT_BLOCK_ELEMENTS:
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
children = node.children or []
|
|
334
|
+
except AttributeError:
|
|
335
|
+
return False
|
|
336
|
+
if not children:
|
|
337
|
+
return False
|
|
338
|
+
|
|
339
|
+
stack: list[Any] = list(children)
|
|
340
|
+
while stack:
|
|
341
|
+
child = stack.pop()
|
|
342
|
+
if child is None:
|
|
343
|
+
continue
|
|
344
|
+
child_name = child.name
|
|
345
|
+
if child_name in _LAYOUT_BLOCK_ELEMENTS:
|
|
346
|
+
return True
|
|
347
|
+
if child_name in {"#text", "#comment", "!doctype"}:
|
|
348
|
+
continue
|
|
349
|
+
grand_children = child.children
|
|
350
|
+
if grand_children:
|
|
351
|
+
stack.extend(grand_children)
|
|
352
|
+
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _is_formatting_whitespace_text(data: str) -> bool:
|
|
357
|
+
# Formatting whitespace is something users typically don't intend to preserve
|
|
358
|
+
# exactly (e.g. newlines/indentation, or large runs of spaces).
|
|
359
|
+
if not data:
|
|
360
|
+
return False
|
|
361
|
+
if "\n" in data or "\r" in data or "\t" in data or "\f" in data:
|
|
362
|
+
return True
|
|
363
|
+
return len(data) > 2
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _should_pretty_indent_children(children: list[Any]) -> bool:
|
|
367
|
+
for child in children:
|
|
368
|
+
if child is None:
|
|
369
|
+
continue
|
|
370
|
+
name = child.name
|
|
371
|
+
if name == "#comment":
|
|
372
|
+
return False
|
|
373
|
+
if name == "#text" and (child.data or "").strip():
|
|
374
|
+
return False
|
|
375
|
+
|
|
376
|
+
element_children: list[Any] = [
|
|
377
|
+
child for child in children if child is not None and child.name not in {"#text", "#comment"}
|
|
378
|
+
]
|
|
379
|
+
if not element_children:
|
|
380
|
+
return True
|
|
381
|
+
if len(element_children) == 1:
|
|
382
|
+
only_child = element_children[0]
|
|
383
|
+
if _is_blocky_element(only_child):
|
|
384
|
+
return True
|
|
385
|
+
return False
|
|
386
|
+
|
|
387
|
+
# Safe indentation rule: only insert inter-element whitespace when we won't
|
|
388
|
+
# be placing it between two adjacent inline/phrasing elements.
|
|
389
|
+
prev_is_blocky = _is_blocky_element(element_children[0])
|
|
390
|
+
for child in element_children[1:]:
|
|
391
|
+
current_is_blocky = _is_blocky_element(child)
|
|
392
|
+
if not prev_is_blocky and not current_is_blocky:
|
|
393
|
+
return False
|
|
394
|
+
prev_is_blocky = current_is_blocky
|
|
395
|
+
return True
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
|
|
90
399
|
"""Helper to convert a node to HTML."""
|
|
91
|
-
prefix = " " * (indent * indent_size) if pretty else ""
|
|
92
|
-
newline = "\n" if pretty else ""
|
|
400
|
+
prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
|
|
93
401
|
name: str = node.name
|
|
402
|
+
content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
|
|
403
|
+
newline = "\n" if pretty and not content_pre else ""
|
|
94
404
|
|
|
95
405
|
# Text node
|
|
96
406
|
if name == "#text":
|
|
97
407
|
text: str | None = node.data
|
|
98
|
-
if pretty:
|
|
408
|
+
if pretty and not in_pre:
|
|
99
409
|
text = text.strip() if text else ""
|
|
100
410
|
if text:
|
|
101
411
|
return f"{prefix}{_escape_text(text)}"
|
|
@@ -114,7 +424,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
114
424
|
if name == "#document-fragment":
|
|
115
425
|
parts: list[str] = []
|
|
116
426
|
for child in node.children or []:
|
|
117
|
-
child_html = _node_to_html(child, indent, indent_size, pretty)
|
|
427
|
+
child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
|
|
118
428
|
if child_html:
|
|
119
429
|
parts.append(child_html)
|
|
120
430
|
return newline.join(parts) if pretty else "".join(parts)
|
|
@@ -130,20 +440,340 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
130
440
|
return f"{prefix}{open_tag}"
|
|
131
441
|
|
|
132
442
|
# Elements with children
|
|
133
|
-
|
|
443
|
+
# Template special handling: HTML templates store contents in `template_content`.
|
|
444
|
+
if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
445
|
+
children: list[Any] = node.template_content.children or []
|
|
446
|
+
else:
|
|
447
|
+
children = node.children or []
|
|
134
448
|
if not children:
|
|
135
449
|
return f"{prefix}{open_tag}{serialize_end_tag(name)}"
|
|
136
450
|
|
|
137
451
|
# Check if all children are text-only (inline rendering)
|
|
138
|
-
all_text =
|
|
452
|
+
all_text = True
|
|
453
|
+
for child in children:
|
|
454
|
+
if child is None:
|
|
455
|
+
continue
|
|
456
|
+
if child.name != "#text":
|
|
457
|
+
all_text = False
|
|
458
|
+
break
|
|
459
|
+
|
|
460
|
+
if all_text and pretty and not content_pre:
|
|
461
|
+
# Serializer controls sanitization at the to_html() entry point; avoid
|
|
462
|
+
# implicit re-sanitization during rendering.
|
|
463
|
+
text_content = node.to_text(separator="", strip=False)
|
|
464
|
+
text_content = _collapse_html_whitespace(text_content)
|
|
465
|
+
return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
|
|
466
|
+
|
|
467
|
+
if pretty and content_pre:
|
|
468
|
+
inner = "".join(
|
|
469
|
+
_node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
|
|
470
|
+
for child in children
|
|
471
|
+
if child is not None
|
|
472
|
+
)
|
|
473
|
+
return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
|
|
474
|
+
|
|
475
|
+
if pretty and not content_pre and name in SPECIAL_ELEMENTS:
|
|
476
|
+
# For block-ish containers that only have element children (and/or
|
|
477
|
+
# whitespace-only text nodes), prefer a multiline layout for readability
|
|
478
|
+
# even when children are inline elements.
|
|
479
|
+
can_indent = True
|
|
480
|
+
for child in children:
|
|
481
|
+
if child is None:
|
|
482
|
+
continue
|
|
483
|
+
if child.name == "#comment":
|
|
484
|
+
can_indent = False
|
|
485
|
+
break
|
|
486
|
+
if child.name == "#text" and (child.data or "").strip():
|
|
487
|
+
can_indent = False
|
|
488
|
+
break
|
|
489
|
+
|
|
490
|
+
if can_indent:
|
|
491
|
+
inner_lines: list[str] = []
|
|
492
|
+
for child in children:
|
|
493
|
+
if child is None:
|
|
494
|
+
continue
|
|
495
|
+
if _is_whitespace_text_node(child):
|
|
496
|
+
continue
|
|
497
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
|
|
498
|
+
if child_html:
|
|
499
|
+
inner_lines.append(child_html)
|
|
500
|
+
|
|
501
|
+
if inner_lines:
|
|
502
|
+
parts = [f"{prefix}{open_tag}"]
|
|
503
|
+
parts.extend(inner_lines)
|
|
504
|
+
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
505
|
+
return "\n".join(parts)
|
|
506
|
+
|
|
507
|
+
# Smart pretty-printing: if the author already inserted formatting whitespace
|
|
508
|
+
# between siblings, we can split into "inline runs" and put each run on its
|
|
509
|
+
# own line without introducing new inter-token whitespace.
|
|
510
|
+
has_comment = any(child is not None and child.name == "#comment" for child in children)
|
|
511
|
+
if not has_comment:
|
|
512
|
+
non_none_children: list[Any] = [child for child in children if child is not None]
|
|
513
|
+
|
|
514
|
+
# Only enable this mode if there is at least one formatting whitespace text node
|
|
515
|
+
# between non-whitespace siblings.
|
|
516
|
+
has_separator = False
|
|
517
|
+
for child in non_none_children[1:-1]:
|
|
518
|
+
if child.name != "#text":
|
|
519
|
+
continue
|
|
520
|
+
data = child.data or ""
|
|
521
|
+
if data.strip() != "":
|
|
522
|
+
continue
|
|
523
|
+
if _is_formatting_whitespace_text(data):
|
|
524
|
+
has_separator = True
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
if has_separator:
|
|
528
|
+
# Build runs by splitting on formatting whitespace text nodes.
|
|
529
|
+
# Keep small spacing nodes (" " or " ") inside runs.
|
|
530
|
+
items: list[Any] = []
|
|
531
|
+
last_was_sep = False
|
|
532
|
+
for child in non_none_children:
|
|
533
|
+
if child.name == "#text":
|
|
534
|
+
data = child.data or ""
|
|
535
|
+
if data.strip() == "" and _is_formatting_whitespace_text(data):
|
|
536
|
+
if not last_was_sep:
|
|
537
|
+
items.append(_FORMAT_SEP)
|
|
538
|
+
last_was_sep = True
|
|
539
|
+
continue
|
|
540
|
+
items.append(child)
|
|
541
|
+
last_was_sep = False
|
|
542
|
+
|
|
543
|
+
while items and items[0] is _FORMAT_SEP:
|
|
544
|
+
items.pop(0)
|
|
545
|
+
while items and items[-1] is _FORMAT_SEP:
|
|
546
|
+
items.pop()
|
|
547
|
+
|
|
548
|
+
runs: list[list[Any]] = []
|
|
549
|
+
current_run: list[Any] = []
|
|
550
|
+
for item in items:
|
|
551
|
+
if item is _FORMAT_SEP:
|
|
552
|
+
runs.append(current_run)
|
|
553
|
+
current_run = []
|
|
554
|
+
continue
|
|
555
|
+
current_run.append(item)
|
|
556
|
+
runs.append(current_run)
|
|
557
|
+
runs = [run for run in runs if run]
|
|
558
|
+
|
|
559
|
+
# Only apply if we can render each run either as a single blocky element
|
|
560
|
+
# (possibly multiline) or as a single-line inline run.
|
|
561
|
+
smart_lines: list[str] = []
|
|
562
|
+
can_apply = True
|
|
563
|
+
for run in runs:
|
|
564
|
+
blocky_elements = [c for c in run if c.name not in {"#text", "#comment"} and _is_blocky_element(c)]
|
|
565
|
+
if blocky_elements and len(run) != 1:
|
|
566
|
+
can_apply = False
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
if len(run) == 1 and run[0].name != "#text":
|
|
570
|
+
child_html = _node_to_html(run[0], indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
571
|
+
smart_lines.append(child_html)
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
# Inline run: render on one line.
|
|
575
|
+
run_parts: list[str] = []
|
|
576
|
+
for c in run:
|
|
577
|
+
if c.name == "#text":
|
|
578
|
+
data = c.data or ""
|
|
579
|
+
if not data.strip():
|
|
580
|
+
# Formatting whitespace never appears inside runs (it is used as a separator).
|
|
581
|
+
# Preserve intentional tiny spacing.
|
|
582
|
+
run_parts.append(data)
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
run_parts.append(_escape_text(_normalize_formatting_whitespace(data)))
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
# Render inline elements without their own leading indentation.
|
|
589
|
+
child_html = _node_to_html(c, 0, indent_size, pretty=True, in_pre=content_pre)
|
|
590
|
+
run_parts.append(child_html)
|
|
591
|
+
|
|
592
|
+
smart_lines.append(f"{' ' * ((indent + 1) * indent_size)}{''.join(run_parts)}")
|
|
593
|
+
|
|
594
|
+
if can_apply and smart_lines:
|
|
595
|
+
return f"{prefix}{open_tag}\n" + "\n".join(smart_lines) + f"\n{prefix}{serialize_end_tag(name)}"
|
|
596
|
+
|
|
597
|
+
if pretty and not content_pre and not _should_pretty_indent_children(children):
|
|
598
|
+
# For block-ish elements that contain only element children and whitespace-only
|
|
599
|
+
# text nodes, we can still format each child on its own line (only when there
|
|
600
|
+
# is already whitespace separating element siblings).
|
|
601
|
+
if name in SPECIAL_ELEMENTS:
|
|
602
|
+
# Mixed content in block-ish containers: if we encounter a blocky child
|
|
603
|
+
# (e.g. <ul>) adjacent to inline text, printing everything on one line
|
|
604
|
+
# both hurts readability and can lose indentation inside the block subtree.
|
|
605
|
+
# In that case, put inline runs and blocky children on their own lines.
|
|
606
|
+
has_comment = any(child is not None and child.name == "#comment" for child in children)
|
|
607
|
+
if not has_comment:
|
|
608
|
+
has_blocky_child = any(
|
|
609
|
+
child is not None and child.name not in {"#text", "#comment"} and _is_layout_blocky_element(child)
|
|
610
|
+
for child in children
|
|
611
|
+
)
|
|
612
|
+
has_non_whitespace_text = any(
|
|
613
|
+
child is not None and child.name == "#text" and (child.data or "").strip() for child in children
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
if has_blocky_child and has_non_whitespace_text:
|
|
617
|
+
mixed_multiline_lines: list[str] = []
|
|
618
|
+
inline_parts: list[str] = []
|
|
619
|
+
|
|
620
|
+
mixed_first_non_none_index: int | None = None
|
|
621
|
+
mixed_last_non_none_index: int | None = None
|
|
622
|
+
for i, child in enumerate(children):
|
|
623
|
+
if child is None:
|
|
624
|
+
continue
|
|
625
|
+
if mixed_first_non_none_index is None:
|
|
626
|
+
mixed_first_non_none_index = i
|
|
627
|
+
mixed_last_non_none_index = i
|
|
628
|
+
|
|
629
|
+
def flush_inline() -> None:
|
|
630
|
+
if not inline_parts:
|
|
631
|
+
return
|
|
632
|
+
line = "".join(inline_parts).strip(" ")
|
|
633
|
+
inline_parts.clear()
|
|
634
|
+
if line:
|
|
635
|
+
mixed_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{line}")
|
|
636
|
+
|
|
637
|
+
for i, child in enumerate(children):
|
|
638
|
+
if child is None:
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
if child.name == "#text":
|
|
642
|
+
data = child.data or ""
|
|
643
|
+
if not data.strip():
|
|
644
|
+
# Drop leading/trailing formatting whitespace.
|
|
645
|
+
if i == mixed_first_non_none_index or i == mixed_last_non_none_index:
|
|
646
|
+
continue
|
|
647
|
+
# Preserve intentional small spacing, but treat formatting whitespace
|
|
648
|
+
# as a separator between inline runs (new line).
|
|
649
|
+
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
650
|
+
flush_inline()
|
|
651
|
+
else:
|
|
652
|
+
inline_parts.append(data)
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
data = _normalize_formatting_whitespace(data)
|
|
656
|
+
inline_parts.append(_escape_text(data))
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
if _is_layout_blocky_element(child):
|
|
660
|
+
flush_inline()
|
|
661
|
+
mixed_multiline_lines.append(
|
|
662
|
+
_node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
663
|
+
)
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
# Inline element: keep it in the current line without leading indentation.
|
|
667
|
+
inline_parts.append(_node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre))
|
|
668
|
+
|
|
669
|
+
flush_inline()
|
|
670
|
+
inner = "\n".join(line for line in mixed_multiline_lines if line)
|
|
671
|
+
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
672
|
+
|
|
673
|
+
has_comment = False
|
|
674
|
+
has_element = False
|
|
675
|
+
has_whitespace_between_elements = False
|
|
676
|
+
|
|
677
|
+
first_element_index: int | None = None
|
|
678
|
+
last_element_index: int | None = None
|
|
679
|
+
|
|
680
|
+
previous_was_element = False
|
|
681
|
+
saw_whitespace_since_last_element = False
|
|
682
|
+
for i, child in enumerate(children):
|
|
683
|
+
if child is None:
|
|
684
|
+
continue
|
|
685
|
+
if child.name == "#comment":
|
|
686
|
+
has_comment = True
|
|
687
|
+
break
|
|
688
|
+
if child.name == "#text":
|
|
689
|
+
# Track whether there is already whitespace between element siblings.
|
|
690
|
+
if previous_was_element and not (child.data or "").strip():
|
|
691
|
+
saw_whitespace_since_last_element = True
|
|
692
|
+
continue
|
|
693
|
+
|
|
694
|
+
has_element = True
|
|
695
|
+
if first_element_index is None:
|
|
696
|
+
first_element_index = i
|
|
697
|
+
last_element_index = i
|
|
698
|
+
if previous_was_element and saw_whitespace_since_last_element:
|
|
699
|
+
has_whitespace_between_elements = True
|
|
700
|
+
previous_was_element = True
|
|
701
|
+
saw_whitespace_since_last_element = False
|
|
702
|
+
|
|
703
|
+
can_indent_non_whitespace_text = True
|
|
704
|
+
if has_element and first_element_index is not None and last_element_index is not None:
|
|
705
|
+
for i, child in enumerate(children):
|
|
706
|
+
if child is None or child.name != "#text":
|
|
707
|
+
continue
|
|
708
|
+
if not (child.data or "").strip():
|
|
709
|
+
continue
|
|
710
|
+
# Only allow non-whitespace text *after* the last element.
|
|
711
|
+
# Leading text or text between elements could gain new spaces
|
|
712
|
+
# due to indentation/newlines.
|
|
713
|
+
if i < first_element_index or first_element_index < i < last_element_index:
|
|
714
|
+
can_indent_non_whitespace_text = False
|
|
715
|
+
break
|
|
716
|
+
|
|
717
|
+
if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
|
|
718
|
+
element_multiline_lines: list[str] = []
|
|
719
|
+
for child in children:
|
|
720
|
+
if child is None:
|
|
721
|
+
continue
|
|
722
|
+
if child.name == "#text":
|
|
723
|
+
text = _collapse_html_whitespace(child.data or "")
|
|
724
|
+
if text:
|
|
725
|
+
element_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
|
|
726
|
+
continue
|
|
727
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
728
|
+
if child_html:
|
|
729
|
+
element_multiline_lines.append(child_html)
|
|
730
|
+
if element_multiline_lines:
|
|
731
|
+
inner = "\n".join(element_multiline_lines)
|
|
732
|
+
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
733
|
+
|
|
734
|
+
inner_parts: list[str] = []
|
|
735
|
+
|
|
736
|
+
compact_first_non_none_index: int | None = None
|
|
737
|
+
compact_last_non_none_index: int | None = None
|
|
738
|
+
for i, child in enumerate(children):
|
|
739
|
+
if child is None:
|
|
740
|
+
continue
|
|
741
|
+
if compact_first_non_none_index is None:
|
|
742
|
+
compact_first_non_none_index = i
|
|
743
|
+
compact_last_non_none_index = i
|
|
744
|
+
|
|
745
|
+
for i, child in enumerate(children):
|
|
746
|
+
if child is None:
|
|
747
|
+
continue
|
|
748
|
+
|
|
749
|
+
if child.name == "#text":
|
|
750
|
+
data = child.data or ""
|
|
751
|
+
if not data.strip():
|
|
752
|
+
# Drop leading/trailing formatting whitespace in compact mode.
|
|
753
|
+
if i == compact_first_non_none_index or i == compact_last_non_none_index:
|
|
754
|
+
continue
|
|
755
|
+
# Preserve intentional small spacing, but collapse large formatting gaps.
|
|
756
|
+
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
757
|
+
inner_parts.append(" ")
|
|
758
|
+
continue
|
|
759
|
+
|
|
760
|
+
data = _normalize_formatting_whitespace(data)
|
|
761
|
+
child_html = _escape_text(data) if data else ""
|
|
762
|
+
else:
|
|
763
|
+
# Even when we can't safely insert whitespace *between* siblings, we can
|
|
764
|
+
# still pretty-print each element subtree to improve readability.
|
|
765
|
+
child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
|
|
766
|
+
if child_html:
|
|
767
|
+
inner_parts.append(child_html)
|
|
139
768
|
|
|
140
|
-
|
|
141
|
-
return f"{prefix}{open_tag}{_escape_text(node.to_text(separator='', strip=False))}{serialize_end_tag(name)}"
|
|
769
|
+
return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
|
|
142
770
|
|
|
143
771
|
# Render with child indentation
|
|
144
772
|
parts = [f"{prefix}{open_tag}"]
|
|
145
773
|
for child in children:
|
|
146
|
-
|
|
774
|
+
if pretty and not content_pre and _is_whitespace_text_node(child):
|
|
775
|
+
continue
|
|
776
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
|
|
147
777
|
if child_html:
|
|
148
778
|
parts.append(child_html)
|
|
149
779
|
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
@@ -180,7 +810,7 @@ def _node_to_test_format(node: Any, indent: int) -> str:
|
|
|
180
810
|
attribute_lines = _attrs_to_test_format(node, indent)
|
|
181
811
|
|
|
182
812
|
# Template special handling (only HTML namespace templates have template_content)
|
|
183
|
-
if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
|
|
813
|
+
if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
184
814
|
sections: list[str] = [line]
|
|
185
815
|
if attribute_lines:
|
|
186
816
|
sections.extend(attribute_lines)
|