justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/selector.py
CHANGED
|
@@ -651,7 +651,9 @@ class SelectorMatcher:
|
|
|
651
651
|
attr_value: str | None = None
|
|
652
652
|
for name, value in attrs.items():
|
|
653
653
|
if name.lower() == attr_name:
|
|
654
|
-
|
|
654
|
+
# Attributes can be boolean (represented as None in JustHTML).
|
|
655
|
+
# For selector matching, presence should still count.
|
|
656
|
+
attr_value = "" if value is None else str(value)
|
|
655
657
|
break
|
|
656
658
|
|
|
657
659
|
if attr_value is None:
|
justhtml/serialize.py
CHANGED
|
@@ -4,17 +4,22 @@
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import re
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
|
-
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS
|
|
10
|
-
|
|
10
|
+
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
|
|
11
|
+
|
|
12
|
+
# Matches characters that prevent an attribute value from being unquoted.
|
|
13
|
+
# Note: This matches the logic of the previous loop-based implementation.
|
|
14
|
+
# It checks for space characters, quotes, equals sign, and greater-than.
|
|
15
|
+
_UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
def _escape_text(text: str | None) -> str:
|
|
14
19
|
if not text:
|
|
15
20
|
return ""
|
|
16
21
|
# Minimal, but matches html5lib serializer expectations in core cases.
|
|
17
|
-
return
|
|
22
|
+
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
|
|
@@ -22,7 +27,7 @@ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None)
|
|
|
22
27
|
return forced_quote_char
|
|
23
28
|
if value is None:
|
|
24
29
|
return '"'
|
|
25
|
-
value
|
|
30
|
+
# value is assumed to be a string
|
|
26
31
|
if '"' in value and "'" not in value:
|
|
27
32
|
return "'"
|
|
28
33
|
return '"'
|
|
@@ -31,7 +36,7 @@ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None)
|
|
|
31
36
|
def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
|
|
32
37
|
if value is None:
|
|
33
38
|
return ""
|
|
34
|
-
value
|
|
39
|
+
# value is assumed to be a string
|
|
35
40
|
value = value.replace("&", "&")
|
|
36
41
|
if escape_lt_in_attrs:
|
|
37
42
|
value = value.replace("<", "<")
|
|
@@ -44,15 +49,8 @@ def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs
|
|
|
44
49
|
def _can_unquote_attr_value(value: str | None) -> bool:
|
|
45
50
|
if value is None:
|
|
46
51
|
return False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if ch == ">":
|
|
50
|
-
return False
|
|
51
|
-
if ch in {'"', "'", "="}:
|
|
52
|
-
return False
|
|
53
|
-
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
54
|
-
return False
|
|
55
|
-
return True
|
|
52
|
+
# Optimization: use regex instead of loop
|
|
53
|
+
return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
|
|
56
54
|
|
|
57
55
|
|
|
58
56
|
def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
|
|
@@ -60,7 +58,9 @@ def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boole
|
|
|
60
58
|
return False
|
|
61
59
|
if value is None or value == "":
|
|
62
60
|
return True
|
|
63
|
-
|
|
61
|
+
if value == name:
|
|
62
|
+
return True
|
|
63
|
+
return value.lower() == name
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
def serialize_start_tag(
|
|
@@ -86,7 +86,8 @@ def serialize_start_tag(
|
|
|
86
86
|
parts.extend([" ", key, '=""'])
|
|
87
87
|
continue
|
|
88
88
|
|
|
89
|
-
|
|
89
|
+
# value is guaranteed to be a string here because attrs is dict[str, str | None]
|
|
90
|
+
value_str = value
|
|
90
91
|
if value_str == "":
|
|
91
92
|
parts.extend([" ", key, '=""'])
|
|
92
93
|
continue
|
|
@@ -118,15 +119,8 @@ def to_html(
|
|
|
118
119
|
indent_size: int = 2,
|
|
119
120
|
*,
|
|
120
121
|
pretty: bool = True,
|
|
121
|
-
safe: bool = True,
|
|
122
|
-
policy: SanitizationPolicy | None = None,
|
|
123
122
|
) -> str:
|
|
124
123
|
"""Convert node to HTML string."""
|
|
125
|
-
if safe:
|
|
126
|
-
if policy is None and node.name == "#document":
|
|
127
|
-
node = sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
|
|
128
|
-
else:
|
|
129
|
-
node = sanitize(node, policy=policy or DEFAULT_POLICY)
|
|
130
124
|
if node.name == "#document":
|
|
131
125
|
# Document root - just render children
|
|
132
126
|
parts: list[str] = []
|
|
@@ -136,12 +130,6 @@ def to_html(
|
|
|
136
130
|
return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
|
|
137
131
|
|
|
138
132
|
|
|
139
|
-
_PREFORMATTED_ELEMENTS: set[str] = {"pre", "textarea", "code"}
|
|
140
|
-
|
|
141
|
-
# Elements whose text content must not be normalized (e.g. scripts/styles).
|
|
142
|
-
_RAWTEXT_ELEMENTS: set[str] = {"script", "style"}
|
|
143
|
-
|
|
144
|
-
|
|
145
133
|
def _collapse_html_whitespace(text: str) -> str:
|
|
146
134
|
"""Collapse HTML whitespace runs to a single space and trim edges.
|
|
147
135
|
|
|
@@ -151,20 +139,26 @@ def _collapse_html_whitespace(text: str) -> str:
|
|
|
151
139
|
if not text:
|
|
152
140
|
return ""
|
|
153
141
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
parts.append(" ")
|
|
160
|
-
in_whitespace = True
|
|
161
|
-
continue
|
|
162
|
-
|
|
163
|
-
parts.append(ch)
|
|
142
|
+
# Optimization: split() handles whitespace collapsing efficiently.
|
|
143
|
+
# Note: split() treats \v as whitespace, which is not HTML whitespace.
|
|
144
|
+
# But \v is extremely rare in HTML.
|
|
145
|
+
if "\v" in text:
|
|
146
|
+
parts: list[str] = []
|
|
164
147
|
in_whitespace = False
|
|
148
|
+
for ch in text:
|
|
149
|
+
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
150
|
+
if not in_whitespace:
|
|
151
|
+
parts.append(" ")
|
|
152
|
+
in_whitespace = True
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
parts.append(ch)
|
|
156
|
+
in_whitespace = False
|
|
165
157
|
|
|
166
|
-
|
|
167
|
-
|
|
158
|
+
collapsed = "".join(parts)
|
|
159
|
+
return collapsed.strip(" ")
|
|
160
|
+
|
|
161
|
+
return " ".join(text.split())
|
|
168
162
|
|
|
169
163
|
|
|
170
164
|
def _normalize_formatting_whitespace(text: str) -> str:
|
|
@@ -226,6 +220,149 @@ def _is_whitespace_text_node(node: Any) -> bool:
|
|
|
226
220
|
return node.name == "#text" and (node.data or "").strip() == ""
|
|
227
221
|
|
|
228
222
|
|
|
223
|
+
def _is_blocky_element(node: Any) -> bool:
|
|
224
|
+
# Treat elements as block-ish if they are block-level *or* contain any block-level
|
|
225
|
+
# descendants. This keeps pretty-printing readable for constructs like <a><div>...</div></a>.
|
|
226
|
+
try:
|
|
227
|
+
name = node.name
|
|
228
|
+
except AttributeError:
|
|
229
|
+
return False
|
|
230
|
+
if name in {"#text", "#comment", "!doctype"}:
|
|
231
|
+
return False
|
|
232
|
+
if name in SPECIAL_ELEMENTS:
|
|
233
|
+
return True
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
children = node.children or []
|
|
237
|
+
except AttributeError:
|
|
238
|
+
return False
|
|
239
|
+
if not children:
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
stack: list[Any] = list(children)
|
|
243
|
+
while stack:
|
|
244
|
+
child = stack.pop()
|
|
245
|
+
if child is None:
|
|
246
|
+
continue
|
|
247
|
+
child_name = child.name
|
|
248
|
+
if child_name in SPECIAL_ELEMENTS:
|
|
249
|
+
return True
|
|
250
|
+
if child_name in {"#text", "#comment", "!doctype"}:
|
|
251
|
+
continue
|
|
252
|
+
grand_children = child.children
|
|
253
|
+
if grand_children:
|
|
254
|
+
stack.extend(grand_children)
|
|
255
|
+
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
_LAYOUT_BLOCK_ELEMENTS = {
|
|
260
|
+
"address",
|
|
261
|
+
"article",
|
|
262
|
+
"aside",
|
|
263
|
+
"blockquote",
|
|
264
|
+
"body",
|
|
265
|
+
"caption",
|
|
266
|
+
"center",
|
|
267
|
+
"dd",
|
|
268
|
+
"details",
|
|
269
|
+
"dialog",
|
|
270
|
+
"dir",
|
|
271
|
+
"div",
|
|
272
|
+
"dl",
|
|
273
|
+
"dt",
|
|
274
|
+
"fieldset",
|
|
275
|
+
"figcaption",
|
|
276
|
+
"figure",
|
|
277
|
+
"footer",
|
|
278
|
+
"form",
|
|
279
|
+
"h1",
|
|
280
|
+
"h2",
|
|
281
|
+
"h3",
|
|
282
|
+
"h4",
|
|
283
|
+
"h5",
|
|
284
|
+
"h6",
|
|
285
|
+
"header",
|
|
286
|
+
"hgroup",
|
|
287
|
+
"hr",
|
|
288
|
+
"html",
|
|
289
|
+
"iframe",
|
|
290
|
+
"li",
|
|
291
|
+
"listing",
|
|
292
|
+
"main",
|
|
293
|
+
"marquee",
|
|
294
|
+
"menu",
|
|
295
|
+
"nav",
|
|
296
|
+
"noframes",
|
|
297
|
+
"noscript",
|
|
298
|
+
"ol",
|
|
299
|
+
"p",
|
|
300
|
+
"plaintext",
|
|
301
|
+
"pre",
|
|
302
|
+
"search",
|
|
303
|
+
"section",
|
|
304
|
+
"summary",
|
|
305
|
+
"table",
|
|
306
|
+
"tbody",
|
|
307
|
+
"td",
|
|
308
|
+
"tfoot",
|
|
309
|
+
"th",
|
|
310
|
+
"thead",
|
|
311
|
+
"tr",
|
|
312
|
+
"ul",
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
_FORMAT_SEP = object()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _is_layout_blocky_element(node: Any) -> bool:
|
|
320
|
+
# Similar to _is_blocky_element(), but limited to actual layout blocks.
|
|
321
|
+
# This avoids turning inline-ish "special" elements like <script> into
|
|
322
|
+
# multiline pretty-print breaks in contexts like <p>.
|
|
323
|
+
try:
|
|
324
|
+
name = node.name
|
|
325
|
+
except AttributeError:
|
|
326
|
+
return False
|
|
327
|
+
if name in {"#text", "#comment", "!doctype"}:
|
|
328
|
+
return False
|
|
329
|
+
if name in _LAYOUT_BLOCK_ELEMENTS:
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
children = node.children or []
|
|
334
|
+
except AttributeError:
|
|
335
|
+
return False
|
|
336
|
+
if not children:
|
|
337
|
+
return False
|
|
338
|
+
|
|
339
|
+
stack: list[Any] = list(children)
|
|
340
|
+
while stack:
|
|
341
|
+
child = stack.pop()
|
|
342
|
+
if child is None:
|
|
343
|
+
continue
|
|
344
|
+
child_name = child.name
|
|
345
|
+
if child_name in _LAYOUT_BLOCK_ELEMENTS:
|
|
346
|
+
return True
|
|
347
|
+
if child_name in {"#text", "#comment", "!doctype"}:
|
|
348
|
+
continue
|
|
349
|
+
grand_children = child.children
|
|
350
|
+
if grand_children:
|
|
351
|
+
stack.extend(grand_children)
|
|
352
|
+
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _is_formatting_whitespace_text(data: str) -> bool:
|
|
357
|
+
# Formatting whitespace is something users typically don't intend to preserve
|
|
358
|
+
# exactly (e.g. newlines/indentation, or large runs of spaces).
|
|
359
|
+
if not data:
|
|
360
|
+
return False
|
|
361
|
+
if "\n" in data or "\r" in data or "\t" in data or "\f" in data:
|
|
362
|
+
return True
|
|
363
|
+
return len(data) > 2
|
|
364
|
+
|
|
365
|
+
|
|
229
366
|
def _should_pretty_indent_children(children: list[Any]) -> bool:
|
|
230
367
|
for child in children:
|
|
231
368
|
if child is None:
|
|
@@ -243,26 +380,18 @@ def _should_pretty_indent_children(children: list[Any]) -> bool:
|
|
|
243
380
|
return True
|
|
244
381
|
if len(element_children) == 1:
|
|
245
382
|
only_child = element_children[0]
|
|
246
|
-
if only_child
|
|
383
|
+
if _is_blocky_element(only_child):
|
|
247
384
|
return True
|
|
248
|
-
if only_child.name == "a":
|
|
249
|
-
# If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
|
|
250
|
-
# for pretty-printing so the parent can indent it on its own line.
|
|
251
|
-
for grandchild in only_child.children or []:
|
|
252
|
-
if grandchild is None:
|
|
253
|
-
continue
|
|
254
|
-
if grandchild.name in SPECIAL_ELEMENTS:
|
|
255
|
-
return True
|
|
256
385
|
return False
|
|
257
386
|
|
|
258
387
|
# Safe indentation rule: only insert inter-element whitespace when we won't
|
|
259
388
|
# be placing it between two adjacent inline/phrasing elements.
|
|
260
|
-
|
|
389
|
+
prev_is_blocky = _is_blocky_element(element_children[0])
|
|
261
390
|
for child in element_children[1:]:
|
|
262
|
-
|
|
263
|
-
if not
|
|
391
|
+
current_is_blocky = _is_blocky_element(child)
|
|
392
|
+
if not prev_is_blocky and not current_is_blocky:
|
|
264
393
|
return False
|
|
265
|
-
|
|
394
|
+
prev_is_blocky = current_is_blocky
|
|
266
395
|
return True
|
|
267
396
|
|
|
268
397
|
|
|
@@ -270,7 +399,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
270
399
|
"""Helper to convert a node to HTML."""
|
|
271
400
|
prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
|
|
272
401
|
name: str = node.name
|
|
273
|
-
content_pre = in_pre or name in
|
|
402
|
+
content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
|
|
274
403
|
newline = "\n" if pretty and not content_pre else ""
|
|
275
404
|
|
|
276
405
|
# Text node
|
|
@@ -320,14 +449,19 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
320
449
|
return f"{prefix}{open_tag}{serialize_end_tag(name)}"
|
|
321
450
|
|
|
322
451
|
# Check if all children are text-only (inline rendering)
|
|
323
|
-
all_text =
|
|
452
|
+
all_text = True
|
|
453
|
+
for child in children:
|
|
454
|
+
if child is None:
|
|
455
|
+
continue
|
|
456
|
+
if child.name != "#text":
|
|
457
|
+
all_text = False
|
|
458
|
+
break
|
|
324
459
|
|
|
325
460
|
if all_text and pretty and not content_pre:
|
|
326
461
|
# Serializer controls sanitization at the to_html() entry point; avoid
|
|
327
462
|
# implicit re-sanitization during rendering.
|
|
328
|
-
text_content = node.to_text(separator="", strip=False
|
|
329
|
-
|
|
330
|
-
text_content = _collapse_html_whitespace(text_content)
|
|
463
|
+
text_content = node.to_text(separator="", strip=False)
|
|
464
|
+
text_content = _collapse_html_whitespace(text_content)
|
|
331
465
|
return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
|
|
332
466
|
|
|
333
467
|
if pretty and content_pre:
|
|
@@ -338,11 +472,204 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
338
472
|
)
|
|
339
473
|
return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
|
|
340
474
|
|
|
475
|
+
if pretty and not content_pre and name in SPECIAL_ELEMENTS:
|
|
476
|
+
# For block-ish containers that only have element children (and/or
|
|
477
|
+
# whitespace-only text nodes), prefer a multiline layout for readability
|
|
478
|
+
# even when children are inline elements.
|
|
479
|
+
can_indent = True
|
|
480
|
+
for child in children:
|
|
481
|
+
if child is None:
|
|
482
|
+
continue
|
|
483
|
+
if child.name == "#comment":
|
|
484
|
+
can_indent = False
|
|
485
|
+
break
|
|
486
|
+
if child.name == "#text" and (child.data or "").strip():
|
|
487
|
+
can_indent = False
|
|
488
|
+
break
|
|
489
|
+
|
|
490
|
+
if can_indent:
|
|
491
|
+
inner_lines: list[str] = []
|
|
492
|
+
for child in children:
|
|
493
|
+
if child is None:
|
|
494
|
+
continue
|
|
495
|
+
if _is_whitespace_text_node(child):
|
|
496
|
+
continue
|
|
497
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
|
|
498
|
+
if child_html:
|
|
499
|
+
inner_lines.append(child_html)
|
|
500
|
+
|
|
501
|
+
if inner_lines:
|
|
502
|
+
parts = [f"{prefix}{open_tag}"]
|
|
503
|
+
parts.extend(inner_lines)
|
|
504
|
+
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
505
|
+
return "\n".join(parts)
|
|
506
|
+
|
|
507
|
+
# Smart pretty-printing: if the author already inserted formatting whitespace
|
|
508
|
+
# between siblings, we can split into "inline runs" and put each run on its
|
|
509
|
+
# own line without introducing new inter-token whitespace.
|
|
510
|
+
has_comment = any(child is not None and child.name == "#comment" for child in children)
|
|
511
|
+
if not has_comment:
|
|
512
|
+
non_none_children: list[Any] = [child for child in children if child is not None]
|
|
513
|
+
|
|
514
|
+
# Only enable this mode if there is at least one formatting whitespace text node
|
|
515
|
+
# between non-whitespace siblings.
|
|
516
|
+
has_separator = False
|
|
517
|
+
for child in non_none_children[1:-1]:
|
|
518
|
+
if child.name != "#text":
|
|
519
|
+
continue
|
|
520
|
+
data = child.data or ""
|
|
521
|
+
if data.strip() != "":
|
|
522
|
+
continue
|
|
523
|
+
if _is_formatting_whitespace_text(data):
|
|
524
|
+
has_separator = True
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
if has_separator:
|
|
528
|
+
# Build runs by splitting on formatting whitespace text nodes.
|
|
529
|
+
# Keep small spacing nodes (" " or " ") inside runs.
|
|
530
|
+
items: list[Any] = []
|
|
531
|
+
last_was_sep = False
|
|
532
|
+
for child in non_none_children:
|
|
533
|
+
if child.name == "#text":
|
|
534
|
+
data = child.data or ""
|
|
535
|
+
if data.strip() == "" and _is_formatting_whitespace_text(data):
|
|
536
|
+
if not last_was_sep:
|
|
537
|
+
items.append(_FORMAT_SEP)
|
|
538
|
+
last_was_sep = True
|
|
539
|
+
continue
|
|
540
|
+
items.append(child)
|
|
541
|
+
last_was_sep = False
|
|
542
|
+
|
|
543
|
+
while items and items[0] is _FORMAT_SEP:
|
|
544
|
+
items.pop(0)
|
|
545
|
+
while items and items[-1] is _FORMAT_SEP:
|
|
546
|
+
items.pop()
|
|
547
|
+
|
|
548
|
+
runs: list[list[Any]] = []
|
|
549
|
+
current_run: list[Any] = []
|
|
550
|
+
for item in items:
|
|
551
|
+
if item is _FORMAT_SEP:
|
|
552
|
+
runs.append(current_run)
|
|
553
|
+
current_run = []
|
|
554
|
+
continue
|
|
555
|
+
current_run.append(item)
|
|
556
|
+
runs.append(current_run)
|
|
557
|
+
runs = [run for run in runs if run]
|
|
558
|
+
|
|
559
|
+
# Only apply if we can render each run either as a single blocky element
|
|
560
|
+
# (possibly multiline) or as a single-line inline run.
|
|
561
|
+
smart_lines: list[str] = []
|
|
562
|
+
can_apply = True
|
|
563
|
+
for run in runs:
|
|
564
|
+
blocky_elements = [c for c in run if c.name not in {"#text", "#comment"} and _is_blocky_element(c)]
|
|
565
|
+
if blocky_elements and len(run) != 1:
|
|
566
|
+
can_apply = False
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
if len(run) == 1 and run[0].name != "#text":
|
|
570
|
+
child_html = _node_to_html(run[0], indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
571
|
+
smart_lines.append(child_html)
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
# Inline run: render on one line.
|
|
575
|
+
run_parts: list[str] = []
|
|
576
|
+
for c in run:
|
|
577
|
+
if c.name == "#text":
|
|
578
|
+
data = c.data or ""
|
|
579
|
+
if not data.strip():
|
|
580
|
+
# Formatting whitespace never appears inside runs (it is used as a separator).
|
|
581
|
+
# Preserve intentional tiny spacing.
|
|
582
|
+
run_parts.append(data)
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
run_parts.append(_escape_text(_normalize_formatting_whitespace(data)))
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
# Render inline elements without their own leading indentation.
|
|
589
|
+
child_html = _node_to_html(c, 0, indent_size, pretty=True, in_pre=content_pre)
|
|
590
|
+
run_parts.append(child_html)
|
|
591
|
+
|
|
592
|
+
smart_lines.append(f"{' ' * ((indent + 1) * indent_size)}{''.join(run_parts)}")
|
|
593
|
+
|
|
594
|
+
if can_apply and smart_lines:
|
|
595
|
+
return f"{prefix}{open_tag}\n" + "\n".join(smart_lines) + f"\n{prefix}{serialize_end_tag(name)}"
|
|
596
|
+
|
|
341
597
|
if pretty and not content_pre and not _should_pretty_indent_children(children):
|
|
342
598
|
# For block-ish elements that contain only element children and whitespace-only
|
|
343
599
|
# text nodes, we can still format each child on its own line (only when there
|
|
344
600
|
# is already whitespace separating element siblings).
|
|
345
601
|
if name in SPECIAL_ELEMENTS:
|
|
602
|
+
# Mixed content in block-ish containers: if we encounter a blocky child
|
|
603
|
+
# (e.g. <ul>) adjacent to inline text, printing everything on one line
|
|
604
|
+
# both hurts readability and can lose indentation inside the block subtree.
|
|
605
|
+
# In that case, put inline runs and blocky children on their own lines.
|
|
606
|
+
has_comment = any(child is not None and child.name == "#comment" for child in children)
|
|
607
|
+
if not has_comment:
|
|
608
|
+
has_blocky_child = any(
|
|
609
|
+
child is not None and child.name not in {"#text", "#comment"} and _is_layout_blocky_element(child)
|
|
610
|
+
for child in children
|
|
611
|
+
)
|
|
612
|
+
has_non_whitespace_text = any(
|
|
613
|
+
child is not None and child.name == "#text" and (child.data or "").strip() for child in children
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
if has_blocky_child and has_non_whitespace_text:
|
|
617
|
+
mixed_multiline_lines: list[str] = []
|
|
618
|
+
inline_parts: list[str] = []
|
|
619
|
+
|
|
620
|
+
mixed_first_non_none_index: int | None = None
|
|
621
|
+
mixed_last_non_none_index: int | None = None
|
|
622
|
+
for i, child in enumerate(children):
|
|
623
|
+
if child is None:
|
|
624
|
+
continue
|
|
625
|
+
if mixed_first_non_none_index is None:
|
|
626
|
+
mixed_first_non_none_index = i
|
|
627
|
+
mixed_last_non_none_index = i
|
|
628
|
+
|
|
629
|
+
def flush_inline() -> None:
|
|
630
|
+
if not inline_parts:
|
|
631
|
+
return
|
|
632
|
+
line = "".join(inline_parts).strip(" ")
|
|
633
|
+
inline_parts.clear()
|
|
634
|
+
if line:
|
|
635
|
+
mixed_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{line}")
|
|
636
|
+
|
|
637
|
+
for i, child in enumerate(children):
|
|
638
|
+
if child is None:
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
if child.name == "#text":
|
|
642
|
+
data = child.data or ""
|
|
643
|
+
if not data.strip():
|
|
644
|
+
# Drop leading/trailing formatting whitespace.
|
|
645
|
+
if i == mixed_first_non_none_index or i == mixed_last_non_none_index:
|
|
646
|
+
continue
|
|
647
|
+
# Preserve intentional small spacing, but treat formatting whitespace
|
|
648
|
+
# as a separator between inline runs (new line).
|
|
649
|
+
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
650
|
+
flush_inline()
|
|
651
|
+
else:
|
|
652
|
+
inline_parts.append(data)
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
data = _normalize_formatting_whitespace(data)
|
|
656
|
+
inline_parts.append(_escape_text(data))
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
if _is_layout_blocky_element(child):
|
|
660
|
+
flush_inline()
|
|
661
|
+
mixed_multiline_lines.append(
|
|
662
|
+
_node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
663
|
+
)
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
# Inline element: keep it in the current line without leading indentation.
|
|
667
|
+
inline_parts.append(_node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre))
|
|
668
|
+
|
|
669
|
+
flush_inline()
|
|
670
|
+
inner = "\n".join(line for line in mixed_multiline_lines if line)
|
|
671
|
+
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
672
|
+
|
|
346
673
|
has_comment = False
|
|
347
674
|
has_element = False
|
|
348
675
|
has_whitespace_between_elements = False
|
|
@@ -388,32 +715,32 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
388
715
|
break
|
|
389
716
|
|
|
390
717
|
if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
|
|
391
|
-
|
|
718
|
+
element_multiline_lines: list[str] = []
|
|
392
719
|
for child in children:
|
|
393
720
|
if child is None:
|
|
394
721
|
continue
|
|
395
722
|
if child.name == "#text":
|
|
396
723
|
text = _collapse_html_whitespace(child.data or "")
|
|
397
724
|
if text:
|
|
398
|
-
|
|
725
|
+
element_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
|
|
399
726
|
continue
|
|
400
727
|
child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
401
728
|
if child_html:
|
|
402
|
-
|
|
403
|
-
if
|
|
404
|
-
inner = "\n".join(
|
|
729
|
+
element_multiline_lines.append(child_html)
|
|
730
|
+
if element_multiline_lines:
|
|
731
|
+
inner = "\n".join(element_multiline_lines)
|
|
405
732
|
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
406
733
|
|
|
407
734
|
inner_parts: list[str] = []
|
|
408
735
|
|
|
409
|
-
|
|
410
|
-
|
|
736
|
+
compact_first_non_none_index: int | None = None
|
|
737
|
+
compact_last_non_none_index: int | None = None
|
|
411
738
|
for i, child in enumerate(children):
|
|
412
739
|
if child is None:
|
|
413
740
|
continue
|
|
414
|
-
if
|
|
415
|
-
|
|
416
|
-
|
|
741
|
+
if compact_first_non_none_index is None:
|
|
742
|
+
compact_first_non_none_index = i
|
|
743
|
+
compact_last_non_none_index = i
|
|
417
744
|
|
|
418
745
|
for i, child in enumerate(children):
|
|
419
746
|
if child is None:
|
|
@@ -423,15 +750,14 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
423
750
|
data = child.data or ""
|
|
424
751
|
if not data.strip():
|
|
425
752
|
# Drop leading/trailing formatting whitespace in compact mode.
|
|
426
|
-
if i ==
|
|
753
|
+
if i == compact_first_non_none_index or i == compact_last_non_none_index:
|
|
427
754
|
continue
|
|
428
755
|
# Preserve intentional small spacing, but collapse large formatting gaps.
|
|
429
756
|
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
430
757
|
inner_parts.append(" ")
|
|
431
758
|
continue
|
|
432
759
|
|
|
433
|
-
|
|
434
|
-
data = _normalize_formatting_whitespace(data)
|
|
760
|
+
data = _normalize_formatting_whitespace(data)
|
|
435
761
|
child_html = _escape_text(data) if data else ""
|
|
436
762
|
else:
|
|
437
763
|
# Even when we can't safely insert whitespace *between* siblings, we can
|