justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/selector.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
from functools import lru_cache
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
8
9
|
|
|
@@ -529,6 +530,14 @@ class SelectorMatcher:
|
|
|
529
530
|
|
|
530
531
|
__slots__ = ()
|
|
531
532
|
|
|
533
|
+
def _unquote_pseudo_arg(self, arg: str) -> str:
|
|
534
|
+
arg = arg.strip()
|
|
535
|
+
if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
|
|
536
|
+
quote = arg[0]
|
|
537
|
+
# Minimal unescaping for common cases like :contains("click me")
|
|
538
|
+
return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
|
|
539
|
+
return arg
|
|
540
|
+
|
|
532
541
|
def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
|
|
533
542
|
"""Check if a node matches a parsed selector."""
|
|
534
543
|
if isinstance(selector, SelectorList):
|
|
@@ -724,6 +733,17 @@ class SelectorMatcher:
|
|
|
724
733
|
return parent.name in ("#document", "#document-fragment")
|
|
725
734
|
return False
|
|
726
735
|
|
|
736
|
+
if name == "contains":
|
|
737
|
+
if selector.arg is None:
|
|
738
|
+
raise SelectorError(":contains() requires a string argument")
|
|
739
|
+
needle = self._unquote_pseudo_arg(selector.arg)
|
|
740
|
+
if needle == "":
|
|
741
|
+
return True
|
|
742
|
+
# Non-standard (jQuery-style) pseudo-class: match elements whose descendant
|
|
743
|
+
# text contains the substring. We use `to_text()` to approximate textContent.
|
|
744
|
+
haystack: str = node.to_text(separator=" ", strip=True)
|
|
745
|
+
return needle in haystack
|
|
746
|
+
|
|
727
747
|
if name == "first-of-type":
|
|
728
748
|
return self._is_first_of_type(node)
|
|
729
749
|
|
|
@@ -743,7 +763,7 @@ class SelectorMatcher:
|
|
|
743
763
|
"""Get only element children (exclude text, comments, etc.)."""
|
|
744
764
|
if not parent or not parent.has_child_nodes():
|
|
745
765
|
return []
|
|
746
|
-
return [c for c in parent.children if
|
|
766
|
+
return [c for c in parent.children if not c.name.startswith("#")]
|
|
747
767
|
|
|
748
768
|
def _get_previous_sibling(self, node: Any) -> Any | None:
|
|
749
769
|
"""Get the previous element sibling. Returns None if node is first or not found."""
|
|
@@ -755,7 +775,7 @@ class SelectorMatcher:
|
|
|
755
775
|
for child in parent.children:
|
|
756
776
|
if child is node:
|
|
757
777
|
return prev
|
|
758
|
-
if
|
|
778
|
+
if not child.name.startswith("#"):
|
|
759
779
|
prev = child
|
|
760
780
|
return None # node not in parent.children (detached)
|
|
761
781
|
|
|
@@ -903,7 +923,12 @@ def parse_selector(selector_string: str) -> ParsedSelector:
|
|
|
903
923
|
if not selector_string or not selector_string.strip():
|
|
904
924
|
raise SelectorError("Empty selector")
|
|
905
925
|
|
|
906
|
-
|
|
926
|
+
return _parse_selector_cached(selector_string.strip())
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
@lru_cache(maxsize=512)
|
|
930
|
+
def _parse_selector_cached(selector_string: str) -> ParsedSelector:
|
|
931
|
+
tokenizer = SelectorTokenizer(selector_string)
|
|
907
932
|
tokens = tokenizer.tokenize()
|
|
908
933
|
parser = SelectorParser(tokens)
|
|
909
934
|
return parser.parse()
|
|
@@ -913,6 +938,51 @@ def parse_selector(selector_string: str) -> ParsedSelector:
|
|
|
913
938
|
_matcher: SelectorMatcher = SelectorMatcher()
|
|
914
939
|
|
|
915
940
|
|
|
941
|
+
def _is_simple_tag_selector(selector: str) -> bool:
|
|
942
|
+
if not selector:
|
|
943
|
+
return False
|
|
944
|
+
ch0 = selector[0]
|
|
945
|
+
if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
|
|
946
|
+
return False
|
|
947
|
+
for ch in selector[1:]:
|
|
948
|
+
if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
|
|
949
|
+
continue
|
|
950
|
+
return False
|
|
951
|
+
return True
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
|
|
955
|
+
results_append = results.append
|
|
956
|
+
|
|
957
|
+
stack: list[Any] = []
|
|
958
|
+
|
|
959
|
+
root_children = node.children
|
|
960
|
+
if root_children:
|
|
961
|
+
stack.extend(reversed(root_children))
|
|
962
|
+
|
|
963
|
+
if node.name == "template" and node.namespace == "html":
|
|
964
|
+
template_content = node.template_content
|
|
965
|
+
if template_content:
|
|
966
|
+
stack.append(template_content)
|
|
967
|
+
|
|
968
|
+
while stack:
|
|
969
|
+
current = stack.pop()
|
|
970
|
+
|
|
971
|
+
name = current.name
|
|
972
|
+
if not name.startswith("#"):
|
|
973
|
+
if name == tag_lower or name.lower() == tag_lower:
|
|
974
|
+
results_append(current)
|
|
975
|
+
|
|
976
|
+
children = current.children
|
|
977
|
+
if children:
|
|
978
|
+
stack.extend(reversed(children))
|
|
979
|
+
|
|
980
|
+
if name == "template" and current.namespace == "html":
|
|
981
|
+
template_content = current.template_content
|
|
982
|
+
if template_content:
|
|
983
|
+
stack.append(template_content)
|
|
984
|
+
|
|
985
|
+
|
|
916
986
|
def query(root: Any, selector_string: str) -> list[Any]:
|
|
917
987
|
"""
|
|
918
988
|
Query the DOM tree starting from root, returning all matching elements.
|
|
@@ -927,27 +997,53 @@ def query(root: Any, selector_string: str) -> list[Any]:
|
|
|
927
997
|
Returns:
|
|
928
998
|
A list of matching nodes
|
|
929
999
|
"""
|
|
930
|
-
|
|
1000
|
+
selector_string = selector_string.strip()
|
|
1001
|
+
if not selector_string:
|
|
1002
|
+
raise SelectorError("Empty selector")
|
|
1003
|
+
|
|
931
1004
|
results: list[Any] = []
|
|
1005
|
+
|
|
1006
|
+
if _is_simple_tag_selector(selector_string):
|
|
1007
|
+
_query_descendants_tag(root, selector_string.lower(), results)
|
|
1008
|
+
return results
|
|
1009
|
+
|
|
1010
|
+
selector = _parse_selector_cached(selector_string)
|
|
932
1011
|
_query_descendants(root, selector, results)
|
|
933
1012
|
return results
|
|
934
1013
|
|
|
935
1014
|
|
|
936
1015
|
def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
|
|
937
|
-
"""
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
1016
|
+
"""Search for matching nodes in descendants."""
|
|
1017
|
+
matcher_matches = _matcher.matches
|
|
1018
|
+
results_append = results.append
|
|
1019
|
+
|
|
1020
|
+
# querySelectorAll searches descendants of root, not including root itself.
|
|
1021
|
+
stack: list[Any] = []
|
|
1022
|
+
|
|
1023
|
+
root_children = node.children
|
|
1024
|
+
if root_children:
|
|
1025
|
+
stack.extend(reversed(root_children))
|
|
1026
|
+
|
|
1027
|
+
if node.name == "template" and node.namespace == "html":
|
|
1028
|
+
template_content = node.template_content
|
|
1029
|
+
if template_content:
|
|
1030
|
+
stack.append(template_content)
|
|
1031
|
+
|
|
1032
|
+
while stack:
|
|
1033
|
+
current = stack.pop()
|
|
1034
|
+
|
|
1035
|
+
name = current.name
|
|
1036
|
+
if not name.startswith("#") and matcher_matches(current, selector):
|
|
1037
|
+
results_append(current)
|
|
1038
|
+
|
|
1039
|
+
children = current.children
|
|
1040
|
+
if children:
|
|
1041
|
+
stack.extend(reversed(children))
|
|
1042
|
+
|
|
1043
|
+
if name == "template" and current.namespace == "html":
|
|
1044
|
+
template_content = current.template_content
|
|
1045
|
+
if template_content:
|
|
1046
|
+
stack.append(template_content)
|
|
951
1047
|
|
|
952
1048
|
|
|
953
1049
|
def matches(node: Any, selector_string: str) -> bool:
|
justhtml/serialize.py
CHANGED
|
@@ -6,7 +6,8 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
|
|
9
|
+
from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS
|
|
10
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, sanitize
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def _escape_text(text: str | None) -> str:
|
|
@@ -16,7 +17,9 @@ def _escape_text(text: str | None) -> str:
|
|
|
16
17
|
return str(text).replace("&", "&").replace("<", "<").replace(">", ">")
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def _choose_attr_quote(value: str | None) -> str:
|
|
20
|
+
def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
|
|
21
|
+
if forced_quote_char in {'"', "'"}:
|
|
22
|
+
return forced_quote_char
|
|
20
23
|
if value is None:
|
|
21
24
|
return '"'
|
|
22
25
|
value = str(value)
|
|
@@ -25,11 +28,13 @@ def _choose_attr_quote(value: str | None) -> str:
|
|
|
25
28
|
return '"'
|
|
26
29
|
|
|
27
30
|
|
|
28
|
-
def _escape_attr_value(value: str | None, quote_char: str) -> str:
|
|
31
|
+
def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
|
|
29
32
|
if value is None:
|
|
30
33
|
return ""
|
|
31
34
|
value = str(value)
|
|
32
35
|
value = value.replace("&", "&")
|
|
36
|
+
if escape_lt_in_attrs:
|
|
37
|
+
value = value.replace("<", "<")
|
|
33
38
|
# Note: html5lib's default serializer does not escape '>' in attrs.
|
|
34
39
|
if quote_char == '"':
|
|
35
40
|
return value.replace('"', """)
|
|
@@ -40,8 +45,6 @@ def _can_unquote_attr_value(value: str | None) -> bool:
|
|
|
40
45
|
if value is None:
|
|
41
46
|
return False
|
|
42
47
|
value = str(value)
|
|
43
|
-
# html5lib's serializer unquotes aggressively; match fixture expectations.
|
|
44
|
-
# Disallow whitespace and characters that would terminate/ambiguate the value.
|
|
45
48
|
for ch in value:
|
|
46
49
|
if ch == ">":
|
|
47
50
|
return False
|
|
@@ -52,22 +55,56 @@ def _can_unquote_attr_value(value: str | None) -> bool:
|
|
|
52
55
|
return True
|
|
53
56
|
|
|
54
57
|
|
|
55
|
-
def
|
|
58
|
+
def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
|
|
59
|
+
if not minimize_boolean_attributes:
|
|
60
|
+
return False
|
|
61
|
+
if value is None or value == "":
|
|
62
|
+
return True
|
|
63
|
+
return str(value).lower() == str(name).lower()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def serialize_start_tag(
|
|
67
|
+
name: str,
|
|
68
|
+
attrs: dict[str, str | None] | None,
|
|
69
|
+
*,
|
|
70
|
+
quote_attr_values: bool = True,
|
|
71
|
+
minimize_boolean_attributes: bool = True,
|
|
72
|
+
quote_char: str | None = None,
|
|
73
|
+
escape_lt_in_attrs: bool = False,
|
|
74
|
+
use_trailing_solidus: bool = False,
|
|
75
|
+
is_void: bool = False,
|
|
76
|
+
) -> str:
|
|
56
77
|
attrs = attrs or {}
|
|
57
78
|
parts: list[str] = ["<", name]
|
|
58
79
|
if attrs:
|
|
59
80
|
for key, value in attrs.items():
|
|
60
|
-
if
|
|
81
|
+
if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
|
|
61
82
|
parts.extend([" ", key])
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if value is None:
|
|
86
|
+
parts.extend([" ", key, '=""'])
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
value_str = str(value)
|
|
90
|
+
if value_str == "":
|
|
91
|
+
parts.extend([" ", key, '=""'])
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
if not quote_attr_values and _can_unquote_attr_value(value_str):
|
|
95
|
+
escaped = value_str.replace("&", "&")
|
|
96
|
+
if escape_lt_in_attrs:
|
|
97
|
+
escaped = escaped.replace("<", "<")
|
|
98
|
+
parts.extend([" ", key, "=", escaped])
|
|
62
99
|
else:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
100
|
+
quote = _choose_attr_quote(value_str, quote_char)
|
|
101
|
+
escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
|
|
102
|
+
parts.extend([" ", key, "=", quote, escaped, quote])
|
|
103
|
+
|
|
104
|
+
if use_trailing_solidus and is_void:
|
|
105
|
+
parts.append(" />")
|
|
106
|
+
else:
|
|
107
|
+
parts.append(">")
|
|
71
108
|
return "".join(parts)
|
|
72
109
|
|
|
73
110
|
|
|
@@ -75,27 +112,171 @@ def serialize_end_tag(name: str) -> str:
|
|
|
75
112
|
return f"</{name}>"
|
|
76
113
|
|
|
77
114
|
|
|
78
|
-
def to_html(
|
|
115
|
+
def to_html(
|
|
116
|
+
node: Any,
|
|
117
|
+
indent: int = 0,
|
|
118
|
+
indent_size: int = 2,
|
|
119
|
+
*,
|
|
120
|
+
pretty: bool = True,
|
|
121
|
+
safe: bool = True,
|
|
122
|
+
policy: SanitizationPolicy | None = None,
|
|
123
|
+
) -> str:
|
|
79
124
|
"""Convert node to HTML string."""
|
|
125
|
+
if safe:
|
|
126
|
+
if policy is None and node.name == "#document":
|
|
127
|
+
node = sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
|
|
128
|
+
else:
|
|
129
|
+
node = sanitize(node, policy=policy or DEFAULT_POLICY)
|
|
80
130
|
if node.name == "#document":
|
|
81
131
|
# Document root - just render children
|
|
82
132
|
parts: list[str] = []
|
|
83
133
|
for child in node.children or []:
|
|
84
|
-
parts.append(_node_to_html(child, indent, indent_size, pretty))
|
|
134
|
+
parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
|
|
85
135
|
return "\n".join(parts) if pretty else "".join(parts)
|
|
86
|
-
return _node_to_html(node, indent, indent_size, pretty)
|
|
136
|
+
return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
_PREFORMATTED_ELEMENTS: set[str] = {"pre", "textarea", "code"}
|
|
140
|
+
|
|
141
|
+
# Elements whose text content must not be normalized (e.g. scripts/styles).
|
|
142
|
+
_RAWTEXT_ELEMENTS: set[str] = {"script", "style"}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _collapse_html_whitespace(text: str) -> str:
|
|
146
|
+
"""Collapse HTML whitespace runs to a single space and trim edges.
|
|
147
|
+
|
|
148
|
+
This matches how HTML rendering treats most whitespace in text nodes, and is
|
|
149
|
+
used only for pretty-printing in non-preformatted contexts.
|
|
150
|
+
"""
|
|
151
|
+
if not text:
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
parts: list[str] = []
|
|
155
|
+
in_whitespace = False
|
|
156
|
+
for ch in text:
|
|
157
|
+
if ch in {" ", "\t", "\n", "\f", "\r"}:
|
|
158
|
+
if not in_whitespace:
|
|
159
|
+
parts.append(" ")
|
|
160
|
+
in_whitespace = True
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
parts.append(ch)
|
|
164
|
+
in_whitespace = False
|
|
165
|
+
|
|
166
|
+
collapsed = "".join(parts)
|
|
167
|
+
return collapsed.strip(" ")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _normalize_formatting_whitespace(text: str) -> str:
|
|
171
|
+
"""Normalize formatting whitespace within a text node.
|
|
87
172
|
|
|
173
|
+
Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
|
|
174
|
+
include such formatting whitespace to a single space.
|
|
88
175
|
|
|
89
|
-
|
|
176
|
+
Pure space runs are preserved as-is (so existing double-spaces remain).
|
|
177
|
+
"""
|
|
178
|
+
if not text:
|
|
179
|
+
return ""
|
|
180
|
+
|
|
181
|
+
if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
|
|
182
|
+
return text
|
|
183
|
+
|
|
184
|
+
starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
|
|
185
|
+
ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
|
|
186
|
+
|
|
187
|
+
out: list[str] = []
|
|
188
|
+
in_ws = False
|
|
189
|
+
saw_formatting_ws = False
|
|
190
|
+
|
|
191
|
+
for ch in text:
|
|
192
|
+
if ch == " ":
|
|
193
|
+
if in_ws:
|
|
194
|
+
# Only collapse if this whitespace run included formatting whitespace.
|
|
195
|
+
if saw_formatting_ws:
|
|
196
|
+
continue
|
|
197
|
+
out.append(" ")
|
|
198
|
+
continue
|
|
199
|
+
in_ws = True
|
|
200
|
+
saw_formatting_ws = False
|
|
201
|
+
out.append(" ")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
if ch in {"\n", "\r", "\t", "\f"}:
|
|
205
|
+
if in_ws:
|
|
206
|
+
saw_formatting_ws = True
|
|
207
|
+
continue
|
|
208
|
+
in_ws = True
|
|
209
|
+
saw_formatting_ws = True
|
|
210
|
+
out.append(" ")
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
in_ws = False
|
|
214
|
+
saw_formatting_ws = False
|
|
215
|
+
out.append(ch)
|
|
216
|
+
|
|
217
|
+
normalized = "".join(out)
|
|
218
|
+
if starts_with_formatting and normalized.startswith(" "):
|
|
219
|
+
normalized = normalized[1:]
|
|
220
|
+
if ends_with_formatting and normalized.endswith(" "):
|
|
221
|
+
normalized = normalized[:-1]
|
|
222
|
+
return normalized
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _is_whitespace_text_node(node: Any) -> bool:
|
|
226
|
+
return node.name == "#text" and (node.data or "").strip() == ""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _should_pretty_indent_children(children: list[Any]) -> bool:
|
|
230
|
+
for child in children:
|
|
231
|
+
if child is None:
|
|
232
|
+
continue
|
|
233
|
+
name = child.name
|
|
234
|
+
if name == "#comment":
|
|
235
|
+
return False
|
|
236
|
+
if name == "#text" and (child.data or "").strip():
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
element_children: list[Any] = [
|
|
240
|
+
child for child in children if child is not None and child.name not in {"#text", "#comment"}
|
|
241
|
+
]
|
|
242
|
+
if not element_children:
|
|
243
|
+
return True
|
|
244
|
+
if len(element_children) == 1:
|
|
245
|
+
only_child = element_children[0]
|
|
246
|
+
if only_child.name in SPECIAL_ELEMENTS:
|
|
247
|
+
return True
|
|
248
|
+
if only_child.name == "a":
|
|
249
|
+
# If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
|
|
250
|
+
# for pretty-printing so the parent can indent it on its own line.
|
|
251
|
+
for grandchild in only_child.children or []:
|
|
252
|
+
if grandchild is None:
|
|
253
|
+
continue
|
|
254
|
+
if grandchild.name in SPECIAL_ELEMENTS:
|
|
255
|
+
return True
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# Safe indentation rule: only insert inter-element whitespace when we won't
|
|
259
|
+
# be placing it between two adjacent inline/phrasing elements.
|
|
260
|
+
prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
|
|
261
|
+
for child in element_children[1:]:
|
|
262
|
+
current_is_special = child.name in SPECIAL_ELEMENTS
|
|
263
|
+
if not prev_is_special and not current_is_special:
|
|
264
|
+
return False
|
|
265
|
+
prev_is_special = current_is_special
|
|
266
|
+
return True
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
|
|
90
270
|
"""Helper to convert a node to HTML."""
|
|
91
|
-
prefix = " " * (indent * indent_size) if pretty else ""
|
|
92
|
-
newline = "\n" if pretty else ""
|
|
271
|
+
prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
|
|
93
272
|
name: str = node.name
|
|
273
|
+
content_pre = in_pre or name in _PREFORMATTED_ELEMENTS
|
|
274
|
+
newline = "\n" if pretty and not content_pre else ""
|
|
94
275
|
|
|
95
276
|
# Text node
|
|
96
277
|
if name == "#text":
|
|
97
278
|
text: str | None = node.data
|
|
98
|
-
if pretty:
|
|
279
|
+
if pretty and not in_pre:
|
|
99
280
|
text = text.strip() if text else ""
|
|
100
281
|
if text:
|
|
101
282
|
return f"{prefix}{_escape_text(text)}"
|
|
@@ -114,7 +295,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
114
295
|
if name == "#document-fragment":
|
|
115
296
|
parts: list[str] = []
|
|
116
297
|
for child in node.children or []:
|
|
117
|
-
child_html = _node_to_html(child, indent, indent_size, pretty)
|
|
298
|
+
child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
|
|
118
299
|
if child_html:
|
|
119
300
|
parts.append(child_html)
|
|
120
301
|
return newline.join(parts) if pretty else "".join(parts)
|
|
@@ -130,20 +311,143 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
|
|
|
130
311
|
return f"{prefix}{open_tag}"
|
|
131
312
|
|
|
132
313
|
# Elements with children
|
|
133
|
-
|
|
314
|
+
# Template special handling: HTML templates store contents in `template_content`.
|
|
315
|
+
if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
316
|
+
children: list[Any] = node.template_content.children or []
|
|
317
|
+
else:
|
|
318
|
+
children = node.children or []
|
|
134
319
|
if not children:
|
|
135
320
|
return f"{prefix}{open_tag}{serialize_end_tag(name)}"
|
|
136
321
|
|
|
137
322
|
# Check if all children are text-only (inline rendering)
|
|
138
323
|
all_text = all(c.name == "#text" for c in children)
|
|
139
324
|
|
|
140
|
-
if all_text and pretty:
|
|
141
|
-
|
|
325
|
+
if all_text and pretty and not content_pre:
|
|
326
|
+
# Serializer controls sanitization at the to_html() entry point; avoid
|
|
327
|
+
# implicit re-sanitization during rendering.
|
|
328
|
+
text_content = node.to_text(separator="", strip=False, safe=False)
|
|
329
|
+
if name not in _RAWTEXT_ELEMENTS:
|
|
330
|
+
text_content = _collapse_html_whitespace(text_content)
|
|
331
|
+
return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
|
|
332
|
+
|
|
333
|
+
if pretty and content_pre:
|
|
334
|
+
inner = "".join(
|
|
335
|
+
_node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
|
|
336
|
+
for child in children
|
|
337
|
+
if child is not None
|
|
338
|
+
)
|
|
339
|
+
return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
|
|
340
|
+
|
|
341
|
+
if pretty and not content_pre and not _should_pretty_indent_children(children):
|
|
342
|
+
# For block-ish elements that contain only element children and whitespace-only
|
|
343
|
+
# text nodes, we can still format each child on its own line (only when there
|
|
344
|
+
# is already whitespace separating element siblings).
|
|
345
|
+
if name in SPECIAL_ELEMENTS:
|
|
346
|
+
has_comment = False
|
|
347
|
+
has_element = False
|
|
348
|
+
has_whitespace_between_elements = False
|
|
349
|
+
|
|
350
|
+
first_element_index: int | None = None
|
|
351
|
+
last_element_index: int | None = None
|
|
352
|
+
|
|
353
|
+
previous_was_element = False
|
|
354
|
+
saw_whitespace_since_last_element = False
|
|
355
|
+
for i, child in enumerate(children):
|
|
356
|
+
if child is None:
|
|
357
|
+
continue
|
|
358
|
+
if child.name == "#comment":
|
|
359
|
+
has_comment = True
|
|
360
|
+
break
|
|
361
|
+
if child.name == "#text":
|
|
362
|
+
# Track whether there is already whitespace between element siblings.
|
|
363
|
+
if previous_was_element and not (child.data or "").strip():
|
|
364
|
+
saw_whitespace_since_last_element = True
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
has_element = True
|
|
368
|
+
if first_element_index is None:
|
|
369
|
+
first_element_index = i
|
|
370
|
+
last_element_index = i
|
|
371
|
+
if previous_was_element and saw_whitespace_since_last_element:
|
|
372
|
+
has_whitespace_between_elements = True
|
|
373
|
+
previous_was_element = True
|
|
374
|
+
saw_whitespace_since_last_element = False
|
|
375
|
+
|
|
376
|
+
can_indent_non_whitespace_text = True
|
|
377
|
+
if has_element and first_element_index is not None and last_element_index is not None:
|
|
378
|
+
for i, child in enumerate(children):
|
|
379
|
+
if child is None or child.name != "#text":
|
|
380
|
+
continue
|
|
381
|
+
if not (child.data or "").strip():
|
|
382
|
+
continue
|
|
383
|
+
# Only allow non-whitespace text *after* the last element.
|
|
384
|
+
# Leading text or text between elements could gain new spaces
|
|
385
|
+
# due to indentation/newlines.
|
|
386
|
+
if i < first_element_index or first_element_index < i < last_element_index:
|
|
387
|
+
can_indent_non_whitespace_text = False
|
|
388
|
+
break
|
|
389
|
+
|
|
390
|
+
if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
|
|
391
|
+
inner_lines: list[str] = []
|
|
392
|
+
for child in children:
|
|
393
|
+
if child is None:
|
|
394
|
+
continue
|
|
395
|
+
if child.name == "#text":
|
|
396
|
+
text = _collapse_html_whitespace(child.data or "")
|
|
397
|
+
if text:
|
|
398
|
+
inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
|
|
399
|
+
continue
|
|
400
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
|
|
401
|
+
if child_html:
|
|
402
|
+
inner_lines.append(child_html)
|
|
403
|
+
if inner_lines:
|
|
404
|
+
inner = "\n".join(inner_lines)
|
|
405
|
+
return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
|
|
406
|
+
|
|
407
|
+
inner_parts: list[str] = []
|
|
408
|
+
|
|
409
|
+
first_non_none_index: int | None = None
|
|
410
|
+
last_non_none_index: int | None = None
|
|
411
|
+
for i, child in enumerate(children):
|
|
412
|
+
if child is None:
|
|
413
|
+
continue
|
|
414
|
+
if first_non_none_index is None:
|
|
415
|
+
first_non_none_index = i
|
|
416
|
+
last_non_none_index = i
|
|
417
|
+
|
|
418
|
+
for i, child in enumerate(children):
|
|
419
|
+
if child is None:
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
if child.name == "#text":
|
|
423
|
+
data = child.data or ""
|
|
424
|
+
if not data.strip():
|
|
425
|
+
# Drop leading/trailing formatting whitespace in compact mode.
|
|
426
|
+
if i == first_non_none_index or i == last_non_none_index:
|
|
427
|
+
continue
|
|
428
|
+
# Preserve intentional small spacing, but collapse large formatting gaps.
|
|
429
|
+
if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
|
|
430
|
+
inner_parts.append(" ")
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
if not content_pre and name not in _RAWTEXT_ELEMENTS:
|
|
434
|
+
data = _normalize_formatting_whitespace(data)
|
|
435
|
+
child_html = _escape_text(data) if data else ""
|
|
436
|
+
else:
|
|
437
|
+
# Even when we can't safely insert whitespace *between* siblings, we can
|
|
438
|
+
# still pretty-print each element subtree to improve readability.
|
|
439
|
+
child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
|
|
440
|
+
if child_html:
|
|
441
|
+
inner_parts.append(child_html)
|
|
442
|
+
|
|
443
|
+
return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
|
|
142
444
|
|
|
143
445
|
# Render with child indentation
|
|
144
446
|
parts = [f"{prefix}{open_tag}"]
|
|
145
447
|
for child in children:
|
|
146
|
-
|
|
448
|
+
if pretty and not content_pre and _is_whitespace_text_node(child):
|
|
449
|
+
continue
|
|
450
|
+
child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
|
|
147
451
|
if child_html:
|
|
148
452
|
parts.append(child_html)
|
|
149
453
|
parts.append(f"{prefix}{serialize_end_tag(name)}")
|
|
@@ -180,7 +484,7 @@ def _node_to_test_format(node: Any, indent: int) -> str:
|
|
|
180
484
|
attribute_lines = _attrs_to_test_format(node, indent)
|
|
181
485
|
|
|
182
486
|
# Template special handling (only HTML namespace templates have template_content)
|
|
183
|
-
if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
|
|
487
|
+
if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
|
|
184
488
|
sections: list[str] = [line]
|
|
185
489
|
if attribute_lines:
|
|
186
490
|
sections.extend(attribute_lines)
|