justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/selector.py CHANGED
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ from functools import lru_cache
6
7
  from typing import Any
7
8
 
8
9
 
@@ -529,6 +530,14 @@ class SelectorMatcher:
529
530
 
530
531
  __slots__ = ()
531
532
 
533
+ def _unquote_pseudo_arg(self, arg: str) -> str:
534
+ arg = arg.strip()
535
+ if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
536
+ quote = arg[0]
537
+ # Minimal unescaping for common cases like :contains("click me")
538
+ return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
539
+ return arg
540
+
532
541
  def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
533
542
  """Check if a node matches a parsed selector."""
534
543
  if isinstance(selector, SelectorList):
@@ -724,6 +733,17 @@ class SelectorMatcher:
724
733
  return parent.name in ("#document", "#document-fragment")
725
734
  return False
726
735
 
736
+ if name == "contains":
737
+ if selector.arg is None:
738
+ raise SelectorError(":contains() requires a string argument")
739
+ needle = self._unquote_pseudo_arg(selector.arg)
740
+ if needle == "":
741
+ return True
742
+ # Non-standard (jQuery-style) pseudo-class: match elements whose descendant
743
+ # text contains the substring. We use `to_text()` to approximate textContent.
744
+ haystack: str = node.to_text(separator=" ", strip=True)
745
+ return needle in haystack
746
+
727
747
  if name == "first-of-type":
728
748
  return self._is_first_of_type(node)
729
749
 
@@ -743,7 +763,7 @@ class SelectorMatcher:
743
763
  """Get only element children (exclude text, comments, etc.)."""
744
764
  if not parent or not parent.has_child_nodes():
745
765
  return []
746
- return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
766
+ return [c for c in parent.children if not c.name.startswith("#")]
747
767
 
748
768
  def _get_previous_sibling(self, node: Any) -> Any | None:
749
769
  """Get the previous element sibling. Returns None if node is first or not found."""
@@ -755,7 +775,7 @@ class SelectorMatcher:
755
775
  for child in parent.children:
756
776
  if child is node:
757
777
  return prev
758
- if hasattr(child, "name") and not child.name.startswith("#"):
778
+ if not child.name.startswith("#"):
759
779
  prev = child
760
780
  return None # node not in parent.children (detached)
761
781
 
@@ -903,7 +923,12 @@ def parse_selector(selector_string: str) -> ParsedSelector:
903
923
  if not selector_string or not selector_string.strip():
904
924
  raise SelectorError("Empty selector")
905
925
 
906
- tokenizer = SelectorTokenizer(selector_string.strip())
926
+ return _parse_selector_cached(selector_string.strip())
927
+
928
+
929
+ @lru_cache(maxsize=512)
930
+ def _parse_selector_cached(selector_string: str) -> ParsedSelector:
931
+ tokenizer = SelectorTokenizer(selector_string)
907
932
  tokens = tokenizer.tokenize()
908
933
  parser = SelectorParser(tokens)
909
934
  return parser.parse()
@@ -913,6 +938,51 @@ def parse_selector(selector_string: str) -> ParsedSelector:
913
938
  _matcher: SelectorMatcher = SelectorMatcher()
914
939
 
915
940
 
941
+ def _is_simple_tag_selector(selector: str) -> bool:
942
+ if not selector:
943
+ return False
944
+ ch0 = selector[0]
945
+ if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
946
+ return False
947
+ for ch in selector[1:]:
948
+ if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
949
+ continue
950
+ return False
951
+ return True
952
+
953
+
954
+ def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
955
+ results_append = results.append
956
+
957
+ stack: list[Any] = []
958
+
959
+ root_children = node.children
960
+ if root_children:
961
+ stack.extend(reversed(root_children))
962
+
963
+ if node.name == "template" and node.namespace == "html":
964
+ template_content = node.template_content
965
+ if template_content:
966
+ stack.append(template_content)
967
+
968
+ while stack:
969
+ current = stack.pop()
970
+
971
+ name = current.name
972
+ if not name.startswith("#"):
973
+ if name == tag_lower or name.lower() == tag_lower:
974
+ results_append(current)
975
+
976
+ children = current.children
977
+ if children:
978
+ stack.extend(reversed(children))
979
+
980
+ if name == "template" and current.namespace == "html":
981
+ template_content = current.template_content
982
+ if template_content:
983
+ stack.append(template_content)
984
+
985
+
916
986
  def query(root: Any, selector_string: str) -> list[Any]:
917
987
  """
918
988
  Query the DOM tree starting from root, returning all matching elements.
@@ -927,27 +997,53 @@ def query(root: Any, selector_string: str) -> list[Any]:
927
997
  Returns:
928
998
  A list of matching nodes
929
999
  """
930
- selector = parse_selector(selector_string)
1000
+ selector_string = selector_string.strip()
1001
+ if not selector_string:
1002
+ raise SelectorError("Empty selector")
1003
+
931
1004
  results: list[Any] = []
1005
+
1006
+ if _is_simple_tag_selector(selector_string):
1007
+ _query_descendants_tag(root, selector_string.lower(), results)
1008
+ return results
1009
+
1010
+ selector = _parse_selector_cached(selector_string)
932
1011
  _query_descendants(root, selector, results)
933
1012
  return results
934
1013
 
935
1014
 
936
1015
  def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
937
- """Recursively search for matching nodes in descendants."""
938
- # Only recurse into children (not the node itself)
939
- if node.has_child_nodes():
940
- for child in node.children:
941
- # Check if this child matches
942
- if hasattr(child, "name") and not child.name.startswith("#"):
943
- if _matcher.matches(child, selector):
944
- results.append(child)
945
- # Recurse into child's descendants
946
- _query_descendants(child, selector, results)
947
-
948
- # Also check template content if present
949
- if hasattr(node, "template_content") and node.template_content:
950
- _query_descendants(node.template_content, selector, results)
1016
+ """Search for matching nodes in descendants."""
1017
+ matcher_matches = _matcher.matches
1018
+ results_append = results.append
1019
+
1020
+ # querySelectorAll searches descendants of root, not including root itself.
1021
+ stack: list[Any] = []
1022
+
1023
+ root_children = node.children
1024
+ if root_children:
1025
+ stack.extend(reversed(root_children))
1026
+
1027
+ if node.name == "template" and node.namespace == "html":
1028
+ template_content = node.template_content
1029
+ if template_content:
1030
+ stack.append(template_content)
1031
+
1032
+ while stack:
1033
+ current = stack.pop()
1034
+
1035
+ name = current.name
1036
+ if not name.startswith("#") and matcher_matches(current, selector):
1037
+ results_append(current)
1038
+
1039
+ children = current.children
1040
+ if children:
1041
+ stack.extend(reversed(children))
1042
+
1043
+ if name == "template" and current.namespace == "html":
1044
+ template_content = current.template_content
1045
+ if template_content:
1046
+ stack.append(template_content)
951
1047
 
952
1048
 
953
1049
  def matches(node: Any, selector_string: str) -> bool:
justhtml/serialize.py CHANGED
@@ -6,7 +6,8 @@ from __future__ import annotations
6
6
 
7
7
  from typing import Any
8
8
 
9
- from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
9
+ from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS
10
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, sanitize
10
11
 
11
12
 
12
13
  def _escape_text(text: str | None) -> str:
@@ -16,7 +17,9 @@ def _escape_text(text: str | None) -> str:
16
17
  return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
17
18
 
18
19
 
19
- def _choose_attr_quote(value: str | None) -> str:
20
+ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
21
+ if forced_quote_char in {'"', "'"}:
22
+ return forced_quote_char
20
23
  if value is None:
21
24
  return '"'
22
25
  value = str(value)
@@ -25,11 +28,13 @@ def _choose_attr_quote(value: str | None) -> str:
25
28
  return '"'
26
29
 
27
30
 
28
- def _escape_attr_value(value: str | None, quote_char: str) -> str:
31
+ def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
29
32
  if value is None:
30
33
  return ""
31
34
  value = str(value)
32
35
  value = value.replace("&", "&amp;")
36
+ if escape_lt_in_attrs:
37
+ value = value.replace("<", "&lt;")
33
38
  # Note: html5lib's default serializer does not escape '>' in attrs.
34
39
  if quote_char == '"':
35
40
  return value.replace('"', "&quot;")
@@ -40,8 +45,6 @@ def _can_unquote_attr_value(value: str | None) -> bool:
40
45
  if value is None:
41
46
  return False
42
47
  value = str(value)
43
- # html5lib's serializer unquotes aggressively; match fixture expectations.
44
- # Disallow whitespace and characters that would terminate/ambiguate the value.
45
48
  for ch in value:
46
49
  if ch == ">":
47
50
  return False
@@ -52,22 +55,56 @@ def _can_unquote_attr_value(value: str | None) -> bool:
52
55
  return True
53
56
 
54
57
 
55
- def serialize_start_tag(name: str, attrs: dict[str, str | None] | None) -> str:
58
+ def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
59
+ if not minimize_boolean_attributes:
60
+ return False
61
+ if value is None or value == "":
62
+ return True
63
+ return str(value).lower() == str(name).lower()
64
+
65
+
66
+ def serialize_start_tag(
67
+ name: str,
68
+ attrs: dict[str, str | None] | None,
69
+ *,
70
+ quote_attr_values: bool = True,
71
+ minimize_boolean_attributes: bool = True,
72
+ quote_char: str | None = None,
73
+ escape_lt_in_attrs: bool = False,
74
+ use_trailing_solidus: bool = False,
75
+ is_void: bool = False,
76
+ ) -> str:
56
77
  attrs = attrs or {}
57
78
  parts: list[str] = ["<", name]
58
79
  if attrs:
59
80
  for key, value in attrs.items():
60
- if value is None or value == "":
81
+ if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
61
82
  parts.extend([" ", key])
83
+ continue
84
+
85
+ if value is None:
86
+ parts.extend([" ", key, '=""'])
87
+ continue
88
+
89
+ value_str = str(value)
90
+ if value_str == "":
91
+ parts.extend([" ", key, '=""'])
92
+ continue
93
+
94
+ if not quote_attr_values and _can_unquote_attr_value(value_str):
95
+ escaped = value_str.replace("&", "&amp;")
96
+ if escape_lt_in_attrs:
97
+ escaped = escaped.replace("<", "&lt;")
98
+ parts.extend([" ", key, "=", escaped])
62
99
  else:
63
- if _can_unquote_attr_value(value):
64
- escaped = str(value).replace("&", "&amp;")
65
- parts.extend([" ", key, "=", escaped])
66
- else:
67
- quote = _choose_attr_quote(value)
68
- escaped = _escape_attr_value(value, quote)
69
- parts.extend([" ", key, "=", quote, escaped, quote])
70
- parts.append(">")
100
+ quote = _choose_attr_quote(value_str, quote_char)
101
+ escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
102
+ parts.extend([" ", key, "=", quote, escaped, quote])
103
+
104
+ if use_trailing_solidus and is_void:
105
+ parts.append(" />")
106
+ else:
107
+ parts.append(">")
71
108
  return "".join(parts)
72
109
 
73
110
 
@@ -75,27 +112,171 @@ def serialize_end_tag(name: str) -> str:
75
112
  return f"</{name}>"
76
113
 
77
114
 
78
- def to_html(node: Any, indent: int = 0, indent_size: int = 2, *, pretty: bool = True) -> str:
115
+ def to_html(
116
+ node: Any,
117
+ indent: int = 0,
118
+ indent_size: int = 2,
119
+ *,
120
+ pretty: bool = True,
121
+ safe: bool = True,
122
+ policy: SanitizationPolicy | None = None,
123
+ ) -> str:
79
124
  """Convert node to HTML string."""
125
+ if safe:
126
+ if policy is None and node.name == "#document":
127
+ node = sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
128
+ else:
129
+ node = sanitize(node, policy=policy or DEFAULT_POLICY)
80
130
  if node.name == "#document":
81
131
  # Document root - just render children
82
132
  parts: list[str] = []
83
133
  for child in node.children or []:
84
- parts.append(_node_to_html(child, indent, indent_size, pretty))
134
+ parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
85
135
  return "\n".join(parts) if pretty else "".join(parts)
86
- return _node_to_html(node, indent, indent_size, pretty)
136
+ return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
137
+
138
+
139
+ _PREFORMATTED_ELEMENTS: set[str] = {"pre", "textarea", "code"}
140
+
141
+ # Elements whose text content must not be normalized (e.g. scripts/styles).
142
+ _RAWTEXT_ELEMENTS: set[str] = {"script", "style"}
143
+
144
+
145
+ def _collapse_html_whitespace(text: str) -> str:
146
+ """Collapse HTML whitespace runs to a single space and trim edges.
147
+
148
+ This matches how HTML rendering treats most whitespace in text nodes, and is
149
+ used only for pretty-printing in non-preformatted contexts.
150
+ """
151
+ if not text:
152
+ return ""
153
+
154
+ parts: list[str] = []
155
+ in_whitespace = False
156
+ for ch in text:
157
+ if ch in {" ", "\t", "\n", "\f", "\r"}:
158
+ if not in_whitespace:
159
+ parts.append(" ")
160
+ in_whitespace = True
161
+ continue
162
+
163
+ parts.append(ch)
164
+ in_whitespace = False
165
+
166
+ collapsed = "".join(parts)
167
+ return collapsed.strip(" ")
168
+
169
+
170
+ def _normalize_formatting_whitespace(text: str) -> str:
171
+ """Normalize formatting whitespace within a text node.
87
172
 
173
+ Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
174
+ include such formatting whitespace to a single space.
88
175
 
89
- def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
176
+ Pure space runs are preserved as-is (so existing double-spaces remain).
177
+ """
178
+ if not text:
179
+ return ""
180
+
181
+ if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
182
+ return text
183
+
184
+ starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
185
+ ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
186
+
187
+ out: list[str] = []
188
+ in_ws = False
189
+ saw_formatting_ws = False
190
+
191
+ for ch in text:
192
+ if ch == " ":
193
+ if in_ws:
194
+ # Only collapse if this whitespace run included formatting whitespace.
195
+ if saw_formatting_ws:
196
+ continue
197
+ out.append(" ")
198
+ continue
199
+ in_ws = True
200
+ saw_formatting_ws = False
201
+ out.append(" ")
202
+ continue
203
+
204
+ if ch in {"\n", "\r", "\t", "\f"}:
205
+ if in_ws:
206
+ saw_formatting_ws = True
207
+ continue
208
+ in_ws = True
209
+ saw_formatting_ws = True
210
+ out.append(" ")
211
+ continue
212
+
213
+ in_ws = False
214
+ saw_formatting_ws = False
215
+ out.append(ch)
216
+
217
+ normalized = "".join(out)
218
+ if starts_with_formatting and normalized.startswith(" "):
219
+ normalized = normalized[1:]
220
+ if ends_with_formatting and normalized.endswith(" "):
221
+ normalized = normalized[:-1]
222
+ return normalized
223
+
224
+
225
+ def _is_whitespace_text_node(node: Any) -> bool:
226
+ return node.name == "#text" and (node.data or "").strip() == ""
227
+
228
+
229
+ def _should_pretty_indent_children(children: list[Any]) -> bool:
230
+ for child in children:
231
+ if child is None:
232
+ continue
233
+ name = child.name
234
+ if name == "#comment":
235
+ return False
236
+ if name == "#text" and (child.data or "").strip():
237
+ return False
238
+
239
+ element_children: list[Any] = [
240
+ child for child in children if child is not None and child.name not in {"#text", "#comment"}
241
+ ]
242
+ if not element_children:
243
+ return True
244
+ if len(element_children) == 1:
245
+ only_child = element_children[0]
246
+ if only_child.name in SPECIAL_ELEMENTS:
247
+ return True
248
+ if only_child.name == "a":
249
+ # If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
250
+ # for pretty-printing so the parent can indent it on its own line.
251
+ for grandchild in only_child.children or []:
252
+ if grandchild is None:
253
+ continue
254
+ if grandchild.name in SPECIAL_ELEMENTS:
255
+ return True
256
+ return False
257
+
258
+ # Safe indentation rule: only insert inter-element whitespace when we won't
259
+ # be placing it between two adjacent inline/phrasing elements.
260
+ prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
261
+ for child in element_children[1:]:
262
+ current_is_special = child.name in SPECIAL_ELEMENTS
263
+ if not prev_is_special and not current_is_special:
264
+ return False
265
+ prev_is_special = current_is_special
266
+ return True
267
+
268
+
269
+ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
90
270
  """Helper to convert a node to HTML."""
91
- prefix = " " * (indent * indent_size) if pretty else ""
92
- newline = "\n" if pretty else ""
271
+ prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
93
272
  name: str = node.name
273
+ content_pre = in_pre or name in _PREFORMATTED_ELEMENTS
274
+ newline = "\n" if pretty and not content_pre else ""
94
275
 
95
276
  # Text node
96
277
  if name == "#text":
97
278
  text: str | None = node.data
98
- if pretty:
279
+ if pretty and not in_pre:
99
280
  text = text.strip() if text else ""
100
281
  if text:
101
282
  return f"{prefix}{_escape_text(text)}"
@@ -114,7 +295,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
114
295
  if name == "#document-fragment":
115
296
  parts: list[str] = []
116
297
  for child in node.children or []:
117
- child_html = _node_to_html(child, indent, indent_size, pretty)
298
+ child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
118
299
  if child_html:
119
300
  parts.append(child_html)
120
301
  return newline.join(parts) if pretty else "".join(parts)
@@ -130,20 +311,143 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
130
311
  return f"{prefix}{open_tag}"
131
312
 
132
313
  # Elements with children
133
- children: list[Any] = node.children or []
314
+ # Template special handling: HTML templates store contents in `template_content`.
315
+ if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
316
+ children: list[Any] = node.template_content.children or []
317
+ else:
318
+ children = node.children or []
134
319
  if not children:
135
320
  return f"{prefix}{open_tag}{serialize_end_tag(name)}"
136
321
 
137
322
  # Check if all children are text-only (inline rendering)
138
323
  all_text = all(c.name == "#text" for c in children)
139
324
 
140
- if all_text and pretty:
141
- return f"{prefix}{open_tag}{_escape_text(node.to_text(separator='', strip=False))}{serialize_end_tag(name)}"
325
+ if all_text and pretty and not content_pre:
326
+ # Serializer controls sanitization at the to_html() entry point; avoid
327
+ # implicit re-sanitization during rendering.
328
+ text_content = node.to_text(separator="", strip=False, safe=False)
329
+ if name not in _RAWTEXT_ELEMENTS:
330
+ text_content = _collapse_html_whitespace(text_content)
331
+ return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
332
+
333
+ if pretty and content_pre:
334
+ inner = "".join(
335
+ _node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
336
+ for child in children
337
+ if child is not None
338
+ )
339
+ return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
340
+
341
+ if pretty and not content_pre and not _should_pretty_indent_children(children):
342
+ # For block-ish elements that contain only element children and whitespace-only
343
+ # text nodes, we can still format each child on its own line (only when there
344
+ # is already whitespace separating element siblings).
345
+ if name in SPECIAL_ELEMENTS:
346
+ has_comment = False
347
+ has_element = False
348
+ has_whitespace_between_elements = False
349
+
350
+ first_element_index: int | None = None
351
+ last_element_index: int | None = None
352
+
353
+ previous_was_element = False
354
+ saw_whitespace_since_last_element = False
355
+ for i, child in enumerate(children):
356
+ if child is None:
357
+ continue
358
+ if child.name == "#comment":
359
+ has_comment = True
360
+ break
361
+ if child.name == "#text":
362
+ # Track whether there is already whitespace between element siblings.
363
+ if previous_was_element and not (child.data or "").strip():
364
+ saw_whitespace_since_last_element = True
365
+ continue
366
+
367
+ has_element = True
368
+ if first_element_index is None:
369
+ first_element_index = i
370
+ last_element_index = i
371
+ if previous_was_element and saw_whitespace_since_last_element:
372
+ has_whitespace_between_elements = True
373
+ previous_was_element = True
374
+ saw_whitespace_since_last_element = False
375
+
376
+ can_indent_non_whitespace_text = True
377
+ if has_element and first_element_index is not None and last_element_index is not None:
378
+ for i, child in enumerate(children):
379
+ if child is None or child.name != "#text":
380
+ continue
381
+ if not (child.data or "").strip():
382
+ continue
383
+ # Only allow non-whitespace text *after* the last element.
384
+ # Leading text or text between elements could gain new spaces
385
+ # due to indentation/newlines.
386
+ if i < first_element_index or first_element_index < i < last_element_index:
387
+ can_indent_non_whitespace_text = False
388
+ break
389
+
390
+ if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
391
+ inner_lines: list[str] = []
392
+ for child in children:
393
+ if child is None:
394
+ continue
395
+ if child.name == "#text":
396
+ text = _collapse_html_whitespace(child.data or "")
397
+ if text:
398
+ inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
399
+ continue
400
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
401
+ if child_html:
402
+ inner_lines.append(child_html)
403
+ if inner_lines:
404
+ inner = "\n".join(inner_lines)
405
+ return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
406
+
407
+ inner_parts: list[str] = []
408
+
409
+ first_non_none_index: int | None = None
410
+ last_non_none_index: int | None = None
411
+ for i, child in enumerate(children):
412
+ if child is None:
413
+ continue
414
+ if first_non_none_index is None:
415
+ first_non_none_index = i
416
+ last_non_none_index = i
417
+
418
+ for i, child in enumerate(children):
419
+ if child is None:
420
+ continue
421
+
422
+ if child.name == "#text":
423
+ data = child.data or ""
424
+ if not data.strip():
425
+ # Drop leading/trailing formatting whitespace in compact mode.
426
+ if i == first_non_none_index or i == last_non_none_index:
427
+ continue
428
+ # Preserve intentional small spacing, but collapse large formatting gaps.
429
+ if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
430
+ inner_parts.append(" ")
431
+ continue
432
+
433
+ if not content_pre and name not in _RAWTEXT_ELEMENTS:
434
+ data = _normalize_formatting_whitespace(data)
435
+ child_html = _escape_text(data) if data else ""
436
+ else:
437
+ # Even when we can't safely insert whitespace *between* siblings, we can
438
+ # still pretty-print each element subtree to improve readability.
439
+ child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
440
+ if child_html:
441
+ inner_parts.append(child_html)
442
+
443
+ return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
142
444
 
143
445
  # Render with child indentation
144
446
  parts = [f"{prefix}{open_tag}"]
145
447
  for child in children:
146
- child_html = _node_to_html(child, indent + 1, indent_size, pretty)
448
+ if pretty and not content_pre and _is_whitespace_text_node(child):
449
+ continue
450
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
147
451
  if child_html:
148
452
  parts.append(child_html)
149
453
  parts.append(f"{prefix}{serialize_end_tag(name)}")
@@ -180,7 +484,7 @@ def _node_to_test_format(node: Any, indent: int) -> str:
180
484
  attribute_lines = _attrs_to_test_format(node, indent)
181
485
 
182
486
  # Template special handling (only HTML namespace templates have template_content)
183
- if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
487
+ if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
184
488
  sections: list[str] = [line]
185
489
  if attribute_lines:
186
490
  sections.extend(attribute_lines)