justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/serialize.py CHANGED
@@ -4,32 +4,42 @@
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import re
7
8
  from typing import Any
8
9
 
9
- from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
10
+ from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
11
+
12
+ # Matches characters that prevent an attribute value from being unquoted.
13
+ # Note: This matches the logic of the previous loop-based implementation.
14
+ # It checks for space characters, quotes, equals sign, and greater-than.
15
+ _UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
10
16
 
11
17
 
12
18
  def _escape_text(text: str | None) -> str:
13
19
  if not text:
14
20
  return ""
15
21
  # Minimal, but matches html5lib serializer expectations in core cases.
16
- return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
22
+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
17
23
 
18
24
 
19
- def _choose_attr_quote(value: str | None) -> str:
25
+ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
26
+ if forced_quote_char in {'"', "'"}:
27
+ return forced_quote_char
20
28
  if value is None:
21
29
  return '"'
22
- value = str(value)
30
+ # value is assumed to be a string
23
31
  if '"' in value and "'" not in value:
24
32
  return "'"
25
33
  return '"'
26
34
 
27
35
 
28
- def _escape_attr_value(value: str | None, quote_char: str) -> str:
36
+ def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
29
37
  if value is None:
30
38
  return ""
31
- value = str(value)
39
+ # value is assumed to be a string
32
40
  value = value.replace("&", "&amp;")
41
+ if escape_lt_in_attrs:
42
+ value = value.replace("<", "&lt;")
33
43
  # Note: html5lib's default serializer does not escape '>' in attrs.
34
44
  if quote_char == '"':
35
45
  return value.replace('"', "&quot;")
@@ -39,35 +49,63 @@ def _escape_attr_value(value: str | None, quote_char: str) -> str:
39
49
  def _can_unquote_attr_value(value: str | None) -> bool:
40
50
  if value is None:
41
51
  return False
42
- value = str(value)
43
- # html5lib's serializer unquotes aggressively; match fixture expectations.
44
- # Disallow whitespace and characters that would terminate/ambiguate the value.
45
- for ch in value:
46
- if ch == ">":
47
- return False
48
- if ch in {'"', "'", "="}:
49
- return False
50
- if ch in {" ", "\t", "\n", "\f", "\r"}:
51
- return False
52
- return True
52
+ # Optimization: use regex instead of loop
53
+ return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
53
54
 
54
55
 
55
- def serialize_start_tag(name: str, attrs: dict[str, str | None] | None) -> str:
56
+ def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
57
+ if not minimize_boolean_attributes:
58
+ return False
59
+ if value is None or value == "":
60
+ return True
61
+ if value == name:
62
+ return True
63
+ return value.lower() == name
64
+
65
+
66
+ def serialize_start_tag(
67
+ name: str,
68
+ attrs: dict[str, str | None] | None,
69
+ *,
70
+ quote_attr_values: bool = True,
71
+ minimize_boolean_attributes: bool = True,
72
+ quote_char: str | None = None,
73
+ escape_lt_in_attrs: bool = False,
74
+ use_trailing_solidus: bool = False,
75
+ is_void: bool = False,
76
+ ) -> str:
56
77
  attrs = attrs or {}
57
78
  parts: list[str] = ["<", name]
58
79
  if attrs:
59
80
  for key, value in attrs.items():
60
- if value is None or value == "":
81
+ if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
61
82
  parts.extend([" ", key])
83
+ continue
84
+
85
+ if value is None:
86
+ parts.extend([" ", key, '=""'])
87
+ continue
88
+
89
+ # value is guaranteed to be a string here because attrs is dict[str, str | None]
90
+ value_str = value
91
+ if value_str == "":
92
+ parts.extend([" ", key, '=""'])
93
+ continue
94
+
95
+ if not quote_attr_values and _can_unquote_attr_value(value_str):
96
+ escaped = value_str.replace("&", "&amp;")
97
+ if escape_lt_in_attrs:
98
+ escaped = escaped.replace("<", "&lt;")
99
+ parts.extend([" ", key, "=", escaped])
62
100
  else:
63
- if _can_unquote_attr_value(value):
64
- escaped = str(value).replace("&", "&amp;")
65
- parts.extend([" ", key, "=", escaped])
66
- else:
67
- quote = _choose_attr_quote(value)
68
- escaped = _escape_attr_value(value, quote)
69
- parts.extend([" ", key, "=", quote, escaped, quote])
70
- parts.append(">")
101
+ quote = _choose_attr_quote(value_str, quote_char)
102
+ escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
103
+ parts.extend([" ", key, "=", quote, escaped, quote])
104
+
105
+ if use_trailing_solidus and is_void:
106
+ parts.append(" />")
107
+ else:
108
+ parts.append(">")
71
109
  return "".join(parts)
72
110
 
73
111
 
@@ -75,27 +113,299 @@ def serialize_end_tag(name: str) -> str:
75
113
  return f"</{name}>"
76
114
 
77
115
 
78
- def to_html(node: Any, indent: int = 0, indent_size: int = 2, *, pretty: bool = True) -> str:
116
+ def to_html(
117
+ node: Any,
118
+ indent: int = 0,
119
+ indent_size: int = 2,
120
+ *,
121
+ pretty: bool = True,
122
+ ) -> str:
79
123
  """Convert node to HTML string."""
80
124
  if node.name == "#document":
81
125
  # Document root - just render children
82
126
  parts: list[str] = []
83
127
  for child in node.children or []:
84
- parts.append(_node_to_html(child, indent, indent_size, pretty))
128
+ parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
85
129
  return "\n".join(parts) if pretty else "".join(parts)
86
- return _node_to_html(node, indent, indent_size, pretty)
130
+ return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
131
+
132
+
133
+ def _collapse_html_whitespace(text: str) -> str:
134
+ """Collapse HTML whitespace runs to a single space and trim edges.
135
+
136
+ This matches how HTML rendering treats most whitespace in text nodes, and is
137
+ used only for pretty-printing in non-preformatted contexts.
138
+ """
139
+ if not text:
140
+ return ""
87
141
 
142
+ # Optimization: split() handles whitespace collapsing efficiently.
143
+ # Note: split() treats \v as whitespace, which is not HTML whitespace.
144
+ # But \v is extremely rare in HTML.
145
+ if "\v" in text:
146
+ parts: list[str] = []
147
+ in_whitespace = False
148
+ for ch in text:
149
+ if ch in {" ", "\t", "\n", "\f", "\r"}:
150
+ if not in_whitespace:
151
+ parts.append(" ")
152
+ in_whitespace = True
153
+ continue
154
+
155
+ parts.append(ch)
156
+ in_whitespace = False
157
+
158
+ collapsed = "".join(parts)
159
+ return collapsed.strip(" ")
160
+
161
+ return " ".join(text.split())
162
+
163
+
164
+ def _normalize_formatting_whitespace(text: str) -> str:
165
+ """Normalize formatting whitespace within a text node.
166
+
167
+ Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
168
+ include such formatting whitespace to a single space.
169
+
170
+ Pure space runs are preserved as-is (so existing double-spaces remain).
171
+ """
172
+ if not text:
173
+ return ""
174
+
175
+ if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
176
+ return text
177
+
178
+ starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
179
+ ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
180
+
181
+ out: list[str] = []
182
+ in_ws = False
183
+ saw_formatting_ws = False
184
+
185
+ for ch in text:
186
+ if ch == " ":
187
+ if in_ws:
188
+ # Only collapse if this whitespace run included formatting whitespace.
189
+ if saw_formatting_ws:
190
+ continue
191
+ out.append(" ")
192
+ continue
193
+ in_ws = True
194
+ saw_formatting_ws = False
195
+ out.append(" ")
196
+ continue
197
+
198
+ if ch in {"\n", "\r", "\t", "\f"}:
199
+ if in_ws:
200
+ saw_formatting_ws = True
201
+ continue
202
+ in_ws = True
203
+ saw_formatting_ws = True
204
+ out.append(" ")
205
+ continue
206
+
207
+ in_ws = False
208
+ saw_formatting_ws = False
209
+ out.append(ch)
210
+
211
+ normalized = "".join(out)
212
+ if starts_with_formatting and normalized.startswith(" "):
213
+ normalized = normalized[1:]
214
+ if ends_with_formatting and normalized.endswith(" "):
215
+ normalized = normalized[:-1]
216
+ return normalized
217
+
218
+
219
+ def _is_whitespace_text_node(node: Any) -> bool:
220
+ return node.name == "#text" and (node.data or "").strip() == ""
221
+
222
+
223
+ def _is_blocky_element(node: Any) -> bool:
224
+ # Treat elements as block-ish if they are block-level *or* contain any block-level
225
+ # descendants. This keeps pretty-printing readable for constructs like <a><div>...</div></a>.
226
+ try:
227
+ name = node.name
228
+ except AttributeError:
229
+ return False
230
+ if name in {"#text", "#comment", "!doctype"}:
231
+ return False
232
+ if name in SPECIAL_ELEMENTS:
233
+ return True
88
234
 
89
- def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
235
+ try:
236
+ children = node.children or []
237
+ except AttributeError:
238
+ return False
239
+ if not children:
240
+ return False
241
+
242
+ stack: list[Any] = list(children)
243
+ while stack:
244
+ child = stack.pop()
245
+ if child is None:
246
+ continue
247
+ child_name = child.name
248
+ if child_name in SPECIAL_ELEMENTS:
249
+ return True
250
+ if child_name in {"#text", "#comment", "!doctype"}:
251
+ continue
252
+ grand_children = child.children
253
+ if grand_children:
254
+ stack.extend(grand_children)
255
+
256
+ return False
257
+
258
+
259
+ _LAYOUT_BLOCK_ELEMENTS = {
260
+ "address",
261
+ "article",
262
+ "aside",
263
+ "blockquote",
264
+ "body",
265
+ "caption",
266
+ "center",
267
+ "dd",
268
+ "details",
269
+ "dialog",
270
+ "dir",
271
+ "div",
272
+ "dl",
273
+ "dt",
274
+ "fieldset",
275
+ "figcaption",
276
+ "figure",
277
+ "footer",
278
+ "form",
279
+ "h1",
280
+ "h2",
281
+ "h3",
282
+ "h4",
283
+ "h5",
284
+ "h6",
285
+ "header",
286
+ "hgroup",
287
+ "hr",
288
+ "html",
289
+ "iframe",
290
+ "li",
291
+ "listing",
292
+ "main",
293
+ "marquee",
294
+ "menu",
295
+ "nav",
296
+ "noframes",
297
+ "noscript",
298
+ "ol",
299
+ "p",
300
+ "plaintext",
301
+ "pre",
302
+ "search",
303
+ "section",
304
+ "summary",
305
+ "table",
306
+ "tbody",
307
+ "td",
308
+ "tfoot",
309
+ "th",
310
+ "thead",
311
+ "tr",
312
+ "ul",
313
+ }
314
+
315
+
316
+ _FORMAT_SEP = object()
317
+
318
+
319
+ def _is_layout_blocky_element(node: Any) -> bool:
320
+ # Similar to _is_blocky_element(), but limited to actual layout blocks.
321
+ # This avoids turning inline-ish "special" elements like <script> into
322
+ # multiline pretty-print breaks in contexts like <p>.
323
+ try:
324
+ name = node.name
325
+ except AttributeError:
326
+ return False
327
+ if name in {"#text", "#comment", "!doctype"}:
328
+ return False
329
+ if name in _LAYOUT_BLOCK_ELEMENTS:
330
+ return True
331
+
332
+ try:
333
+ children = node.children or []
334
+ except AttributeError:
335
+ return False
336
+ if not children:
337
+ return False
338
+
339
+ stack: list[Any] = list(children)
340
+ while stack:
341
+ child = stack.pop()
342
+ if child is None:
343
+ continue
344
+ child_name = child.name
345
+ if child_name in _LAYOUT_BLOCK_ELEMENTS:
346
+ return True
347
+ if child_name in {"#text", "#comment", "!doctype"}:
348
+ continue
349
+ grand_children = child.children
350
+ if grand_children:
351
+ stack.extend(grand_children)
352
+
353
+ return False
354
+
355
+
356
+ def _is_formatting_whitespace_text(data: str) -> bool:
357
+ # Formatting whitespace is something users typically don't intend to preserve
358
+ # exactly (e.g. newlines/indentation, or large runs of spaces).
359
+ if not data:
360
+ return False
361
+ if "\n" in data or "\r" in data or "\t" in data or "\f" in data:
362
+ return True
363
+ return len(data) > 2
364
+
365
+
366
+ def _should_pretty_indent_children(children: list[Any]) -> bool:
367
+ for child in children:
368
+ if child is None:
369
+ continue
370
+ name = child.name
371
+ if name == "#comment":
372
+ return False
373
+ if name == "#text" and (child.data or "").strip():
374
+ return False
375
+
376
+ element_children: list[Any] = [
377
+ child for child in children if child is not None and child.name not in {"#text", "#comment"}
378
+ ]
379
+ if not element_children:
380
+ return True
381
+ if len(element_children) == 1:
382
+ only_child = element_children[0]
383
+ if _is_blocky_element(only_child):
384
+ return True
385
+ return False
386
+
387
+ # Safe indentation rule: only insert inter-element whitespace when we won't
388
+ # be placing it between two adjacent inline/phrasing elements.
389
+ prev_is_blocky = _is_blocky_element(element_children[0])
390
+ for child in element_children[1:]:
391
+ current_is_blocky = _is_blocky_element(child)
392
+ if not prev_is_blocky and not current_is_blocky:
393
+ return False
394
+ prev_is_blocky = current_is_blocky
395
+ return True
396
+
397
+
398
+ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
90
399
  """Helper to convert a node to HTML."""
91
- prefix = " " * (indent * indent_size) if pretty else ""
92
- newline = "\n" if pretty else ""
400
+ prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
93
401
  name: str = node.name
402
+ content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
403
+ newline = "\n" if pretty and not content_pre else ""
94
404
 
95
405
  # Text node
96
406
  if name == "#text":
97
407
  text: str | None = node.data
98
- if pretty:
408
+ if pretty and not in_pre:
99
409
  text = text.strip() if text else ""
100
410
  if text:
101
411
  return f"{prefix}{_escape_text(text)}"
@@ -114,7 +424,7 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
114
424
  if name == "#document-fragment":
115
425
  parts: list[str] = []
116
426
  for child in node.children or []:
117
- child_html = _node_to_html(child, indent, indent_size, pretty)
427
+ child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
118
428
  if child_html:
119
429
  parts.append(child_html)
120
430
  return newline.join(parts) if pretty else "".join(parts)
@@ -130,20 +440,340 @@ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool
130
440
  return f"{prefix}{open_tag}"
131
441
 
132
442
  # Elements with children
133
- children: list[Any] = node.children or []
443
+ # Template special handling: HTML templates store contents in `template_content`.
444
+ if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
445
+ children: list[Any] = node.template_content.children or []
446
+ else:
447
+ children = node.children or []
134
448
  if not children:
135
449
  return f"{prefix}{open_tag}{serialize_end_tag(name)}"
136
450
 
137
451
  # Check if all children are text-only (inline rendering)
138
- all_text = all(c.name == "#text" for c in children)
452
+ all_text = True
453
+ for child in children:
454
+ if child is None:
455
+ continue
456
+ if child.name != "#text":
457
+ all_text = False
458
+ break
459
+
460
+ if all_text and pretty and not content_pre:
461
+ # Serializer controls sanitization at the to_html() entry point; avoid
462
+ # implicit re-sanitization during rendering.
463
+ text_content = node.to_text(separator="", strip=False)
464
+ text_content = _collapse_html_whitespace(text_content)
465
+ return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
466
+
467
+ if pretty and content_pre:
468
+ inner = "".join(
469
+ _node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
470
+ for child in children
471
+ if child is not None
472
+ )
473
+ return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
474
+
475
+ if pretty and not content_pre and name in SPECIAL_ELEMENTS:
476
+ # For block-ish containers that only have element children (and/or
477
+ # whitespace-only text nodes), prefer a multiline layout for readability
478
+ # even when children are inline elements.
479
+ can_indent = True
480
+ for child in children:
481
+ if child is None:
482
+ continue
483
+ if child.name == "#comment":
484
+ can_indent = False
485
+ break
486
+ if child.name == "#text" and (child.data or "").strip():
487
+ can_indent = False
488
+ break
489
+
490
+ if can_indent:
491
+ inner_lines: list[str] = []
492
+ for child in children:
493
+ if child is None:
494
+ continue
495
+ if _is_whitespace_text_node(child):
496
+ continue
497
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
498
+ if child_html:
499
+ inner_lines.append(child_html)
500
+
501
+ if inner_lines:
502
+ parts = [f"{prefix}{open_tag}"]
503
+ parts.extend(inner_lines)
504
+ parts.append(f"{prefix}{serialize_end_tag(name)}")
505
+ return "\n".join(parts)
506
+
507
+ # Smart pretty-printing: if the author already inserted formatting whitespace
508
+ # between siblings, we can split into "inline runs" and put each run on its
509
+ # own line without introducing new inter-token whitespace.
510
+ has_comment = any(child is not None and child.name == "#comment" for child in children)
511
+ if not has_comment:
512
+ non_none_children: list[Any] = [child for child in children if child is not None]
513
+
514
+ # Only enable this mode if there is at least one formatting whitespace text node
515
+ # between non-whitespace siblings.
516
+ has_separator = False
517
+ for child in non_none_children[1:-1]:
518
+ if child.name != "#text":
519
+ continue
520
+ data = child.data or ""
521
+ if data.strip() != "":
522
+ continue
523
+ if _is_formatting_whitespace_text(data):
524
+ has_separator = True
525
+ break
526
+
527
+ if has_separator:
528
+ # Build runs by splitting on formatting whitespace text nodes.
529
+ # Keep small spacing nodes (" " or " ") inside runs.
530
+ items: list[Any] = []
531
+ last_was_sep = False
532
+ for child in non_none_children:
533
+ if child.name == "#text":
534
+ data = child.data or ""
535
+ if data.strip() == "" and _is_formatting_whitespace_text(data):
536
+ if not last_was_sep:
537
+ items.append(_FORMAT_SEP)
538
+ last_was_sep = True
539
+ continue
540
+ items.append(child)
541
+ last_was_sep = False
542
+
543
+ while items and items[0] is _FORMAT_SEP:
544
+ items.pop(0)
545
+ while items and items[-1] is _FORMAT_SEP:
546
+ items.pop()
547
+
548
+ runs: list[list[Any]] = []
549
+ current_run: list[Any] = []
550
+ for item in items:
551
+ if item is _FORMAT_SEP:
552
+ runs.append(current_run)
553
+ current_run = []
554
+ continue
555
+ current_run.append(item)
556
+ runs.append(current_run)
557
+ runs = [run for run in runs if run]
558
+
559
+ # Only apply if we can render each run either as a single blocky element
560
+ # (possibly multiline) or as a single-line inline run.
561
+ smart_lines: list[str] = []
562
+ can_apply = True
563
+ for run in runs:
564
+ blocky_elements = [c for c in run if c.name not in {"#text", "#comment"} and _is_blocky_element(c)]
565
+ if blocky_elements and len(run) != 1:
566
+ can_apply = False
567
+ break
568
+
569
+ if len(run) == 1 and run[0].name != "#text":
570
+ child_html = _node_to_html(run[0], indent + 1, indent_size, pretty=True, in_pre=content_pre)
571
+ smart_lines.append(child_html)
572
+ continue
573
+
574
+ # Inline run: render on one line.
575
+ run_parts: list[str] = []
576
+ for c in run:
577
+ if c.name == "#text":
578
+ data = c.data or ""
579
+ if not data.strip():
580
+ # Formatting whitespace never appears inside runs (it is used as a separator).
581
+ # Preserve intentional tiny spacing.
582
+ run_parts.append(data)
583
+ continue
584
+
585
+ run_parts.append(_escape_text(_normalize_formatting_whitespace(data)))
586
+ continue
587
+
588
+ # Render inline elements without their own leading indentation.
589
+ child_html = _node_to_html(c, 0, indent_size, pretty=True, in_pre=content_pre)
590
+ run_parts.append(child_html)
591
+
592
+ smart_lines.append(f"{' ' * ((indent + 1) * indent_size)}{''.join(run_parts)}")
593
+
594
+ if can_apply and smart_lines:
595
+ return f"{prefix}{open_tag}\n" + "\n".join(smart_lines) + f"\n{prefix}{serialize_end_tag(name)}"
596
+
597
+ if pretty and not content_pre and not _should_pretty_indent_children(children):
598
+ # For block-ish elements that contain only element children and whitespace-only
599
+ # text nodes, we can still format each child on its own line (only when there
600
+ # is already whitespace separating element siblings).
601
+ if name in SPECIAL_ELEMENTS:
602
+ # Mixed content in block-ish containers: if we encounter a blocky child
603
+ # (e.g. <ul>) adjacent to inline text, printing everything on one line
604
+ # both hurts readability and can lose indentation inside the block subtree.
605
+ # In that case, put inline runs and blocky children on their own lines.
606
+ has_comment = any(child is not None and child.name == "#comment" for child in children)
607
+ if not has_comment:
608
+ has_blocky_child = any(
609
+ child is not None and child.name not in {"#text", "#comment"} and _is_layout_blocky_element(child)
610
+ for child in children
611
+ )
612
+ has_non_whitespace_text = any(
613
+ child is not None and child.name == "#text" and (child.data or "").strip() for child in children
614
+ )
615
+
616
+ if has_blocky_child and has_non_whitespace_text:
617
+ mixed_multiline_lines: list[str] = []
618
+ inline_parts: list[str] = []
619
+
620
+ mixed_first_non_none_index: int | None = None
621
+ mixed_last_non_none_index: int | None = None
622
+ for i, child in enumerate(children):
623
+ if child is None:
624
+ continue
625
+ if mixed_first_non_none_index is None:
626
+ mixed_first_non_none_index = i
627
+ mixed_last_non_none_index = i
628
+
629
+ def flush_inline() -> None:
630
+ if not inline_parts:
631
+ return
632
+ line = "".join(inline_parts).strip(" ")
633
+ inline_parts.clear()
634
+ if line:
635
+ mixed_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{line}")
636
+
637
+ for i, child in enumerate(children):
638
+ if child is None:
639
+ continue
640
+
641
+ if child.name == "#text":
642
+ data = child.data or ""
643
+ if not data.strip():
644
+ # Drop leading/trailing formatting whitespace.
645
+ if i == mixed_first_non_none_index or i == mixed_last_non_none_index:
646
+ continue
647
+ # Preserve intentional small spacing, but treat formatting whitespace
648
+ # as a separator between inline runs (new line).
649
+ if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
650
+ flush_inline()
651
+ else:
652
+ inline_parts.append(data)
653
+ continue
654
+
655
+ data = _normalize_formatting_whitespace(data)
656
+ inline_parts.append(_escape_text(data))
657
+ continue
658
+
659
+ if _is_layout_blocky_element(child):
660
+ flush_inline()
661
+ mixed_multiline_lines.append(
662
+ _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
663
+ )
664
+ continue
665
+
666
+ # Inline element: keep it in the current line without leading indentation.
667
+ inline_parts.append(_node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre))
668
+
669
+ flush_inline()
670
+ inner = "\n".join(line for line in mixed_multiline_lines if line)
671
+ return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
672
+
673
+ has_comment = False
674
+ has_element = False
675
+ has_whitespace_between_elements = False
676
+
677
+ first_element_index: int | None = None
678
+ last_element_index: int | None = None
679
+
680
+ previous_was_element = False
681
+ saw_whitespace_since_last_element = False
682
+ for i, child in enumerate(children):
683
+ if child is None:
684
+ continue
685
+ if child.name == "#comment":
686
+ has_comment = True
687
+ break
688
+ if child.name == "#text":
689
+ # Track whether there is already whitespace between element siblings.
690
+ if previous_was_element and not (child.data or "").strip():
691
+ saw_whitespace_since_last_element = True
692
+ continue
693
+
694
+ has_element = True
695
+ if first_element_index is None:
696
+ first_element_index = i
697
+ last_element_index = i
698
+ if previous_was_element and saw_whitespace_since_last_element:
699
+ has_whitespace_between_elements = True
700
+ previous_was_element = True
701
+ saw_whitespace_since_last_element = False
702
+
703
+ can_indent_non_whitespace_text = True
704
+ if has_element and first_element_index is not None and last_element_index is not None:
705
+ for i, child in enumerate(children):
706
+ if child is None or child.name != "#text":
707
+ continue
708
+ if not (child.data or "").strip():
709
+ continue
710
+ # Only allow non-whitespace text *after* the last element.
711
+ # Leading text or text between elements could gain new spaces
712
+ # due to indentation/newlines.
713
+ if i < first_element_index or first_element_index < i < last_element_index:
714
+ can_indent_non_whitespace_text = False
715
+ break
716
+
717
+ if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
718
+ element_multiline_lines: list[str] = []
719
+ for child in children:
720
+ if child is None:
721
+ continue
722
+ if child.name == "#text":
723
+ text = _collapse_html_whitespace(child.data or "")
724
+ if text:
725
+ element_multiline_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
726
+ continue
727
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
728
+ if child_html:
729
+ element_multiline_lines.append(child_html)
730
+ if element_multiline_lines:
731
+ inner = "\n".join(element_multiline_lines)
732
+ return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
733
+
734
+ inner_parts: list[str] = []
735
+
736
+ compact_first_non_none_index: int | None = None
737
+ compact_last_non_none_index: int | None = None
738
+ for i, child in enumerate(children):
739
+ if child is None:
740
+ continue
741
+ if compact_first_non_none_index is None:
742
+ compact_first_non_none_index = i
743
+ compact_last_non_none_index = i
744
+
745
+ for i, child in enumerate(children):
746
+ if child is None:
747
+ continue
748
+
749
+ if child.name == "#text":
750
+ data = child.data or ""
751
+ if not data.strip():
752
+ # Drop leading/trailing formatting whitespace in compact mode.
753
+ if i == compact_first_non_none_index or i == compact_last_non_none_index:
754
+ continue
755
+ # Preserve intentional small spacing, but collapse large formatting gaps.
756
+ if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
757
+ inner_parts.append(" ")
758
+ continue
759
+
760
+ data = _normalize_formatting_whitespace(data)
761
+ child_html = _escape_text(data) if data else ""
762
+ else:
763
+ # Even when we can't safely insert whitespace *between* siblings, we can
764
+ # still pretty-print each element subtree to improve readability.
765
+ child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
766
+ if child_html:
767
+ inner_parts.append(child_html)
139
768
 
140
- if all_text and pretty:
141
- return f"{prefix}{open_tag}{_escape_text(node.to_text(separator='', strip=False))}{serialize_end_tag(name)}"
769
+ return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
142
770
 
143
771
  # Render with child indentation
144
772
  parts = [f"{prefix}{open_tag}"]
145
773
  for child in children:
146
- child_html = _node_to_html(child, indent + 1, indent_size, pretty)
774
+ if pretty and not content_pre and _is_whitespace_text_node(child):
775
+ continue
776
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
147
777
  if child_html:
148
778
  parts.append(child_html)
149
779
  parts.append(f"{prefix}{serialize_end_tag(name)}")
@@ -180,7 +810,7 @@ def _node_to_test_format(node: Any, indent: int) -> str:
180
810
  attribute_lines = _attrs_to_test_format(node, indent)
181
811
 
182
812
  # Template special handling (only HTML namespace templates have template_content)
183
- if node.name == "template" and node.namespace in {None, "html"} and node.template_content:
813
+ if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
184
814
  sections: list[str] = [line]
185
815
  if attribute_lines:
186
816
  sections.extend(attribute_lines)