justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/node.py CHANGED
@@ -1,14 +1,228 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from urllib.parse import quote
5
+
6
+ from .sanitize import _sanitize
1
7
  from .selector import query
2
8
  from .serialize import to_html
3
9
 
10
+ if TYPE_CHECKING:
11
+ from .sanitize import SanitizationPolicy
12
+ from .tokens import Doctype
13
+
14
+
15
+ def _markdown_escape_text(s: str) -> str:
16
+ if not s:
17
+ return ""
18
+ # Pragmatic: escape the few characters that commonly change Markdown meaning.
19
+ # Keep this minimal to preserve readability.
20
+ out: list[str] = []
21
+ for ch in s:
22
+ if ch in "\\`*_[]":
23
+ out.append("\\")
24
+ out.append(ch)
25
+ return "".join(out)
26
+
27
+
28
+ def _markdown_code_span(s: str | None) -> str:
29
+ if s is None:
30
+ s = ""
31
+ # Use a backtick fence longer than any run of backticks inside.
32
+ longest = 0
33
+ run = 0
34
+ for ch in s:
35
+ if ch == "`":
36
+ run += 1
37
+ if run > longest:
38
+ longest = run
39
+ else:
40
+ run = 0
41
+ fence = "`" * (longest + 1)
42
+ # CommonMark requires a space if the content starts/ends with backticks.
43
+ needs_space = s.startswith("`") or s.endswith("`")
44
+ if needs_space:
45
+ return f"{fence} {s} {fence}"
46
+ return f"{fence}{s}{fence}"
47
+
4
48
 
5
- class SimpleDomNode:
6
- __slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
49
+ def _markdown_link_destination(url: str) -> str:
50
+ """Return a Markdown-safe link destination.
51
+
52
+ We primarily care about avoiding Markdown formatting injection and broken
53
+ parsing for URLs that contain whitespace or parentheses.
54
+
55
+ CommonMark supports destinations wrapped in angle brackets:
56
+ `[text](<https://example.com/a(b)c>)`
57
+ """
58
+
59
+ u = (url or "").strip()
60
+ if not u:
61
+ return ""
62
+
63
+ # If the destination contains characters that can terminate or confuse
64
+ # the Markdown destination parser, wrap in <...> and percent-encode
65
+ # whitespace and angle brackets.
66
+ if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
67
+ u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
68
+ return f"<{u}>"
69
+
70
+ return u
71
+
72
+
73
+ class _MarkdownBuilder:
74
+ __slots__ = ("_buf", "_newline_count", "_pending_space")
75
+
76
+ _buf: list[str]
77
+ _newline_count: int
78
+ _pending_space: bool
79
+
80
+ def __init__(self) -> None:
81
+ self._buf = []
82
+ self._newline_count = 0
83
+ self._pending_space = False
84
+
85
+ def _rstrip_last_segment(self) -> None:
86
+ if not self._buf:
87
+ return
88
+ last = self._buf[-1]
89
+ stripped = last.rstrip(" \t")
90
+ if stripped != last:
91
+ self._buf[-1] = stripped
92
+
93
+ def newline(self, count: int = 1) -> None:
94
+ for _ in range(count):
95
+ self._pending_space = False
96
+ self._rstrip_last_segment()
97
+ self._buf.append("\n")
98
+ # Track newlines to make it easy to insert blank lines.
99
+ if self._newline_count < 2:
100
+ self._newline_count += 1
101
+
102
+ def ensure_newlines(self, count: int) -> None:
103
+ while self._newline_count < count:
104
+ self.newline(1)
105
+
106
+ def raw(self, s: str) -> None:
107
+ if not s:
108
+ return
109
+
110
+ # If we've collapsed whitespace and the next output is raw (e.g. "**"),
111
+ # we still need to emit a single separating space.
112
+ if self._pending_space:
113
+ first = s[0]
114
+ if first not in " \t\n\r\f" and self._buf and self._newline_count == 0:
115
+ self._buf.append(" ")
116
+ self._pending_space = False
117
+
118
+ self._buf.append(s)
119
+ if "\n" in s:
120
+ # Count trailing newlines (cap at 2 for blank-line semantics).
121
+ trailing = 0
122
+ i = len(s) - 1
123
+ while i >= 0 and s[i] == "\n":
124
+ trailing += 1
125
+ i -= 1
126
+ self._newline_count = min(2, trailing)
127
+ if trailing:
128
+ self._pending_space = False
129
+ else:
130
+ self._newline_count = 0
131
+
132
+ def text(self, s: str, preserve_whitespace: bool = False) -> None:
133
+ if not s:
134
+ return
7
135
 
8
- def __init__(self, name, attrs=None, data=None, namespace=None):
136
+ if preserve_whitespace:
137
+ self.raw(s)
138
+ return
139
+
140
+ for ch in s:
141
+ if ch in " \t\n\r\f":
142
+ self._pending_space = True
143
+ continue
144
+
145
+ if self._pending_space:
146
+ if self._buf and self._newline_count == 0:
147
+ self._buf.append(" ")
148
+ self._pending_space = False
149
+
150
+ self._buf.append(ch)
151
+ self._newline_count = 0
152
+
153
+ def finish(self) -> str:
154
+ out = "".join(self._buf)
155
+ return out.strip(" \t\n")
156
+
157
+
158
+ # Type alias for any node type
159
+ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
160
+
161
+
162
+ def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
163
+ # Iterative traversal avoids recursion overhead on large documents.
164
+ stack: list[Any] = [node]
165
+ while stack:
166
+ current = stack.pop()
167
+ name: str = current.name
168
+
169
+ if name == "#text":
170
+ data: str | None = current.data
171
+ if not data:
172
+ continue
173
+ if strip:
174
+ data = data.strip()
175
+ if not data:
176
+ continue
177
+ parts.append(data)
178
+ continue
179
+
180
+ # Preserve the same traversal order as the recursive implementation:
181
+ # children first, then template content.
182
+ if type(current) is TemplateNode and current.template_content:
183
+ stack.append(current.template_content)
184
+
185
+ children = current.children
186
+ if children:
187
+ stack.extend(reversed(children))
188
+
189
+
190
+ class SimpleDomNode:
191
+ __slots__ = (
192
+ "_origin_col",
193
+ "_origin_line",
194
+ "_origin_pos",
195
+ "attrs",
196
+ "children",
197
+ "data",
198
+ "name",
199
+ "namespace",
200
+ "parent",
201
+ )
202
+
203
+ name: str
204
+ parent: SimpleDomNode | ElementNode | TemplateNode | None
205
+ attrs: dict[str, str | None] | None
206
+ children: list[Any] | None
207
+ data: str | Doctype | None
208
+ namespace: str | None
209
+ _origin_pos: int | None
210
+ _origin_line: int | None
211
+ _origin_col: int | None
212
+
213
+ def __init__(
214
+ self,
215
+ name: str,
216
+ attrs: dict[str, str | None] | None = None,
217
+ data: str | Doctype | None = None,
218
+ namespace: str | None = None,
219
+ ) -> None:
9
220
  self.name = name
10
221
  self.parent = None
11
222
  self.data = data
223
+ self._origin_pos = None
224
+ self._origin_line = None
225
+ self._origin_col = None
12
226
 
13
227
  if name.startswith("#") or name == "!doctype":
14
228
  self.namespace = namespace
@@ -23,19 +237,48 @@ class SimpleDomNode:
23
237
  self.children = []
24
238
  self.attrs = attrs if attrs is not None else {}
25
239
 
26
- def append_child(self, node):
27
- self.children.append(node)
28
- node.parent = self
240
+ def append_child(self, node: Any) -> None:
241
+ if self.children is not None:
242
+ self.children.append(node)
243
+ node.parent = self
244
+
245
+ @property
246
+ def origin_offset(self) -> int | None:
247
+ """Best-effort origin offset (0-indexed) in the source HTML, if known."""
248
+ return self._origin_pos
29
249
 
30
- def remove_child(self, node):
31
- self.children.remove(node)
32
- node.parent = None
250
+ @property
251
+ def origin_line(self) -> int | None:
252
+ return self._origin_line
253
+
254
+ @property
255
+ def origin_col(self) -> int | None:
256
+ return self._origin_col
33
257
 
34
- def to_html(self, indent=0, indent_size=2, pretty=True):
258
+ @property
259
+ def origin_location(self) -> tuple[int, int] | None:
260
+ if self._origin_line is None or self._origin_col is None:
261
+ return None
262
+ return (self._origin_line, self._origin_col)
263
+
264
+ def remove_child(self, node: Any) -> None:
265
+ if self.children is not None:
266
+ self.children.remove(node)
267
+ node.parent = None
268
+
269
+ def to_html(
270
+ self,
271
+ indent: int = 0,
272
+ indent_size: int = 2,
273
+ pretty: bool = True,
274
+ *,
275
+ safe: bool = True,
276
+ policy: SanitizationPolicy | None = None,
277
+ ) -> str:
35
278
  """Convert node to HTML string."""
36
- return to_html(self, indent, indent_size, pretty=pretty)
279
+ return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
37
280
 
38
- def query(self, selector):
281
+ def query(self, selector: str) -> list[Any]:
39
282
  """
40
283
  Query this subtree using a CSS selector.
41
284
 
@@ -48,18 +291,65 @@ class SimpleDomNode:
48
291
  Raises:
49
292
  ValueError: If the selector is invalid
50
293
  """
51
- return query(self, selector)
294
+ result: list[Any] = query(self, selector)
295
+ return result
52
296
 
53
297
  @property
54
- def text(self):
55
- """Return the text content of this node and its descendants."""
298
+ def text(self) -> str:
299
+ """Return the node's own text value.
300
+
301
+ For text nodes this is the node data. For other nodes this is an empty
302
+ string. Use `to_text()` to get textContent semantics.
303
+ """
56
304
  if self.name == "#text":
57
- return self.data or ""
58
- if not self.children:
305
+ data = self.data
306
+ if isinstance(data, str):
307
+ return data
59
308
  return ""
60
- return "".join(child.text for child in self.children)
309
+ return ""
310
+
311
+ def to_text(
312
+ self,
313
+ separator: str = " ",
314
+ strip: bool = True,
315
+ *,
316
+ safe: bool = True,
317
+ policy: SanitizationPolicy | None = None,
318
+ ) -> str:
319
+ """Return the concatenated text of this node's descendants.
320
+
321
+ - `separator` controls how text nodes are joined (default: a single space).
322
+ - `strip=True` strips each text node and drops empty segments.
323
+ - `safe=True` sanitizes untrusted HTML before extracting text.
324
+ - `policy` overrides the default sanitization policy.
325
+
326
+ Template element contents are included via `template_content`.
327
+ """
328
+ node: Any = _sanitize(self, policy=policy) if safe else self
329
+ parts: list[str] = []
330
+ _to_text_collect(node, parts, strip=strip)
331
+ if not parts:
332
+ return ""
333
+ return separator.join(parts)
334
+
335
+ def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
336
+ """Return a GitHub Flavored Markdown representation of this subtree.
61
337
 
62
- def insert_before(self, node, reference_node):
338
+ This is a pragmatic HTML->Markdown converter intended for readability.
339
+ - Tables and images are preserved as raw HTML.
340
+ - Unknown elements fall back to rendering their children.
341
+ """
342
+ if safe:
343
+ node = _sanitize(self, policy=policy)
344
+ builder = _MarkdownBuilder()
345
+ _to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
346
+ return builder.finish()
347
+
348
+ builder = _MarkdownBuilder()
349
+ _to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
350
+ return builder.finish()
351
+
352
+ def insert_before(self, node: Any, reference_node: Any | None) -> None:
63
353
  """
64
354
  Insert a node before a reference node.
65
355
 
@@ -84,7 +374,7 @@ class SimpleDomNode:
84
374
  except ValueError:
85
375
  raise ValueError("Reference node is not a child of this node") from None
86
376
 
87
- def replace_child(self, new_node, old_node):
377
+ def replace_child(self, new_node: Any, old_node: Any) -> Any:
88
378
  """
89
379
  Replace a child node with a new node.
90
380
 
@@ -111,26 +401,31 @@ class SimpleDomNode:
111
401
  old_node.parent = None
112
402
  return old_node
113
403
 
114
- def has_child_nodes(self):
404
+ def has_child_nodes(self) -> bool:
115
405
  """Return True if this node has children."""
116
406
  return bool(self.children)
117
407
 
118
- def clone_node(self, deep=False):
408
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
119
409
  """
120
410
  Clone this node.
121
411
 
122
412
  Args:
123
413
  deep: If True, recursively clone children.
414
+ override_attrs: Optional dictionary to use as attributes for the clone.
124
415
 
125
416
  Returns:
126
417
  A new node that is a copy of this node.
127
418
  """
419
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
128
420
  clone = SimpleDomNode(
129
421
  self.name,
130
- self.attrs.copy() if self.attrs else None,
422
+ attrs,
131
423
  self.data,
132
424
  self.namespace,
133
425
  )
426
+ clone._origin_pos = self._origin_pos
427
+ clone._origin_line = self._origin_line
428
+ clone._origin_col = self._origin_col
134
429
  if deep and self.children:
135
430
  for child in self.children:
136
431
  clone.append_child(child.clone_node(deep=True))
@@ -138,18 +433,30 @@ class SimpleDomNode:
138
433
 
139
434
 
140
435
  class ElementNode(SimpleDomNode):
141
- __slots__ = ()
436
+ __slots__ = ("template_content",)
142
437
 
143
- def __init__(self, name, attrs, namespace):
438
+ template_content: SimpleDomNode | None
439
+ children: list[Any]
440
+ attrs: dict[str, str | None]
441
+
442
+ def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
144
443
  self.name = name
145
444
  self.parent = None
146
445
  self.data = None
147
446
  self.namespace = namespace
148
447
  self.children = []
149
- self.attrs = attrs
150
-
151
- def clone_node(self, deep=False):
152
- clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
448
+ self.attrs = attrs if attrs is not None else {}
449
+ self.template_content = None
450
+ self._origin_pos = None
451
+ self._origin_line = None
452
+ self._origin_col = None
453
+
454
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
455
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
456
+ clone = ElementNode(self.name, attrs, self.namespace)
457
+ clone._origin_pos = self._origin_pos
458
+ clone._origin_line = self._origin_line
459
+ clone._origin_col = self._origin_col
153
460
  if deep:
154
461
  for child in self.children:
155
462
  clone.append_child(child.clone_node(deep=True))
@@ -157,22 +464,32 @@ class ElementNode(SimpleDomNode):
157
464
 
158
465
 
159
466
  class TemplateNode(ElementNode):
160
- __slots__ = ("template_content",)
467
+ __slots__ = ()
161
468
 
162
- def __init__(self, name, attrs=None, data=None, namespace=None):
469
+ def __init__(
470
+ self,
471
+ name: str,
472
+ attrs: dict[str, str | None] | None = None,
473
+ data: str | None = None,
474
+ namespace: str | None = None,
475
+ ) -> None:
163
476
  super().__init__(name, attrs, namespace)
164
477
  if self.namespace == "html":
165
478
  self.template_content = SimpleDomNode("#document-fragment")
166
479
  else:
167
480
  self.template_content = None
168
481
 
169
- def clone_node(self, deep=False):
482
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
483
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
170
484
  clone = TemplateNode(
171
485
  self.name,
172
- self.attrs.copy() if self.attrs else {},
173
- self.data,
486
+ attrs,
487
+ None,
174
488
  self.namespace,
175
489
  )
490
+ clone._origin_pos = self._origin_pos
491
+ clone._origin_line = self._origin_line
492
+ clone._origin_col = self._origin_col
176
493
  if deep:
177
494
  if self.template_content:
178
495
  clone.template_content = self.template_content.clone_node(deep=True)
@@ -182,27 +499,407 @@ class TemplateNode(ElementNode):
182
499
 
183
500
 
184
501
  class TextNode:
185
- __slots__ = ("data", "name", "namespace", "parent")
502
+ __slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
186
503
 
187
- def __init__(self, data):
504
+ data: str | None
505
+ name: str
506
+ namespace: None
507
+ parent: SimpleDomNode | ElementNode | TemplateNode | None
508
+ _origin_pos: int | None
509
+ _origin_line: int | None
510
+ _origin_col: int | None
511
+
512
+ def __init__(self, data: str | None) -> None:
188
513
  self.data = data
189
514
  self.parent = None
190
515
  self.name = "#text"
191
516
  self.namespace = None
517
+ self._origin_pos = None
518
+ self._origin_line = None
519
+ self._origin_col = None
520
+
521
+ @property
522
+ def origin_offset(self) -> int | None:
523
+ """Best-effort origin offset (0-indexed) in the source HTML, if known."""
524
+ return self._origin_pos
525
+
526
+ @property
527
+ def origin_line(self) -> int | None:
528
+ return self._origin_line
192
529
 
193
530
  @property
194
- def text(self):
531
+ def origin_col(self) -> int | None:
532
+ return self._origin_col
533
+
534
+ @property
535
+ def origin_location(self) -> tuple[int, int] | None:
536
+ if self._origin_line is None or self._origin_col is None:
537
+ return None
538
+ return (self._origin_line, self._origin_col)
539
+
540
+ @property
541
+ def text(self) -> str:
195
542
  """Return the text content of this node."""
196
543
  return self.data or ""
197
544
 
545
+ def to_text(
546
+ self,
547
+ separator: str = " ",
548
+ strip: bool = True,
549
+ *,
550
+ safe: bool = True,
551
+ policy: SanitizationPolicy | None = None,
552
+ ) -> str:
553
+ # Parameters are accepted for API consistency; they don't affect leaf nodes.
554
+ _ = separator
555
+ _ = safe
556
+ _ = policy
557
+
558
+ if self.data is None:
559
+ return ""
560
+ if strip:
561
+ return self.data.strip()
562
+ return self.data
563
+
564
+ def to_markdown(self) -> str:
565
+ builder = _MarkdownBuilder()
566
+ builder.text(_markdown_escape_text(self.data or ""), preserve_whitespace=False)
567
+ return builder.finish()
568
+
198
569
  @property
199
- def children(self):
570
+ def children(self) -> list[Any]:
200
571
  """Return empty list for TextNode (leaf node)."""
201
572
  return []
202
573
 
203
- def has_child_nodes(self):
574
+ def has_child_nodes(self) -> bool:
204
575
  """Return False for TextNode."""
205
576
  return False
206
577
 
207
- def clone_node(self, deep=False):
208
- return TextNode(self.data)
578
+ def clone_node(self, deep: bool = False) -> TextNode:
579
+ clone = TextNode(self.data)
580
+ clone._origin_pos = self._origin_pos
581
+ clone._origin_line = self._origin_line
582
+ clone._origin_col = self._origin_col
583
+ return clone
584
+
585
+
586
+ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
587
+ {
588
+ "p",
589
+ "div",
590
+ "section",
591
+ "article",
592
+ "header",
593
+ "footer",
594
+ "main",
595
+ "nav",
596
+ "aside",
597
+ "blockquote",
598
+ "pre",
599
+ "ul",
600
+ "ol",
601
+ "li",
602
+ "hr",
603
+ "h1",
604
+ "h2",
605
+ "h3",
606
+ "h4",
607
+ "h5",
608
+ "h6",
609
+ "table",
610
+ }
611
+ )
612
+
613
+
614
+ def _to_markdown_walk(
615
+ node: Any,
616
+ builder: _MarkdownBuilder,
617
+ preserve_whitespace: bool,
618
+ list_depth: int,
619
+ in_link: bool = False,
620
+ ) -> None:
621
+ name: str = node.name
622
+
623
+ if name == "#text":
624
+ if preserve_whitespace:
625
+ builder.raw(node.data or "")
626
+ else:
627
+ builder.text(_markdown_escape_text(node.data or ""), preserve_whitespace=False)
628
+ return
629
+
630
+ if name == "br":
631
+ if in_link:
632
+ builder.text(" ", preserve_whitespace=False)
633
+ else:
634
+ builder.newline(1)
635
+ return
636
+
637
+ # Comments/doctype don't contribute.
638
+ if name == "#comment" or name == "!doctype":
639
+ return
640
+
641
+ # Document containers contribute via descendants.
642
+ if name.startswith("#"):
643
+ if node.children:
644
+ for child in node.children:
645
+ _to_markdown_walk(
646
+ child,
647
+ builder,
648
+ preserve_whitespace,
649
+ list_depth,
650
+ in_link=in_link,
651
+ )
652
+ return
653
+
654
+ tag = name.lower()
655
+
656
+ # Metadata containers don't contribute to body text.
657
+ if tag == "head" or tag == "title":
658
+ return
659
+
660
+ # Preserve <img> and <table> as HTML.
661
+ if tag == "img":
662
+ builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
663
+ return
664
+
665
+ if tag == "table":
666
+ if not in_link:
667
+ builder.ensure_newlines(2 if builder._buf else 0)
668
+ builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
669
+ if not in_link:
670
+ builder.ensure_newlines(2)
671
+ return
672
+
673
+ # Headings.
674
+ if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
675
+ if not in_link:
676
+ builder.ensure_newlines(2 if builder._buf else 0)
677
+ level = int(tag[1])
678
+ builder.raw("#" * level)
679
+ builder.raw(" ")
680
+
681
+ if node.children:
682
+ for child in node.children:
683
+ _to_markdown_walk(
684
+ child,
685
+ builder,
686
+ preserve_whitespace=False,
687
+ list_depth=list_depth,
688
+ in_link=in_link,
689
+ )
690
+
691
+ if not in_link:
692
+ builder.ensure_newlines(2)
693
+ return
694
+
695
+ # Horizontal rule.
696
+ if tag == "hr":
697
+ if not in_link:
698
+ builder.ensure_newlines(2 if builder._buf else 0)
699
+ builder.raw("---")
700
+ builder.ensure_newlines(2)
701
+ return
702
+
703
+ # Code blocks.
704
+ if tag == "pre":
705
+ if not in_link:
706
+ builder.ensure_newlines(2 if builder._buf else 0)
707
+ code = node.to_text(separator="", strip=False)
708
+ builder.raw("```")
709
+ builder.newline(1)
710
+ if code:
711
+ builder.raw(code.rstrip("\n"))
712
+ builder.newline(1)
713
+ builder.raw("```")
714
+ builder.ensure_newlines(2)
715
+ else:
716
+ # Inside link, render as inline code or text
717
+ code = node.to_text(separator="", strip=False)
718
+ builder.raw(_markdown_code_span(code))
719
+ return
720
+
721
+ # Inline code.
722
+ if tag == "code" and not preserve_whitespace:
723
+ code = node.to_text(separator="", strip=False)
724
+ builder.raw(_markdown_code_span(code))
725
+ return
726
+
727
+ # Paragraph-like blocks.
728
+ if tag == "p":
729
+ if not in_link:
730
+ builder.ensure_newlines(2 if builder._buf else 0)
731
+
732
+ if node.children:
733
+ for child in node.children:
734
+ _to_markdown_walk(
735
+ child,
736
+ builder,
737
+ preserve_whitespace=False,
738
+ list_depth=list_depth,
739
+ in_link=in_link,
740
+ )
741
+
742
+ if not in_link:
743
+ builder.ensure_newlines(2)
744
+ else:
745
+ builder.text(" ", preserve_whitespace=False)
746
+ return
747
+
748
+ # Blockquotes.
749
+ if tag == "blockquote":
750
+ if not in_link:
751
+ builder.ensure_newlines(2 if builder._buf else 0)
752
+ inner = _MarkdownBuilder()
753
+ if node.children:
754
+ for child in node.children:
755
+ _to_markdown_walk(
756
+ child,
757
+ inner,
758
+ preserve_whitespace=False,
759
+ list_depth=list_depth,
760
+ in_link=in_link,
761
+ )
762
+ text = inner.finish()
763
+ if text:
764
+ lines = text.split("\n")
765
+ for i, line in enumerate(lines):
766
+ if i:
767
+ builder.newline(1)
768
+ builder.raw("> ")
769
+ builder.raw(line)
770
+ builder.ensure_newlines(2)
771
+ else:
772
+ if node.children:
773
+ for child in node.children:
774
+ _to_markdown_walk(
775
+ child,
776
+ builder,
777
+ preserve_whitespace=False,
778
+ list_depth=list_depth,
779
+ in_link=in_link,
780
+ )
781
+ return
782
+
783
+ # Lists.
784
+ if tag in {"ul", "ol"}:
785
+ if not in_link:
786
+ builder.ensure_newlines(2 if builder._buf else 0)
787
+ ordered = tag == "ol"
788
+ idx = 1
789
+ for child in node.children or []:
790
+ if child.name.lower() != "li":
791
+ continue
792
+ if idx > 1:
793
+ builder.newline(1)
794
+ indent = " " * list_depth
795
+ marker = f"{idx}. " if ordered else "- "
796
+ builder.raw(indent)
797
+ builder.raw(marker)
798
+ # Render list item content inline-ish.
799
+ for li_child in child.children or []:
800
+ _to_markdown_walk(
801
+ li_child,
802
+ builder,
803
+ preserve_whitespace=False,
804
+ list_depth=list_depth + 1,
805
+ in_link=in_link,
806
+ )
807
+ idx += 1
808
+ builder.ensure_newlines(2)
809
+ else:
810
+ # Flatten list inside link
811
+ for child in node.children or []:
812
+ if child.name.lower() != "li":
813
+ continue
814
+ builder.raw(" ")
815
+ for li_child in child.children or []:
816
+ _to_markdown_walk(
817
+ li_child,
818
+ builder,
819
+ preserve_whitespace=False,
820
+ list_depth=list_depth + 1,
821
+ in_link=in_link,
822
+ )
823
+ return
824
+
825
+ # Emphasis/strong.
826
+ if tag in {"em", "i"}:
827
+ builder.raw("*")
828
+ for child in node.children or []:
829
+ _to_markdown_walk(
830
+ child,
831
+ builder,
832
+ preserve_whitespace=False,
833
+ list_depth=list_depth,
834
+ in_link=in_link,
835
+ )
836
+ builder.raw("*")
837
+ return
838
+
839
+ if tag in {"strong", "b"}:
840
+ builder.raw("**")
841
+ for child in node.children or []:
842
+ _to_markdown_walk(
843
+ child,
844
+ builder,
845
+ preserve_whitespace=False,
846
+ list_depth=list_depth,
847
+ in_link=in_link,
848
+ )
849
+ builder.raw("**")
850
+ return
851
+
852
+ # Links.
853
+ if tag == "a":
854
+ href = ""
855
+ if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
856
+ href = str(node.attrs["href"])
857
+
858
+ # Capture inner text to strip whitespace.
859
+ inner_builder = _MarkdownBuilder()
860
+ for child in node.children or []:
861
+ _to_markdown_walk(
862
+ child,
863
+ inner_builder,
864
+ preserve_whitespace=False,
865
+ list_depth=list_depth,
866
+ in_link=True,
867
+ )
868
+ link_text = inner_builder.finish()
869
+
870
+ builder.raw("[")
871
+ builder.raw(link_text)
872
+ builder.raw("]")
873
+ if href:
874
+ builder.raw("(")
875
+ builder.raw(_markdown_link_destination(href))
876
+ builder.raw(")")
877
+ return
878
+
879
+ # Containers / unknown tags: recurse into children.
880
+ next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
881
+ if node.children:
882
+ for child in node.children:
883
+ _to_markdown_walk(
884
+ child,
885
+ builder,
886
+ next_preserve,
887
+ list_depth,
888
+ in_link=in_link,
889
+ )
890
+
891
+ if isinstance(node, ElementNode) and node.template_content:
892
+ _to_markdown_walk(
893
+ node.template_content,
894
+ builder,
895
+ next_preserve,
896
+ list_depth,
897
+ in_link=in_link,
898
+ )
899
+
900
+ # Add spacing after block containers to keep output readable.
901
+ if tag in _MARKDOWN_BLOCK_ELEMENTS:
902
+ if not in_link:
903
+ builder.ensure_newlines(2)
904
+ else:
905
+ builder.text(" ", preserve_whitespace=False)