justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/node.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from urllib.parse import quote
|
|
4
5
|
|
|
6
|
+
from .sanitize import sanitize
|
|
5
7
|
from .selector import query
|
|
6
8
|
from .serialize import to_html
|
|
7
9
|
|
|
8
10
|
if TYPE_CHECKING:
|
|
11
|
+
from .sanitize import SanitizationPolicy
|
|
9
12
|
from .tokens import Doctype
|
|
10
13
|
|
|
11
14
|
|
|
@@ -43,6 +46,30 @@ def _markdown_code_span(s: str | None) -> str:
|
|
|
43
46
|
return f"{fence}{s}{fence}"
|
|
44
47
|
|
|
45
48
|
|
|
49
|
+
def _markdown_link_destination(url: str) -> str:
|
|
50
|
+
"""Return a Markdown-safe link destination.
|
|
51
|
+
|
|
52
|
+
We primarily care about avoiding Markdown formatting injection and broken
|
|
53
|
+
parsing for URLs that contain whitespace or parentheses.
|
|
54
|
+
|
|
55
|
+
CommonMark supports destinations wrapped in angle brackets:
|
|
56
|
+
`[text](<https://example.com/a(b)c>)`
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
u = (url or "").strip()
|
|
60
|
+
if not u:
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
# If the destination contains characters that can terminate or confuse
|
|
64
|
+
# the Markdown destination parser, wrap in <...> and percent-encode
|
|
65
|
+
# whitespace and angle brackets.
|
|
66
|
+
if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
|
|
67
|
+
u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
|
|
68
|
+
return f"<{u}>"
|
|
69
|
+
|
|
70
|
+
return u
|
|
71
|
+
|
|
72
|
+
|
|
46
73
|
class _MarkdownBuilder:
|
|
47
74
|
__slots__ = ("_buf", "_newline_count", "_pending_space")
|
|
48
75
|
|
|
@@ -133,29 +160,45 @@ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
|
|
|
133
160
|
|
|
134
161
|
|
|
135
162
|
def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if
|
|
143
|
-
data = data
|
|
163
|
+
# Iterative traversal avoids recursion overhead on large documents.
|
|
164
|
+
stack: list[Any] = [node]
|
|
165
|
+
while stack:
|
|
166
|
+
current = stack.pop()
|
|
167
|
+
name: str = current.name
|
|
168
|
+
|
|
169
|
+
if name == "#text":
|
|
170
|
+
data: str | None = current.data
|
|
144
171
|
if not data:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
172
|
+
continue
|
|
173
|
+
if strip:
|
|
174
|
+
data = data.strip()
|
|
175
|
+
if not data:
|
|
176
|
+
continue
|
|
177
|
+
parts.append(data)
|
|
178
|
+
continue
|
|
148
179
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
180
|
+
# Preserve the same traversal order as the recursive implementation:
|
|
181
|
+
# children first, then template content.
|
|
182
|
+
if type(current) is TemplateNode and current.template_content:
|
|
183
|
+
stack.append(current.template_content)
|
|
152
184
|
|
|
153
|
-
|
|
154
|
-
|
|
185
|
+
children = current.children
|
|
186
|
+
if children:
|
|
187
|
+
stack.extend(reversed(children))
|
|
155
188
|
|
|
156
189
|
|
|
157
190
|
class SimpleDomNode:
|
|
158
|
-
__slots__ = (
|
|
191
|
+
__slots__ = (
|
|
192
|
+
"_origin_col",
|
|
193
|
+
"_origin_line",
|
|
194
|
+
"_origin_pos",
|
|
195
|
+
"attrs",
|
|
196
|
+
"children",
|
|
197
|
+
"data",
|
|
198
|
+
"name",
|
|
199
|
+
"namespace",
|
|
200
|
+
"parent",
|
|
201
|
+
)
|
|
159
202
|
|
|
160
203
|
name: str
|
|
161
204
|
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
@@ -163,6 +206,9 @@ class SimpleDomNode:
|
|
|
163
206
|
children: list[Any] | None
|
|
164
207
|
data: str | Doctype | None
|
|
165
208
|
namespace: str | None
|
|
209
|
+
_origin_pos: int | None
|
|
210
|
+
_origin_line: int | None
|
|
211
|
+
_origin_col: int | None
|
|
166
212
|
|
|
167
213
|
def __init__(
|
|
168
214
|
self,
|
|
@@ -174,6 +220,9 @@ class SimpleDomNode:
|
|
|
174
220
|
self.name = name
|
|
175
221
|
self.parent = None
|
|
176
222
|
self.data = data
|
|
223
|
+
self._origin_pos = None
|
|
224
|
+
self._origin_line = None
|
|
225
|
+
self._origin_col = None
|
|
177
226
|
|
|
178
227
|
if name.startswith("#") or name == "!doctype":
|
|
179
228
|
self.namespace = namespace
|
|
@@ -193,14 +242,41 @@ class SimpleDomNode:
|
|
|
193
242
|
self.children.append(node)
|
|
194
243
|
node.parent = self
|
|
195
244
|
|
|
245
|
+
@property
|
|
246
|
+
def origin_offset(self) -> int | None:
|
|
247
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
248
|
+
return self._origin_pos
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def origin_line(self) -> int | None:
|
|
252
|
+
return self._origin_line
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def origin_col(self) -> int | None:
|
|
256
|
+
return self._origin_col
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
260
|
+
if self._origin_line is None or self._origin_col is None:
|
|
261
|
+
return None
|
|
262
|
+
return (self._origin_line, self._origin_col)
|
|
263
|
+
|
|
196
264
|
def remove_child(self, node: Any) -> None:
|
|
197
265
|
if self.children is not None:
|
|
198
266
|
self.children.remove(node)
|
|
199
267
|
node.parent = None
|
|
200
268
|
|
|
201
|
-
def to_html(
|
|
269
|
+
def to_html(
|
|
270
|
+
self,
|
|
271
|
+
indent: int = 0,
|
|
272
|
+
indent_size: int = 2,
|
|
273
|
+
pretty: bool = True,
|
|
274
|
+
*,
|
|
275
|
+
safe: bool = True,
|
|
276
|
+
policy: SanitizationPolicy | None = None,
|
|
277
|
+
) -> str:
|
|
202
278
|
"""Convert node to HTML string."""
|
|
203
|
-
return to_html(self, indent, indent_size, pretty=pretty)
|
|
279
|
+
return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
|
|
204
280
|
|
|
205
281
|
def query(self, selector: str) -> list[Any]:
|
|
206
282
|
"""
|
|
@@ -232,27 +308,43 @@ class SimpleDomNode:
|
|
|
232
308
|
return ""
|
|
233
309
|
return ""
|
|
234
310
|
|
|
235
|
-
def to_text(
|
|
311
|
+
def to_text(
|
|
312
|
+
self,
|
|
313
|
+
separator: str = " ",
|
|
314
|
+
strip: bool = True,
|
|
315
|
+
*,
|
|
316
|
+
safe: bool = True,
|
|
317
|
+
policy: SanitizationPolicy | None = None,
|
|
318
|
+
) -> str:
|
|
236
319
|
"""Return the concatenated text of this node's descendants.
|
|
237
320
|
|
|
238
321
|
- `separator` controls how text nodes are joined (default: a single space).
|
|
239
322
|
- `strip=True` strips each text node and drops empty segments.
|
|
323
|
+
- `safe=True` sanitizes untrusted HTML before extracting text.
|
|
324
|
+
- `policy` overrides the default sanitization policy.
|
|
240
325
|
|
|
241
326
|
Template element contents are included via `template_content`.
|
|
242
327
|
"""
|
|
328
|
+
node: Any = sanitize(self, policy=policy) if safe else self
|
|
243
329
|
parts: list[str] = []
|
|
244
|
-
_to_text_collect(
|
|
330
|
+
_to_text_collect(node, parts, strip=strip)
|
|
245
331
|
if not parts:
|
|
246
332
|
return ""
|
|
247
333
|
return separator.join(parts)
|
|
248
334
|
|
|
249
|
-
def to_markdown(self) -> str:
|
|
335
|
+
def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
|
|
250
336
|
"""Return a GitHub Flavored Markdown representation of this subtree.
|
|
251
337
|
|
|
252
338
|
This is a pragmatic HTML->Markdown converter intended for readability.
|
|
253
339
|
- Tables and images are preserved as raw HTML.
|
|
254
340
|
- Unknown elements fall back to rendering their children.
|
|
255
341
|
"""
|
|
342
|
+
if safe:
|
|
343
|
+
node = sanitize(self, policy=policy)
|
|
344
|
+
builder = _MarkdownBuilder()
|
|
345
|
+
_to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
|
|
346
|
+
return builder.finish()
|
|
347
|
+
|
|
256
348
|
builder = _MarkdownBuilder()
|
|
257
349
|
_to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
|
|
258
350
|
return builder.finish()
|
|
@@ -329,6 +421,9 @@ class SimpleDomNode:
|
|
|
329
421
|
self.data,
|
|
330
422
|
self.namespace,
|
|
331
423
|
)
|
|
424
|
+
clone._origin_pos = self._origin_pos
|
|
425
|
+
clone._origin_line = self._origin_line
|
|
426
|
+
clone._origin_col = self._origin_col
|
|
332
427
|
if deep and self.children:
|
|
333
428
|
for child in self.children:
|
|
334
429
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -350,9 +445,15 @@ class ElementNode(SimpleDomNode):
|
|
|
350
445
|
self.children = []
|
|
351
446
|
self.attrs = attrs if attrs is not None else {}
|
|
352
447
|
self.template_content = None
|
|
448
|
+
self._origin_pos = None
|
|
449
|
+
self._origin_line = None
|
|
450
|
+
self._origin_col = None
|
|
353
451
|
|
|
354
452
|
def clone_node(self, deep: bool = False) -> ElementNode:
|
|
355
453
|
clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
|
|
454
|
+
clone._origin_pos = self._origin_pos
|
|
455
|
+
clone._origin_line = self._origin_line
|
|
456
|
+
clone._origin_col = self._origin_col
|
|
356
457
|
if deep:
|
|
357
458
|
for child in self.children:
|
|
358
459
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -382,6 +483,9 @@ class TemplateNode(ElementNode):
|
|
|
382
483
|
None,
|
|
383
484
|
self.namespace,
|
|
384
485
|
)
|
|
486
|
+
clone._origin_pos = self._origin_pos
|
|
487
|
+
clone._origin_line = self._origin_line
|
|
488
|
+
clone._origin_col = self._origin_col
|
|
385
489
|
if deep:
|
|
386
490
|
if self.template_content:
|
|
387
491
|
clone.template_content = self.template_content.clone_node(deep=True)
|
|
@@ -391,26 +495,62 @@ class TemplateNode(ElementNode):
|
|
|
391
495
|
|
|
392
496
|
|
|
393
497
|
class TextNode:
|
|
394
|
-
__slots__ = ("data", "name", "namespace", "parent")
|
|
498
|
+
__slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
|
|
395
499
|
|
|
396
500
|
data: str | None
|
|
397
501
|
name: str
|
|
398
502
|
namespace: None
|
|
399
503
|
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
504
|
+
_origin_pos: int | None
|
|
505
|
+
_origin_line: int | None
|
|
506
|
+
_origin_col: int | None
|
|
400
507
|
|
|
401
508
|
def __init__(self, data: str | None) -> None:
|
|
402
509
|
self.data = data
|
|
403
510
|
self.parent = None
|
|
404
511
|
self.name = "#text"
|
|
405
512
|
self.namespace = None
|
|
513
|
+
self._origin_pos = None
|
|
514
|
+
self._origin_line = None
|
|
515
|
+
self._origin_col = None
|
|
516
|
+
|
|
517
|
+
@property
|
|
518
|
+
def origin_offset(self) -> int | None:
|
|
519
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
520
|
+
return self._origin_pos
|
|
521
|
+
|
|
522
|
+
@property
|
|
523
|
+
def origin_line(self) -> int | None:
|
|
524
|
+
return self._origin_line
|
|
525
|
+
|
|
526
|
+
@property
|
|
527
|
+
def origin_col(self) -> int | None:
|
|
528
|
+
return self._origin_col
|
|
529
|
+
|
|
530
|
+
@property
|
|
531
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
532
|
+
if self._origin_line is None or self._origin_col is None:
|
|
533
|
+
return None
|
|
534
|
+
return (self._origin_line, self._origin_col)
|
|
406
535
|
|
|
407
536
|
@property
|
|
408
537
|
def text(self) -> str:
|
|
409
538
|
"""Return the text content of this node."""
|
|
410
539
|
return self.data or ""
|
|
411
540
|
|
|
412
|
-
def to_text(
|
|
541
|
+
def to_text(
|
|
542
|
+
self,
|
|
543
|
+
separator: str = " ",
|
|
544
|
+
strip: bool = True,
|
|
545
|
+
*,
|
|
546
|
+
safe: bool = True,
|
|
547
|
+
policy: SanitizationPolicy | None = None,
|
|
548
|
+
) -> str:
|
|
413
549
|
# Parameters are accepted for API consistency; they don't affect leaf nodes.
|
|
550
|
+
_ = separator
|
|
551
|
+
_ = safe
|
|
552
|
+
_ = policy
|
|
553
|
+
|
|
414
554
|
if self.data is None:
|
|
415
555
|
return ""
|
|
416
556
|
if strip:
|
|
@@ -432,7 +572,11 @@ class TextNode:
|
|
|
432
572
|
return False
|
|
433
573
|
|
|
434
574
|
def clone_node(self, deep: bool = False) -> TextNode:
|
|
435
|
-
|
|
575
|
+
clone = TextNode(self.data)
|
|
576
|
+
clone._origin_pos = self._origin_pos
|
|
577
|
+
clone._origin_line = self._origin_line
|
|
578
|
+
clone._origin_col = self._origin_col
|
|
579
|
+
return clone
|
|
436
580
|
|
|
437
581
|
|
|
438
582
|
_MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
|
|
@@ -463,7 +607,13 @@ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
|
|
|
463
607
|
)
|
|
464
608
|
|
|
465
609
|
|
|
466
|
-
def _to_markdown_walk(
|
|
610
|
+
def _to_markdown_walk(
|
|
611
|
+
node: Any,
|
|
612
|
+
builder: _MarkdownBuilder,
|
|
613
|
+
preserve_whitespace: bool,
|
|
614
|
+
list_depth: int,
|
|
615
|
+
in_link: bool = False,
|
|
616
|
+
) -> None:
|
|
467
617
|
name: str = node.name
|
|
468
618
|
|
|
469
619
|
if name == "#text":
|
|
@@ -474,7 +624,10 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
474
624
|
return
|
|
475
625
|
|
|
476
626
|
if name == "br":
|
|
477
|
-
|
|
627
|
+
if in_link:
|
|
628
|
+
builder.text(" ", preserve_whitespace=False)
|
|
629
|
+
else:
|
|
630
|
+
builder.newline(1)
|
|
478
631
|
return
|
|
479
632
|
|
|
480
633
|
# Comments/doctype don't contribute.
|
|
@@ -485,52 +638,80 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
485
638
|
if name.startswith("#"):
|
|
486
639
|
if node.children:
|
|
487
640
|
for child in node.children:
|
|
488
|
-
_to_markdown_walk(
|
|
641
|
+
_to_markdown_walk(
|
|
642
|
+
child,
|
|
643
|
+
builder,
|
|
644
|
+
preserve_whitespace,
|
|
645
|
+
list_depth,
|
|
646
|
+
in_link=in_link,
|
|
647
|
+
)
|
|
489
648
|
return
|
|
490
649
|
|
|
491
650
|
tag = name.lower()
|
|
492
651
|
|
|
652
|
+
# Metadata containers don't contribute to body text.
|
|
653
|
+
if tag == "head" or tag == "title":
|
|
654
|
+
return
|
|
655
|
+
|
|
493
656
|
# Preserve <img> and <table> as HTML.
|
|
494
657
|
if tag == "img":
|
|
495
658
|
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
496
659
|
return
|
|
497
660
|
|
|
498
661
|
if tag == "table":
|
|
499
|
-
|
|
662
|
+
if not in_link:
|
|
663
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
500
664
|
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
501
|
-
|
|
665
|
+
if not in_link:
|
|
666
|
+
builder.ensure_newlines(2)
|
|
502
667
|
return
|
|
503
668
|
|
|
504
669
|
# Headings.
|
|
505
670
|
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
671
|
+
if not in_link:
|
|
672
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
673
|
+
level = int(tag[1])
|
|
674
|
+
builder.raw("#" * level)
|
|
675
|
+
builder.raw(" ")
|
|
676
|
+
|
|
510
677
|
if node.children:
|
|
511
678
|
for child in node.children:
|
|
512
|
-
_to_markdown_walk(
|
|
513
|
-
|
|
679
|
+
_to_markdown_walk(
|
|
680
|
+
child,
|
|
681
|
+
builder,
|
|
682
|
+
preserve_whitespace=False,
|
|
683
|
+
list_depth=list_depth,
|
|
684
|
+
in_link=in_link,
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
if not in_link:
|
|
688
|
+
builder.ensure_newlines(2)
|
|
514
689
|
return
|
|
515
690
|
|
|
516
691
|
# Horizontal rule.
|
|
517
692
|
if tag == "hr":
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
693
|
+
if not in_link:
|
|
694
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
695
|
+
builder.raw("---")
|
|
696
|
+
builder.ensure_newlines(2)
|
|
521
697
|
return
|
|
522
698
|
|
|
523
699
|
# Code blocks.
|
|
524
700
|
if tag == "pre":
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
if code:
|
|
530
|
-
builder.raw(code.rstrip("\n"))
|
|
701
|
+
if not in_link:
|
|
702
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
703
|
+
code = node.to_text(separator="", strip=False)
|
|
704
|
+
builder.raw("```")
|
|
531
705
|
builder.newline(1)
|
|
532
|
-
|
|
533
|
-
|
|
706
|
+
if code:
|
|
707
|
+
builder.raw(code.rstrip("\n"))
|
|
708
|
+
builder.newline(1)
|
|
709
|
+
builder.raw("```")
|
|
710
|
+
builder.ensure_newlines(2)
|
|
711
|
+
else:
|
|
712
|
+
# Inside link, render as inline code or text
|
|
713
|
+
code = node.to_text(separator="", strip=False)
|
|
714
|
+
builder.raw(_markdown_code_span(code))
|
|
534
715
|
return
|
|
535
716
|
|
|
536
717
|
# Inline code.
|
|
@@ -541,64 +722,126 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
541
722
|
|
|
542
723
|
# Paragraph-like blocks.
|
|
543
724
|
if tag == "p":
|
|
544
|
-
|
|
725
|
+
if not in_link:
|
|
726
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
727
|
+
|
|
545
728
|
if node.children:
|
|
546
729
|
for child in node.children:
|
|
547
|
-
_to_markdown_walk(
|
|
548
|
-
|
|
730
|
+
_to_markdown_walk(
|
|
731
|
+
child,
|
|
732
|
+
builder,
|
|
733
|
+
preserve_whitespace=False,
|
|
734
|
+
list_depth=list_depth,
|
|
735
|
+
in_link=in_link,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
if not in_link:
|
|
739
|
+
builder.ensure_newlines(2)
|
|
740
|
+
else:
|
|
741
|
+
builder.text(" ", preserve_whitespace=False)
|
|
549
742
|
return
|
|
550
743
|
|
|
551
744
|
# Blockquotes.
|
|
552
745
|
if tag == "blockquote":
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
746
|
+
if not in_link:
|
|
747
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
748
|
+
inner = _MarkdownBuilder()
|
|
749
|
+
if node.children:
|
|
750
|
+
for child in node.children:
|
|
751
|
+
_to_markdown_walk(
|
|
752
|
+
child,
|
|
753
|
+
inner,
|
|
754
|
+
preserve_whitespace=False,
|
|
755
|
+
list_depth=list_depth,
|
|
756
|
+
in_link=in_link,
|
|
757
|
+
)
|
|
758
|
+
text = inner.finish()
|
|
759
|
+
if text:
|
|
760
|
+
lines = text.split("\n")
|
|
761
|
+
for i, line in enumerate(lines):
|
|
762
|
+
if i:
|
|
763
|
+
builder.newline(1)
|
|
764
|
+
builder.raw("> ")
|
|
765
|
+
builder.raw(line)
|
|
766
|
+
builder.ensure_newlines(2)
|
|
767
|
+
else:
|
|
768
|
+
if node.children:
|
|
769
|
+
for child in node.children:
|
|
770
|
+
_to_markdown_walk(
|
|
771
|
+
child,
|
|
772
|
+
builder,
|
|
773
|
+
preserve_whitespace=False,
|
|
774
|
+
list_depth=list_depth,
|
|
775
|
+
in_link=in_link,
|
|
776
|
+
)
|
|
567
777
|
return
|
|
568
778
|
|
|
569
779
|
# Lists.
|
|
570
780
|
if tag in {"ul", "ol"}:
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
781
|
+
if not in_link:
|
|
782
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
783
|
+
ordered = tag == "ol"
|
|
784
|
+
idx = 1
|
|
785
|
+
for child in node.children or []:
|
|
786
|
+
if child.name.lower() != "li":
|
|
787
|
+
continue
|
|
788
|
+
if idx > 1:
|
|
789
|
+
builder.newline(1)
|
|
790
|
+
indent = " " * list_depth
|
|
791
|
+
marker = f"{idx}. " if ordered else "- "
|
|
792
|
+
builder.raw(indent)
|
|
793
|
+
builder.raw(marker)
|
|
794
|
+
# Render list item content inline-ish.
|
|
795
|
+
for li_child in child.children or []:
|
|
796
|
+
_to_markdown_walk(
|
|
797
|
+
li_child,
|
|
798
|
+
builder,
|
|
799
|
+
preserve_whitespace=False,
|
|
800
|
+
list_depth=list_depth + 1,
|
|
801
|
+
in_link=in_link,
|
|
802
|
+
)
|
|
803
|
+
idx += 1
|
|
804
|
+
builder.ensure_newlines(2)
|
|
805
|
+
else:
|
|
806
|
+
# Flatten list inside link
|
|
807
|
+
for child in node.children or []:
|
|
808
|
+
if child.name.lower() != "li":
|
|
809
|
+
continue
|
|
810
|
+
builder.raw(" ")
|
|
811
|
+
for li_child in child.children or []:
|
|
812
|
+
_to_markdown_walk(
|
|
813
|
+
li_child,
|
|
814
|
+
builder,
|
|
815
|
+
preserve_whitespace=False,
|
|
816
|
+
list_depth=list_depth + 1,
|
|
817
|
+
in_link=in_link,
|
|
818
|
+
)
|
|
588
819
|
return
|
|
589
820
|
|
|
590
821
|
# Emphasis/strong.
|
|
591
822
|
if tag in {"em", "i"}:
|
|
592
823
|
builder.raw("*")
|
|
593
824
|
for child in node.children or []:
|
|
594
|
-
_to_markdown_walk(
|
|
825
|
+
_to_markdown_walk(
|
|
826
|
+
child,
|
|
827
|
+
builder,
|
|
828
|
+
preserve_whitespace=False,
|
|
829
|
+
list_depth=list_depth,
|
|
830
|
+
in_link=in_link,
|
|
831
|
+
)
|
|
595
832
|
builder.raw("*")
|
|
596
833
|
return
|
|
597
834
|
|
|
598
835
|
if tag in {"strong", "b"}:
|
|
599
836
|
builder.raw("**")
|
|
600
837
|
for child in node.children or []:
|
|
601
|
-
_to_markdown_walk(
|
|
838
|
+
_to_markdown_walk(
|
|
839
|
+
child,
|
|
840
|
+
builder,
|
|
841
|
+
preserve_whitespace=False,
|
|
842
|
+
list_depth=list_depth,
|
|
843
|
+
in_link=in_link,
|
|
844
|
+
)
|
|
602
845
|
builder.raw("**")
|
|
603
846
|
return
|
|
604
847
|
|
|
@@ -608,13 +851,24 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
608
851
|
if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
|
|
609
852
|
href = str(node.attrs["href"])
|
|
610
853
|
|
|
611
|
-
|
|
854
|
+
# Capture inner text to strip whitespace.
|
|
855
|
+
inner_builder = _MarkdownBuilder()
|
|
612
856
|
for child in node.children or []:
|
|
613
|
-
_to_markdown_walk(
|
|
857
|
+
_to_markdown_walk(
|
|
858
|
+
child,
|
|
859
|
+
inner_builder,
|
|
860
|
+
preserve_whitespace=False,
|
|
861
|
+
list_depth=list_depth,
|
|
862
|
+
in_link=True,
|
|
863
|
+
)
|
|
864
|
+
link_text = inner_builder.finish()
|
|
865
|
+
|
|
866
|
+
builder.raw("[")
|
|
867
|
+
builder.raw(link_text)
|
|
614
868
|
builder.raw("]")
|
|
615
869
|
if href:
|
|
616
870
|
builder.raw("(")
|
|
617
|
-
builder.raw(href)
|
|
871
|
+
builder.raw(_markdown_link_destination(href))
|
|
618
872
|
builder.raw(")")
|
|
619
873
|
return
|
|
620
874
|
|
|
@@ -622,11 +876,26 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
622
876
|
next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
|
|
623
877
|
if node.children:
|
|
624
878
|
for child in node.children:
|
|
625
|
-
_to_markdown_walk(
|
|
879
|
+
_to_markdown_walk(
|
|
880
|
+
child,
|
|
881
|
+
builder,
|
|
882
|
+
next_preserve,
|
|
883
|
+
list_depth,
|
|
884
|
+
in_link=in_link,
|
|
885
|
+
)
|
|
626
886
|
|
|
627
887
|
if isinstance(node, ElementNode) and node.template_content:
|
|
628
|
-
_to_markdown_walk(
|
|
888
|
+
_to_markdown_walk(
|
|
889
|
+
node.template_content,
|
|
890
|
+
builder,
|
|
891
|
+
next_preserve,
|
|
892
|
+
list_depth,
|
|
893
|
+
in_link=in_link,
|
|
894
|
+
)
|
|
629
895
|
|
|
630
896
|
# Add spacing after block containers to keep output readable.
|
|
631
897
|
if tag in _MARKDOWN_BLOCK_ELEMENTS:
|
|
632
|
-
|
|
898
|
+
if not in_link:
|
|
899
|
+
builder.ensure_newlines(2)
|
|
900
|
+
else:
|
|
901
|
+
builder.text(" ", preserve_whitespace=False)
|