justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/node.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from urllib.parse import quote
|
|
4
5
|
|
|
5
6
|
from .selector import query
|
|
6
7
|
from .serialize import to_html
|
|
@@ -43,6 +44,30 @@ def _markdown_code_span(s: str | None) -> str:
|
|
|
43
44
|
return f"{fence}{s}{fence}"
|
|
44
45
|
|
|
45
46
|
|
|
47
|
+
def _markdown_link_destination(url: str) -> str:
|
|
48
|
+
"""Return a Markdown-safe link destination.
|
|
49
|
+
|
|
50
|
+
We primarily care about avoiding Markdown formatting injection and broken
|
|
51
|
+
parsing for URLs that contain whitespace or parentheses.
|
|
52
|
+
|
|
53
|
+
CommonMark supports destinations wrapped in angle brackets:
|
|
54
|
+
`[text](<https://example.com/a(b)c>)`
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
u = (url or "").strip()
|
|
58
|
+
if not u:
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
# If the destination contains characters that can terminate or confuse
|
|
62
|
+
# the Markdown destination parser, wrap in <...> and percent-encode
|
|
63
|
+
# whitespace and angle brackets.
|
|
64
|
+
if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
|
|
65
|
+
u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
|
|
66
|
+
return f"<{u}>"
|
|
67
|
+
|
|
68
|
+
return u
|
|
69
|
+
|
|
70
|
+
|
|
46
71
|
class _MarkdownBuilder:
|
|
47
72
|
__slots__ = ("_buf", "_newline_count", "_pending_space")
|
|
48
73
|
|
|
@@ -133,29 +158,46 @@ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
|
|
|
133
158
|
|
|
134
159
|
|
|
135
160
|
def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if
|
|
143
|
-
data = data
|
|
161
|
+
# Iterative traversal avoids recursion overhead on large documents.
|
|
162
|
+
stack: list[Any] = [node]
|
|
163
|
+
while stack:
|
|
164
|
+
current = stack.pop()
|
|
165
|
+
name: str = current.name
|
|
166
|
+
|
|
167
|
+
if name == "#text":
|
|
168
|
+
data: str | None = current.data
|
|
144
169
|
if not data:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
170
|
+
continue
|
|
171
|
+
if strip:
|
|
172
|
+
data = data.strip()
|
|
173
|
+
if not data:
|
|
174
|
+
continue
|
|
175
|
+
parts.append(data)
|
|
176
|
+
continue
|
|
148
177
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
178
|
+
# Preserve the same traversal order as the recursive implementation:
|
|
179
|
+
# children first, then template content.
|
|
180
|
+
if type(current) is TemplateNode and current.template_content:
|
|
181
|
+
stack.append(current.template_content)
|
|
152
182
|
|
|
153
|
-
|
|
154
|
-
|
|
183
|
+
children = current.children
|
|
184
|
+
if children:
|
|
185
|
+
stack.extend(reversed(children))
|
|
155
186
|
|
|
156
187
|
|
|
157
188
|
class SimpleDomNode:
|
|
158
|
-
__slots__ = (
|
|
189
|
+
__slots__ = (
|
|
190
|
+
"_origin_col",
|
|
191
|
+
"_origin_line",
|
|
192
|
+
"_origin_pos",
|
|
193
|
+
"_source_html",
|
|
194
|
+
"attrs",
|
|
195
|
+
"children",
|
|
196
|
+
"data",
|
|
197
|
+
"name",
|
|
198
|
+
"namespace",
|
|
199
|
+
"parent",
|
|
200
|
+
)
|
|
159
201
|
|
|
160
202
|
name: str
|
|
161
203
|
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
@@ -163,6 +205,10 @@ class SimpleDomNode:
|
|
|
163
205
|
children: list[Any] | None
|
|
164
206
|
data: str | Doctype | None
|
|
165
207
|
namespace: str | None
|
|
208
|
+
_origin_pos: int | None
|
|
209
|
+
_origin_line: int | None
|
|
210
|
+
_origin_col: int | None
|
|
211
|
+
_source_html: str | None
|
|
166
212
|
|
|
167
213
|
def __init__(
|
|
168
214
|
self,
|
|
@@ -174,6 +220,10 @@ class SimpleDomNode:
|
|
|
174
220
|
self.name = name
|
|
175
221
|
self.parent = None
|
|
176
222
|
self.data = data
|
|
223
|
+
self._source_html = None
|
|
224
|
+
self._origin_pos = None
|
|
225
|
+
self._origin_line = None
|
|
226
|
+
self._origin_col = None
|
|
177
227
|
|
|
178
228
|
if name.startswith("#") or name == "!doctype":
|
|
179
229
|
self.namespace = namespace
|
|
@@ -193,12 +243,36 @@ class SimpleDomNode:
|
|
|
193
243
|
self.children.append(node)
|
|
194
244
|
node.parent = self
|
|
195
245
|
|
|
246
|
+
@property
|
|
247
|
+
def origin_offset(self) -> int | None:
|
|
248
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
249
|
+
return self._origin_pos
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def origin_line(self) -> int | None:
|
|
253
|
+
return self._origin_line
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def origin_col(self) -> int | None:
|
|
257
|
+
return self._origin_col
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
261
|
+
if self._origin_line is None or self._origin_col is None:
|
|
262
|
+
return None
|
|
263
|
+
return (self._origin_line, self._origin_col)
|
|
264
|
+
|
|
196
265
|
def remove_child(self, node: Any) -> None:
|
|
197
266
|
if self.children is not None:
|
|
198
267
|
self.children.remove(node)
|
|
199
268
|
node.parent = None
|
|
200
269
|
|
|
201
|
-
def to_html(
|
|
270
|
+
def to_html(
|
|
271
|
+
self,
|
|
272
|
+
indent: int = 0,
|
|
273
|
+
indent_size: int = 2,
|
|
274
|
+
pretty: bool = True,
|
|
275
|
+
) -> str:
|
|
202
276
|
"""Convert node to HTML string."""
|
|
203
277
|
return to_html(self, indent, indent_size, pretty=pretty)
|
|
204
278
|
|
|
@@ -232,16 +306,20 @@ class SimpleDomNode:
|
|
|
232
306
|
return ""
|
|
233
307
|
return ""
|
|
234
308
|
|
|
235
|
-
def to_text(
|
|
309
|
+
def to_text(
|
|
310
|
+
self,
|
|
311
|
+
separator: str = " ",
|
|
312
|
+
strip: bool = True,
|
|
313
|
+
) -> str:
|
|
236
314
|
"""Return the concatenated text of this node's descendants.
|
|
237
315
|
|
|
238
316
|
- `separator` controls how text nodes are joined (default: a single space).
|
|
239
317
|
- `strip=True` strips each text node and drops empty segments.
|
|
240
|
-
|
|
241
318
|
Template element contents are included via `template_content`.
|
|
242
319
|
"""
|
|
320
|
+
node: Any = self
|
|
243
321
|
parts: list[str] = []
|
|
244
|
-
_to_text_collect(
|
|
322
|
+
_to_text_collect(node, parts, strip=strip)
|
|
245
323
|
if not parts:
|
|
246
324
|
return ""
|
|
247
325
|
return separator.join(parts)
|
|
@@ -313,22 +391,28 @@ class SimpleDomNode:
|
|
|
313
391
|
"""Return True if this node has children."""
|
|
314
392
|
return bool(self.children)
|
|
315
393
|
|
|
316
|
-
def clone_node(self, deep: bool = False) -> SimpleDomNode:
|
|
394
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
|
|
317
395
|
"""
|
|
318
396
|
Clone this node.
|
|
319
397
|
|
|
320
398
|
Args:
|
|
321
399
|
deep: If True, recursively clone children.
|
|
400
|
+
override_attrs: Optional dictionary to use as attributes for the clone.
|
|
322
401
|
|
|
323
402
|
Returns:
|
|
324
403
|
A new node that is a copy of this node.
|
|
325
404
|
"""
|
|
405
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
|
|
326
406
|
clone = SimpleDomNode(
|
|
327
407
|
self.name,
|
|
328
|
-
|
|
408
|
+
attrs,
|
|
329
409
|
self.data,
|
|
330
410
|
self.namespace,
|
|
331
411
|
)
|
|
412
|
+
clone._source_html = self._source_html
|
|
413
|
+
clone._origin_pos = self._origin_pos
|
|
414
|
+
clone._origin_line = self._origin_line
|
|
415
|
+
clone._origin_col = self._origin_col
|
|
332
416
|
if deep and self.children:
|
|
333
417
|
for child in self.children:
|
|
334
418
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -336,11 +420,25 @@ class SimpleDomNode:
|
|
|
336
420
|
|
|
337
421
|
|
|
338
422
|
class ElementNode(SimpleDomNode):
|
|
339
|
-
__slots__ = (
|
|
423
|
+
__slots__ = (
|
|
424
|
+
"_end_tag_end",
|
|
425
|
+
"_end_tag_present",
|
|
426
|
+
"_end_tag_start",
|
|
427
|
+
"_self_closing",
|
|
428
|
+
"_start_tag_end",
|
|
429
|
+
"_start_tag_start",
|
|
430
|
+
"template_content",
|
|
431
|
+
)
|
|
340
432
|
|
|
341
433
|
template_content: SimpleDomNode | None
|
|
342
434
|
children: list[Any]
|
|
343
435
|
attrs: dict[str, str | None]
|
|
436
|
+
_start_tag_start: int | None
|
|
437
|
+
_start_tag_end: int | None
|
|
438
|
+
_end_tag_start: int | None
|
|
439
|
+
_end_tag_end: int | None
|
|
440
|
+
_end_tag_present: bool
|
|
441
|
+
_self_closing: bool
|
|
344
442
|
|
|
345
443
|
def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
|
|
346
444
|
self.name = name
|
|
@@ -350,9 +448,30 @@ class ElementNode(SimpleDomNode):
|
|
|
350
448
|
self.children = []
|
|
351
449
|
self.attrs = attrs if attrs is not None else {}
|
|
352
450
|
self.template_content = None
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
451
|
+
self._source_html = None
|
|
452
|
+
self._origin_pos = None
|
|
453
|
+
self._origin_line = None
|
|
454
|
+
self._origin_col = None
|
|
455
|
+
self._start_tag_start = None
|
|
456
|
+
self._start_tag_end = None
|
|
457
|
+
self._end_tag_start = None
|
|
458
|
+
self._end_tag_end = None
|
|
459
|
+
self._end_tag_present = False
|
|
460
|
+
self._self_closing = False
|
|
461
|
+
|
|
462
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
|
|
463
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
464
|
+
clone = ElementNode(self.name, attrs, self.namespace)
|
|
465
|
+
clone._source_html = self._source_html
|
|
466
|
+
clone._origin_pos = self._origin_pos
|
|
467
|
+
clone._origin_line = self._origin_line
|
|
468
|
+
clone._origin_col = self._origin_col
|
|
469
|
+
clone._start_tag_start = self._start_tag_start
|
|
470
|
+
clone._start_tag_end = self._start_tag_end
|
|
471
|
+
clone._end_tag_start = self._end_tag_start
|
|
472
|
+
clone._end_tag_end = self._end_tag_end
|
|
473
|
+
clone._end_tag_present = self._end_tag_present
|
|
474
|
+
clone._self_closing = self._self_closing
|
|
356
475
|
if deep:
|
|
357
476
|
for child in self.children:
|
|
358
477
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -375,13 +494,24 @@ class TemplateNode(ElementNode):
|
|
|
375
494
|
else:
|
|
376
495
|
self.template_content = None
|
|
377
496
|
|
|
378
|
-
def clone_node(self, deep: bool = False) -> TemplateNode:
|
|
497
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
|
|
498
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
379
499
|
clone = TemplateNode(
|
|
380
500
|
self.name,
|
|
381
|
-
|
|
501
|
+
attrs,
|
|
382
502
|
None,
|
|
383
503
|
self.namespace,
|
|
384
504
|
)
|
|
505
|
+
clone._source_html = self._source_html
|
|
506
|
+
clone._origin_pos = self._origin_pos
|
|
507
|
+
clone._origin_line = self._origin_line
|
|
508
|
+
clone._origin_col = self._origin_col
|
|
509
|
+
clone._start_tag_start = self._start_tag_start
|
|
510
|
+
clone._start_tag_end = self._start_tag_end
|
|
511
|
+
clone._end_tag_start = self._end_tag_start
|
|
512
|
+
clone._end_tag_end = self._end_tag_end
|
|
513
|
+
clone._end_tag_present = self._end_tag_present
|
|
514
|
+
clone._self_closing = self._self_closing
|
|
385
515
|
if deep:
|
|
386
516
|
if self.template_content:
|
|
387
517
|
clone.template_content = self.template_content.clone_node(deep=True)
|
|
@@ -391,26 +521,55 @@ class TemplateNode(ElementNode):
|
|
|
391
521
|
|
|
392
522
|
|
|
393
523
|
class TextNode:
|
|
394
|
-
__slots__ = ("data", "name", "namespace", "parent")
|
|
524
|
+
__slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
|
|
395
525
|
|
|
396
526
|
data: str | None
|
|
397
527
|
name: str
|
|
398
528
|
namespace: None
|
|
399
529
|
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
530
|
+
_origin_pos: int | None
|
|
531
|
+
_origin_line: int | None
|
|
532
|
+
_origin_col: int | None
|
|
400
533
|
|
|
401
534
|
def __init__(self, data: str | None) -> None:
|
|
402
535
|
self.data = data
|
|
403
536
|
self.parent = None
|
|
404
537
|
self.name = "#text"
|
|
405
538
|
self.namespace = None
|
|
539
|
+
self._origin_pos = None
|
|
540
|
+
self._origin_line = None
|
|
541
|
+
self._origin_col = None
|
|
542
|
+
|
|
543
|
+
@property
|
|
544
|
+
def origin_offset(self) -> int | None:
|
|
545
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
546
|
+
return self._origin_pos
|
|
547
|
+
|
|
548
|
+
@property
|
|
549
|
+
def origin_line(self) -> int | None:
|
|
550
|
+
return self._origin_line
|
|
551
|
+
|
|
552
|
+
@property
|
|
553
|
+
def origin_col(self) -> int | None:
|
|
554
|
+
return self._origin_col
|
|
555
|
+
|
|
556
|
+
@property
|
|
557
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
558
|
+
if self._origin_line is None or self._origin_col is None:
|
|
559
|
+
return None
|
|
560
|
+
return (self._origin_line, self._origin_col)
|
|
406
561
|
|
|
407
562
|
@property
|
|
408
563
|
def text(self) -> str:
|
|
409
564
|
"""Return the text content of this node."""
|
|
410
565
|
return self.data or ""
|
|
411
566
|
|
|
412
|
-
def to_text(
|
|
413
|
-
|
|
567
|
+
def to_text(
|
|
568
|
+
self,
|
|
569
|
+
separator: str = " ",
|
|
570
|
+
strip: bool = True,
|
|
571
|
+
) -> str:
|
|
572
|
+
_ = separator
|
|
414
573
|
if self.data is None:
|
|
415
574
|
return ""
|
|
416
575
|
if strip:
|
|
@@ -432,7 +591,11 @@ class TextNode:
|
|
|
432
591
|
return False
|
|
433
592
|
|
|
434
593
|
def clone_node(self, deep: bool = False) -> TextNode:
|
|
435
|
-
|
|
594
|
+
clone = TextNode(self.data)
|
|
595
|
+
clone._origin_pos = self._origin_pos
|
|
596
|
+
clone._origin_line = self._origin_line
|
|
597
|
+
clone._origin_col = self._origin_col
|
|
598
|
+
return clone
|
|
436
599
|
|
|
437
600
|
|
|
438
601
|
_MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
|
|
@@ -463,7 +626,13 @@ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
|
|
|
463
626
|
)
|
|
464
627
|
|
|
465
628
|
|
|
466
|
-
def _to_markdown_walk(
|
|
629
|
+
def _to_markdown_walk(
|
|
630
|
+
node: Any,
|
|
631
|
+
builder: _MarkdownBuilder,
|
|
632
|
+
preserve_whitespace: bool,
|
|
633
|
+
list_depth: int,
|
|
634
|
+
in_link: bool = False,
|
|
635
|
+
) -> None:
|
|
467
636
|
name: str = node.name
|
|
468
637
|
|
|
469
638
|
if name == "#text":
|
|
@@ -474,7 +643,10 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
474
643
|
return
|
|
475
644
|
|
|
476
645
|
if name == "br":
|
|
477
|
-
|
|
646
|
+
if in_link:
|
|
647
|
+
builder.text(" ", preserve_whitespace=False)
|
|
648
|
+
else:
|
|
649
|
+
builder.newline(1)
|
|
478
650
|
return
|
|
479
651
|
|
|
480
652
|
# Comments/doctype don't contribute.
|
|
@@ -485,52 +657,80 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
485
657
|
if name.startswith("#"):
|
|
486
658
|
if node.children:
|
|
487
659
|
for child in node.children:
|
|
488
|
-
_to_markdown_walk(
|
|
660
|
+
_to_markdown_walk(
|
|
661
|
+
child,
|
|
662
|
+
builder,
|
|
663
|
+
preserve_whitespace,
|
|
664
|
+
list_depth,
|
|
665
|
+
in_link=in_link,
|
|
666
|
+
)
|
|
489
667
|
return
|
|
490
668
|
|
|
491
669
|
tag = name.lower()
|
|
492
670
|
|
|
671
|
+
# Metadata containers don't contribute to body text.
|
|
672
|
+
if tag == "head" or tag == "title":
|
|
673
|
+
return
|
|
674
|
+
|
|
493
675
|
# Preserve <img> and <table> as HTML.
|
|
494
676
|
if tag == "img":
|
|
495
677
|
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
496
678
|
return
|
|
497
679
|
|
|
498
680
|
if tag == "table":
|
|
499
|
-
|
|
681
|
+
if not in_link:
|
|
682
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
500
683
|
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
501
|
-
|
|
684
|
+
if not in_link:
|
|
685
|
+
builder.ensure_newlines(2)
|
|
502
686
|
return
|
|
503
687
|
|
|
504
688
|
# Headings.
|
|
505
689
|
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
690
|
+
if not in_link:
|
|
691
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
692
|
+
level = int(tag[1])
|
|
693
|
+
builder.raw("#" * level)
|
|
694
|
+
builder.raw(" ")
|
|
695
|
+
|
|
510
696
|
if node.children:
|
|
511
697
|
for child in node.children:
|
|
512
|
-
_to_markdown_walk(
|
|
513
|
-
|
|
698
|
+
_to_markdown_walk(
|
|
699
|
+
child,
|
|
700
|
+
builder,
|
|
701
|
+
preserve_whitespace=False,
|
|
702
|
+
list_depth=list_depth,
|
|
703
|
+
in_link=in_link,
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
if not in_link:
|
|
707
|
+
builder.ensure_newlines(2)
|
|
514
708
|
return
|
|
515
709
|
|
|
516
710
|
# Horizontal rule.
|
|
517
711
|
if tag == "hr":
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
712
|
+
if not in_link:
|
|
713
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
714
|
+
builder.raw("---")
|
|
715
|
+
builder.ensure_newlines(2)
|
|
521
716
|
return
|
|
522
717
|
|
|
523
718
|
# Code blocks.
|
|
524
719
|
if tag == "pre":
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
if code:
|
|
530
|
-
builder.raw(code.rstrip("\n"))
|
|
720
|
+
if not in_link:
|
|
721
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
722
|
+
code = node.to_text(separator="", strip=False)
|
|
723
|
+
builder.raw("```")
|
|
531
724
|
builder.newline(1)
|
|
532
|
-
|
|
533
|
-
|
|
725
|
+
if code:
|
|
726
|
+
builder.raw(code.rstrip("\n"))
|
|
727
|
+
builder.newline(1)
|
|
728
|
+
builder.raw("```")
|
|
729
|
+
builder.ensure_newlines(2)
|
|
730
|
+
else:
|
|
731
|
+
# Inside link, render as inline code or text
|
|
732
|
+
code = node.to_text(separator="", strip=False)
|
|
733
|
+
builder.raw(_markdown_code_span(code))
|
|
534
734
|
return
|
|
535
735
|
|
|
536
736
|
# Inline code.
|
|
@@ -541,64 +741,126 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
541
741
|
|
|
542
742
|
# Paragraph-like blocks.
|
|
543
743
|
if tag == "p":
|
|
544
|
-
|
|
744
|
+
if not in_link:
|
|
745
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
746
|
+
|
|
545
747
|
if node.children:
|
|
546
748
|
for child in node.children:
|
|
547
|
-
_to_markdown_walk(
|
|
548
|
-
|
|
749
|
+
_to_markdown_walk(
|
|
750
|
+
child,
|
|
751
|
+
builder,
|
|
752
|
+
preserve_whitespace=False,
|
|
753
|
+
list_depth=list_depth,
|
|
754
|
+
in_link=in_link,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
if not in_link:
|
|
758
|
+
builder.ensure_newlines(2)
|
|
759
|
+
else:
|
|
760
|
+
builder.text(" ", preserve_whitespace=False)
|
|
549
761
|
return
|
|
550
762
|
|
|
551
763
|
# Blockquotes.
|
|
552
764
|
if tag == "blockquote":
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
765
|
+
if not in_link:
|
|
766
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
767
|
+
inner = _MarkdownBuilder()
|
|
768
|
+
if node.children:
|
|
769
|
+
for child in node.children:
|
|
770
|
+
_to_markdown_walk(
|
|
771
|
+
child,
|
|
772
|
+
inner,
|
|
773
|
+
preserve_whitespace=False,
|
|
774
|
+
list_depth=list_depth,
|
|
775
|
+
in_link=in_link,
|
|
776
|
+
)
|
|
777
|
+
text = inner.finish()
|
|
778
|
+
if text:
|
|
779
|
+
lines = text.split("\n")
|
|
780
|
+
for i, line in enumerate(lines):
|
|
781
|
+
if i:
|
|
782
|
+
builder.newline(1)
|
|
783
|
+
builder.raw("> ")
|
|
784
|
+
builder.raw(line)
|
|
785
|
+
builder.ensure_newlines(2)
|
|
786
|
+
else:
|
|
787
|
+
if node.children:
|
|
788
|
+
for child in node.children:
|
|
789
|
+
_to_markdown_walk(
|
|
790
|
+
child,
|
|
791
|
+
builder,
|
|
792
|
+
preserve_whitespace=False,
|
|
793
|
+
list_depth=list_depth,
|
|
794
|
+
in_link=in_link,
|
|
795
|
+
)
|
|
567
796
|
return
|
|
568
797
|
|
|
569
798
|
# Lists.
|
|
570
799
|
if tag in {"ul", "ol"}:
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
800
|
+
if not in_link:
|
|
801
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
802
|
+
ordered = tag == "ol"
|
|
803
|
+
idx = 1
|
|
804
|
+
for child in node.children or []:
|
|
805
|
+
if child.name.lower() != "li":
|
|
806
|
+
continue
|
|
807
|
+
if idx > 1:
|
|
808
|
+
builder.newline(1)
|
|
809
|
+
indent = " " * list_depth
|
|
810
|
+
marker = f"{idx}. " if ordered else "- "
|
|
811
|
+
builder.raw(indent)
|
|
812
|
+
builder.raw(marker)
|
|
813
|
+
# Render list item content inline-ish.
|
|
814
|
+
for li_child in child.children or []:
|
|
815
|
+
_to_markdown_walk(
|
|
816
|
+
li_child,
|
|
817
|
+
builder,
|
|
818
|
+
preserve_whitespace=False,
|
|
819
|
+
list_depth=list_depth + 1,
|
|
820
|
+
in_link=in_link,
|
|
821
|
+
)
|
|
822
|
+
idx += 1
|
|
823
|
+
builder.ensure_newlines(2)
|
|
824
|
+
else:
|
|
825
|
+
# Flatten list inside link
|
|
826
|
+
for child in node.children or []:
|
|
827
|
+
if child.name.lower() != "li":
|
|
828
|
+
continue
|
|
829
|
+
builder.raw(" ")
|
|
830
|
+
for li_child in child.children or []:
|
|
831
|
+
_to_markdown_walk(
|
|
832
|
+
li_child,
|
|
833
|
+
builder,
|
|
834
|
+
preserve_whitespace=False,
|
|
835
|
+
list_depth=list_depth + 1,
|
|
836
|
+
in_link=in_link,
|
|
837
|
+
)
|
|
588
838
|
return
|
|
589
839
|
|
|
590
840
|
# Emphasis/strong.
|
|
591
841
|
if tag in {"em", "i"}:
|
|
592
842
|
builder.raw("*")
|
|
593
843
|
for child in node.children or []:
|
|
594
|
-
_to_markdown_walk(
|
|
844
|
+
_to_markdown_walk(
|
|
845
|
+
child,
|
|
846
|
+
builder,
|
|
847
|
+
preserve_whitespace=False,
|
|
848
|
+
list_depth=list_depth,
|
|
849
|
+
in_link=in_link,
|
|
850
|
+
)
|
|
595
851
|
builder.raw("*")
|
|
596
852
|
return
|
|
597
853
|
|
|
598
854
|
if tag in {"strong", "b"}:
|
|
599
855
|
builder.raw("**")
|
|
600
856
|
for child in node.children or []:
|
|
601
|
-
_to_markdown_walk(
|
|
857
|
+
_to_markdown_walk(
|
|
858
|
+
child,
|
|
859
|
+
builder,
|
|
860
|
+
preserve_whitespace=False,
|
|
861
|
+
list_depth=list_depth,
|
|
862
|
+
in_link=in_link,
|
|
863
|
+
)
|
|
602
864
|
builder.raw("**")
|
|
603
865
|
return
|
|
604
866
|
|
|
@@ -608,13 +870,24 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
608
870
|
if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
|
|
609
871
|
href = str(node.attrs["href"])
|
|
610
872
|
|
|
611
|
-
|
|
873
|
+
# Capture inner text to strip whitespace.
|
|
874
|
+
inner_builder = _MarkdownBuilder()
|
|
612
875
|
for child in node.children or []:
|
|
613
|
-
_to_markdown_walk(
|
|
876
|
+
_to_markdown_walk(
|
|
877
|
+
child,
|
|
878
|
+
inner_builder,
|
|
879
|
+
preserve_whitespace=False,
|
|
880
|
+
list_depth=list_depth,
|
|
881
|
+
in_link=True,
|
|
882
|
+
)
|
|
883
|
+
link_text = inner_builder.finish()
|
|
884
|
+
|
|
885
|
+
builder.raw("[")
|
|
886
|
+
builder.raw(link_text)
|
|
614
887
|
builder.raw("]")
|
|
615
888
|
if href:
|
|
616
889
|
builder.raw("(")
|
|
617
|
-
builder.raw(href)
|
|
890
|
+
builder.raw(_markdown_link_destination(href))
|
|
618
891
|
builder.raw(")")
|
|
619
892
|
return
|
|
620
893
|
|
|
@@ -622,11 +895,26 @@ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace:
|
|
|
622
895
|
next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
|
|
623
896
|
if node.children:
|
|
624
897
|
for child in node.children:
|
|
625
|
-
_to_markdown_walk(
|
|
898
|
+
_to_markdown_walk(
|
|
899
|
+
child,
|
|
900
|
+
builder,
|
|
901
|
+
next_preserve,
|
|
902
|
+
list_depth,
|
|
903
|
+
in_link=in_link,
|
|
904
|
+
)
|
|
626
905
|
|
|
627
906
|
if isinstance(node, ElementNode) and node.template_content:
|
|
628
|
-
_to_markdown_walk(
|
|
907
|
+
_to_markdown_walk(
|
|
908
|
+
node.template_content,
|
|
909
|
+
builder,
|
|
910
|
+
next_preserve,
|
|
911
|
+
list_depth,
|
|
912
|
+
in_link=in_link,
|
|
913
|
+
)
|
|
629
914
|
|
|
630
915
|
# Add spacing after block containers to keep output readable.
|
|
631
916
|
if tag in _MARKDOWN_BLOCK_ELEMENTS:
|
|
632
|
-
|
|
917
|
+
if not in_link:
|
|
918
|
+
builder.ensure_newlines(2)
|
|
919
|
+
else:
|
|
920
|
+
builder.text(" ", preserve_whitespace=False)
|