justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/node.py
CHANGED
|
@@ -1,14 +1,228 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from urllib.parse import quote
|
|
5
|
+
|
|
6
|
+
from .sanitize import _sanitize
|
|
1
7
|
from .selector import query
|
|
2
8
|
from .serialize import to_html
|
|
3
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from .sanitize import SanitizationPolicy
|
|
12
|
+
from .tokens import Doctype
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _markdown_escape_text(s: str) -> str:
|
|
16
|
+
if not s:
|
|
17
|
+
return ""
|
|
18
|
+
# Pragmatic: escape the few characters that commonly change Markdown meaning.
|
|
19
|
+
# Keep this minimal to preserve readability.
|
|
20
|
+
out: list[str] = []
|
|
21
|
+
for ch in s:
|
|
22
|
+
if ch in "\\`*_[]":
|
|
23
|
+
out.append("\\")
|
|
24
|
+
out.append(ch)
|
|
25
|
+
return "".join(out)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _markdown_code_span(s: str | None) -> str:
|
|
29
|
+
if s is None:
|
|
30
|
+
s = ""
|
|
31
|
+
# Use a backtick fence longer than any run of backticks inside.
|
|
32
|
+
longest = 0
|
|
33
|
+
run = 0
|
|
34
|
+
for ch in s:
|
|
35
|
+
if ch == "`":
|
|
36
|
+
run += 1
|
|
37
|
+
if run > longest:
|
|
38
|
+
longest = run
|
|
39
|
+
else:
|
|
40
|
+
run = 0
|
|
41
|
+
fence = "`" * (longest + 1)
|
|
42
|
+
# CommonMark requires a space if the content starts/ends with backticks.
|
|
43
|
+
needs_space = s.startswith("`") or s.endswith("`")
|
|
44
|
+
if needs_space:
|
|
45
|
+
return f"{fence} {s} {fence}"
|
|
46
|
+
return f"{fence}{s}{fence}"
|
|
47
|
+
|
|
4
48
|
|
|
5
|
-
|
|
6
|
-
|
|
49
|
+
def _markdown_link_destination(url: str) -> str:
|
|
50
|
+
"""Return a Markdown-safe link destination.
|
|
51
|
+
|
|
52
|
+
We primarily care about avoiding Markdown formatting injection and broken
|
|
53
|
+
parsing for URLs that contain whitespace or parentheses.
|
|
54
|
+
|
|
55
|
+
CommonMark supports destinations wrapped in angle brackets:
|
|
56
|
+
`[text](<https://example.com/a(b)c>)`
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
u = (url or "").strip()
|
|
60
|
+
if not u:
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
# If the destination contains characters that can terminate or confuse
|
|
64
|
+
# the Markdown destination parser, wrap in <...> and percent-encode
|
|
65
|
+
# whitespace and angle brackets.
|
|
66
|
+
if any(ch in u for ch in (" ", "\t", "\n", "\r", "(", ")", "<", ">")):
|
|
67
|
+
u = quote(u, safe=":/?#[]@!$&'*+,;=%-._~()")
|
|
68
|
+
return f"<{u}>"
|
|
69
|
+
|
|
70
|
+
return u
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class _MarkdownBuilder:
|
|
74
|
+
__slots__ = ("_buf", "_newline_count", "_pending_space")
|
|
75
|
+
|
|
76
|
+
_buf: list[str]
|
|
77
|
+
_newline_count: int
|
|
78
|
+
_pending_space: bool
|
|
79
|
+
|
|
80
|
+
def __init__(self) -> None:
|
|
81
|
+
self._buf = []
|
|
82
|
+
self._newline_count = 0
|
|
83
|
+
self._pending_space = False
|
|
84
|
+
|
|
85
|
+
def _rstrip_last_segment(self) -> None:
|
|
86
|
+
if not self._buf:
|
|
87
|
+
return
|
|
88
|
+
last = self._buf[-1]
|
|
89
|
+
stripped = last.rstrip(" \t")
|
|
90
|
+
if stripped != last:
|
|
91
|
+
self._buf[-1] = stripped
|
|
92
|
+
|
|
93
|
+
def newline(self, count: int = 1) -> None:
|
|
94
|
+
for _ in range(count):
|
|
95
|
+
self._pending_space = False
|
|
96
|
+
self._rstrip_last_segment()
|
|
97
|
+
self._buf.append("\n")
|
|
98
|
+
# Track newlines to make it easy to insert blank lines.
|
|
99
|
+
if self._newline_count < 2:
|
|
100
|
+
self._newline_count += 1
|
|
101
|
+
|
|
102
|
+
def ensure_newlines(self, count: int) -> None:
|
|
103
|
+
while self._newline_count < count:
|
|
104
|
+
self.newline(1)
|
|
105
|
+
|
|
106
|
+
def raw(self, s: str) -> None:
|
|
107
|
+
if not s:
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
# If we've collapsed whitespace and the next output is raw (e.g. "**"),
|
|
111
|
+
# we still need to emit a single separating space.
|
|
112
|
+
if self._pending_space:
|
|
113
|
+
first = s[0]
|
|
114
|
+
if first not in " \t\n\r\f" and self._buf and self._newline_count == 0:
|
|
115
|
+
self._buf.append(" ")
|
|
116
|
+
self._pending_space = False
|
|
117
|
+
|
|
118
|
+
self._buf.append(s)
|
|
119
|
+
if "\n" in s:
|
|
120
|
+
# Count trailing newlines (cap at 2 for blank-line semantics).
|
|
121
|
+
trailing = 0
|
|
122
|
+
i = len(s) - 1
|
|
123
|
+
while i >= 0 and s[i] == "\n":
|
|
124
|
+
trailing += 1
|
|
125
|
+
i -= 1
|
|
126
|
+
self._newline_count = min(2, trailing)
|
|
127
|
+
if trailing:
|
|
128
|
+
self._pending_space = False
|
|
129
|
+
else:
|
|
130
|
+
self._newline_count = 0
|
|
131
|
+
|
|
132
|
+
def text(self, s: str, preserve_whitespace: bool = False) -> None:
|
|
133
|
+
if not s:
|
|
134
|
+
return
|
|
7
135
|
|
|
8
|
-
|
|
136
|
+
if preserve_whitespace:
|
|
137
|
+
self.raw(s)
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
for ch in s:
|
|
141
|
+
if ch in " \t\n\r\f":
|
|
142
|
+
self._pending_space = True
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
if self._pending_space:
|
|
146
|
+
if self._buf and self._newline_count == 0:
|
|
147
|
+
self._buf.append(" ")
|
|
148
|
+
self._pending_space = False
|
|
149
|
+
|
|
150
|
+
self._buf.append(ch)
|
|
151
|
+
self._newline_count = 0
|
|
152
|
+
|
|
153
|
+
def finish(self) -> str:
|
|
154
|
+
out = "".join(self._buf)
|
|
155
|
+
return out.strip(" \t\n")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# Type alias for any node type
|
|
159
|
+
NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
|
|
163
|
+
# Iterative traversal avoids recursion overhead on large documents.
|
|
164
|
+
stack: list[Any] = [node]
|
|
165
|
+
while stack:
|
|
166
|
+
current = stack.pop()
|
|
167
|
+
name: str = current.name
|
|
168
|
+
|
|
169
|
+
if name == "#text":
|
|
170
|
+
data: str | None = current.data
|
|
171
|
+
if not data:
|
|
172
|
+
continue
|
|
173
|
+
if strip:
|
|
174
|
+
data = data.strip()
|
|
175
|
+
if not data:
|
|
176
|
+
continue
|
|
177
|
+
parts.append(data)
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# Preserve the same traversal order as the recursive implementation:
|
|
181
|
+
# children first, then template content.
|
|
182
|
+
if type(current) is TemplateNode and current.template_content:
|
|
183
|
+
stack.append(current.template_content)
|
|
184
|
+
|
|
185
|
+
children = current.children
|
|
186
|
+
if children:
|
|
187
|
+
stack.extend(reversed(children))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class SimpleDomNode:
|
|
191
|
+
__slots__ = (
|
|
192
|
+
"_origin_col",
|
|
193
|
+
"_origin_line",
|
|
194
|
+
"_origin_pos",
|
|
195
|
+
"attrs",
|
|
196
|
+
"children",
|
|
197
|
+
"data",
|
|
198
|
+
"name",
|
|
199
|
+
"namespace",
|
|
200
|
+
"parent",
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
name: str
|
|
204
|
+
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
205
|
+
attrs: dict[str, str | None] | None
|
|
206
|
+
children: list[Any] | None
|
|
207
|
+
data: str | Doctype | None
|
|
208
|
+
namespace: str | None
|
|
209
|
+
_origin_pos: int | None
|
|
210
|
+
_origin_line: int | None
|
|
211
|
+
_origin_col: int | None
|
|
212
|
+
|
|
213
|
+
def __init__(
|
|
214
|
+
self,
|
|
215
|
+
name: str,
|
|
216
|
+
attrs: dict[str, str | None] | None = None,
|
|
217
|
+
data: str | Doctype | None = None,
|
|
218
|
+
namespace: str | None = None,
|
|
219
|
+
) -> None:
|
|
9
220
|
self.name = name
|
|
10
221
|
self.parent = None
|
|
11
222
|
self.data = data
|
|
223
|
+
self._origin_pos = None
|
|
224
|
+
self._origin_line = None
|
|
225
|
+
self._origin_col = None
|
|
12
226
|
|
|
13
227
|
if name.startswith("#") or name == "!doctype":
|
|
14
228
|
self.namespace = namespace
|
|
@@ -23,19 +237,48 @@ class SimpleDomNode:
|
|
|
23
237
|
self.children = []
|
|
24
238
|
self.attrs = attrs if attrs is not None else {}
|
|
25
239
|
|
|
26
|
-
def append_child(self, node):
|
|
27
|
-
self.children
|
|
28
|
-
|
|
240
|
+
def append_child(self, node: Any) -> None:
|
|
241
|
+
if self.children is not None:
|
|
242
|
+
self.children.append(node)
|
|
243
|
+
node.parent = self
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def origin_offset(self) -> int | None:
|
|
247
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
248
|
+
return self._origin_pos
|
|
29
249
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
250
|
+
@property
|
|
251
|
+
def origin_line(self) -> int | None:
|
|
252
|
+
return self._origin_line
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def origin_col(self) -> int | None:
|
|
256
|
+
return self._origin_col
|
|
33
257
|
|
|
34
|
-
|
|
258
|
+
@property
|
|
259
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
260
|
+
if self._origin_line is None or self._origin_col is None:
|
|
261
|
+
return None
|
|
262
|
+
return (self._origin_line, self._origin_col)
|
|
263
|
+
|
|
264
|
+
def remove_child(self, node: Any) -> None:
|
|
265
|
+
if self.children is not None:
|
|
266
|
+
self.children.remove(node)
|
|
267
|
+
node.parent = None
|
|
268
|
+
|
|
269
|
+
def to_html(
|
|
270
|
+
self,
|
|
271
|
+
indent: int = 0,
|
|
272
|
+
indent_size: int = 2,
|
|
273
|
+
pretty: bool = True,
|
|
274
|
+
*,
|
|
275
|
+
safe: bool = True,
|
|
276
|
+
policy: SanitizationPolicy | None = None,
|
|
277
|
+
) -> str:
|
|
35
278
|
"""Convert node to HTML string."""
|
|
36
|
-
return to_html(self, indent, indent_size, pretty=pretty)
|
|
279
|
+
return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
|
|
37
280
|
|
|
38
|
-
def query(self, selector):
|
|
281
|
+
def query(self, selector: str) -> list[Any]:
|
|
39
282
|
"""
|
|
40
283
|
Query this subtree using a CSS selector.
|
|
41
284
|
|
|
@@ -48,18 +291,65 @@ class SimpleDomNode:
|
|
|
48
291
|
Raises:
|
|
49
292
|
ValueError: If the selector is invalid
|
|
50
293
|
"""
|
|
51
|
-
|
|
294
|
+
result: list[Any] = query(self, selector)
|
|
295
|
+
return result
|
|
52
296
|
|
|
53
297
|
@property
|
|
54
|
-
def text(self):
|
|
55
|
-
"""Return the
|
|
298
|
+
def text(self) -> str:
|
|
299
|
+
"""Return the node's own text value.
|
|
300
|
+
|
|
301
|
+
For text nodes this is the node data. For other nodes this is an empty
|
|
302
|
+
string. Use `to_text()` to get textContent semantics.
|
|
303
|
+
"""
|
|
56
304
|
if self.name == "#text":
|
|
57
|
-
|
|
58
|
-
|
|
305
|
+
data = self.data
|
|
306
|
+
if isinstance(data, str):
|
|
307
|
+
return data
|
|
59
308
|
return ""
|
|
60
|
-
return ""
|
|
309
|
+
return ""
|
|
310
|
+
|
|
311
|
+
def to_text(
|
|
312
|
+
self,
|
|
313
|
+
separator: str = " ",
|
|
314
|
+
strip: bool = True,
|
|
315
|
+
*,
|
|
316
|
+
safe: bool = True,
|
|
317
|
+
policy: SanitizationPolicy | None = None,
|
|
318
|
+
) -> str:
|
|
319
|
+
"""Return the concatenated text of this node's descendants.
|
|
320
|
+
|
|
321
|
+
- `separator` controls how text nodes are joined (default: a single space).
|
|
322
|
+
- `strip=True` strips each text node and drops empty segments.
|
|
323
|
+
- `safe=True` sanitizes untrusted HTML before extracting text.
|
|
324
|
+
- `policy` overrides the default sanitization policy.
|
|
325
|
+
|
|
326
|
+
Template element contents are included via `template_content`.
|
|
327
|
+
"""
|
|
328
|
+
node: Any = _sanitize(self, policy=policy) if safe else self
|
|
329
|
+
parts: list[str] = []
|
|
330
|
+
_to_text_collect(node, parts, strip=strip)
|
|
331
|
+
if not parts:
|
|
332
|
+
return ""
|
|
333
|
+
return separator.join(parts)
|
|
334
|
+
|
|
335
|
+
def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
|
|
336
|
+
"""Return a GitHub Flavored Markdown representation of this subtree.
|
|
61
337
|
|
|
62
|
-
|
|
338
|
+
This is a pragmatic HTML->Markdown converter intended for readability.
|
|
339
|
+
- Tables and images are preserved as raw HTML.
|
|
340
|
+
- Unknown elements fall back to rendering their children.
|
|
341
|
+
"""
|
|
342
|
+
if safe:
|
|
343
|
+
node = _sanitize(self, policy=policy)
|
|
344
|
+
builder = _MarkdownBuilder()
|
|
345
|
+
_to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
|
|
346
|
+
return builder.finish()
|
|
347
|
+
|
|
348
|
+
builder = _MarkdownBuilder()
|
|
349
|
+
_to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
|
|
350
|
+
return builder.finish()
|
|
351
|
+
|
|
352
|
+
def insert_before(self, node: Any, reference_node: Any | None) -> None:
|
|
63
353
|
"""
|
|
64
354
|
Insert a node before a reference node.
|
|
65
355
|
|
|
@@ -84,7 +374,7 @@ class SimpleDomNode:
|
|
|
84
374
|
except ValueError:
|
|
85
375
|
raise ValueError("Reference node is not a child of this node") from None
|
|
86
376
|
|
|
87
|
-
def replace_child(self, new_node, old_node):
|
|
377
|
+
def replace_child(self, new_node: Any, old_node: Any) -> Any:
|
|
88
378
|
"""
|
|
89
379
|
Replace a child node with a new node.
|
|
90
380
|
|
|
@@ -111,26 +401,31 @@ class SimpleDomNode:
|
|
|
111
401
|
old_node.parent = None
|
|
112
402
|
return old_node
|
|
113
403
|
|
|
114
|
-
def has_child_nodes(self):
|
|
404
|
+
def has_child_nodes(self) -> bool:
|
|
115
405
|
"""Return True if this node has children."""
|
|
116
406
|
return bool(self.children)
|
|
117
407
|
|
|
118
|
-
def clone_node(self, deep=False):
|
|
408
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
|
|
119
409
|
"""
|
|
120
410
|
Clone this node.
|
|
121
411
|
|
|
122
412
|
Args:
|
|
123
413
|
deep: If True, recursively clone children.
|
|
414
|
+
override_attrs: Optional dictionary to use as attributes for the clone.
|
|
124
415
|
|
|
125
416
|
Returns:
|
|
126
417
|
A new node that is a copy of this node.
|
|
127
418
|
"""
|
|
419
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
|
|
128
420
|
clone = SimpleDomNode(
|
|
129
421
|
self.name,
|
|
130
|
-
|
|
422
|
+
attrs,
|
|
131
423
|
self.data,
|
|
132
424
|
self.namespace,
|
|
133
425
|
)
|
|
426
|
+
clone._origin_pos = self._origin_pos
|
|
427
|
+
clone._origin_line = self._origin_line
|
|
428
|
+
clone._origin_col = self._origin_col
|
|
134
429
|
if deep and self.children:
|
|
135
430
|
for child in self.children:
|
|
136
431
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -138,18 +433,30 @@ class SimpleDomNode:
|
|
|
138
433
|
|
|
139
434
|
|
|
140
435
|
class ElementNode(SimpleDomNode):
|
|
141
|
-
__slots__ = ()
|
|
436
|
+
__slots__ = ("template_content",)
|
|
142
437
|
|
|
143
|
-
|
|
438
|
+
template_content: SimpleDomNode | None
|
|
439
|
+
children: list[Any]
|
|
440
|
+
attrs: dict[str, str | None]
|
|
441
|
+
|
|
442
|
+
def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
|
|
144
443
|
self.name = name
|
|
145
444
|
self.parent = None
|
|
146
445
|
self.data = None
|
|
147
446
|
self.namespace = namespace
|
|
148
447
|
self.children = []
|
|
149
|
-
self.attrs = attrs
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
448
|
+
self.attrs = attrs if attrs is not None else {}
|
|
449
|
+
self.template_content = None
|
|
450
|
+
self._origin_pos = None
|
|
451
|
+
self._origin_line = None
|
|
452
|
+
self._origin_col = None
|
|
453
|
+
|
|
454
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
|
|
455
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
456
|
+
clone = ElementNode(self.name, attrs, self.namespace)
|
|
457
|
+
clone._origin_pos = self._origin_pos
|
|
458
|
+
clone._origin_line = self._origin_line
|
|
459
|
+
clone._origin_col = self._origin_col
|
|
153
460
|
if deep:
|
|
154
461
|
for child in self.children:
|
|
155
462
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -157,22 +464,32 @@ class ElementNode(SimpleDomNode):
|
|
|
157
464
|
|
|
158
465
|
|
|
159
466
|
class TemplateNode(ElementNode):
|
|
160
|
-
__slots__ = (
|
|
467
|
+
__slots__ = ()
|
|
161
468
|
|
|
162
|
-
def __init__(
|
|
469
|
+
def __init__(
|
|
470
|
+
self,
|
|
471
|
+
name: str,
|
|
472
|
+
attrs: dict[str, str | None] | None = None,
|
|
473
|
+
data: str | None = None,
|
|
474
|
+
namespace: str | None = None,
|
|
475
|
+
) -> None:
|
|
163
476
|
super().__init__(name, attrs, namespace)
|
|
164
477
|
if self.namespace == "html":
|
|
165
478
|
self.template_content = SimpleDomNode("#document-fragment")
|
|
166
479
|
else:
|
|
167
480
|
self.template_content = None
|
|
168
481
|
|
|
169
|
-
def clone_node(self, deep=False):
|
|
482
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
|
|
483
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
170
484
|
clone = TemplateNode(
|
|
171
485
|
self.name,
|
|
172
|
-
|
|
173
|
-
|
|
486
|
+
attrs,
|
|
487
|
+
None,
|
|
174
488
|
self.namespace,
|
|
175
489
|
)
|
|
490
|
+
clone._origin_pos = self._origin_pos
|
|
491
|
+
clone._origin_line = self._origin_line
|
|
492
|
+
clone._origin_col = self._origin_col
|
|
176
493
|
if deep:
|
|
177
494
|
if self.template_content:
|
|
178
495
|
clone.template_content = self.template_content.clone_node(deep=True)
|
|
@@ -182,27 +499,407 @@ class TemplateNode(ElementNode):
|
|
|
182
499
|
|
|
183
500
|
|
|
184
501
|
class TextNode:
|
|
185
|
-
__slots__ = ("data", "name", "namespace", "parent")
|
|
502
|
+
__slots__ = ("_origin_col", "_origin_line", "_origin_pos", "data", "name", "namespace", "parent")
|
|
186
503
|
|
|
187
|
-
|
|
504
|
+
data: str | None
|
|
505
|
+
name: str
|
|
506
|
+
namespace: None
|
|
507
|
+
parent: SimpleDomNode | ElementNode | TemplateNode | None
|
|
508
|
+
_origin_pos: int | None
|
|
509
|
+
_origin_line: int | None
|
|
510
|
+
_origin_col: int | None
|
|
511
|
+
|
|
512
|
+
def __init__(self, data: str | None) -> None:
|
|
188
513
|
self.data = data
|
|
189
514
|
self.parent = None
|
|
190
515
|
self.name = "#text"
|
|
191
516
|
self.namespace = None
|
|
517
|
+
self._origin_pos = None
|
|
518
|
+
self._origin_line = None
|
|
519
|
+
self._origin_col = None
|
|
520
|
+
|
|
521
|
+
@property
|
|
522
|
+
def origin_offset(self) -> int | None:
|
|
523
|
+
"""Best-effort origin offset (0-indexed) in the source HTML, if known."""
|
|
524
|
+
return self._origin_pos
|
|
525
|
+
|
|
526
|
+
@property
|
|
527
|
+
def origin_line(self) -> int | None:
|
|
528
|
+
return self._origin_line
|
|
192
529
|
|
|
193
530
|
@property
|
|
194
|
-
def
|
|
531
|
+
def origin_col(self) -> int | None:
|
|
532
|
+
return self._origin_col
|
|
533
|
+
|
|
534
|
+
@property
|
|
535
|
+
def origin_location(self) -> tuple[int, int] | None:
|
|
536
|
+
if self._origin_line is None or self._origin_col is None:
|
|
537
|
+
return None
|
|
538
|
+
return (self._origin_line, self._origin_col)
|
|
539
|
+
|
|
540
|
+
@property
|
|
541
|
+
def text(self) -> str:
|
|
195
542
|
"""Return the text content of this node."""
|
|
196
543
|
return self.data or ""
|
|
197
544
|
|
|
545
|
+
def to_text(
|
|
546
|
+
self,
|
|
547
|
+
separator: str = " ",
|
|
548
|
+
strip: bool = True,
|
|
549
|
+
*,
|
|
550
|
+
safe: bool = True,
|
|
551
|
+
policy: SanitizationPolicy | None = None,
|
|
552
|
+
) -> str:
|
|
553
|
+
# Parameters are accepted for API consistency; they don't affect leaf nodes.
|
|
554
|
+
_ = separator
|
|
555
|
+
_ = safe
|
|
556
|
+
_ = policy
|
|
557
|
+
|
|
558
|
+
if self.data is None:
|
|
559
|
+
return ""
|
|
560
|
+
if strip:
|
|
561
|
+
return self.data.strip()
|
|
562
|
+
return self.data
|
|
563
|
+
|
|
564
|
+
def to_markdown(self) -> str:
|
|
565
|
+
builder = _MarkdownBuilder()
|
|
566
|
+
builder.text(_markdown_escape_text(self.data or ""), preserve_whitespace=False)
|
|
567
|
+
return builder.finish()
|
|
568
|
+
|
|
198
569
|
@property
|
|
199
|
-
def children(self):
|
|
570
|
+
def children(self) -> list[Any]:
|
|
200
571
|
"""Return empty list for TextNode (leaf node)."""
|
|
201
572
|
return []
|
|
202
573
|
|
|
203
|
-
def has_child_nodes(self):
|
|
574
|
+
def has_child_nodes(self) -> bool:
|
|
204
575
|
"""Return False for TextNode."""
|
|
205
576
|
return False
|
|
206
577
|
|
|
207
|
-
def clone_node(self, deep=False):
|
|
208
|
-
|
|
578
|
+
def clone_node(self, deep: bool = False) -> TextNode:
|
|
579
|
+
clone = TextNode(self.data)
|
|
580
|
+
clone._origin_pos = self._origin_pos
|
|
581
|
+
clone._origin_line = self._origin_line
|
|
582
|
+
clone._origin_col = self._origin_col
|
|
583
|
+
return clone
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
_MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
|
|
587
|
+
{
|
|
588
|
+
"p",
|
|
589
|
+
"div",
|
|
590
|
+
"section",
|
|
591
|
+
"article",
|
|
592
|
+
"header",
|
|
593
|
+
"footer",
|
|
594
|
+
"main",
|
|
595
|
+
"nav",
|
|
596
|
+
"aside",
|
|
597
|
+
"blockquote",
|
|
598
|
+
"pre",
|
|
599
|
+
"ul",
|
|
600
|
+
"ol",
|
|
601
|
+
"li",
|
|
602
|
+
"hr",
|
|
603
|
+
"h1",
|
|
604
|
+
"h2",
|
|
605
|
+
"h3",
|
|
606
|
+
"h4",
|
|
607
|
+
"h5",
|
|
608
|
+
"h6",
|
|
609
|
+
"table",
|
|
610
|
+
}
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _to_markdown_walk(
|
|
615
|
+
node: Any,
|
|
616
|
+
builder: _MarkdownBuilder,
|
|
617
|
+
preserve_whitespace: bool,
|
|
618
|
+
list_depth: int,
|
|
619
|
+
in_link: bool = False,
|
|
620
|
+
) -> None:
|
|
621
|
+
name: str = node.name
|
|
622
|
+
|
|
623
|
+
if name == "#text":
|
|
624
|
+
if preserve_whitespace:
|
|
625
|
+
builder.raw(node.data or "")
|
|
626
|
+
else:
|
|
627
|
+
builder.text(_markdown_escape_text(node.data or ""), preserve_whitespace=False)
|
|
628
|
+
return
|
|
629
|
+
|
|
630
|
+
if name == "br":
|
|
631
|
+
if in_link:
|
|
632
|
+
builder.text(" ", preserve_whitespace=False)
|
|
633
|
+
else:
|
|
634
|
+
builder.newline(1)
|
|
635
|
+
return
|
|
636
|
+
|
|
637
|
+
# Comments/doctype don't contribute.
|
|
638
|
+
if name == "#comment" or name == "!doctype":
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
# Document containers contribute via descendants.
|
|
642
|
+
if name.startswith("#"):
|
|
643
|
+
if node.children:
|
|
644
|
+
for child in node.children:
|
|
645
|
+
_to_markdown_walk(
|
|
646
|
+
child,
|
|
647
|
+
builder,
|
|
648
|
+
preserve_whitespace,
|
|
649
|
+
list_depth,
|
|
650
|
+
in_link=in_link,
|
|
651
|
+
)
|
|
652
|
+
return
|
|
653
|
+
|
|
654
|
+
tag = name.lower()
|
|
655
|
+
|
|
656
|
+
# Metadata containers don't contribute to body text.
|
|
657
|
+
if tag == "head" or tag == "title":
|
|
658
|
+
return
|
|
659
|
+
|
|
660
|
+
# Preserve <img> and <table> as HTML.
|
|
661
|
+
if tag == "img":
|
|
662
|
+
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
663
|
+
return
|
|
664
|
+
|
|
665
|
+
if tag == "table":
|
|
666
|
+
if not in_link:
|
|
667
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
668
|
+
builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
|
|
669
|
+
if not in_link:
|
|
670
|
+
builder.ensure_newlines(2)
|
|
671
|
+
return
|
|
672
|
+
|
|
673
|
+
# Headings.
|
|
674
|
+
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
675
|
+
if not in_link:
|
|
676
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
677
|
+
level = int(tag[1])
|
|
678
|
+
builder.raw("#" * level)
|
|
679
|
+
builder.raw(" ")
|
|
680
|
+
|
|
681
|
+
if node.children:
|
|
682
|
+
for child in node.children:
|
|
683
|
+
_to_markdown_walk(
|
|
684
|
+
child,
|
|
685
|
+
builder,
|
|
686
|
+
preserve_whitespace=False,
|
|
687
|
+
list_depth=list_depth,
|
|
688
|
+
in_link=in_link,
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
if not in_link:
|
|
692
|
+
builder.ensure_newlines(2)
|
|
693
|
+
return
|
|
694
|
+
|
|
695
|
+
# Horizontal rule.
|
|
696
|
+
if tag == "hr":
|
|
697
|
+
if not in_link:
|
|
698
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
699
|
+
builder.raw("---")
|
|
700
|
+
builder.ensure_newlines(2)
|
|
701
|
+
return
|
|
702
|
+
|
|
703
|
+
# Code blocks.
|
|
704
|
+
if tag == "pre":
|
|
705
|
+
if not in_link:
|
|
706
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
707
|
+
code = node.to_text(separator="", strip=False)
|
|
708
|
+
builder.raw("```")
|
|
709
|
+
builder.newline(1)
|
|
710
|
+
if code:
|
|
711
|
+
builder.raw(code.rstrip("\n"))
|
|
712
|
+
builder.newline(1)
|
|
713
|
+
builder.raw("```")
|
|
714
|
+
builder.ensure_newlines(2)
|
|
715
|
+
else:
|
|
716
|
+
# Inside link, render as inline code or text
|
|
717
|
+
code = node.to_text(separator="", strip=False)
|
|
718
|
+
builder.raw(_markdown_code_span(code))
|
|
719
|
+
return
|
|
720
|
+
|
|
721
|
+
# Inline code.
|
|
722
|
+
if tag == "code" and not preserve_whitespace:
|
|
723
|
+
code = node.to_text(separator="", strip=False)
|
|
724
|
+
builder.raw(_markdown_code_span(code))
|
|
725
|
+
return
|
|
726
|
+
|
|
727
|
+
# Paragraph-like blocks.
|
|
728
|
+
if tag == "p":
|
|
729
|
+
if not in_link:
|
|
730
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
731
|
+
|
|
732
|
+
if node.children:
|
|
733
|
+
for child in node.children:
|
|
734
|
+
_to_markdown_walk(
|
|
735
|
+
child,
|
|
736
|
+
builder,
|
|
737
|
+
preserve_whitespace=False,
|
|
738
|
+
list_depth=list_depth,
|
|
739
|
+
in_link=in_link,
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
if not in_link:
|
|
743
|
+
builder.ensure_newlines(2)
|
|
744
|
+
else:
|
|
745
|
+
builder.text(" ", preserve_whitespace=False)
|
|
746
|
+
return
|
|
747
|
+
|
|
748
|
+
# Blockquotes.
|
|
749
|
+
if tag == "blockquote":
|
|
750
|
+
if not in_link:
|
|
751
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
752
|
+
inner = _MarkdownBuilder()
|
|
753
|
+
if node.children:
|
|
754
|
+
for child in node.children:
|
|
755
|
+
_to_markdown_walk(
|
|
756
|
+
child,
|
|
757
|
+
inner,
|
|
758
|
+
preserve_whitespace=False,
|
|
759
|
+
list_depth=list_depth,
|
|
760
|
+
in_link=in_link,
|
|
761
|
+
)
|
|
762
|
+
text = inner.finish()
|
|
763
|
+
if text:
|
|
764
|
+
lines = text.split("\n")
|
|
765
|
+
for i, line in enumerate(lines):
|
|
766
|
+
if i:
|
|
767
|
+
builder.newline(1)
|
|
768
|
+
builder.raw("> ")
|
|
769
|
+
builder.raw(line)
|
|
770
|
+
builder.ensure_newlines(2)
|
|
771
|
+
else:
|
|
772
|
+
if node.children:
|
|
773
|
+
for child in node.children:
|
|
774
|
+
_to_markdown_walk(
|
|
775
|
+
child,
|
|
776
|
+
builder,
|
|
777
|
+
preserve_whitespace=False,
|
|
778
|
+
list_depth=list_depth,
|
|
779
|
+
in_link=in_link,
|
|
780
|
+
)
|
|
781
|
+
return
|
|
782
|
+
|
|
783
|
+
# Lists.
|
|
784
|
+
if tag in {"ul", "ol"}:
|
|
785
|
+
if not in_link:
|
|
786
|
+
builder.ensure_newlines(2 if builder._buf else 0)
|
|
787
|
+
ordered = tag == "ol"
|
|
788
|
+
idx = 1
|
|
789
|
+
for child in node.children or []:
|
|
790
|
+
if child.name.lower() != "li":
|
|
791
|
+
continue
|
|
792
|
+
if idx > 1:
|
|
793
|
+
builder.newline(1)
|
|
794
|
+
indent = " " * list_depth
|
|
795
|
+
marker = f"{idx}. " if ordered else "- "
|
|
796
|
+
builder.raw(indent)
|
|
797
|
+
builder.raw(marker)
|
|
798
|
+
# Render list item content inline-ish.
|
|
799
|
+
for li_child in child.children or []:
|
|
800
|
+
_to_markdown_walk(
|
|
801
|
+
li_child,
|
|
802
|
+
builder,
|
|
803
|
+
preserve_whitespace=False,
|
|
804
|
+
list_depth=list_depth + 1,
|
|
805
|
+
in_link=in_link,
|
|
806
|
+
)
|
|
807
|
+
idx += 1
|
|
808
|
+
builder.ensure_newlines(2)
|
|
809
|
+
else:
|
|
810
|
+
# Flatten list inside link
|
|
811
|
+
for child in node.children or []:
|
|
812
|
+
if child.name.lower() != "li":
|
|
813
|
+
continue
|
|
814
|
+
builder.raw(" ")
|
|
815
|
+
for li_child in child.children or []:
|
|
816
|
+
_to_markdown_walk(
|
|
817
|
+
li_child,
|
|
818
|
+
builder,
|
|
819
|
+
preserve_whitespace=False,
|
|
820
|
+
list_depth=list_depth + 1,
|
|
821
|
+
in_link=in_link,
|
|
822
|
+
)
|
|
823
|
+
return
|
|
824
|
+
|
|
825
|
+
# Emphasis/strong.
|
|
826
|
+
if tag in {"em", "i"}:
|
|
827
|
+
builder.raw("*")
|
|
828
|
+
for child in node.children or []:
|
|
829
|
+
_to_markdown_walk(
|
|
830
|
+
child,
|
|
831
|
+
builder,
|
|
832
|
+
preserve_whitespace=False,
|
|
833
|
+
list_depth=list_depth,
|
|
834
|
+
in_link=in_link,
|
|
835
|
+
)
|
|
836
|
+
builder.raw("*")
|
|
837
|
+
return
|
|
838
|
+
|
|
839
|
+
if tag in {"strong", "b"}:
|
|
840
|
+
builder.raw("**")
|
|
841
|
+
for child in node.children or []:
|
|
842
|
+
_to_markdown_walk(
|
|
843
|
+
child,
|
|
844
|
+
builder,
|
|
845
|
+
preserve_whitespace=False,
|
|
846
|
+
list_depth=list_depth,
|
|
847
|
+
in_link=in_link,
|
|
848
|
+
)
|
|
849
|
+
builder.raw("**")
|
|
850
|
+
return
|
|
851
|
+
|
|
852
|
+
# Links.
|
|
853
|
+
if tag == "a":
|
|
854
|
+
href = ""
|
|
855
|
+
if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
|
|
856
|
+
href = str(node.attrs["href"])
|
|
857
|
+
|
|
858
|
+
# Capture inner text to strip whitespace.
|
|
859
|
+
inner_builder = _MarkdownBuilder()
|
|
860
|
+
for child in node.children or []:
|
|
861
|
+
_to_markdown_walk(
|
|
862
|
+
child,
|
|
863
|
+
inner_builder,
|
|
864
|
+
preserve_whitespace=False,
|
|
865
|
+
list_depth=list_depth,
|
|
866
|
+
in_link=True,
|
|
867
|
+
)
|
|
868
|
+
link_text = inner_builder.finish()
|
|
869
|
+
|
|
870
|
+
builder.raw("[")
|
|
871
|
+
builder.raw(link_text)
|
|
872
|
+
builder.raw("]")
|
|
873
|
+
if href:
|
|
874
|
+
builder.raw("(")
|
|
875
|
+
builder.raw(_markdown_link_destination(href))
|
|
876
|
+
builder.raw(")")
|
|
877
|
+
return
|
|
878
|
+
|
|
879
|
+
# Containers / unknown tags: recurse into children.
|
|
880
|
+
next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
|
|
881
|
+
if node.children:
|
|
882
|
+
for child in node.children:
|
|
883
|
+
_to_markdown_walk(
|
|
884
|
+
child,
|
|
885
|
+
builder,
|
|
886
|
+
next_preserve,
|
|
887
|
+
list_depth,
|
|
888
|
+
in_link=in_link,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
if isinstance(node, ElementNode) and node.template_content:
|
|
892
|
+
_to_markdown_walk(
|
|
893
|
+
node.template_content,
|
|
894
|
+
builder,
|
|
895
|
+
next_preserve,
|
|
896
|
+
list_depth,
|
|
897
|
+
in_link=in_link,
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# Add spacing after block containers to keep output readable.
|
|
901
|
+
if tag in _MARKDOWN_BLOCK_ELEMENTS:
|
|
902
|
+
if not in_link:
|
|
903
|
+
builder.ensure_newlines(2)
|
|
904
|
+
else:
|
|
905
|
+
builder.text(" ", preserve_whitespace=False)
|