justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/node.py ADDED
@@ -0,0 +1,632 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from .selector import query
6
+ from .serialize import to_html
7
+
8
+ if TYPE_CHECKING:
9
+ from .tokens import Doctype
10
+
11
+
12
+ def _markdown_escape_text(s: str) -> str:
13
+ if not s:
14
+ return ""
15
+ # Pragmatic: escape the few characters that commonly change Markdown meaning.
16
+ # Keep this minimal to preserve readability.
17
+ out: list[str] = []
18
+ for ch in s:
19
+ if ch in "\\`*_[]":
20
+ out.append("\\")
21
+ out.append(ch)
22
+ return "".join(out)
23
+
24
+
25
+ def _markdown_code_span(s: str | None) -> str:
26
+ if s is None:
27
+ s = ""
28
+ # Use a backtick fence longer than any run of backticks inside.
29
+ longest = 0
30
+ run = 0
31
+ for ch in s:
32
+ if ch == "`":
33
+ run += 1
34
+ if run > longest:
35
+ longest = run
36
+ else:
37
+ run = 0
38
+ fence = "`" * (longest + 1)
39
+ # CommonMark requires a space if the content starts/ends with backticks.
40
+ needs_space = s.startswith("`") or s.endswith("`")
41
+ if needs_space:
42
+ return f"{fence} {s} {fence}"
43
+ return f"{fence}{s}{fence}"
44
+
45
+
46
+ class _MarkdownBuilder:
47
+ __slots__ = ("_buf", "_newline_count", "_pending_space")
48
+
49
+ _buf: list[str]
50
+ _newline_count: int
51
+ _pending_space: bool
52
+
53
+ def __init__(self) -> None:
54
+ self._buf = []
55
+ self._newline_count = 0
56
+ self._pending_space = False
57
+
58
+ def _rstrip_last_segment(self) -> None:
59
+ if not self._buf:
60
+ return
61
+ last = self._buf[-1]
62
+ stripped = last.rstrip(" \t")
63
+ if stripped != last:
64
+ self._buf[-1] = stripped
65
+
66
+ def newline(self, count: int = 1) -> None:
67
+ for _ in range(count):
68
+ self._pending_space = False
69
+ self._rstrip_last_segment()
70
+ self._buf.append("\n")
71
+ # Track newlines to make it easy to insert blank lines.
72
+ if self._newline_count < 2:
73
+ self._newline_count += 1
74
+
75
+ def ensure_newlines(self, count: int) -> None:
76
+ while self._newline_count < count:
77
+ self.newline(1)
78
+
79
+ def raw(self, s: str) -> None:
80
+ if not s:
81
+ return
82
+
83
+ # If we've collapsed whitespace and the next output is raw (e.g. "**"),
84
+ # we still need to emit a single separating space.
85
+ if self._pending_space:
86
+ first = s[0]
87
+ if first not in " \t\n\r\f" and self._buf and self._newline_count == 0:
88
+ self._buf.append(" ")
89
+ self._pending_space = False
90
+
91
+ self._buf.append(s)
92
+ if "\n" in s:
93
+ # Count trailing newlines (cap at 2 for blank-line semantics).
94
+ trailing = 0
95
+ i = len(s) - 1
96
+ while i >= 0 and s[i] == "\n":
97
+ trailing += 1
98
+ i -= 1
99
+ self._newline_count = min(2, trailing)
100
+ if trailing:
101
+ self._pending_space = False
102
+ else:
103
+ self._newline_count = 0
104
+
105
+ def text(self, s: str, preserve_whitespace: bool = False) -> None:
106
+ if not s:
107
+ return
108
+
109
+ if preserve_whitespace:
110
+ self.raw(s)
111
+ return
112
+
113
+ for ch in s:
114
+ if ch in " \t\n\r\f":
115
+ self._pending_space = True
116
+ continue
117
+
118
+ if self._pending_space:
119
+ if self._buf and self._newline_count == 0:
120
+ self._buf.append(" ")
121
+ self._pending_space = False
122
+
123
+ self._buf.append(ch)
124
+ self._newline_count = 0
125
+
126
+ def finish(self) -> str:
127
+ out = "".join(self._buf)
128
+ return out.strip(" \t\n")
129
+
130
+
131
+ # Type alias for any node type
132
+ NodeType = "SimpleDomNode | ElementNode | TemplateNode | TextNode"
133
+
134
+
135
+ def _to_text_collect(node: Any, parts: list[str], strip: bool) -> None:
136
+ name: str = node.name
137
+
138
+ if name == "#text":
139
+ data: str | None = node.data
140
+ if not data:
141
+ return
142
+ if strip:
143
+ data = data.strip()
144
+ if not data:
145
+ return
146
+ parts.append(data)
147
+ return
148
+
149
+ if node.children:
150
+ for child in node.children:
151
+ _to_text_collect(child, parts, strip=strip)
152
+
153
+ if isinstance(node, ElementNode) and node.template_content:
154
+ _to_text_collect(node.template_content, parts, strip=strip)
155
+
156
+
157
+ class SimpleDomNode:
158
+ __slots__ = ("attrs", "children", "data", "name", "namespace", "parent")
159
+
160
+ name: str
161
+ parent: SimpleDomNode | ElementNode | TemplateNode | None
162
+ attrs: dict[str, str | None] | None
163
+ children: list[Any] | None
164
+ data: str | Doctype | None
165
+ namespace: str | None
166
+
167
+ def __init__(
168
+ self,
169
+ name: str,
170
+ attrs: dict[str, str | None] | None = None,
171
+ data: str | Doctype | None = None,
172
+ namespace: str | None = None,
173
+ ) -> None:
174
+ self.name = name
175
+ self.parent = None
176
+ self.data = data
177
+
178
+ if name.startswith("#") or name == "!doctype":
179
+ self.namespace = namespace
180
+ if name == "#comment" or name == "!doctype":
181
+ self.children = None
182
+ self.attrs = None
183
+ else:
184
+ self.children = []
185
+ self.attrs = attrs if attrs is not None else {}
186
+ else:
187
+ self.namespace = namespace or "html"
188
+ self.children = []
189
+ self.attrs = attrs if attrs is not None else {}
190
+
191
+ def append_child(self, node: Any) -> None:
192
+ if self.children is not None:
193
+ self.children.append(node)
194
+ node.parent = self
195
+
196
+ def remove_child(self, node: Any) -> None:
197
+ if self.children is not None:
198
+ self.children.remove(node)
199
+ node.parent = None
200
+
201
+ def to_html(self, indent: int = 0, indent_size: int = 2, pretty: bool = True) -> str:
202
+ """Convert node to HTML string."""
203
+ return to_html(self, indent, indent_size, pretty=pretty)
204
+
205
+ def query(self, selector: str) -> list[Any]:
206
+ """
207
+ Query this subtree using a CSS selector.
208
+
209
+ Args:
210
+ selector: A CSS selector string
211
+
212
+ Returns:
213
+ A list of matching nodes
214
+
215
+ Raises:
216
+ ValueError: If the selector is invalid
217
+ """
218
+ result: list[Any] = query(self, selector)
219
+ return result
220
+
221
+ @property
222
+ def text(self) -> str:
223
+ """Return the node's own text value.
224
+
225
+ For text nodes this is the node data. For other nodes this is an empty
226
+ string. Use `to_text()` to get textContent semantics.
227
+ """
228
+ if self.name == "#text":
229
+ data = self.data
230
+ if isinstance(data, str):
231
+ return data
232
+ return ""
233
+ return ""
234
+
235
+ def to_text(self, separator: str = " ", strip: bool = True) -> str:
236
+ """Return the concatenated text of this node's descendants.
237
+
238
+ - `separator` controls how text nodes are joined (default: a single space).
239
+ - `strip=True` strips each text node and drops empty segments.
240
+
241
+ Template element contents are included via `template_content`.
242
+ """
243
+ parts: list[str] = []
244
+ _to_text_collect(self, parts, strip=strip)
245
+ if not parts:
246
+ return ""
247
+ return separator.join(parts)
248
+
249
+ def to_markdown(self) -> str:
250
+ """Return a GitHub Flavored Markdown representation of this subtree.
251
+
252
+ This is a pragmatic HTML->Markdown converter intended for readability.
253
+ - Tables and images are preserved as raw HTML.
254
+ - Unknown elements fall back to rendering their children.
255
+ """
256
+ builder = _MarkdownBuilder()
257
+ _to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
258
+ return builder.finish()
259
+
260
+ def insert_before(self, node: Any, reference_node: Any | None) -> None:
261
+ """
262
+ Insert a node before a reference node.
263
+
264
+ Args:
265
+ node: The node to insert
266
+ reference_node: The node to insert before. If None, append to end.
267
+
268
+ Raises:
269
+ ValueError: If reference_node is not a child of this node
270
+ """
271
+ if self.children is None:
272
+ raise ValueError(f"Node {self.name} cannot have children")
273
+
274
+ if reference_node is None:
275
+ self.append_child(node)
276
+ return
277
+
278
+ try:
279
+ index = self.children.index(reference_node)
280
+ self.children.insert(index, node)
281
+ node.parent = self
282
+ except ValueError:
283
+ raise ValueError("Reference node is not a child of this node") from None
284
+
285
+ def replace_child(self, new_node: Any, old_node: Any) -> Any:
286
+ """
287
+ Replace a child node with a new node.
288
+
289
+ Args:
290
+ new_node: The new node to insert
291
+ old_node: The child node to replace
292
+
293
+ Returns:
294
+ The replaced node (old_node)
295
+
296
+ Raises:
297
+ ValueError: If old_node is not a child of this node
298
+ """
299
+ if self.children is None:
300
+ raise ValueError(f"Node {self.name} cannot have children")
301
+
302
+ try:
303
+ index = self.children.index(old_node)
304
+ except ValueError:
305
+ raise ValueError("The node to be replaced is not a child of this node") from None
306
+
307
+ self.children[index] = new_node
308
+ new_node.parent = self
309
+ old_node.parent = None
310
+ return old_node
311
+
312
+ def has_child_nodes(self) -> bool:
313
+ """Return True if this node has children."""
314
+ return bool(self.children)
315
+
316
+ def clone_node(self, deep: bool = False) -> SimpleDomNode:
317
+ """
318
+ Clone this node.
319
+
320
+ Args:
321
+ deep: If True, recursively clone children.
322
+
323
+ Returns:
324
+ A new node that is a copy of this node.
325
+ """
326
+ clone = SimpleDomNode(
327
+ self.name,
328
+ self.attrs.copy() if self.attrs else None,
329
+ self.data,
330
+ self.namespace,
331
+ )
332
+ if deep and self.children:
333
+ for child in self.children:
334
+ clone.append_child(child.clone_node(deep=True))
335
+ return clone
336
+
337
+
338
+ class ElementNode(SimpleDomNode):
339
+ __slots__ = ("template_content",)
340
+
341
+ template_content: SimpleDomNode | None
342
+ children: list[Any]
343
+ attrs: dict[str, str | None]
344
+
345
+ def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
346
+ self.name = name
347
+ self.parent = None
348
+ self.data = None
349
+ self.namespace = namespace
350
+ self.children = []
351
+ self.attrs = attrs if attrs is not None else {}
352
+ self.template_content = None
353
+
354
+ def clone_node(self, deep: bool = False) -> ElementNode:
355
+ clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
356
+ if deep:
357
+ for child in self.children:
358
+ clone.append_child(child.clone_node(deep=True))
359
+ return clone
360
+
361
+
362
+ class TemplateNode(ElementNode):
363
+ __slots__ = ()
364
+
365
+ def __init__(
366
+ self,
367
+ name: str,
368
+ attrs: dict[str, str | None] | None = None,
369
+ data: str | None = None,
370
+ namespace: str | None = None,
371
+ ) -> None:
372
+ super().__init__(name, attrs, namespace)
373
+ if self.namespace == "html":
374
+ self.template_content = SimpleDomNode("#document-fragment")
375
+ else:
376
+ self.template_content = None
377
+
378
+ def clone_node(self, deep: bool = False) -> TemplateNode:
379
+ clone = TemplateNode(
380
+ self.name,
381
+ self.attrs.copy() if self.attrs else {},
382
+ None,
383
+ self.namespace,
384
+ )
385
+ if deep:
386
+ if self.template_content:
387
+ clone.template_content = self.template_content.clone_node(deep=True)
388
+ for child in self.children:
389
+ clone.append_child(child.clone_node(deep=True))
390
+ return clone
391
+
392
+
393
+ class TextNode:
394
+ __slots__ = ("data", "name", "namespace", "parent")
395
+
396
+ data: str | None
397
+ name: str
398
+ namespace: None
399
+ parent: SimpleDomNode | ElementNode | TemplateNode | None
400
+
401
+ def __init__(self, data: str | None) -> None:
402
+ self.data = data
403
+ self.parent = None
404
+ self.name = "#text"
405
+ self.namespace = None
406
+
407
+ @property
408
+ def text(self) -> str:
409
+ """Return the text content of this node."""
410
+ return self.data or ""
411
+
412
+ def to_text(self, separator: str = " ", strip: bool = True) -> str:
413
+ # Parameters are accepted for API consistency; they don't affect leaf nodes.
414
+ if self.data is None:
415
+ return ""
416
+ if strip:
417
+ return self.data.strip()
418
+ return self.data
419
+
420
+ def to_markdown(self) -> str:
421
+ builder = _MarkdownBuilder()
422
+ builder.text(_markdown_escape_text(self.data or ""), preserve_whitespace=False)
423
+ return builder.finish()
424
+
425
+ @property
426
+ def children(self) -> list[Any]:
427
+ """Return empty list for TextNode (leaf node)."""
428
+ return []
429
+
430
+ def has_child_nodes(self) -> bool:
431
+ """Return False for TextNode."""
432
+ return False
433
+
434
+ def clone_node(self, deep: bool = False) -> TextNode:
435
+ return TextNode(self.data)
436
+
437
+
438
+ _MARKDOWN_BLOCK_ELEMENTS: frozenset[str] = frozenset(
439
+ {
440
+ "p",
441
+ "div",
442
+ "section",
443
+ "article",
444
+ "header",
445
+ "footer",
446
+ "main",
447
+ "nav",
448
+ "aside",
449
+ "blockquote",
450
+ "pre",
451
+ "ul",
452
+ "ol",
453
+ "li",
454
+ "hr",
455
+ "h1",
456
+ "h2",
457
+ "h3",
458
+ "h4",
459
+ "h5",
460
+ "h6",
461
+ "table",
462
+ }
463
+ )
464
+
465
+
466
+ def _to_markdown_walk(node: Any, builder: _MarkdownBuilder, preserve_whitespace: bool, list_depth: int) -> None:
467
+ name: str = node.name
468
+
469
+ if name == "#text":
470
+ if preserve_whitespace:
471
+ builder.raw(node.data or "")
472
+ else:
473
+ builder.text(_markdown_escape_text(node.data or ""), preserve_whitespace=False)
474
+ return
475
+
476
+ if name == "br":
477
+ builder.newline(1)
478
+ return
479
+
480
+ # Comments/doctype don't contribute.
481
+ if name == "#comment" or name == "!doctype":
482
+ return
483
+
484
+ # Document containers contribute via descendants.
485
+ if name.startswith("#"):
486
+ if node.children:
487
+ for child in node.children:
488
+ _to_markdown_walk(child, builder, preserve_whitespace, list_depth)
489
+ return
490
+
491
+ tag = name.lower()
492
+
493
+ # Preserve <img> and <table> as HTML.
494
+ if tag == "img":
495
+ builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
496
+ return
497
+
498
+ if tag == "table":
499
+ builder.ensure_newlines(2 if builder._buf else 0)
500
+ builder.raw(node.to_html(indent=0, indent_size=2, pretty=False))
501
+ builder.ensure_newlines(2)
502
+ return
503
+
504
+ # Headings.
505
+ if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
506
+ builder.ensure_newlines(2 if builder._buf else 0)
507
+ level = int(tag[1])
508
+ builder.raw("#" * level)
509
+ builder.raw(" ")
510
+ if node.children:
511
+ for child in node.children:
512
+ _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
513
+ builder.ensure_newlines(2)
514
+ return
515
+
516
+ # Horizontal rule.
517
+ if tag == "hr":
518
+ builder.ensure_newlines(2 if builder._buf else 0)
519
+ builder.raw("---")
520
+ builder.ensure_newlines(2)
521
+ return
522
+
523
+ # Code blocks.
524
+ if tag == "pre":
525
+ builder.ensure_newlines(2 if builder._buf else 0)
526
+ code = node.to_text(separator="", strip=False)
527
+ builder.raw("```")
528
+ builder.newline(1)
529
+ if code:
530
+ builder.raw(code.rstrip("\n"))
531
+ builder.newline(1)
532
+ builder.raw("```")
533
+ builder.ensure_newlines(2)
534
+ return
535
+
536
+ # Inline code.
537
+ if tag == "code" and not preserve_whitespace:
538
+ code = node.to_text(separator="", strip=False)
539
+ builder.raw(_markdown_code_span(code))
540
+ return
541
+
542
+ # Paragraph-like blocks.
543
+ if tag == "p":
544
+ builder.ensure_newlines(2 if builder._buf else 0)
545
+ if node.children:
546
+ for child in node.children:
547
+ _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
548
+ builder.ensure_newlines(2)
549
+ return
550
+
551
+ # Blockquotes.
552
+ if tag == "blockquote":
553
+ builder.ensure_newlines(2 if builder._buf else 0)
554
+ inner = _MarkdownBuilder()
555
+ if node.children:
556
+ for child in node.children:
557
+ _to_markdown_walk(child, inner, preserve_whitespace=False, list_depth=list_depth)
558
+ text = inner.finish()
559
+ if text:
560
+ lines = text.split("\n")
561
+ for i, line in enumerate(lines):
562
+ if i:
563
+ builder.newline(1)
564
+ builder.raw("> ")
565
+ builder.raw(line)
566
+ builder.ensure_newlines(2)
567
+ return
568
+
569
+ # Lists.
570
+ if tag in {"ul", "ol"}:
571
+ builder.ensure_newlines(2 if builder._buf else 0)
572
+ ordered = tag == "ol"
573
+ idx = 1
574
+ for child in node.children or []:
575
+ if child.name.lower() != "li":
576
+ continue
577
+ if idx > 1:
578
+ builder.newline(1)
579
+ indent = " " * list_depth
580
+ marker = f"{idx}. " if ordered else "- "
581
+ builder.raw(indent)
582
+ builder.raw(marker)
583
+ # Render list item content inline-ish.
584
+ for li_child in child.children or []:
585
+ _to_markdown_walk(li_child, builder, preserve_whitespace=False, list_depth=list_depth + 1)
586
+ idx += 1
587
+ builder.ensure_newlines(2)
588
+ return
589
+
590
+ # Emphasis/strong.
591
+ if tag in {"em", "i"}:
592
+ builder.raw("*")
593
+ for child in node.children or []:
594
+ _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
595
+ builder.raw("*")
596
+ return
597
+
598
+ if tag in {"strong", "b"}:
599
+ builder.raw("**")
600
+ for child in node.children or []:
601
+ _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
602
+ builder.raw("**")
603
+ return
604
+
605
+ # Links.
606
+ if tag == "a":
607
+ href = ""
608
+ if node.attrs and "href" in node.attrs and node.attrs["href"] is not None:
609
+ href = str(node.attrs["href"])
610
+
611
+ builder.raw("[")
612
+ for child in node.children or []:
613
+ _to_markdown_walk(child, builder, preserve_whitespace=False, list_depth=list_depth)
614
+ builder.raw("]")
615
+ if href:
616
+ builder.raw("(")
617
+ builder.raw(href)
618
+ builder.raw(")")
619
+ return
620
+
621
+ # Containers / unknown tags: recurse into children.
622
+ next_preserve = preserve_whitespace or (tag in {"textarea", "script", "style"})
623
+ if node.children:
624
+ for child in node.children:
625
+ _to_markdown_walk(child, builder, next_preserve, list_depth)
626
+
627
+ if isinstance(node, ElementNode) and node.template_content:
628
+ _to_markdown_walk(node.template_content, builder, next_preserve, list_depth)
629
+
630
+ # Add spacing after block containers to keep output readable.
631
+ if tag in _MARKDOWN_BLOCK_ELEMENTS:
632
+ builder.ensure_newlines(2)