justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/serialize.py ADDED
@@ -0,0 +1,201 @@
1
+ """HTML serialization utilities for JustHTML DOM nodes."""
2
+
3
+ # ruff: noqa: PERF401
4
+
5
+ from justhtml.constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
6
+
7
+
8
+ def to_html(node, indent=0, indent_size=2, pretty=True):
9
+ """Convert node to HTML string."""
10
+ if node.name == "#document":
11
+ # Document root - just render children
12
+ parts = []
13
+ for child in node.children or []:
14
+ parts.append(_node_to_html(child, indent, indent_size, pretty))
15
+ return "\n".join(parts) if pretty else "".join(parts)
16
+ return _node_to_html(node, indent, indent_size, pretty)
17
+
18
+
19
+ def _node_to_html(node, indent=0, indent_size=2, pretty=True):
20
+ """Helper to convert a node to HTML."""
21
+ prefix = " " * (indent * indent_size) if pretty else ""
22
+ newline = "\n" if pretty else ""
23
+ name = node.name
24
+
25
+ # Text node
26
+ if name == "#text":
27
+ text = node.data
28
+ if pretty:
29
+ text = text.strip() if text else ""
30
+ if text:
31
+ return f"{prefix}{text}"
32
+ return ""
33
+ return text or ""
34
+
35
+ # Comment node
36
+ if name == "#comment":
37
+ return f"{prefix}<!--{node.data or ''}-->"
38
+
39
+ # Doctype
40
+ if name == "!doctype":
41
+ return f"{prefix}<!DOCTYPE html>"
42
+
43
+ # Document fragment
44
+ if name == "#document-fragment":
45
+ parts = []
46
+ for child in node.children or []:
47
+ child_html = _node_to_html(child, indent, indent_size, pretty)
48
+ if child_html:
49
+ parts.append(child_html)
50
+ return newline.join(parts) if pretty else "".join(parts)
51
+
52
+ # Element node
53
+ attrs = node.attrs or {}
54
+
55
+ # Build opening tag
56
+ attr_str = ""
57
+ if attrs:
58
+ attr_parts = []
59
+ for key, value in attrs.items():
60
+ if value is None:
61
+ attr_parts.append(key)
62
+ elif value == "":
63
+ attr_parts.append(key)
64
+ else:
65
+ # Escape quotes in attribute values
66
+ escaped = str(value).replace("&", "&amp;").replace('"', "&quot;")
67
+ attr_parts.append(f'{key}="{escaped}"')
68
+ if attr_parts: # pragma: no branch
69
+ attr_str = " " + " ".join(attr_parts)
70
+
71
+ # Void elements
72
+ if name in VOID_ELEMENTS:
73
+ return f"{prefix}<{name}{attr_str}>"
74
+
75
+ # Elements with children
76
+ children = node.children or []
77
+ if not children:
78
+ return f"{prefix}<{name}{attr_str}></{name}>"
79
+
80
+ # Check if all children are text-only (inline rendering)
81
+ all_text = all(hasattr(c, "name") and c.name == "#text" for c in children)
82
+
83
+ if all_text and pretty:
84
+ return f"{prefix}<{name}{attr_str}>{node.text}</{name}>"
85
+
86
+ # Render with child indentation
87
+ parts = [f"{prefix}<{name}{attr_str}>"]
88
+ for child in children:
89
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty)
90
+ if child_html:
91
+ parts.append(child_html)
92
+ parts.append(f"{prefix}</{name}>")
93
+ return newline.join(parts) if pretty else "".join(parts)
94
+
95
+
96
+ def to_test_format(node, indent=0):
97
+ """Convert node to html5lib test format string.
98
+
99
+ This format is used by html5lib-tests for validating parser output.
100
+ Uses '| ' prefixes and specific indentation rules.
101
+ """
102
+ if node.name in {"#document", "#document-fragment"}:
103
+ parts = [_node_to_test_format(child, 0) for child in node.children]
104
+ return "\n".join(parts)
105
+ return _node_to_test_format(node, indent)
106
+
107
+
108
+ def _node_to_test_format(node, indent):
109
+ """Helper to convert a node to test format."""
110
+ if node.name == "#comment":
111
+ comment = node.data or ""
112
+ return f"| {' ' * indent}<!-- {comment} -->"
113
+
114
+ if node.name == "!doctype":
115
+ return _doctype_to_test_format(node)
116
+
117
+ if node.name == "#text":
118
+ text = node.data or ""
119
+ return f'| {" " * indent}"{text}"'
120
+
121
+ # Regular element
122
+ line = f"| {' ' * indent}<{_qualified_name(node)}>"
123
+ attribute_lines = _attrs_to_test_format(node, indent)
124
+
125
+ # Template special handling
126
+ if node.name == "template" and hasattr(node, "template_content") and node.template_content:
127
+ sections = [line]
128
+ if attribute_lines:
129
+ sections.extend(attribute_lines)
130
+ content_line = f"| {' ' * (indent + 2)}content"
131
+ sections.append(content_line)
132
+ sections.extend(_node_to_test_format(child, indent + 4) for child in node.template_content.children)
133
+ return "\n".join(sections)
134
+
135
+ # Regular element with children
136
+ child_lines = [_node_to_test_format(child, indent + 2) for child in node.children] if node.children else []
137
+
138
+ sections = [line]
139
+ if attribute_lines:
140
+ sections.extend(attribute_lines)
141
+ sections.extend(child_lines)
142
+ return "\n".join(sections)
143
+
144
+
145
+ def _qualified_name(node):
146
+ """Get the qualified name of a node (with namespace prefix if needed)."""
147
+ if node.namespace and node.namespace not in {"html", None}:
148
+ return f"{node.namespace} {node.name}"
149
+ return node.name
150
+
151
+
152
+ def _attrs_to_test_format(node, indent):
153
+ """Format element attributes for test output."""
154
+ if not node.attrs:
155
+ return []
156
+
157
+ formatted = []
158
+ padding = " " * (indent + 2)
159
+
160
+ # Prepare display names for sorting
161
+ display_attrs = []
162
+ namespace = node.namespace
163
+ for attr_name, attr_value in node.attrs.items():
164
+ value = attr_value or ""
165
+ display_name = attr_name
166
+ if namespace and namespace not in {None, "html"}:
167
+ lower_name = attr_name.lower()
168
+ if lower_name in FOREIGN_ATTRIBUTE_ADJUSTMENTS:
169
+ display_name = attr_name.replace(":", " ")
170
+ display_attrs.append((display_name, value))
171
+
172
+ # Sort by display name for canonical test output
173
+ display_attrs.sort(key=lambda x: x[0])
174
+
175
+ for display_name, value in display_attrs:
176
+ formatted.append(f'| {padding}{display_name}="{value}"')
177
+ return formatted
178
+
179
+
180
+ def _doctype_to_test_format(node):
181
+ """Format DOCTYPE node for test output."""
182
+ doctype = node.data
183
+
184
+ name = doctype.name or ""
185
+ public_id = doctype.public_id
186
+ system_id = doctype.system_id
187
+
188
+ parts = ["| <!DOCTYPE"]
189
+ if name:
190
+ parts.append(f" {name}")
191
+ else:
192
+ parts.append(" ")
193
+
194
+ if public_id is not None or system_id is not None:
195
+ pub = public_id if public_id is not None else ""
196
+ sys = system_id if system_id is not None else ""
197
+ parts.append(f' "{pub}"')
198
+ parts.append(f' "{sys}"')
199
+
200
+ parts.append(">")
201
+ return "".join(parts)
justhtml/stream.py ADDED
@@ -0,0 +1,83 @@
1
+ from .tokenizer import Tokenizer
2
+ from .tokens import CommentToken, DoctypeToken, Tag
3
+
4
+
5
+ class StreamSink:
6
+ """A sink that buffers tokens for the stream API."""
7
+
8
+ def __init__(self):
9
+ self.tokens = []
10
+ self.open_elements = [] # Required by tokenizer for rawtext checks
11
+
12
+ def process_token(self, token):
13
+ # Tokenizer reuses token objects, so we must copy data
14
+ if isinstance(token, Tag):
15
+ # Copy tag data
16
+ self.tokens.append(
17
+ (
18
+ "start" if token.kind == Tag.START else "end",
19
+ (token.name, token.attrs.copy()) if token.kind == Tag.START else token.name,
20
+ )
21
+ )
22
+ # Maintain open_elements stack for tokenizer's rawtext checks
23
+ if token.kind == Tag.START:
24
+ # We need a dummy object with namespace for tokenizer checks
25
+ # Tokenizer checks: stack[-1].namespace
26
+ # We can just use a simple object
27
+ class DummyNode:
28
+ namespace = "html"
29
+
30
+ self.open_elements.append(DummyNode())
31
+ else: # Tag.END
32
+ if self.open_elements:
33
+ self.open_elements.pop()
34
+ # If open_elements is empty, we ignore the end tag for rawtext tracking purposes
35
+ # (it's an unmatched end tag at the root level)
36
+
37
+ elif isinstance(token, CommentToken):
38
+ self.tokens.append(("comment", token.data))
39
+
40
+ elif isinstance(token, DoctypeToken):
41
+ dt = token.doctype
42
+ self.tokens.append(("doctype", (dt.name, dt.public_id, dt.system_id)))
43
+
44
+ return 0 # TokenSinkResult.Continue
45
+
46
+ def process_characters(self, data):
47
+ """Handle character data from tokenizer."""
48
+ self.tokens.append(("text", data))
49
+
50
+
51
+ def stream(html):
52
+ """
53
+ Stream HTML events from the given HTML string.
54
+ Yields tuples of (event_type, data).
55
+ """
56
+ sink = StreamSink()
57
+ tokenizer = Tokenizer(sink)
58
+ tokenizer.initialize(html)
59
+
60
+ while True:
61
+ # Run one step of the tokenizer
62
+ is_eof = tokenizer.step()
63
+
64
+ # Yield any tokens produced by this step
65
+ if sink.tokens:
66
+ # Coalesce text tokens
67
+ text_buffer = []
68
+ for event, data in sink.tokens:
69
+ if event == "text":
70
+ text_buffer.append(data)
71
+ else:
72
+ if text_buffer:
73
+ yield ("text", "".join(text_buffer))
74
+ text_buffer = []
75
+ yield (event, data)
76
+
77
+ if text_buffer:
78
+ yield ("text", "".join(text_buffer))
79
+
80
+ sink.tokens.clear()
81
+
82
+ if is_eof:
83
+ break