justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/serialize.py CHANGED
@@ -2,35 +2,288 @@
2
2
 
3
3
  # ruff: noqa: PERF401
4
4
 
5
- from justhtml.constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, VOID_ELEMENTS
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from typing import Any
9
+
10
+ from .constants import FOREIGN_ATTRIBUTE_ADJUSTMENTS, SPECIAL_ELEMENTS, VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
11
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, _sanitize
12
+
13
+ # Matches characters that prevent an attribute value from being unquoted.
14
+ # Note: This matches the logic of the previous loop-based implementation.
15
+ # It checks for space characters, quotes, equals sign, and greater-than.
16
+ _UNQUOTED_ATTR_VALUE_INVALID = re.compile(r'[ \t\n\f\r"\'=>]')
17
+
18
+
19
+ def _escape_text(text: str | None) -> str:
20
+ if not text:
21
+ return ""
22
+ # Minimal, but matches html5lib serializer expectations in core cases.
23
+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
24
+
25
+
26
+ def _choose_attr_quote(value: str | None, forced_quote_char: str | None = None) -> str:
27
+ if forced_quote_char in {'"', "'"}:
28
+ return forced_quote_char
29
+ if value is None:
30
+ return '"'
31
+ # value is assumed to be a string
32
+ if '"' in value and "'" not in value:
33
+ return "'"
34
+ return '"'
35
+
36
+
37
+ def _escape_attr_value(value: str | None, quote_char: str, *, escape_lt_in_attrs: bool = False) -> str:
38
+ if value is None:
39
+ return ""
40
+ # value is assumed to be a string
41
+ value = value.replace("&", "&amp;")
42
+ if escape_lt_in_attrs:
43
+ value = value.replace("<", "&lt;")
44
+ # Note: html5lib's default serializer does not escape '>' in attrs.
45
+ if quote_char == '"':
46
+ return value.replace('"', "&quot;")
47
+ return value.replace("'", "&#39;")
48
+
49
+
50
+ def _can_unquote_attr_value(value: str | None) -> bool:
51
+ if value is None:
52
+ return False
53
+ # Optimization: use regex instead of loop
54
+ return not _UNQUOTED_ATTR_VALUE_INVALID.search(value)
55
+
56
+
57
+ def _serializer_minimize_attr_value(name: str, value: str | None, minimize_boolean_attributes: bool) -> bool:
58
+ if not minimize_boolean_attributes:
59
+ return False
60
+ if value is None or value == "":
61
+ return True
62
+ if value == name:
63
+ return True
64
+ return value.lower() == name
65
+
66
+
67
+ def serialize_start_tag(
68
+ name: str,
69
+ attrs: dict[str, str | None] | None,
70
+ *,
71
+ quote_attr_values: bool = True,
72
+ minimize_boolean_attributes: bool = True,
73
+ quote_char: str | None = None,
74
+ escape_lt_in_attrs: bool = False,
75
+ use_trailing_solidus: bool = False,
76
+ is_void: bool = False,
77
+ ) -> str:
78
+ attrs = attrs or {}
79
+ parts: list[str] = ["<", name]
80
+ if attrs:
81
+ for key, value in attrs.items():
82
+ if _serializer_minimize_attr_value(key, value, minimize_boolean_attributes):
83
+ parts.extend([" ", key])
84
+ continue
85
+
86
+ if value is None:
87
+ parts.extend([" ", key, '=""'])
88
+ continue
89
+
90
+ # value is guaranteed to be a string here because attrs is dict[str, str | None]
91
+ value_str = value
92
+ if value_str == "":
93
+ parts.extend([" ", key, '=""'])
94
+ continue
95
+
96
+ if not quote_attr_values and _can_unquote_attr_value(value_str):
97
+ escaped = value_str.replace("&", "&amp;")
98
+ if escape_lt_in_attrs:
99
+ escaped = escaped.replace("<", "&lt;")
100
+ parts.extend([" ", key, "=", escaped])
101
+ else:
102
+ quote = _choose_attr_quote(value_str, quote_char)
103
+ escaped = _escape_attr_value(value_str, quote, escape_lt_in_attrs=escape_lt_in_attrs)
104
+ parts.extend([" ", key, "=", quote, escaped, quote])
105
+
106
+ if use_trailing_solidus and is_void:
107
+ parts.append(" />")
108
+ else:
109
+ parts.append(">")
110
+ return "".join(parts)
6
111
 
7
112
 
8
- def to_html(node, indent=0, indent_size=2, pretty=True):
113
+ def serialize_end_tag(name: str) -> str:
114
+ return f"</{name}>"
115
+
116
+
117
+ def to_html(
118
+ node: Any,
119
+ indent: int = 0,
120
+ indent_size: int = 2,
121
+ *,
122
+ pretty: bool = True,
123
+ safe: bool = True,
124
+ policy: SanitizationPolicy | None = None,
125
+ ) -> str:
9
126
  """Convert node to HTML string."""
127
+ if safe:
128
+ if policy is None and node.name == "#document":
129
+ node = _sanitize(node, policy=DEFAULT_DOCUMENT_POLICY)
130
+ else:
131
+ node = _sanitize(node, policy=policy or DEFAULT_POLICY)
10
132
  if node.name == "#document":
11
133
  # Document root - just render children
12
- parts = []
134
+ parts: list[str] = []
13
135
  for child in node.children or []:
14
- parts.append(_node_to_html(child, indent, indent_size, pretty))
136
+ parts.append(_node_to_html(child, indent, indent_size, pretty, in_pre=False))
15
137
  return "\n".join(parts) if pretty else "".join(parts)
16
- return _node_to_html(node, indent, indent_size, pretty)
138
+ return _node_to_html(node, indent, indent_size, pretty, in_pre=False)
139
+
140
+
141
+ def _collapse_html_whitespace(text: str) -> str:
142
+ """Collapse HTML whitespace runs to a single space and trim edges.
143
+
144
+ This matches how HTML rendering treats most whitespace in text nodes, and is
145
+ used only for pretty-printing in non-preformatted contexts.
146
+ """
147
+ if not text:
148
+ return ""
149
+
150
+ # Optimization: split() handles whitespace collapsing efficiently.
151
+ # Note: split() treats \v as whitespace, which is not HTML whitespace.
152
+ # But \v is extremely rare in HTML.
153
+ if "\v" in text:
154
+ parts: list[str] = []
155
+ in_whitespace = False
156
+ for ch in text:
157
+ if ch in {" ", "\t", "\n", "\f", "\r"}:
158
+ if not in_whitespace:
159
+ parts.append(" ")
160
+ in_whitespace = True
161
+ continue
17
162
 
163
+ parts.append(ch)
164
+ in_whitespace = False
18
165
 
19
- def _node_to_html(node, indent=0, indent_size=2, pretty=True):
166
+ collapsed = "".join(parts)
167
+ return collapsed.strip(" ")
168
+
169
+ return " ".join(text.split())
170
+
171
+
172
+ def _normalize_formatting_whitespace(text: str) -> str:
173
+ """Normalize formatting whitespace within a text node.
174
+
175
+ Converts newlines/tabs/CR/FF to regular spaces and collapses runs that
176
+ include such formatting whitespace to a single space.
177
+
178
+ Pure space runs are preserved as-is (so existing double-spaces remain).
179
+ """
180
+ if not text:
181
+ return ""
182
+
183
+ if "\n" not in text and "\r" not in text and "\t" not in text and "\f" not in text:
184
+ return text
185
+
186
+ starts_with_formatting = text[0] in {"\n", "\r", "\t", "\f"}
187
+ ends_with_formatting = text[-1] in {"\n", "\r", "\t", "\f"}
188
+
189
+ out: list[str] = []
190
+ in_ws = False
191
+ saw_formatting_ws = False
192
+
193
+ for ch in text:
194
+ if ch == " ":
195
+ if in_ws:
196
+ # Only collapse if this whitespace run included formatting whitespace.
197
+ if saw_formatting_ws:
198
+ continue
199
+ out.append(" ")
200
+ continue
201
+ in_ws = True
202
+ saw_formatting_ws = False
203
+ out.append(" ")
204
+ continue
205
+
206
+ if ch in {"\n", "\r", "\t", "\f"}:
207
+ if in_ws:
208
+ saw_formatting_ws = True
209
+ continue
210
+ in_ws = True
211
+ saw_formatting_ws = True
212
+ out.append(" ")
213
+ continue
214
+
215
+ in_ws = False
216
+ saw_formatting_ws = False
217
+ out.append(ch)
218
+
219
+ normalized = "".join(out)
220
+ if starts_with_formatting and normalized.startswith(" "):
221
+ normalized = normalized[1:]
222
+ if ends_with_formatting and normalized.endswith(" "):
223
+ normalized = normalized[:-1]
224
+ return normalized
225
+
226
+
227
+ def _is_whitespace_text_node(node: Any) -> bool:
228
+ return node.name == "#text" and (node.data or "").strip() == ""
229
+
230
+
231
+ def _should_pretty_indent_children(children: list[Any]) -> bool:
232
+ for child in children:
233
+ if child is None:
234
+ continue
235
+ name = child.name
236
+ if name == "#comment":
237
+ return False
238
+ if name == "#text" and (child.data or "").strip():
239
+ return False
240
+
241
+ element_children: list[Any] = [
242
+ child for child in children if child is not None and child.name not in {"#text", "#comment"}
243
+ ]
244
+ if not element_children:
245
+ return True
246
+ if len(element_children) == 1:
247
+ only_child = element_children[0]
248
+ if only_child.name in SPECIAL_ELEMENTS:
249
+ return True
250
+ if only_child.name == "a":
251
+ # If an anchor wraps block-ish content (valid HTML5), treat it as block-ish
252
+ # for pretty-printing so the parent can indent it on its own line.
253
+ for grandchild in only_child.children or []:
254
+ if grandchild is None:
255
+ continue
256
+ if grandchild.name in SPECIAL_ELEMENTS:
257
+ return True
258
+ return False
259
+
260
+ # Safe indentation rule: only insert inter-element whitespace when we won't
261
+ # be placing it between two adjacent inline/phrasing elements.
262
+ prev_is_special = element_children[0].name in SPECIAL_ELEMENTS
263
+ for child in element_children[1:]:
264
+ current_is_special = child.name in SPECIAL_ELEMENTS
265
+ if not prev_is_special and not current_is_special:
266
+ return False
267
+ prev_is_special = current_is_special
268
+ return True
269
+
270
+
271
+ def _node_to_html(node: Any, indent: int = 0, indent_size: int = 2, pretty: bool = True, *, in_pre: bool) -> str:
20
272
  """Helper to convert a node to HTML."""
21
- prefix = " " * (indent * indent_size) if pretty else ""
22
- newline = "\n" if pretty else ""
23
- name = node.name
273
+ prefix = " " * (indent * indent_size) if pretty and not in_pre else ""
274
+ name: str = node.name
275
+ content_pre = in_pre or name in WHITESPACE_PRESERVING_ELEMENTS
276
+ newline = "\n" if pretty and not content_pre else ""
24
277
 
25
278
  # Text node
26
279
  if name == "#text":
27
- text = node.data
28
- if pretty:
280
+ text: str | None = node.data
281
+ if pretty and not in_pre:
29
282
  text = text.strip() if text else ""
30
283
  if text:
31
- return f"{prefix}{text}"
284
+ return f"{prefix}{_escape_text(text)}"
32
285
  return ""
33
- return text or ""
286
+ return _escape_text(text) if text else ""
34
287
 
35
288
  # Comment node
36
289
  if name == "#comment":
@@ -42,58 +295,166 @@ def _node_to_html(node, indent=0, indent_size=2, pretty=True):
42
295
 
43
296
  # Document fragment
44
297
  if name == "#document-fragment":
45
- parts = []
298
+ parts: list[str] = []
46
299
  for child in node.children or []:
47
- child_html = _node_to_html(child, indent, indent_size, pretty)
300
+ child_html = _node_to_html(child, indent, indent_size, pretty, in_pre=in_pre)
48
301
  if child_html:
49
302
  parts.append(child_html)
50
303
  return newline.join(parts) if pretty else "".join(parts)
51
304
 
52
305
  # Element node
53
- attrs = node.attrs or {}
306
+ attrs: dict[str, str | None] = node.attrs or {}
54
307
 
55
308
  # Build opening tag
56
- attr_str = ""
57
- if attrs:
58
- attr_parts = []
59
- for key, value in attrs.items():
60
- if value is None:
61
- attr_parts.append(key)
62
- elif value == "":
63
- attr_parts.append(key)
64
- else:
65
- # Escape quotes in attribute values
66
- escaped = str(value).replace("&", "&amp;").replace('"', "&quot;")
67
- attr_parts.append(f'{key}="{escaped}"')
68
- if attr_parts: # pragma: no branch
69
- attr_str = " " + " ".join(attr_parts)
309
+ open_tag = serialize_start_tag(name, attrs)
70
310
 
71
311
  # Void elements
72
312
  if name in VOID_ELEMENTS:
73
- return f"{prefix}<{name}{attr_str}>"
313
+ return f"{prefix}{open_tag}"
74
314
 
75
315
  # Elements with children
76
- children = node.children or []
316
+ # Template special handling: HTML templates store contents in `template_content`.
317
+ if name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
318
+ children: list[Any] = node.template_content.children or []
319
+ else:
320
+ children = node.children or []
77
321
  if not children:
78
- return f"{prefix}<{name}{attr_str}></{name}>"
322
+ return f"{prefix}{open_tag}{serialize_end_tag(name)}"
79
323
 
80
324
  # Check if all children are text-only (inline rendering)
81
- all_text = all(hasattr(c, "name") and c.name == "#text" for c in children)
325
+ all_text = all(c.name == "#text" for c in children)
326
+
327
+ if all_text and pretty and not content_pre:
328
+ # Serializer controls sanitization at the to_html() entry point; avoid
329
+ # implicit re-sanitization during rendering.
330
+ text_content = node.to_text(separator="", strip=False, safe=False)
331
+ text_content = _collapse_html_whitespace(text_content)
332
+ return f"{prefix}{open_tag}{_escape_text(text_content)}{serialize_end_tag(name)}"
333
+
334
+ if pretty and content_pre:
335
+ inner = "".join(
336
+ _node_to_html(child, indent + 1, indent_size, pretty, in_pre=True)
337
+ for child in children
338
+ if child is not None
339
+ )
340
+ return f"{prefix}{open_tag}{inner}{serialize_end_tag(name)}"
341
+
342
+ if pretty and not content_pre and not _should_pretty_indent_children(children):
343
+ # For block-ish elements that contain only element children and whitespace-only
344
+ # text nodes, we can still format each child on its own line (only when there
345
+ # is already whitespace separating element siblings).
346
+ if name in SPECIAL_ELEMENTS:
347
+ has_comment = False
348
+ has_element = False
349
+ has_whitespace_between_elements = False
350
+
351
+ first_element_index: int | None = None
352
+ last_element_index: int | None = None
353
+
354
+ previous_was_element = False
355
+ saw_whitespace_since_last_element = False
356
+ for i, child in enumerate(children):
357
+ if child is None:
358
+ continue
359
+ if child.name == "#comment":
360
+ has_comment = True
361
+ break
362
+ if child.name == "#text":
363
+ # Track whether there is already whitespace between element siblings.
364
+ if previous_was_element and not (child.data or "").strip():
365
+ saw_whitespace_since_last_element = True
366
+ continue
367
+
368
+ has_element = True
369
+ if first_element_index is None:
370
+ first_element_index = i
371
+ last_element_index = i
372
+ if previous_was_element and saw_whitespace_since_last_element:
373
+ has_whitespace_between_elements = True
374
+ previous_was_element = True
375
+ saw_whitespace_since_last_element = False
376
+
377
+ can_indent_non_whitespace_text = True
378
+ if has_element and first_element_index is not None and last_element_index is not None:
379
+ for i, child in enumerate(children):
380
+ if child is None or child.name != "#text":
381
+ continue
382
+ if not (child.data or "").strip():
383
+ continue
384
+ # Only allow non-whitespace text *after* the last element.
385
+ # Leading text or text between elements could gain new spaces
386
+ # due to indentation/newlines.
387
+ if i < first_element_index or first_element_index < i < last_element_index:
388
+ can_indent_non_whitespace_text = False
389
+ break
390
+
391
+ if has_element and has_whitespace_between_elements and not has_comment and can_indent_non_whitespace_text:
392
+ inner_lines: list[str] = []
393
+ for child in children:
394
+ if child is None:
395
+ continue
396
+ if child.name == "#text":
397
+ text = _collapse_html_whitespace(child.data or "")
398
+ if text:
399
+ inner_lines.append(f"{' ' * ((indent + 1) * indent_size)}{_escape_text(text)}")
400
+ continue
401
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty=True, in_pre=content_pre)
402
+ if child_html:
403
+ inner_lines.append(child_html)
404
+ if inner_lines:
405
+ inner = "\n".join(inner_lines)
406
+ return f"{prefix}{open_tag}\n{inner}\n{prefix}{serialize_end_tag(name)}"
407
+
408
+ inner_parts: list[str] = []
409
+
410
+ first_non_none_index: int | None = None
411
+ last_non_none_index: int | None = None
412
+ for i, child in enumerate(children):
413
+ if child is None:
414
+ continue
415
+ if first_non_none_index is None:
416
+ first_non_none_index = i
417
+ last_non_none_index = i
418
+
419
+ for i, child in enumerate(children):
420
+ if child is None:
421
+ continue
422
+
423
+ if child.name == "#text":
424
+ data = child.data or ""
425
+ if not data.strip():
426
+ # Drop leading/trailing formatting whitespace in compact mode.
427
+ if i == first_non_none_index or i == last_non_none_index:
428
+ continue
429
+ # Preserve intentional small spacing, but collapse large formatting gaps.
430
+ if "\n" in data or "\r" in data or "\t" in data or len(data) > 2:
431
+ inner_parts.append(" ")
432
+ continue
433
+
434
+ data = _normalize_formatting_whitespace(data)
435
+ child_html = _escape_text(data) if data else ""
436
+ else:
437
+ # Even when we can't safely insert whitespace *between* siblings, we can
438
+ # still pretty-print each element subtree to improve readability.
439
+ child_html = _node_to_html(child, 0, indent_size, pretty=True, in_pre=content_pre)
440
+ if child_html:
441
+ inner_parts.append(child_html)
82
442
 
83
- if all_text and pretty:
84
- return f"{prefix}<{name}{attr_str}>{node.text}</{name}>"
443
+ return f"{prefix}{open_tag}{''.join(inner_parts)}{serialize_end_tag(name)}"
85
444
 
86
445
  # Render with child indentation
87
- parts = [f"{prefix}<{name}{attr_str}>"]
446
+ parts = [f"{prefix}{open_tag}"]
88
447
  for child in children:
89
- child_html = _node_to_html(child, indent + 1, indent_size, pretty)
448
+ if pretty and not content_pre and _is_whitespace_text_node(child):
449
+ continue
450
+ child_html = _node_to_html(child, indent + 1, indent_size, pretty, in_pre=content_pre)
90
451
  if child_html:
91
452
  parts.append(child_html)
92
- parts.append(f"{prefix}</{name}>")
453
+ parts.append(f"{prefix}{serialize_end_tag(name)}")
93
454
  return newline.join(parts) if pretty else "".join(parts)
94
455
 
95
456
 
96
- def to_test_format(node, indent=0):
457
+ def to_test_format(node: Any, indent: int = 0) -> str:
97
458
  """Convert node to html5lib test format string.
98
459
 
99
460
  This format is used by html5lib-tests for validating parser output.
@@ -105,26 +466,26 @@ def to_test_format(node, indent=0):
105
466
  return _node_to_test_format(node, indent)
106
467
 
107
468
 
108
- def _node_to_test_format(node, indent):
469
+ def _node_to_test_format(node: Any, indent: int) -> str:
109
470
  """Helper to convert a node to test format."""
110
471
  if node.name == "#comment":
111
- comment = node.data or ""
472
+ comment: str = node.data or ""
112
473
  return f"| {' ' * indent}<!-- {comment} -->"
113
474
 
114
475
  if node.name == "!doctype":
115
476
  return _doctype_to_test_format(node)
116
477
 
117
478
  if node.name == "#text":
118
- text = node.data or ""
479
+ text: str = node.data or ""
119
480
  return f'| {" " * indent}"{text}"'
120
481
 
121
482
  # Regular element
122
483
  line = f"| {' ' * indent}<{_qualified_name(node)}>"
123
484
  attribute_lines = _attrs_to_test_format(node, indent)
124
485
 
125
- # Template special handling
126
- if node.name == "template" and hasattr(node, "template_content") and node.template_content:
127
- sections = [line]
486
+ # Template special handling (only HTML namespace templates have template_content)
487
+ if node.name == "template" and node.namespace in {None, "html"} and node.template_content is not None:
488
+ sections: list[str] = [line]
128
489
  if attribute_lines:
129
490
  sections.extend(attribute_lines)
130
491
  content_line = f"| {' ' * (indent + 2)}content"
@@ -142,24 +503,24 @@ def _node_to_test_format(node, indent):
142
503
  return "\n".join(sections)
143
504
 
144
505
 
145
- def _qualified_name(node):
506
+ def _qualified_name(node: Any) -> str:
146
507
  """Get the qualified name of a node (with namespace prefix if needed)."""
147
508
  if node.namespace and node.namespace not in {"html", None}:
148
509
  return f"{node.namespace} {node.name}"
149
- return node.name
510
+ return str(node.name)
150
511
 
151
512
 
152
- def _attrs_to_test_format(node, indent):
513
+ def _attrs_to_test_format(node: Any, indent: int) -> list[str]:
153
514
  """Format element attributes for test output."""
154
515
  if not node.attrs:
155
516
  return []
156
517
 
157
- formatted = []
518
+ formatted: list[str] = []
158
519
  padding = " " * (indent + 2)
159
520
 
160
521
  # Prepare display names for sorting
161
- display_attrs = []
162
- namespace = node.namespace
522
+ display_attrs: list[tuple[str, str]] = []
523
+ namespace: str | None = node.namespace
163
524
  for attr_name, attr_value in node.attrs.items():
164
525
  value = attr_value or ""
165
526
  display_name = attr_name
@@ -177,15 +538,15 @@ def _attrs_to_test_format(node, indent):
177
538
  return formatted
178
539
 
179
540
 
180
- def _doctype_to_test_format(node):
541
+ def _doctype_to_test_format(node: Any) -> str:
181
542
  """Format DOCTYPE node for test output."""
182
543
  doctype = node.data
183
544
 
184
- name = doctype.name or ""
185
- public_id = doctype.public_id
186
- system_id = doctype.system_id
545
+ name: str = doctype.name or ""
546
+ public_id: str | None = doctype.public_id
547
+ system_id: str | None = doctype.system_id
187
548
 
188
- parts = ["| <!DOCTYPE"]
549
+ parts: list[str] = ["| <!DOCTYPE"]
189
550
  if name:
190
551
  parts.append(f" {name}")
191
552
  else:
justhtml/stream.py CHANGED
@@ -1,15 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Generator
7
+
8
+ from .encoding import decode_html
1
9
  from .tokenizer import Tokenizer
2
10
  from .tokens import CommentToken, DoctypeToken, Tag
3
11
 
12
+ # Type alias for stream events
13
+ StreamEvent = tuple[str, Any]
14
+
15
+
16
+ class _DummyNode:
17
+ namespace: str = "html"
18
+
4
19
 
5
20
  class StreamSink:
6
21
  """A sink that buffers tokens for the stream API."""
7
22
 
8
- def __init__(self):
23
+ tokens: list[StreamEvent]
24
+ open_elements: list[_DummyNode]
25
+
26
+ def __init__(self) -> None:
9
27
  self.tokens = []
10
28
  self.open_elements = [] # Required by tokenizer for rawtext checks
11
29
 
12
- def process_token(self, token):
30
+ def process_token(self, token: Tag | CommentToken | DoctypeToken | Any) -> int:
13
31
  # Tokenizer reuses token objects, so we must copy data
14
32
  if isinstance(token, Tag):
15
33
  # Copy tag data
@@ -24,10 +42,7 @@ class StreamSink:
24
42
  # We need a dummy object with namespace for tokenizer checks
25
43
  # Tokenizer checks: stack[-1].namespace
26
44
  # We can just use a simple object
27
- class DummyNode:
28
- namespace = "html"
29
-
30
- self.open_elements.append(DummyNode())
45
+ self.open_elements.append(_DummyNode())
31
46
  else: # Tag.END
32
47
  if self.open_elements:
33
48
  self.open_elements.pop()
@@ -43,19 +58,28 @@ class StreamSink:
43
58
 
44
59
  return 0 # TokenSinkResult.Continue
45
60
 
46
- def process_characters(self, data):
61
+ def process_characters(self, data: str) -> None:
47
62
  """Handle character data from tokenizer."""
48
63
  self.tokens.append(("text", data))
49
64
 
50
65
 
51
- def stream(html):
66
+ def stream(
67
+ html: str | bytes | bytearray | memoryview,
68
+ *,
69
+ encoding: str | None = None,
70
+ ) -> Generator[StreamEvent, None, None]:
52
71
  """
53
72
  Stream HTML events from the given HTML string.
54
73
  Yields tuples of (event_type, data).
55
74
  """
75
+ html_str: str
76
+ if isinstance(html, (bytes, bytearray, memoryview)):
77
+ html_str, _ = decode_html(bytes(html), transport_encoding=encoding)
78
+ else:
79
+ html_str = html
56
80
  sink = StreamSink()
57
81
  tokenizer = Tokenizer(sink)
58
- tokenizer.initialize(html)
82
+ tokenizer.initialize(html_str)
59
83
 
60
84
  while True:
61
85
  # Run one step of the tokenizer
@@ -64,7 +88,7 @@ def stream(html):
64
88
  # Yield any tokens produced by this step
65
89
  if sink.tokens:
66
90
  # Coalesce text tokens
67
- text_buffer = []
91
+ text_buffer: list[str] = []
68
92
  for event, data in sink.tokens:
69
93
  if event == "text":
70
94
  text_buffer.append(data)