PyPI - justhtml - Versions diffs - 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

justhtml 0.6.0py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

justhtml/__init__.py +28 -0
justhtml/__main__.py +161 -13
justhtml/constants.py +17 -1
justhtml/context.py +7 -1
justhtml/encoding.py +405 -0
justhtml/entities.py +57 -17
justhtml/errors.py +20 -4
justhtml/linkify.py +438 -0
justhtml/node.py +738 -41
justhtml/parser.py +188 -21
justhtml/py.typed +0 -0
justhtml/sanitize.py +1141 -0
justhtml/selector.py +240 -104
justhtml/serialize.py +418 -57
justhtml/stream.py +34 -10
justhtml/tokenizer.py +433 -289
justhtml/tokens.py +91 -23
justhtml/transforms.py +690 -0
justhtml/treebuilder.py +196 -111
justhtml/treebuilder_modes.py +191 -117
justhtml/treebuilder_utils.py +11 -4
justhtml-0.33.0.dist-info/METADATA +196 -0
justhtml-0.33.0.dist-info/RECORD +26 -0
justhtml-0.33.0.dist-info/entry_points.txt +2 -0
{justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
justhtml-0.6.0.dist-info/METADATA +0 -126
justhtml-0.6.0.dist-info/RECORD +0 -20
{justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0

justhtml/tokenizer.py CHANGED Viewed

@@ -1,11 +1,17 @@
+from __future__ import annotations
 import re
 from bisect import bisect_right
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable
 from .entities import decode_entities_in_text
 from .errors import generate_error_message
-from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
+from .tokens import AnyToken, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
-_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
+_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
 _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
 _RCDATA_ELEMENTS = {"title", "textarea"}
 _RAWTEXT_SWITCH_TAGS = {
@@ -23,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
 _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
 _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
-_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
-_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
+_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
+_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
 _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
 _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
@@ -38,13 +44,20 @@ for _plane in range(17):
 _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
-def _xml_coercion_callback(match):
+def _is_noncharacter_codepoint(codepoint: int) -> bool:
+    if 0xFDD0 <= codepoint <= 0xFDEF:
+        return True
+    last = codepoint & 0xFFFF
+    return last == 0xFFFE or last == 0xFFFF
+def _xml_coercion_callback(match: re.Match[str]) -> str:
     if match.group(0) == "\f":
         return " "
     return "\ufffd"
-def _coerce_text_for_xml(text):
+def _coerce_text_for_xml(text: str) -> str:
     """Apply XML coercion to text content."""
     # Fast path for ASCII
     if text.isascii():
@@ -57,7 +70,7 @@ def _coerce_text_for_xml(text):
     return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
-def _coerce_comment_for_xml(text):
+def _coerce_comment_for_xml(text: str) -> str:
     """Apply XML coercion to comment content - handle double hyphens."""
     # Replace -- with - - (with space)
     if "--" in text:
@@ -68,14 +81,20 @@ def _coerce_comment_for_xml(text):
 class TokenizerOpts:
     __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
+    discard_bom: bool
+    exact_errors: bool
+    initial_rawtext_tag: str | None
+    initial_state: int | None
+    xml_coercion: bool
     def __init__(
         self,
-        exact_errors=False,
-        discard_bom=True,
-        initial_state=None,
-        initial_rawtext_tag=None,
-        xml_coercion=False,
-    ):
+        exact_errors: bool = False,
+        discard_bom: bool = True,
+        initial_state: int | None = None,
+        initial_rawtext_tag: str | None = None,
+        xml_coercion: bool = False,
+    ) -> None:
         self.exact_errors = bool(exact_errors)
         self.discard_bom = bool(discard_bom)
         self.initial_state = initial_state
@@ -166,11 +185,12 @@ class Tokenizer:
         "current_tag_kind",
         "current_tag_name",
         "current_tag_self_closing",
+        "current_token_start_pos",
         "errors",
-        "ignore_lf",
         "last_start_tag_name",
         "last_token_column",
         "last_token_line",
+        "last_token_start_pos",
         "length",
         "opts",
         "original_tag_name",
@@ -182,14 +202,61 @@ class Tokenizer:
         "temp_buffer",
         "text_buffer",
         "text_start_pos",
+        "track_node_locations",
     )
+    _comment_token: CommentToken
+    _newline_positions: list[int] | None
+    _state_handlers: list[Callable[[Tokenizer], bool]]
+    _tag_token: Tag
+    buffer: str
+    collect_errors: bool
+    track_node_locations: bool
+    current_attr_name: list[str]
+    current_attr_value: list[str]
+    current_attr_value_has_amp: bool
+    current_char: str | None
+    current_comment: list[str]
+    current_doctype_force_quirks: bool
+    current_doctype_name: list[str]
+    current_doctype_public: list[str] | None
+    current_doctype_system: list[str] | None
+    current_tag_attrs: dict[str, str | None]
+    current_tag_kind: int
+    current_tag_name: list[str]
+    current_tag_self_closing: bool
+    current_token_start_pos: int
+    errors: list[ParseError]
+    last_start_tag_name: str | None
+    last_token_column: int
+    last_token_line: int
+    last_token_start_pos: int | None
+    length: int
+    opts: TokenizerOpts
+    original_tag_name: list[str]
+    pos: int
+    rawtext_tag_name: str | None
+    reconsume: bool
+    sink: Any
+    state: int
+    temp_buffer: list[str]
+    text_buffer: list[str]
+    text_start_pos: int
     # _STATE_HANDLERS is defined at the end of the file
-    def __init__(self, sink, opts=None, collect_errors=False):
+    def __init__(
+        self,
+        sink: Any,
+        opts: TokenizerOpts | None = None,
+        *,
+        collect_errors: bool = False,
+        track_node_locations: bool = False,
+    ) -> None:
         self.sink = sink
         self.opts = opts or TokenizerOpts()
         self.collect_errors = collect_errors
+        self.track_node_locations = bool(track_node_locations)
         self.errors = []
         self.state = self.DATA
@@ -198,9 +265,10 @@ class Tokenizer:
         self.pos = 0
         self.reconsume = False
         self.current_char = ""
-        self.ignore_lf = False
         self.last_token_line = 1
         self.last_token_column = 0
+        self.current_token_start_pos = 0
+        self.last_token_start_pos = None
         # Reusable buffers to avoid per-token allocations.
         self.text_buffer = []
@@ -224,18 +292,24 @@ class Tokenizer:
         self._tag_token = Tag(Tag.START, "", {}, False)
         self._comment_token = CommentToken("")
-    def initialize(self, html):
+    def initialize(self, html: str | None) -> None:
         if html and html[0] == "\ufeff" and self.opts.discard_bom:
             html = html[1:]
+        # Normalize newlines per §13.2.2.5
+        if html:
+            if "\r" in html:
+                html = html.replace("\r\n", "\n").replace("\r", "\n")
         self.buffer = html or ""
         self.length = len(self.buffer)
         self.pos = 0
         self.reconsume = False
         self.current_char = ""
-        self.ignore_lf = False
         self.last_token_line = 1
         self.last_token_column = 0
+        self.current_token_start_pos = 0
+        self.last_token_start_pos = None
         self.errors = []
         self.text_buffer.clear()
         self.text_start_pos = 0
@@ -265,8 +339,9 @@ class Tokenizer:
         else:
             self.state = self.DATA
-        # Pre-compute newline positions for O(log n) line lookups
-        if self.collect_errors:
+        # Pre-compute newline positions for O(log n) line lookups.
+        # Only do this when errors are collected or when node locations are requested.
+        if self.collect_errors or self.track_node_locations:
             self._newline_positions = []
             pos = -1
             buffer = self.buffer
@@ -278,42 +353,73 @@ class Tokenizer:
         else:
             self._newline_positions = None
-    def _get_line_at_pos(self, pos):
+    def _get_line_at_pos(self, pos: int) -> int:
         """Get line number (1-indexed) for a position using binary search."""
         # Line number = count of newlines before pos + 1
-        return bisect_right(self._newline_positions, pos - 1) + 1
+        newline_positions = self._newline_positions
+        if newline_positions is None:  # pragma: no cover
+            return 1
+        return bisect_right(newline_positions, pos - 1) + 1
+    def location_at_pos(self, pos: int) -> tuple[int, int]:
+        """Return (line, column) for a 0-indexed offset in the current buffer.
+        Column is 1-indexed. Newline positions are computed lazily when needed.
+        """
+        newline_positions = self._newline_positions
+        if newline_positions is None:
+            newline_positions = []
+            scan = -1
+            buffer = self.buffer
+            while True:
+                scan = buffer.find("\n", scan + 1)
+                if scan == -1:
+                    break
+                newline_positions.append(scan)
+            self._newline_positions = newline_positions
+        line_index = bisect_right(newline_positions, pos - 1)
+        line = line_index + 1
-    def step(self):
+        # Compute column using newline index rather than rfind() to avoid O(n) scans.
+        if line_index == 0:
+            last_newline = -1
+        else:
+            last_newline = newline_positions[line_index - 1]
+        column = pos - last_newline
+        return line, column
+    def step(self) -> bool:
         """Run one step of the tokenizer state machine. Returns True if EOF reached."""
-        handler = self._STATE_HANDLERS[self.state]
-        return handler(self)
+        handler = self._STATE_HANDLERS[self.state]  # type: ignore[attr-defined]
+        return handler(self)  # type: ignore[no-any-return]
-    def run(self, html):
+    def run(self, html: str | None) -> None:
         self.initialize(html)
+        handlers = self._STATE_HANDLERS  # type: ignore[attr-defined]
         while True:
-            if self.step():
+            if handlers[self.state](self):  # type: ignore[no-any-return]
                 break
     # ---------------------
     # Helper methods
     # ---------------------
-    def _peek_char(self, offset):
+    def _peek_char(self, offset: int) -> str | None:
         """Peek ahead at character at current position + offset without consuming"""
         peek_pos = self.pos + offset
         if peek_pos < self.length:
             return self.buffer[peek_pos]
         return None
-    def _append_text_chunk(self, chunk, *, ends_with_cr=False):
+    def _append_text_chunk(self, chunk: str) -> None:
         self._append_text(chunk)
-        self.ignore_lf = ends_with_cr
     # ---------------------
     # State handlers
     # ---------------------
-    def _state_data(self):
+    def _state_data(self) -> bool:
         buffer = self.buffer
         length = self.length
         pos = self.pos
@@ -341,12 +447,12 @@ class Tokenizer:
             if end > pos:
                 chunk = buffer[pos:end]
-                if "\r" in chunk:
-                    chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
+                if self.collect_errors and not chunk.isascii():
+                    base_pos = pos
+                    for offset, ch in enumerate(chunk):
+                        if _is_noncharacter_codepoint(ord(ch)):
+                            self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
                 self._append_text(chunk)
-                self.ignore_lf = chunk.endswith("\r")
                 pos = end
                 self.pos = pos
@@ -359,8 +465,8 @@ class Tokenizer:
             pos += 1
             self.pos = pos
             self.current_char = c
-            self.ignore_lf = False
             # c is always '<' here due to find() optimization above
+            self.current_token_start_pos = pos - 1
             # Optimization: Peek ahead for common tag starts
             if pos < length:
                 nc = buffer[pos]
@@ -415,7 +521,7 @@ class Tokenizer:
             self.state = self.TAG_OPEN
             return self._state_tag_open()
-    def _state_tag_open(self):
+    def _state_tag_open(self) -> bool:
         c = self._get_char()
         if c is None:
             self._emit_error("eof-before-tag-name")
@@ -442,7 +548,7 @@ class Tokenizer:
         self.state = self.DATA
         return False
-    def _state_end_tag_open(self):
+    def _state_end_tag_open(self) -> bool:
         c = self._get_char()
         if c is None:
             self._emit_error("eof-before-tag-name")
@@ -462,20 +568,20 @@ class Tokenizer:
         self.state = self.BOGUS_COMMENT
         return False
-    def _state_tag_name(self):
+    def _state_tag_name(self) -> bool:
         replacement = "\ufffd"
         append_tag_char = self.current_tag_name.append
         buffer = self.buffer
         length = self.length
+        pos = self.pos
         while True:
             # Inline _consume_tag_name_run
-            # Note: reconsume and ignore_lf are never True when entering TAG_NAME
-            pos = self.pos
+            # Note: reconsume is never True when entering TAG_NAME
             if pos < length:
                 # Optimization: Check for common terminators before regex
                 match = None
-                if buffer[pos] not in "\t\n\f />\0\r":
+                if buffer[pos] not in "\t\n\f />\0":
                     match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
                 if match:
@@ -483,56 +589,69 @@ class Tokenizer:
                     if not chunk.islower():
                         chunk = chunk.translate(_ASCII_LOWER_TABLE)
                     append_tag_char(chunk)
-                    self.pos = match.end()
-                    if self.pos < length:
-                        c = buffer[self.pos]
-                        if c in (" ", "\t", "\n", "\f", "\r"):
-                            self.pos += 1
-                            if c == "\r":
-                                self.ignore_lf = True
+                    pos = match.end()
+                    if pos < length:
+                        next_char = buffer[pos]
+                        if next_char in (" ", "\t", "\n", "\f"):
+                            pos += 1
+                            self.pos = pos
                             self.state = self.BEFORE_ATTRIBUTE_NAME
                             return self._state_before_attribute_name()
-                        if c == ">":
-                            self.pos += 1
+                        if next_char == ">":
+                            pos += 1
+                            self.pos = pos
                             if not self._emit_current_tag():
                                 self.state = self.DATA
                             return False
-                        if c == "/":
-                            self.pos += 1
+                        if next_char == "/":
+                            pos += 1
+                            self.pos = pos
                             self.state = self.SELF_CLOSING_START_TAG
                             return self._state_self_closing_start_tag()
-            c = self._get_char()
+            # Inline _get_char
+            # Note: reconsume is never True in this state.
+            if pos >= length:
+                c: str | None = None
+            else:
+                c = buffer[pos]
+                pos += 1
+            self.current_char = c
             if c is None:
+                self.pos = pos
                 self._emit_error("eof-in-tag")
                 # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
                 # The incomplete tag is discarded (not emitted as text)
                 self._emit_token(EOFToken())
                 return True
             if c in ("\t", "\n", "\f", " "):
+                self.pos = pos
                 self.state = self.BEFORE_ATTRIBUTE_NAME
                 return self._state_before_attribute_name()
             if c == "/":
+                self.pos = pos
                 self.state = self.SELF_CLOSING_START_TAG
                 return self._state_self_closing_start_tag()
             if c == ">":
                 # In slow path, tag name is only first char (from DATA),
                 # so no rawtext elements possible - always set DATA state
+                self.pos = pos
                 self._emit_current_tag()
                 self.state = self.DATA
                 return False
             # c == "\0" - the only remaining possibility after fast-path
+            self.pos = pos
             self._emit_error("unexpected-null-character")
             append_tag_char(replacement)
-    def _state_before_attribute_name(self):
+    def _state_before_attribute_name(self) -> bool:
         buffer = self.buffer
         length = self.length
         while True:
             # Optimization: Skip whitespace
-            if not self.reconsume and not self.ignore_lf:
+            if not self.reconsume:
                 if self.pos < length:
                     # Check if current char is whitespace before running regex
                     if buffer[self.pos] in " \t\n\f":
@@ -552,21 +671,7 @@ class Tokenizer:
             self.current_char = c
-            if c == " ":
-                self.ignore_lf = False
-                continue
-            if c == "\n":
-                if self.ignore_lf:
-                    self.ignore_lf = False
-                # Line tracking now computed on-demand via _get_line_at_pos()
-                continue
-            if c == "\t" or c == "\f":
-                self.ignore_lf = False
-                continue
-            if c == "\r":
-                self.ignore_lf = False
-                if self.pos < length and buffer[self.pos] == "\n":
-                    self.pos += 1
+            if c in (" ", "\n", "\t", "\f"):
                 continue
             if c is None:
@@ -605,55 +710,64 @@ class Tokenizer:
             self.state = self.ATTRIBUTE_NAME
             return False  # Let main loop dispatch to avoid recursion
-    def _state_attribute_name(self):
+    def _state_attribute_name(self) -> bool:
         replacement = "\ufffd"
         append_attr_char = self.current_attr_name.append
         buffer = self.buffer
         length = self.length
+        pos = self.pos
         while True:
             # Inline _consume_attribute_name_run
-            if not self.reconsume and not self.ignore_lf:
-                pos = self.pos
-                if pos < length:
-                    # Optimization: Check for common terminators before regex
-                    match = None
-                    if buffer[pos] not in "\t\n\f />=\0\"'<\r":
-                        match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
-                    if match:
-                        chunk = match.group(0)
-                        if not chunk.islower():
-                            chunk = chunk.translate(_ASCII_LOWER_TABLE)
-                        append_attr_char(chunk)
-                        self.pos = match.end()
-                        if self.pos < length:
-                            c = buffer[self.pos]
-                            if c == "=":
-                                self.pos += 1
-                                self.state = self.BEFORE_ATTRIBUTE_VALUE
-                                return self._state_before_attribute_value()
-                            if c in (" ", "\t", "\n", "\f", "\r"):
-                                self.pos += 1
-                                if c == "\r":
-                                    self.ignore_lf = True
-                                self._finish_attribute()
-                                self.state = self.AFTER_ATTRIBUTE_NAME
-                                return False  # Let main loop dispatch to avoid recursion
-                            if c == ">":
-                                self.pos += 1
-                                self._finish_attribute()
-                                if not self._emit_current_tag():
-                                    self.state = self.DATA
-                                return False
-                            if c == "/":
-                                self.pos += 1
-                                self._finish_attribute()
-                                self.state = self.SELF_CLOSING_START_TAG
-                                return self._state_self_closing_start_tag()
+            # Note: reconsume is never True in this state.
+            if pos < length:
+                # Optimization: Check for common terminators before regex
+                match = None
+                if buffer[pos] not in "\t\n\f />=\0\"'<":
+                    match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
-            c = self._get_char()
+                if match:
+                    chunk = match.group(0)
+                    if not chunk.islower():
+                        chunk = chunk.translate(_ASCII_LOWER_TABLE)
+                    append_attr_char(chunk)
+                    pos = match.end()
+                    if pos < length:
+                        next_char = buffer[pos]
+                        if next_char == "=":
+                            pos += 1
+                            self.pos = pos
+                            self.state = self.BEFORE_ATTRIBUTE_VALUE
+                            return self._state_before_attribute_value()
+                        if next_char in (" ", "\t", "\n", "\f"):
+                            pos += 1
+                            self.pos = pos
+                            self._finish_attribute()
+                            self.state = self.AFTER_ATTRIBUTE_NAME
+                            return False  # Let main loop dispatch to avoid recursion
+                        if next_char == ">":
+                            pos += 1
+                            self.pos = pos
+                            self._finish_attribute()
+                            if not self._emit_current_tag():
+                                self.state = self.DATA
+                            return False
+                        if next_char == "/":
+                            pos += 1
+                            self.pos = pos
+                            self._finish_attribute()
+                            self.state = self.SELF_CLOSING_START_TAG
+                            return self._state_self_closing_start_tag()
+            # Inline _get_char (reconsume is never True in this state)
+            if pos >= length:
+                c: str | None = None
+            else:
+                c = buffer[pos]
+                pos += 1
+            self.current_char = c
+            self.pos = pos
             if c is None:
                 self._emit_error("eof-in-tag")
                 self._flush_text()
@@ -679,21 +793,19 @@ class Tokenizer:
                 self._emit_error("unexpected-null-character")
                 append_attr_char(replacement)
                 continue
-            if c in ('"', "'", "<"):
-                self._emit_error("unexpected-character-in-attribute-name")
+            self._emit_error("unexpected-character-in-attribute-name")
             append_attr_char(c)
-    def _state_after_attribute_name(self):
+    def _state_after_attribute_name(self) -> bool:
         buffer = self.buffer
         length = self.length
         while True:
             # Optimization: Skip whitespace
-            if not self.reconsume and not self.ignore_lf:
+            if not self.reconsume:
                 if self.pos < length:
-                    match = _WHITESPACE_PATTERN.match(buffer, self.pos)
-                    if match:
-                        self.pos = match.end()
+                    if buffer[self.pos] in " \t\n\f":
+                        self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end()  # type: ignore[union-attr]
             # Inline _get_char
             if self.pos >= length:
@@ -704,23 +816,9 @@ class Tokenizer:
             self.current_char = c
-            if c == " ":
-                self.ignore_lf = False
-                continue
-            if c == "\n":
-                # Note: Only reachable when ignore_lf=True (CR-LF handling)
-                # Standalone \n is caught by whitespace optimization
-                self.ignore_lf = False
-                continue
-            if c == "\r":
-                self.ignore_lf = True
-                continue
-            if c == "\t" or c == "\f":
-                self.ignore_lf = False
+            if c in (" ", "\n", "\t", "\f"):
                 continue
-            self.ignore_lf = False
             if c is None:
                 self._emit_error("eof-in-tag")
                 self._flush_text()
@@ -751,9 +849,16 @@ class Tokenizer:
             self.state = self.ATTRIBUTE_NAME
             return False  # Let main loop dispatch to avoid recursion
-    def _state_before_attribute_value(self):
+    def _state_before_attribute_value(self) -> bool:
         while True:
-            c = self._get_char()
+            # Inline _get_char (reconsume is never True in this state)
+            pos = self.pos
+            if pos >= self.length:
+                c: str | None = None
+            else:
+                c = self.buffer[pos]
+                self.pos = pos + 1
+            self.current_char = c
             if c is None:
                 self._emit_error("eof-in-tag")
                 self._flush_text()
@@ -777,7 +882,7 @@ class Tokenizer:
             self.state = self.ATTRIBUTE_VALUE_UNQUOTED
             return self._state_attribute_value_unquoted()
-    def _state_attribute_value_double(self):
+    def _state_attribute_value_double(self) -> bool:
         replacement = "\ufffd"
         stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
         buffer = self.buffer
@@ -797,8 +902,7 @@ class Tokenizer:
                 if "&" in chunk or "\0" in chunk:
                     # Fallback to regex if complex chars present
                     match = stop_pattern.search(buffer, pos)
-                    # Note: match is always found because we checked for & or \0 above
-                    end = match.start()
+                    end = length if match is None else match.start()
                 else:
                     end = next_quote
@@ -807,10 +911,6 @@ class Tokenizer:
                     if end != next_quote:
                         chunk = buffer[pos:end]
-                    # Normalize chunk for value if needed
-                    if "\r" in chunk:
-                        chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
                     self.current_attr_value.append(chunk)
                     self.pos = end
@@ -837,7 +937,7 @@ class Tokenizer:
                 self._emit_error("unexpected-null-character")
                 self._append_attr_value_char(replacement)
-    def _state_attribute_value_single(self):
+    def _state_attribute_value_single(self) -> bool:
         replacement = "\ufffd"
         stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
         buffer = self.buffer
@@ -857,8 +957,7 @@ class Tokenizer:
                 if "&" in chunk or "\0" in chunk:
                     # Fallback to regex if complex chars present
                     match = stop_pattern.search(buffer, pos)
-                    # Note: match is always found because we checked for & or \0 above
-                    end = match.start()
+                    end = length if match is None else match.start()
                 else:
                     end = next_quote
@@ -867,10 +966,6 @@ class Tokenizer:
                     if end != next_quote:
                         chunk = buffer[pos:end]
-                    # Normalize chunk for value if needed
-                    if "\r" in chunk:
-                        chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
                     self.current_attr_value.append(chunk)
                     self.pos = end
@@ -897,7 +992,7 @@ class Tokenizer:
                 self._emit_error("unexpected-null-character")
                 self._append_attr_value_char(replacement)
-    def _state_attribute_value_unquoted(self):
+    def _state_attribute_value_unquoted(self) -> bool:
         replacement = "\ufffd"
         stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
         buffer = self.buffer
@@ -916,7 +1011,17 @@ class Tokenizer:
                         self.current_attr_value.append(buffer[pos:end])
                         self.pos = end
-            c = self._get_char()
+            # Inline _get_char
+            if self.reconsume:
+                self.reconsume = False
+                c = self.current_char
+            elif self.pos >= length:
+                c = None
+            else:
+                c = buffer[self.pos]
+                self.pos += 1
+            self.current_char = c
             if c is None:
                 # Per HTML5 spec: EOF in attribute value is a parse error
                 # The incomplete tag is discarded (not emitted)
@@ -944,9 +1049,16 @@ class Tokenizer:
                 continue
             self._append_attr_value_char(c)
-    def _state_after_attribute_value_quoted(self):
+    def _state_after_attribute_value_quoted(self) -> bool:
         """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
-        c = self._get_char()
+        # Inline _get_char
+        if self.pos >= self.length:
+            c: str | None = None
+        else:
+            c = self.buffer[self.pos]
+            self.pos += 1
+        self.current_char = c
         if c is None:
             self._emit_error("eof-in-tag")
             self._flush_text()
@@ -972,7 +1084,7 @@ class Tokenizer:
         self.state = self.BEFORE_ATTRIBUTE_NAME
         return False
-    def _state_self_closing_start_tag(self):
+    def _state_self_closing_start_tag(self) -> bool:
         c = self._get_char()
         if c is None:
             self._emit_error("eof-in-tag")
@@ -989,7 +1101,7 @@ class Tokenizer:
         self.state = self.BEFORE_ATTRIBUTE_NAME
         return False
-    def _state_markup_declaration_open(self):
+    def _state_markup_declaration_open(self) -> bool:
         # Note: Comment handling (<!--) is optimized in DATA state fast-path
         # This code only handles DOCTYPE and CDATA, or malformed markup
         if self._consume_case_insensitive("DOCTYPE"):
@@ -1023,7 +1135,7 @@ class Tokenizer:
         self.state = self.BOGUS_COMMENT
         return False
-    def _state_comment_start(self):
+    def _state_comment_start(self) -> bool:
         replacement = "\ufffd"
         c = self._get_char()
         if c is None:
@@ -1047,7 +1159,7 @@ class Tokenizer:
         self.state = self.COMMENT
         return False
-    def _state_comment_start_dash(self):
+    def _state_comment_start_dash(self) -> bool:
         replacement = "\ufffd"
         c = self._get_char()
         if c is None:
@@ -1071,12 +1183,19 @@ class Tokenizer:
         self.state = self.COMMENT
         return False
-    def _state_comment(self):
+    def _state_comment(self) -> bool:
         replacement = "\ufffd"
         while True:
             if self._consume_comment_run():
                 continue
-            c = self._get_char()
+            # Inline _get_char
+            if self.pos >= self.length:
+                c: str | None = None
+            else:
+                c = self.buffer[self.pos]
+                self.pos += 1
+            self.current_char = c
             if c is None:
                 self._emit_error("eof-in-comment")
                 self._emit_comment()
@@ -1089,7 +1208,7 @@ class Tokenizer:
             self._emit_error("unexpected-null-character")
             self.current_comment.append(replacement)
-    def _state_comment_end_dash(self):
+    def _state_comment_end_dash(self) -> bool:
         replacement = "\ufffd"
         c = self._get_char()
         if c is None:
@@ -1110,7 +1229,7 @@ class Tokenizer:
         self.state = self.COMMENT
         return False
-    def _state_comment_end(self):
+    def _state_comment_end(self) -> bool:
         replacement = "\ufffd"
         c = self._get_char()
         if c is None:
@@ -1138,7 +1257,7 @@ class Tokenizer:
         self.state = self.COMMENT
         return False
-    def _state_comment_end_bang(self):
+    def _state_comment_end_bang(self) -> bool:
         replacement = "\ufffd"
         c = self._get_char()
         if c is None:
@@ -1172,7 +1291,7 @@ class Tokenizer:
         self.state = self.COMMENT
         return False
-    def _state_bogus_comment(self):
+    def _state_bogus_comment(self) -> bool:
         replacement = "\ufffd"
         while True:
             c = self._get_char()
@@ -1189,7 +1308,7 @@ class Tokenizer:
             else:
                 self.current_comment.append(c)
-    def _state_doctype(self):
+    def _state_doctype(self) -> bool:
         c = self._get_char()
         if c is None:
             self._emit_error("eof-in-doctype")
@@ -1211,11 +1330,11 @@ class Tokenizer:
         self.state = self.BEFORE_DOCTYPE_NAME
         return False
-    def _state_before_doctype_name(self):
+    def _state_before_doctype_name(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
-                self._emit_error("eof-in-doctype-name")
+                self._emit_error("eof-in-doctype")
                 self.current_doctype_force_quirks = True
                 self._emit_doctype()
                 self._emit_token(EOFToken())
@@ -1238,11 +1357,11 @@ class Tokenizer:
             self.state = self.DOCTYPE_NAME
             return False
-    def _state_doctype_name(self):
+    def _state_doctype_name(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
-                self._emit_error("eof-in-doctype-name")
+                self._emit_error("eof-in-doctype")
                 self.current_doctype_force_quirks = True
                 self._emit_doctype()
                 self._emit_token(EOFToken())
@@ -1263,7 +1382,7 @@ class Tokenizer:
                 continue
             self.current_doctype_name.append(c)
-    def _state_after_doctype_name(self):
+    def _state_after_doctype_name(self) -> bool:
         if self._consume_case_insensitive("PUBLIC"):
             self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
             return False
@@ -1290,7 +1409,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_after_doctype_public_keyword(self):
+    def _state_after_doctype_public_keyword(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1324,7 +1443,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_after_doctype_system_keyword(self):
+    def _state_after_doctype_system_keyword(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1358,7 +1477,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_before_doctype_public_identifier(self):
+    def _state_before_doctype_public_identifier(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1389,7 +1508,9 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_doctype_public_identifier_double_quoted(self):
+    def _state_doctype_public_identifier_double_quoted(self) -> bool:
+        if self.current_doctype_public is None:  # pragma: no cover
+            self.current_doctype_public = []
         while True:
             c = self._get_char()
             if c is None:
@@ -1413,7 +1534,9 @@ class Tokenizer:
                 return False
             self.current_doctype_public.append(c)
-    def _state_doctype_public_identifier_single_quoted(self):
+    def _state_doctype_public_identifier_single_quoted(self) -> bool:
+        if self.current_doctype_public is None:  # pragma: no cover
+            self.current_doctype_public = []
         while True:
             c = self._get_char()
             if c is None:
@@ -1437,7 +1560,7 @@ class Tokenizer:
                 return False
             self.current_doctype_public.append(c)
-    def _state_after_doctype_public_identifier(self):
+    def _state_after_doctype_public_identifier(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1469,7 +1592,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_between_doctype_public_and_system_identifiers(self):
+    def _state_between_doctype_public_and_system_identifiers(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1498,7 +1621,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_before_doctype_system_identifier(self):
+    def _state_before_doctype_system_identifier(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1529,7 +1652,9 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_doctype_system_identifier_double_quoted(self):
+    def _state_doctype_system_identifier_double_quoted(self) -> bool:
+        if self.current_doctype_system is None:  # pragma: no cover
+            self.current_doctype_system = []
         while True:
             c = self._get_char()
             if c is None:
@@ -1553,7 +1678,9 @@ class Tokenizer:
                 return False
             self.current_doctype_system.append(c)
-    def _state_doctype_system_identifier_single_quoted(self):
+    def _state_doctype_system_identifier_single_quoted(self) -> bool:
+        if self.current_doctype_system is None:  # pragma: no cover
+            self.current_doctype_system = []
         while True:
             c = self._get_char()
             if c is None:
@@ -1577,7 +1704,7 @@ class Tokenizer:
                 return False
             self.current_doctype_system.append(c)
-    def _state_after_doctype_system_identifier(self):
+    def _state_after_doctype_system_identifier(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1597,7 +1724,7 @@ class Tokenizer:
             self.state = self.BOGUS_DOCTYPE
             return False
-    def _state_bogus_doctype(self):
+    def _state_bogus_doctype(self) -> bool:
         while True:
             c = self._get_char()
             if c is None:
@@ -1613,53 +1740,36 @@ class Tokenizer:
     # Low-level helpers
     # ---------------------
-    def _get_char(self):
+    def _get_char(self) -> str | None:
         if self.reconsume:
             self.reconsume = False
             return self.current_char
-        buffer = self.buffer
         pos = self.pos
-        length = self.length
-        while True:
-            if pos >= length:
-                self.pos = pos
-                self.current_char = None
-                return None
-            c = buffer[pos]
-            pos += 1
+        if pos >= self.length:
+            self.current_char = None
+            return None
-            if c == "\r":
-                self.ignore_lf = True
-                self.current_char = "\n"
-                self.pos = pos
-                return "\n"
-            if c == "\n":
-                if self.ignore_lf:
-                    self.ignore_lf = False
-                    continue
-                # Line tracking now computed on-demand via _get_line_at_pos()
-            else:
-                self.ignore_lf = False
-            self.current_char = c
-            self.pos = pos
-            return c
+        c = self.buffer[pos]
+        self.pos = pos + 1
+        self.current_char = c
+        if c == "<":
+            self.current_token_start_pos = pos
+        if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
+            self._emit_error_at_pos("noncharacter-in-input-stream", pos)
+        return c
-    def _reconsume_current(self):
+    def _reconsume_current(self) -> None:
         self.reconsume = True
-    def _append_text(self, text):
+    def _append_text(self, text: str) -> None:
         """Append text to buffer, recording start position if this is the first chunk."""
         if not self.text_buffer:
             # Record where text started (current position before this chunk)
             self.text_start_pos = self.pos
         self.text_buffer.append(text)
-    def _flush_text(self):
+    def _flush_text(self) -> None:
         if not self.text_buffer:
             return
@@ -1674,10 +1784,38 @@ class Tokenizer:
         raw_len = len(data)
         self.text_buffer.clear()
-        if self.state == self.DATA and "\0" in data:
-            count = data.count("\0")
-            for _ in range(count):
-                self._emit_error("unexpected-null-character")
+        # U+0000 NULL is a parse error in text.
+        # Emit one error per NULL at the *actual* character position.
+        if "\0" in data:
+            base_pos = self.text_start_pos
+            search_from = 0
+            while True:
+                idx = data.find("\0", search_from)
+                if idx == -1:
+                    break
+                error_pos = base_pos + idx
+                # Compute column at error_pos (1-indexed).
+                last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
+                if last_newline == -1:
+                    column = error_pos + 1
+                else:
+                    column = error_pos - last_newline
+                line = self._get_line_at_pos(error_pos)
+                message = generate_error_message("unexpected-null-character")
+                self.errors.append(
+                    ParseError(
+                        "unexpected-null-character",
+                        line=line,
+                        column=column,
+                        category="tokenizer",
+                        message=message,
+                        source_html=self.buffer,
+                    )
+                )
+                search_from = idx + 1
         # Per HTML5 spec:
         # - RCDATA state (title, textarea): decode character references
@@ -1690,21 +1828,24 @@ class Tokenizer:
             pass
         else:
             if "&" in data:
-                data = decode_entities_in_text(data)
+                report_error = self._emit_error if self.collect_errors else None
+                data = decode_entities_in_text(data, report_error=report_error)
         # Apply XML coercion if enabled
         if self.opts.xml_coercion:
             data = _coerce_text_for_xml(data)
         # Record position at END of raw text (1-indexed column = raw_len)
-        self._record_text_end_position(raw_len)
+        if self.collect_errors:
+            self._record_text_end_position(raw_len)
+        self.last_token_start_pos = self.text_start_pos
         self.sink.process_characters(data)
         # Note: process_characters never returns Plaintext or RawData
         # State switches happen via _emit_current_tag instead
-    def _append_attr_value_char(self, c):
+    def _append_attr_value_char(self, c: str) -> None:
         self.current_attr_value.append(c)
-    def _finish_attribute(self):
+    def _finish_attribute(self) -> None:
         attr_name_buffer = self.current_attr_name
         if not attr_name_buffer:
             return
@@ -1728,12 +1869,13 @@ class Tokenizer:
         else:
             value = "".join(attr_value_buffer)
         if self.current_attr_value_has_amp:
-            value = decode_entities_in_text(value, in_attribute=True)
+            report_error = self._emit_error if self.collect_errors else None
+            value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
         attrs[name] = value
         attr_value_buffer.clear()
         self.current_attr_value_has_amp = False
-    def _emit_current_tag(self):
+    def _emit_current_tag(self) -> bool:
         name_parts = self.current_tag_name
         part_count = len(name_parts)
         # Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
@@ -1749,6 +1891,8 @@ class Tokenizer:
         tag.name = name
         tag.attrs = attrs
         tag.self_closing = self.current_tag_self_closing
+        tag.start_pos = self.current_token_start_pos
+        self.last_token_start_pos = tag.start_pos
         switched_to_rawtext = False
         if self.current_tag_kind == Tag.START:
@@ -1774,7 +1918,8 @@ class Tokenizer:
         # Remember current state before emitting
         # Emit token to sink
-        self._record_token_position()
+        if self.collect_errors:
+            self._record_token_position()
         result = self.sink.process_token(tag)
         if result == 1:  # TokenSinkResult.Plaintext
             self.state = self.PLAINTEXT
@@ -1787,16 +1932,18 @@ class Tokenizer:
         self.current_tag_kind = Tag.START
         return switched_to_rawtext
-    def _emit_comment(self):
+    def _emit_comment(self) -> None:
         data = "".join(self.current_comment)
         self.current_comment.clear()
         # Apply XML coercion if enabled
         if self.opts.xml_coercion:
             data = _coerce_comment_for_xml(data)
         self._comment_token.data = data
+        self._comment_token.start_pos = self.current_token_start_pos
+        self.last_token_start_pos = self._comment_token.start_pos
         self._emit_token(self._comment_token)
-    def _emit_doctype(self):
+    def _emit_doctype(self) -> None:
         name = "".join(self.current_doctype_name) if self.current_doctype_name else None
         # If public_id/system_id is a list (even empty), join it; if None, keep None
         public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
@@ -1813,19 +1960,18 @@ class Tokenizer:
         self.current_doctype_force_quirks = False
         self._emit_token(DoctypeToken(doctype))
-    def _emit_token(self, token):
-        self._record_token_position()
+    def _emit_token(self, token: AnyToken) -> None:
+        if self.collect_errors:
+            self._record_token_position()
         self.sink.process_token(token)
         # Note: process_token never returns Plaintext or RawData for state switches
         # State switches happen via _emit_current_tag checking sink response
-    def _record_token_position(self):
+    def _record_token_position(self) -> None:
         """Record current position as 0-indexed column for the last emitted token.
         Per the spec, the position should be at the end of the token (after the last char).
         """
-        if not self.collect_errors:
-            return
         # pos points after the last consumed character, which is exactly what we want
         pos = self.pos
         last_newline = self.buffer.rfind("\n", 0, pos)
@@ -1836,14 +1982,12 @@ class Tokenizer:
         self.last_token_line = self._get_line_at_pos(pos)
         self.last_token_column = column
-    def _record_text_end_position(self, raw_len):
+    def _record_text_end_position(self, raw_len: int) -> None:
         """Record position at end of text token (after last character).
         Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
         behavior of reporting the column of the last character (1-indexed).
         """
-        if not self.collect_errors:
-            return
         # Position of last character of text (0-indexed)
         end_pos = self.text_start_pos + raw_len
         last_newline = self.buffer.rfind("\n", 0, end_pos)
@@ -1854,7 +1998,7 @@ class Tokenizer:
         self.last_token_line = self._get_line_at_pos(end_pos)
         self.last_token_column = column
-    def _emit_error(self, code):
+    def _emit_error(self, code: str) -> None:
         if not self.collect_errors:
             return
         # Compute column on-demand: scan backwards to find last newline
@@ -1867,9 +2011,24 @@ class Tokenizer:
         message = generate_error_message(code)
         line = self._get_line_at_pos(self.pos)
-        self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
+        self.errors.append(
+            ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
+        )
-    def _consume_if(self, literal):
+    def _emit_error_at_pos(self, code: str, pos: int) -> None:
+        last_newline = self.buffer.rfind("\n", 0, pos + 1)
+        if last_newline == -1:
+            column = pos + 1
+        else:
+            column = pos - last_newline
+        message = generate_error_message(code)
+        line = self._get_line_at_pos(pos)
+        self.errors.append(
+            ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
+        )
+    def _consume_if(self, literal: str) -> bool:
         end = self.pos + len(literal)
         if end > self.length:
             return False
@@ -1879,7 +2038,7 @@ class Tokenizer:
         self.pos = end
         return True
-    def _consume_case_insensitive(self, literal):
+    def _consume_case_insensitive(self, literal: str) -> bool:
         end = self.pos + len(literal)
         if end > self.length:
             return False
@@ -1889,34 +2048,22 @@ class Tokenizer:
         self.pos = end
         return True
-    def _consume_comment_run(self):
+    def _consume_comment_run(self) -> bool:
         # Note: Comments are never reconsumed
         pos = self.pos
         length = self.length
         if pos >= length:
             return False
-        # Handle ignore_lf for CRLF sequences
-        if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
-            self.ignore_lf = False
-            pos += 1
-            self.pos = pos
-            if pos >= length:
-                return False
         match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
         if match:
             chunk = match.group(0)
-            # Handle CRLF normalization for comments
-            if "\r" in chunk:
-                chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
-                self.ignore_lf = chunk.endswith("\r")
             self.current_comment.append(chunk)
             self.pos = match.end()
             return True
         return False
-    def _state_cdata_section(self):
+    def _state_cdata_section(self) -> bool:
         # CDATA section state - consume characters until we see ']'
         while True:
             c = self._get_char()
@@ -1930,7 +2077,7 @@ class Tokenizer:
                 return False
             self._append_text(c)
-    def _state_cdata_section_bracket(self):
+    def _state_cdata_section_bracket(self) -> bool:
         # Seen one ']', check for second ']'
         c = self._get_char()
         if c == "]":
@@ -1947,7 +2094,7 @@ class Tokenizer:
         self.state = self.CDATA_SECTION
         return False
-    def _state_cdata_section_end(self):
+    def _state_cdata_section_end(self) -> bool:
         # Seen ']]', check for '>'
         c = self._get_char()
         if c == ">":
@@ -1973,7 +2120,7 @@ class Tokenizer:
         self.state = self.CDATA_SECTION
         return False
-    def _state_rcdata(self):
+    def _state_rcdata(self) -> bool:
         buffer = self.buffer
         length = self.length
         pos = self.pos
@@ -2004,7 +2151,7 @@ class Tokenizer:
             # Consume everything up to the special character
             if next_special > pos:
                 chunk = buffer[pos:next_special]
-                self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
+                self._append_text_chunk(chunk)
                 pos = next_special
                 self.pos = pos
@@ -2016,7 +2163,6 @@ class Tokenizer:
             # Handle special characters - we're at one of them after find()
             if null_index == pos:
-                self.ignore_lf = False
                 self._emit_error("unexpected-null-character")
                 self._append_text("\ufffd")
                 pos += 1
@@ -2034,7 +2180,7 @@ class Tokenizer:
                 self.state = self.RCDATA_LESS_THAN_SIGN
                 return False
-    def _state_rcdata_less_than_sign(self):
+    def _state_rcdata_less_than_sign(self) -> bool:
         c = self._get_char()
         if c == "/":
             self.current_tag_name.clear()
@@ -2045,7 +2191,7 @@ class Tokenizer:
         self.state = self.RCDATA
         return False
-    def _state_rcdata_end_tag_open(self):
+    def _state_rcdata_end_tag_open(self) -> bool:
         c = self._get_char()
         if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
             self.current_tag_name.append(c.lower())
@@ -2057,7 +2203,7 @@ class Tokenizer:
         self.state = self.RCDATA
         return False
-    def _state_rcdata_end_tag_name(self):
+    def _state_rcdata_end_tag_name(self) -> bool:
         # Check if this matches the opening tag name
         while True:
             c = self._get_char()
@@ -2069,7 +2215,7 @@ class Tokenizer:
             tag_name = "".join(self.current_tag_name)
             if tag_name == self.rawtext_tag_name:
                 if c == ">":
-                    attrs = []
+                    attrs: dict[str, str | None] = {}
                     tag = Tag(Tag.END, tag_name, attrs, False)
                     self._flush_text()
                     self._emit_token(tag)
@@ -2110,7 +2256,7 @@ class Tokenizer:
             self.state = self.RCDATA
             return False
-    def _state_rawtext(self):
+    def _state_rawtext(self) -> bool:
         buffer = self.buffer
         length = self.length
         pos = self.pos
@@ -2131,9 +2277,7 @@ class Tokenizer:
             if null_index != -1 and null_index < next_special:
                 if null_index > pos:
                     chunk = buffer[pos:null_index]
-                    self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
-                else:
-                    self.ignore_lf = False
+                    self._append_text_chunk(chunk)
                 self._emit_error("unexpected-null-character")
                 self._append_text("\ufffd")
                 pos = null_index + 1
@@ -2142,14 +2286,14 @@ class Tokenizer:
             if lt_index == -1:
                 if pos < length:
                     chunk = buffer[pos:length]
-                    self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
+                    self._append_text_chunk(chunk)
                 self.pos = length
                 self._flush_text()
                 self._emit_token(EOFToken())
                 return True
             if lt_index > pos:
                 chunk = buffer[pos:lt_index]
-                self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
+                self._append_text_chunk(chunk)
             pos = lt_index + 1
             self.pos = pos
             # Handle script escaped transition before treating '<' as markup boundary
@@ -2167,7 +2311,7 @@ class Tokenizer:
             self.state = self.RAWTEXT_LESS_THAN_SIGN
             return False
-    def _state_rawtext_less_than_sign(self):
+    def _state_rawtext_less_than_sign(self) -> bool:
         c = self._get_char()
         if c == "/":
             self.current_tag_name.clear()
@@ -2178,7 +2322,7 @@ class Tokenizer:
         self.state = self.RAWTEXT
         return False
-    def _state_rawtext_end_tag_open(self):
+    def _state_rawtext_end_tag_open(self) -> bool:
         c = self._get_char()
         if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
             self.current_tag_name.append(c.lower())
@@ -2190,7 +2334,7 @@ class Tokenizer:
         self.state = self.RAWTEXT
         return False
-    def _state_rawtext_end_tag_name(self):
+    def _state_rawtext_end_tag_name(self) -> bool:
         # Check if this matches the opening tag name
         while True:
             c = self._get_char()
@@ -2202,7 +2346,7 @@ class Tokenizer:
             tag_name = "".join(self.current_tag_name)
             if tag_name == self.rawtext_tag_name:
                 if c == ">":
-                    attrs = []
+                    attrs: dict[str, str | None] = {}
                     tag = Tag(Tag.END, tag_name, attrs, False)
                     self._flush_text()
                     self._emit_token(tag)
@@ -2243,7 +2387,7 @@ class Tokenizer:
             self.state = self.RAWTEXT
             return False
-    def _state_plaintext(self):
+    def _state_plaintext(self) -> bool:
         # PLAINTEXT state - consume everything as text, no end tag
         if self.pos < self.length:
             remaining = self.buffer[self.pos :]
@@ -2257,7 +2401,7 @@ class Tokenizer:
         self._emit_token(EOFToken())
         return True
-    def _state_script_data_escaped(self):
+    def _state_script_data_escaped(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2277,7 +2421,7 @@ class Tokenizer:
         self._append_text(c)
         return False
-    def _state_script_data_escaped_dash(self):
+    def _state_script_data_escaped_dash(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2299,7 +2443,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_ESCAPED
         return False
-    def _state_script_data_escaped_dash_dash(self):
+    def _state_script_data_escaped_dash_dash(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2325,7 +2469,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_ESCAPED
         return False
-    def _state_script_data_escaped_less_than_sign(self):
+    def _state_script_data_escaped_less_than_sign(self) -> bool:
         c = self._get_char()
         if c == "/":
             self.temp_buffer.clear()
@@ -2343,7 +2487,7 @@ class Tokenizer:
         return False
-    def _state_script_data_escaped_end_tag_open(self):
+    def _state_script_data_escaped_end_tag_open(self) -> bool:
         c = self._get_char()
         if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
             self.current_tag_name.clear()
@@ -2356,7 +2500,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_ESCAPED
         return False
-    def _state_script_data_escaped_end_tag_name(self):
+    def _state_script_data_escaped_end_tag_name(self) -> bool:
         c = self._get_char()
         if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
             self.current_tag_name.append(c.lower())
@@ -2381,7 +2525,7 @@ class Tokenizer:
                 return False
             if c == ">":
                 self._flush_text()
-                attrs = []
+                attrs: dict[str, str | None] = {}
                 tag = Tag(Tag.END, tag_name, attrs, False)
                 self._emit_token(tag)
                 self.state = self.DATA
@@ -2397,7 +2541,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_ESCAPED
         return False
-    def _state_script_data_double_escape_start(self):
+    def _state_script_data_double_escape_start(self) -> bool:
         c = self._get_char()
         if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
             # Check if temp_buffer contains "script"
@@ -2416,7 +2560,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_ESCAPED
         return False
-    def _state_script_data_double_escaped(self):
+    def _state_script_data_double_escaped(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2437,7 +2581,7 @@ class Tokenizer:
         self._append_text(c)
         return False
-    def _state_script_data_double_escaped_dash(self):
+    def _state_script_data_double_escaped_dash(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2460,7 +2604,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
         return False
-    def _state_script_data_double_escaped_dash_dash(self):
+    def _state_script_data_double_escaped_dash_dash(self) -> bool:
         c = self._get_char()
         if c is None:
             self._flush_text()
@@ -2488,7 +2632,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
         return False
-    def _state_script_data_double_escaped_less_than_sign(self):
+    def _state_script_data_double_escaped_less_than_sign(self) -> bool:
         c = self._get_char()
         if c == "/":
             self.temp_buffer.clear()
@@ -2504,7 +2648,7 @@ class Tokenizer:
         self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
         return False
-    def _state_script_data_double_escape_end(self):
+    def _state_script_data_double_escape_end(self) -> bool:
         c = self._get_char()
         if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
             # Check if temp_buffer contains "script"
@@ -2525,7 +2669,7 @@ class Tokenizer:
         return False
-Tokenizer._STATE_HANDLERS = [
+Tokenizer._STATE_HANDLERS = [  # type: ignore[attr-defined]
     Tokenizer._state_data,
     Tokenizer._state_tag_open,
     Tokenizer._state_end_tag_open,

justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

justhtml 0.6.0py3-none-any.whl → 0.33.0py3-none-any.whl