justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/tokenizer.py ADDED
@@ -0,0 +1,2647 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from bisect import bisect_right
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Callable
9
+
10
+ from .entities import decode_entities_in_text
11
+ from .errors import generate_error_message
12
+ from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
13
+
14
+ _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
15
+ _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
16
+ _RCDATA_ELEMENTS = {"title", "textarea"}
17
+ _RAWTEXT_SWITCH_TAGS = {
18
+ "script",
19
+ "style",
20
+ "xmp",
21
+ "iframe",
22
+ "noembed",
23
+ "noframes",
24
+ "textarea",
25
+ "title",
26
+ }
27
+
28
+ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
29
+ _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
30
+ _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
31
+
32
+ _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
33
+ _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
34
+ _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
35
+ _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
36
+
37
+ # XML Coercion Regex
38
+ _xml_invalid_single_chars = []
39
+ for _plane in range(17):
40
+ _base = _plane * 0x10000
41
+ _xml_invalid_single_chars.append(chr(_base + 0xFFFE))
42
+ _xml_invalid_single_chars.append(chr(_base + 0xFFFF))
43
+
44
+ _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
45
+
46
+
47
+ def _xml_coercion_callback(match: re.Match[str]) -> str:
48
+ if match.group(0) == "\f":
49
+ return " "
50
+ return "\ufffd"
51
+
52
+
53
+ def _coerce_text_for_xml(text: str) -> str:
54
+ """Apply XML coercion to text content."""
55
+ # Fast path for ASCII
56
+ if text.isascii():
57
+ if "\f" in text:
58
+ return text.replace("\f", " ")
59
+ return text
60
+
61
+ if not _XML_COERCION_PATTERN.search(text):
62
+ return text
63
+ return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
64
+
65
+
66
+ def _coerce_comment_for_xml(text: str) -> str:
67
+ """Apply XML coercion to comment content - handle double hyphens."""
68
+ # Replace -- with - - (with space)
69
+ if "--" in text:
70
+ return text.replace("--", "- -")
71
+ return text
72
+
73
+
74
+ class TokenizerOpts:
75
+ __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
76
+
77
+ discard_bom: bool
78
+ exact_errors: bool
79
+ initial_rawtext_tag: str | None
80
+ initial_state: int | None
81
+ xml_coercion: bool
82
+
83
+ def __init__(
84
+ self,
85
+ exact_errors: bool = False,
86
+ discard_bom: bool = True,
87
+ initial_state: int | None = None,
88
+ initial_rawtext_tag: str | None = None,
89
+ xml_coercion: bool = False,
90
+ ) -> None:
91
+ self.exact_errors = bool(exact_errors)
92
+ self.discard_bom = bool(discard_bom)
93
+ self.initial_state = initial_state
94
+ self.initial_rawtext_tag = initial_rawtext_tag
95
+ self.xml_coercion = bool(xml_coercion)
96
+
97
+
98
+ class Tokenizer:
99
+ DATA = 0
100
+ TAG_OPEN = 1
101
+ END_TAG_OPEN = 2
102
+ TAG_NAME = 3
103
+ BEFORE_ATTRIBUTE_NAME = 4
104
+ ATTRIBUTE_NAME = 5
105
+ AFTER_ATTRIBUTE_NAME = 6
106
+ BEFORE_ATTRIBUTE_VALUE = 7
107
+ ATTRIBUTE_VALUE_DOUBLE = 8
108
+ ATTRIBUTE_VALUE_SINGLE = 9
109
+ ATTRIBUTE_VALUE_UNQUOTED = 10
110
+ AFTER_ATTRIBUTE_VALUE_QUOTED = 11
111
+ SELF_CLOSING_START_TAG = 12
112
+ MARKUP_DECLARATION_OPEN = 13
113
+ COMMENT_START = 14
114
+ COMMENT_START_DASH = 15
115
+ COMMENT = 16
116
+ COMMENT_END_DASH = 17
117
+ COMMENT_END = 18
118
+ COMMENT_END_BANG = 19
119
+ BOGUS_COMMENT = 20
120
+ DOCTYPE = 21
121
+ BEFORE_DOCTYPE_NAME = 22
122
+ DOCTYPE_NAME = 23
123
+ AFTER_DOCTYPE_NAME = 24
124
+ BOGUS_DOCTYPE = 25
125
+ AFTER_DOCTYPE_PUBLIC_KEYWORD = 26
126
+ AFTER_DOCTYPE_SYSTEM_KEYWORD = 27
127
+ BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 28
128
+ DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 29
129
+ DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 30
130
+ AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 31
131
+ BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 32
132
+ BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 33
133
+ DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 34
134
+ DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 35
135
+ AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 36
136
+ CDATA_SECTION = 37
137
+ CDATA_SECTION_BRACKET = 38
138
+ CDATA_SECTION_END = 39
139
+ RCDATA = 40
140
+ RCDATA_LESS_THAN_SIGN = 41
141
+ RCDATA_END_TAG_OPEN = 42
142
+ RCDATA_END_TAG_NAME = 43
143
+ RAWTEXT = 44
144
+ RAWTEXT_LESS_THAN_SIGN = 45
145
+ RAWTEXT_END_TAG_OPEN = 46
146
+ RAWTEXT_END_TAG_NAME = 47
147
+ PLAINTEXT = 48
148
+ SCRIPT_DATA_ESCAPED = 49
149
+ SCRIPT_DATA_ESCAPED_DASH = 50
150
+ SCRIPT_DATA_ESCAPED_DASH_DASH = 51
151
+ SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 52
152
+ SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 53
153
+ SCRIPT_DATA_ESCAPED_END_TAG_NAME = 54
154
+ SCRIPT_DATA_DOUBLE_ESCAPE_START = 55
155
+ SCRIPT_DATA_DOUBLE_ESCAPED = 56
156
+ SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 57
157
+ SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 58
158
+ SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 59
159
+ SCRIPT_DATA_DOUBLE_ESCAPE_END = 60
160
+
161
+ __slots__ = (
162
+ "_comment_token",
163
+ "_newline_positions",
164
+ "_state_handlers",
165
+ "_tag_token",
166
+ "buffer",
167
+ "collect_errors",
168
+ "current_attr_name",
169
+ "current_attr_value",
170
+ "current_attr_value_has_amp",
171
+ "current_char",
172
+ "current_comment",
173
+ "current_doctype_force_quirks",
174
+ "current_doctype_name",
175
+ "current_doctype_public",
176
+ "current_doctype_system",
177
+ "current_tag_attrs",
178
+ "current_tag_kind",
179
+ "current_tag_name",
180
+ "current_tag_self_closing",
181
+ "errors",
182
+ "ignore_lf",
183
+ "last_start_tag_name",
184
+ "last_token_column",
185
+ "last_token_line",
186
+ "length",
187
+ "opts",
188
+ "original_tag_name",
189
+ "pos",
190
+ "rawtext_tag_name",
191
+ "reconsume",
192
+ "sink",
193
+ "state",
194
+ "temp_buffer",
195
+ "text_buffer",
196
+ "text_start_pos",
197
+ )
198
+
199
+ _comment_token: CommentToken
200
+ _newline_positions: list[int] | None
201
+ _state_handlers: list[Callable[[Tokenizer], bool]]
202
+ _tag_token: Tag
203
+ buffer: str
204
+ collect_errors: bool
205
+ current_attr_name: list[str]
206
+ current_attr_value: list[str]
207
+ current_attr_value_has_amp: bool
208
+ current_char: str | None
209
+ current_comment: list[str]
210
+ current_doctype_force_quirks: bool
211
+ current_doctype_name: list[str]
212
+ current_doctype_public: list[str] | None
213
+ current_doctype_system: list[str] | None
214
+ current_tag_attrs: dict[str, str | None]
215
+ current_tag_kind: int
216
+ current_tag_name: list[str]
217
+ current_tag_self_closing: bool
218
+ errors: list[ParseError]
219
+ ignore_lf: bool
220
+ last_start_tag_name: str | None
221
+ last_token_column: int
222
+ last_token_line: int
223
+ length: int
224
+ opts: TokenizerOpts
225
+ original_tag_name: list[str]
226
+ pos: int
227
+ rawtext_tag_name: str | None
228
+ reconsume: bool
229
+ sink: Any
230
+ state: int
231
+ temp_buffer: list[str]
232
+ text_buffer: list[str]
233
+ text_start_pos: int
234
+
235
+ # _STATE_HANDLERS is defined at the end of the file
236
+
237
+ def __init__(self, sink: Any, opts: TokenizerOpts | None = None, collect_errors: bool = False) -> None:
238
+ self.sink = sink
239
+ self.opts = opts or TokenizerOpts()
240
+ self.collect_errors = collect_errors
241
+ self.errors = []
242
+
243
+ self.state = self.DATA
244
+ self.buffer = ""
245
+ self.length = 0
246
+ self.pos = 0
247
+ self.reconsume = False
248
+ self.current_char = ""
249
+ self.ignore_lf = False
250
+ self.last_token_line = 1
251
+ self.last_token_column = 0
252
+
253
+ # Reusable buffers to avoid per-token allocations.
254
+ self.text_buffer = []
255
+ self.text_start_pos = 0
256
+ self.current_tag_name = []
257
+ self.current_tag_attrs = {}
258
+ self.current_attr_name = []
259
+ self.current_attr_value = []
260
+ self.current_attr_value_has_amp = False
261
+ self.current_tag_self_closing = False
262
+ self.current_tag_kind = Tag.START
263
+ self.current_comment = []
264
+ self.current_doctype_name = []
265
+ self.current_doctype_public = None # None = not set, [] = empty string
266
+ self.current_doctype_system = None # None = not set, [] = empty string
267
+ self.current_doctype_force_quirks = False
268
+ self.last_start_tag_name = None
269
+ self.rawtext_tag_name = None
270
+ self.original_tag_name = []
271
+ self.temp_buffer = []
272
+ self._tag_token = Tag(Tag.START, "", {}, False)
273
+ self._comment_token = CommentToken("")
274
+
275
+ def initialize(self, html: str | None) -> None:
276
+ if html and html[0] == "\ufeff" and self.opts.discard_bom:
277
+ html = html[1:]
278
+
279
+ self.buffer = html or ""
280
+ self.length = len(self.buffer)
281
+ self.pos = 0
282
+ self.reconsume = False
283
+ self.current_char = ""
284
+ self.ignore_lf = False
285
+ self.last_token_line = 1
286
+ self.last_token_column = 0
287
+ self.errors = []
288
+ self.text_buffer.clear()
289
+ self.text_start_pos = 0
290
+ self.current_tag_name.clear()
291
+ self.current_tag_attrs = {}
292
+ self.current_attr_name.clear()
293
+ self.current_attr_value.clear()
294
+ self.current_attr_value_has_amp = False
295
+ self.current_comment.clear()
296
+ self.current_doctype_name.clear()
297
+ self.current_doctype_public = None
298
+ self.current_doctype_system = None
299
+ self.current_doctype_force_quirks = False
300
+ self.current_tag_self_closing = False
301
+ self.current_tag_kind = Tag.START
302
+ self.rawtext_tag_name = self.opts.initial_rawtext_tag
303
+ self.temp_buffer.clear()
304
+ self.last_start_tag_name = None
305
+ self._tag_token.kind = Tag.START
306
+ self._tag_token.name = ""
307
+ self._tag_token.attrs = {}
308
+ self._tag_token.self_closing = False
309
+
310
+ initial_state = self.opts.initial_state
311
+ if isinstance(initial_state, int):
312
+ self.state = initial_state
313
+ else:
314
+ self.state = self.DATA
315
+
316
+ # Pre-compute newline positions for O(log n) line lookups
317
+ if self.collect_errors:
318
+ self._newline_positions = []
319
+ pos = -1
320
+ buffer = self.buffer
321
+ while True:
322
+ pos = buffer.find("\n", pos + 1)
323
+ if pos == -1:
324
+ break
325
+ self._newline_positions.append(pos)
326
+ else:
327
+ self._newline_positions = None
328
+
329
+ def _get_line_at_pos(self, pos: int) -> int:
330
+ """Get line number (1-indexed) for a position using binary search."""
331
+ # Line number = count of newlines before pos + 1
332
+ newline_positions = self._newline_positions
333
+ if newline_positions is None: # pragma: no cover
334
+ return 1
335
+ return bisect_right(newline_positions, pos - 1) + 1
336
+
337
+ def step(self) -> bool:
338
+ """Run one step of the tokenizer state machine. Returns True if EOF reached."""
339
+ handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
340
+ return handler(self) # type: ignore[no-any-return]
341
+
342
+ def run(self, html: str | None) -> None:
343
+ self.initialize(html)
344
+ while True:
345
+ if self.step():
346
+ break
347
+
348
+ # ---------------------
349
+ # Helper methods
350
+ # ---------------------
351
+
352
+ def _peek_char(self, offset: int) -> str | None:
353
+ """Peek ahead at character at current position + offset without consuming"""
354
+ peek_pos = self.pos + offset
355
+ if peek_pos < self.length:
356
+ return self.buffer[peek_pos]
357
+ return None
358
+
359
+ def _append_text_chunk(self, chunk: str, *, ends_with_cr: bool = False) -> None:
360
+ self._append_text(chunk)
361
+ self.ignore_lf = ends_with_cr
362
+
363
+ # ---------------------
364
+ # State handlers
365
+ # ---------------------
366
+
367
+ def _state_data(self) -> bool:
368
+ buffer = self.buffer
369
+ length = self.length
370
+ pos = self.pos
371
+ while True:
372
+ if self.reconsume:
373
+ # Note: reconsume is never True at EOF in DATA state
374
+ self.reconsume = False
375
+ self.pos -= 1
376
+ pos = self.pos
377
+
378
+ if pos >= length:
379
+ self.pos = length
380
+ self.current_char = None
381
+ self._flush_text()
382
+ self._emit_token(EOFToken())
383
+ return True
384
+
385
+ # Optimized loop using find
386
+ next_lt = buffer.find("<", pos)
387
+
388
+ if next_lt == -1:
389
+ next_lt = length
390
+
391
+ end = next_lt
392
+
393
+ if end > pos:
394
+ chunk = buffer[pos:end]
395
+
396
+ if "\r" in chunk:
397
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
398
+
399
+ self._append_text(chunk)
400
+ self.ignore_lf = chunk.endswith("\r")
401
+
402
+ pos = end
403
+ self.pos = pos
404
+ if pos >= length:
405
+ continue
406
+
407
+ # After find("<"), we're always at '<' unless reconsume is True
408
+ # But reconsume only happens after TAG_OPEN which reconsumed '<'
409
+ c = buffer[pos]
410
+ pos += 1
411
+ self.pos = pos
412
+ self.current_char = c
413
+ self.ignore_lf = False
414
+ # c is always '<' here due to find() optimization above
415
+ # Optimization: Peek ahead for common tag starts
416
+ if pos < length:
417
+ nc = buffer[pos]
418
+ if ("a" <= nc <= "z") or ("A" <= nc <= "Z"):
419
+ self._flush_text()
420
+ # Inline _start_tag(Tag.START)
421
+ self.current_tag_kind = Tag.START
422
+ self.current_tag_name.clear()
423
+ self.current_attr_name.clear()
424
+ self.current_attr_value.clear()
425
+ self.current_attr_value_has_amp = False
426
+ self.current_tag_self_closing = False
427
+
428
+ if "A" <= nc <= "Z":
429
+ nc = chr(ord(nc) + 32)
430
+ self.current_tag_name.append(nc)
431
+ self.pos += 1
432
+ self.state = self.TAG_NAME
433
+ return self._state_tag_name()
434
+
435
+ if nc == "!":
436
+ # Optimization: Peek ahead for comments
437
+ if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
438
+ self._flush_text()
439
+ self.pos += 3 # Consume !--
440
+ self.current_comment.clear()
441
+ self.state = self.COMMENT_START
442
+ return self._state_comment_start()
443
+
444
+ if nc == "/":
445
+ # Check next char for end tag
446
+ if pos + 1 < length:
447
+ nnc = buffer[pos + 1]
448
+ if ("a" <= nnc <= "z") or ("A" <= nnc <= "Z"):
449
+ self._flush_text()
450
+ # Inline _start_tag(Tag.END)
451
+ self.current_tag_kind = Tag.END
452
+ self.current_tag_name.clear()
453
+ self.current_attr_name.clear()
454
+ self.current_attr_value.clear()
455
+ self.current_attr_value_has_amp = False
456
+ self.current_tag_self_closing = False
457
+
458
+ if "A" <= nnc <= "Z":
459
+ nnc = chr(ord(nnc) + 32)
460
+ self.current_tag_name.append(nnc)
461
+ self.pos += 2 # Consume / and nnc
462
+ self.state = self.TAG_NAME
463
+ return self._state_tag_name()
464
+
465
+ self._flush_text()
466
+ self.state = self.TAG_OPEN
467
+ return self._state_tag_open()
468
+
469
+ def _state_tag_open(self) -> bool:
470
+ c = self._get_char()
471
+ if c is None:
472
+ self._emit_error("eof-before-tag-name")
473
+ self._append_text("<")
474
+ self._flush_text()
475
+ self._emit_token(EOFToken())
476
+ return True
477
+ if c == "!":
478
+ self.state = self.MARKUP_DECLARATION_OPEN
479
+ return False
480
+ if c == "/":
481
+ self.state = self.END_TAG_OPEN
482
+ return False
483
+ if c == "?":
484
+ self._emit_error("unexpected-question-mark-instead-of-tag-name")
485
+ self.current_comment.clear()
486
+ self._reconsume_current()
487
+ self.state = self.BOGUS_COMMENT
488
+ return False
489
+
490
+ self._emit_error("invalid-first-character-of-tag-name")
491
+ self._append_text("<")
492
+ self._reconsume_current()
493
+ self.state = self.DATA
494
+ return False
495
+
496
+ def _state_end_tag_open(self) -> bool:
497
+ c = self._get_char()
498
+ if c is None:
499
+ self._emit_error("eof-before-tag-name")
500
+ self._append_text("<")
501
+ self._append_text("/")
502
+ self._flush_text()
503
+ self._emit_token(EOFToken())
504
+ return True
505
+ if c == ">":
506
+ self._emit_error("empty-end-tag")
507
+ self.state = self.DATA
508
+ return False
509
+
510
+ self._emit_error("invalid-first-character-of-tag-name")
511
+ self.current_comment.clear()
512
+ self._reconsume_current()
513
+ self.state = self.BOGUS_COMMENT
514
+ return False
515
+
516
+ def _state_tag_name(self) -> bool:
517
+ replacement = "\ufffd"
518
+ append_tag_char = self.current_tag_name.append
519
+ buffer = self.buffer
520
+ length = self.length
521
+
522
+ while True:
523
+ # Inline _consume_tag_name_run
524
+ # Note: reconsume and ignore_lf are never True when entering TAG_NAME
525
+ pos = self.pos
526
+ if pos < length:
527
+ # Optimization: Check for common terminators before regex
528
+ match = None
529
+ if buffer[pos] not in "\t\n\f />\0\r":
530
+ match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
531
+
532
+ if match:
533
+ chunk = match.group(0)
534
+ if not chunk.islower():
535
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
536
+ append_tag_char(chunk)
537
+ self.pos = match.end()
538
+
539
+ if self.pos < length:
540
+ c = buffer[self.pos]
541
+ if c in (" ", "\t", "\n", "\f", "\r"):
542
+ self.pos += 1
543
+ if c == "\r":
544
+ self.ignore_lf = True
545
+ self.state = self.BEFORE_ATTRIBUTE_NAME
546
+ return self._state_before_attribute_name()
547
+ if c == ">":
548
+ self.pos += 1
549
+ if not self._emit_current_tag():
550
+ self.state = self.DATA
551
+ return False
552
+ if c == "/":
553
+ self.pos += 1
554
+ self.state = self.SELF_CLOSING_START_TAG
555
+ return self._state_self_closing_start_tag()
556
+
557
+ c = self._get_char() # type: ignore[assignment]
558
+ if c is None:
559
+ self._emit_error("eof-in-tag")
560
+ # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
561
+ # The incomplete tag is discarded (not emitted as text)
562
+ self._emit_token(EOFToken())
563
+ return True
564
+ if c in ("\t", "\n", "\f", " "):
565
+ self.state = self.BEFORE_ATTRIBUTE_NAME
566
+ return self._state_before_attribute_name()
567
+ if c == "/":
568
+ self.state = self.SELF_CLOSING_START_TAG
569
+ return self._state_self_closing_start_tag()
570
+ if c == ">":
571
+ # In slow path, tag name is only first char (from DATA),
572
+ # so no rawtext elements possible - always set DATA state
573
+ self._emit_current_tag()
574
+ self.state = self.DATA
575
+ return False
576
+ # c == "\0" - the only remaining possibility after fast-path
577
+ self._emit_error("unexpected-null-character")
578
+ append_tag_char(replacement)
579
+
580
+ def _state_before_attribute_name(self) -> bool:
581
+ buffer = self.buffer
582
+ length = self.length
583
+
584
+ while True:
585
+ # Optimization: Skip whitespace
586
+ if not self.reconsume and not self.ignore_lf:
587
+ if self.pos < length:
588
+ # Check if current char is whitespace before running regex
589
+ if buffer[self.pos] in " \t\n\f":
590
+ match = _WHITESPACE_PATTERN.match(buffer, self.pos)
591
+ if match:
592
+ self.pos = match.end()
593
+
594
+ # Inline _get_char
595
+ if self.reconsume: # pragma: no cover
596
+ self.reconsume = False
597
+ c = self.current_char
598
+ elif self.pos >= length:
599
+ c = None
600
+ else:
601
+ c = buffer[self.pos]
602
+ self.pos += 1
603
+
604
+ self.current_char = c
605
+
606
+ if c == " ":
607
+ self.ignore_lf = False
608
+ continue
609
+ if c == "\n":
610
+ if self.ignore_lf:
611
+ self.ignore_lf = False
612
+ # Line tracking now computed on-demand via _get_line_at_pos()
613
+ continue
614
+ if c == "\t" or c == "\f":
615
+ self.ignore_lf = False
616
+ continue
617
+ if c == "\r":
618
+ self.ignore_lf = False
619
+ if self.pos < length and buffer[self.pos] == "\n":
620
+ self.pos += 1
621
+ continue
622
+
623
+ if c is None:
624
+ self._emit_error("eof-in-tag")
625
+ self._flush_text()
626
+ self._emit_token(EOFToken())
627
+ return True
628
+
629
+ if c == "/":
630
+ self.state = self.SELF_CLOSING_START_TAG
631
+ return False
632
+ if c == ">":
633
+ self._finish_attribute()
634
+ if not self._emit_current_tag():
635
+ self.state = self.DATA
636
+ return False
637
+ if c == "=":
638
+ self._emit_error("unexpected-equals-sign-before-attribute-name")
639
+ self.current_attr_name.clear()
640
+ self.current_attr_value.clear()
641
+ self.current_attr_value_has_amp = False
642
+ self.current_attr_name.append("=")
643
+ self.state = self.ATTRIBUTE_NAME
644
+ return False # Let main loop dispatch to avoid recursion
645
+
646
+ self.current_attr_name.clear()
647
+ self.current_attr_value.clear()
648
+ self.current_attr_value_has_amp = False
649
+ if c == "\0":
650
+ self._emit_error("unexpected-null-character")
651
+ c = "\ufffd"
652
+ elif "A" <= c <= "Z":
653
+ c = chr(ord(c) + 32)
654
+
655
+ self.current_attr_name.append(c)
656
+ self.state = self.ATTRIBUTE_NAME
657
+ return False # Let main loop dispatch to avoid recursion
658
+
659
+ def _state_attribute_name(self) -> bool:
660
+ replacement = "\ufffd"
661
+ append_attr_char = self.current_attr_name.append
662
+ buffer = self.buffer
663
+ length = self.length
664
+
665
+ while True:
666
+ # Inline _consume_attribute_name_run
667
+ if not self.reconsume and not self.ignore_lf:
668
+ pos = self.pos
669
+ if pos < length:
670
+ # Optimization: Check for common terminators before regex
671
+ match = None
672
+ if buffer[pos] not in "\t\n\f />=\0\"'<\r":
673
+ match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
674
+
675
+ if match:
676
+ chunk = match.group(0)
677
+ if not chunk.islower():
678
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
679
+ append_attr_char(chunk)
680
+ self.pos = match.end()
681
+
682
+ if self.pos < length:
683
+ c = buffer[self.pos]
684
+ if c == "=":
685
+ self.pos += 1
686
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
687
+ return self._state_before_attribute_value()
688
+ if c in (" ", "\t", "\n", "\f", "\r"):
689
+ self.pos += 1
690
+ if c == "\r":
691
+ self.ignore_lf = True
692
+ self._finish_attribute()
693
+ self.state = self.AFTER_ATTRIBUTE_NAME
694
+ return False # Let main loop dispatch to avoid recursion
695
+ if c == ">":
696
+ self.pos += 1
697
+ self._finish_attribute()
698
+ if not self._emit_current_tag():
699
+ self.state = self.DATA
700
+ return False
701
+ if c == "/":
702
+ self.pos += 1
703
+ self._finish_attribute()
704
+ self.state = self.SELF_CLOSING_START_TAG
705
+ return self._state_self_closing_start_tag()
706
+
707
+ c = self._get_char() # type: ignore[assignment]
708
+ if c is None:
709
+ self._emit_error("eof-in-tag")
710
+ self._flush_text()
711
+ self._emit_token(EOFToken())
712
+ return True
713
+ if c in ("\t", "\n", "\f", " "):
714
+ self._finish_attribute()
715
+ self.state = self.AFTER_ATTRIBUTE_NAME
716
+ return False # Let main loop dispatch to avoid recursion
717
+ if c == "/":
718
+ self._finish_attribute()
719
+ self.state = self.SELF_CLOSING_START_TAG
720
+ return self._state_self_closing_start_tag()
721
+ if c == "=":
722
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
723
+ return self._state_before_attribute_value()
724
+ if c == ">":
725
+ self._finish_attribute()
726
+ if not self._emit_current_tag():
727
+ self.state = self.DATA
728
+ return False
729
+ if c == "\0":
730
+ self._emit_error("unexpected-null-character")
731
+ append_attr_char(replacement)
732
+ continue
733
+ if c in ('"', "'", "<"):
734
+ self._emit_error("unexpected-character-in-attribute-name")
735
+ append_attr_char(c)
736
+
737
+ def _state_after_attribute_name(self) -> bool:
738
+ buffer = self.buffer
739
+ length = self.length
740
+
741
+ while True:
742
+ # Optimization: Skip whitespace
743
+ if not self.reconsume and not self.ignore_lf:
744
+ if self.pos < length:
745
+ match = _WHITESPACE_PATTERN.match(buffer, self.pos)
746
+ if match:
747
+ self.pos = match.end()
748
+
749
+ # Inline _get_char
750
+ if self.pos >= length:
751
+ c = None
752
+ else:
753
+ c = buffer[self.pos]
754
+ self.pos += 1
755
+
756
+ self.current_char = c
757
+
758
+ if c == " ":
759
+ self.ignore_lf = False
760
+ continue
761
+ if c == "\n":
762
+ # Note: Only reachable when ignore_lf=True (CR-LF handling)
763
+ # Standalone \n is caught by whitespace optimization
764
+ self.ignore_lf = False
765
+ continue
766
+ if c == "\r":
767
+ self.ignore_lf = True
768
+ continue
769
+ if c == "\t" or c == "\f":
770
+ self.ignore_lf = False
771
+ continue
772
+
773
+ self.ignore_lf = False
774
+
775
+ if c is None:
776
+ self._emit_error("eof-in-tag")
777
+ self._flush_text()
778
+ self._emit_token(EOFToken())
779
+ return True
780
+ if c == "/":
781
+ self._finish_attribute()
782
+ self.state = self.SELF_CLOSING_START_TAG
783
+ return False
784
+ if c == "=":
785
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
786
+ return False
787
+ if c == ">":
788
+ self._finish_attribute()
789
+ if not self._emit_current_tag():
790
+ self.state = self.DATA
791
+ return False
792
+ self._finish_attribute()
793
+ self.current_attr_name.clear()
794
+ self.current_attr_value.clear()
795
+ self.current_attr_value_has_amp = False
796
+ if c == "\0":
797
+ self._emit_error("unexpected-null-character")
798
+ c = "\ufffd"
799
+ elif "A" <= c <= "Z":
800
+ c = chr(ord(c) + 32)
801
+ self.current_attr_name.append(c)
802
+ self.state = self.ATTRIBUTE_NAME
803
+ return False # Let main loop dispatch to avoid recursion
804
+
805
+ def _state_before_attribute_value(self) -> bool:
806
+ while True:
807
+ c = self._get_char()
808
+ if c is None:
809
+ self._emit_error("eof-in-tag")
810
+ self._flush_text()
811
+ self._emit_token(EOFToken())
812
+ return True
813
+ if c in ("\t", "\n", "\f", " "):
814
+ continue
815
+ if c == '"':
816
+ self.state = self.ATTRIBUTE_VALUE_DOUBLE
817
+ return self._state_attribute_value_double()
818
+ if c == "'":
819
+ self.state = self.ATTRIBUTE_VALUE_SINGLE
820
+ return self._state_attribute_value_single()
821
+ if c == ">":
822
+ self._emit_error("missing-attribute-value")
823
+ self._finish_attribute()
824
+ if not self._emit_current_tag():
825
+ self.state = self.DATA
826
+ return False
827
+ self._reconsume_current()
828
+ self.state = self.ATTRIBUTE_VALUE_UNQUOTED
829
+ return self._state_attribute_value_unquoted()
830
+
831
+ def _state_attribute_value_double(self) -> bool:
832
+ replacement = "\ufffd"
833
+ stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
834
+ buffer = self.buffer
835
+ length = self.length
836
+
837
+ while True:
838
+ # Inline _consume_attribute_value_run
839
+ pos = self.pos
840
+ if pos < length:
841
+ # Optimization: Optimistically look for quote
842
+ next_quote = buffer.find('"', pos)
843
+ if next_quote == -1:
844
+ next_quote = length
845
+
846
+ # Check if we skipped other terminators
847
+ chunk = buffer[pos:next_quote]
848
+ if "&" in chunk or "\0" in chunk:
849
+ # Fallback to regex if complex chars present
850
+ match = stop_pattern.search(buffer, pos)
851
+ end = length if match is None else match.start()
852
+ else:
853
+ end = next_quote
854
+
855
+ if end > pos:
856
+ # chunk is already valid if we took the fast path
857
+ if end != next_quote:
858
+ chunk = buffer[pos:end]
859
+
860
+ # Normalize chunk for value if needed
861
+ if "\r" in chunk:
862
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
863
+
864
+ self.current_attr_value.append(chunk)
865
+ self.pos = end
866
+
867
+ # Inlined _get_char logic
868
+ if self.pos >= length:
869
+ self.current_char = None
870
+ self._emit_error("eof-in-tag")
871
+ self._emit_token(EOFToken())
872
+ return True
873
+
874
+ c = buffer[self.pos]
875
+ self.pos += 1
876
+
877
+ self.current_char = c
878
+
879
+ if c == '"':
880
+ self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
881
+ return self._state_after_attribute_value_quoted()
882
+ if c == "&":
883
+ self._append_attr_value_char("&")
884
+ self.current_attr_value_has_amp = True
885
+ else:
886
+ # c == "\0" - the only remaining possibility after fast-path
887
+ self._emit_error("unexpected-null-character")
888
+ self._append_attr_value_char(replacement)
889
+
890
+ def _state_attribute_value_single(self) -> bool:
891
+ replacement = "\ufffd"
892
+ stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
893
+ buffer = self.buffer
894
+ length = self.length
895
+
896
+ while True:
897
+ # Inline _consume_attribute_value_run
898
+ pos = self.pos
899
+ if pos < length:
900
+ # Optimization: Optimistically look for quote
901
+ next_quote = buffer.find("'", pos)
902
+ if next_quote == -1:
903
+ next_quote = length
904
+
905
+ # Check if we skipped other terminators
906
+ chunk = buffer[pos:next_quote]
907
+ if "&" in chunk or "\0" in chunk:
908
+ # Fallback to regex if complex chars present
909
+ match = stop_pattern.search(buffer, pos)
910
+ end = length if match is None else match.start()
911
+ else:
912
+ end = next_quote
913
+
914
+ if end > pos:
915
+ # chunk is already valid if we took the fast path
916
+ if end != next_quote:
917
+ chunk = buffer[pos:end]
918
+
919
+ # Normalize chunk for value if needed
920
+ if "\r" in chunk:
921
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
922
+
923
+ self.current_attr_value.append(chunk)
924
+ self.pos = end
925
+
926
+ # Inlined _get_char logic
927
+ if self.pos >= length:
928
+ self.current_char = None
929
+ self._emit_error("eof-in-tag")
930
+ self._emit_token(EOFToken())
931
+ return True
932
+
933
+ c = buffer[self.pos]
934
+ self.pos += 1
935
+
936
+ self.current_char = c
937
+
938
+ if c == "'":
939
+ self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
940
+ return self._state_after_attribute_value_quoted()
941
+ if c == "&":
942
+ self._append_attr_value_char("&")
943
+ self.current_attr_value_has_amp = True
944
+ else:
945
+ # c == "\0" - the only remaining possibility after fast-path
946
+ self._emit_error("unexpected-null-character")
947
+ self._append_attr_value_char(replacement)
948
+
949
+ def _state_attribute_value_unquoted(self) -> bool:
950
+ replacement = "\ufffd"
951
+ stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
952
+ buffer = self.buffer
953
+ length = self.length
954
+
955
+ while True:
956
+ # Inline _consume_attribute_value_run
957
+ if not self.reconsume:
958
+ pos = self.pos
959
+ if pos < length:
960
+ match = stop_pattern.search(buffer, pos)
961
+ # Note: match is always found - pattern matches terminators or EOF
962
+ end = match.start() if match else length
963
+
964
+ if end > pos:
965
+ self.current_attr_value.append(buffer[pos:end])
966
+ self.pos = end
967
+
968
+ c = self._get_char()
969
+ if c is None:
970
+ # Per HTML5 spec: EOF in attribute value is a parse error
971
+ # The incomplete tag is discarded (not emitted)
972
+ self._emit_error("eof-in-tag")
973
+ self._emit_token(EOFToken())
974
+ return True
975
+ if c in ("\t", "\n", "\f", " "):
976
+ self._finish_attribute()
977
+ self.state = self.BEFORE_ATTRIBUTE_NAME
978
+ return False
979
+ if c == ">":
980
+ self._finish_attribute()
981
+ if not self._emit_current_tag():
982
+ self.state = self.DATA
983
+ return False
984
+ if c == "&":
985
+ self._append_attr_value_char("&")
986
+ self.current_attr_value_has_amp = True
987
+ continue
988
+ if c in ('"', "'", "<", "=", "`"):
989
+ self._emit_error("unexpected-character-in-unquoted-attribute-value")
990
+ if c == "\0":
991
+ self._emit_error("unexpected-null-character")
992
+ self._append_attr_value_char(replacement)
993
+ continue
994
+ self._append_attr_value_char(c)
995
+
996
+ def _state_after_attribute_value_quoted(self) -> bool:
997
+ """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
998
+ c = self._get_char()
999
+ if c is None:
1000
+ self._emit_error("eof-in-tag")
1001
+ self._flush_text()
1002
+ self._emit_token(EOFToken())
1003
+ return True
1004
+ if c in ("\t", "\n", "\f", " "):
1005
+ self._finish_attribute()
1006
+ self.state = self.BEFORE_ATTRIBUTE_NAME
1007
+ return False
1008
+ if c == "/":
1009
+ self._finish_attribute()
1010
+ self.state = self.SELF_CLOSING_START_TAG
1011
+ return False
1012
+ if c == ">":
1013
+ self._finish_attribute()
1014
+ if not self._emit_current_tag():
1015
+ self.state = self.DATA
1016
+ return False
1017
+ # Anything else: parse error, reconsume in before attribute name state
1018
+ self._emit_error("missing-whitespace-between-attributes")
1019
+ self._finish_attribute()
1020
+ self._reconsume_current()
1021
+ self.state = self.BEFORE_ATTRIBUTE_NAME
1022
+ return False
1023
+
1024
+ def _state_self_closing_start_tag(self) -> bool:
1025
+ c = self._get_char()
1026
+ if c is None:
1027
+ self._emit_error("eof-in-tag")
1028
+ self._flush_text()
1029
+ self._emit_token(EOFToken())
1030
+ return True
1031
+ if c == ">":
1032
+ self.current_tag_self_closing = True
1033
+ self._emit_current_tag()
1034
+ self.state = self.DATA
1035
+ return False
1036
+ self._emit_error("unexpected-character-after-solidus-in-tag")
1037
+ self._reconsume_current()
1038
+ self.state = self.BEFORE_ATTRIBUTE_NAME
1039
+ return False
1040
+
1041
+ def _state_markup_declaration_open(self) -> bool:
1042
+ # Note: Comment handling (<!--) is optimized in DATA state fast-path
1043
+ # This code only handles DOCTYPE and CDATA, or malformed markup
1044
+ if self._consume_case_insensitive("DOCTYPE"):
1045
+ self.current_doctype_name.clear()
1046
+ self.current_doctype_public = None
1047
+ self.current_doctype_system = None
1048
+ self.current_doctype_force_quirks = False
1049
+ self.state = self.DOCTYPE
1050
+ return False
1051
+ if self._consume_if("[CDATA["):
1052
+ # CDATA sections are only valid in foreign content (SVG/MathML)
1053
+ # Check if the adjusted current node is in a foreign namespace
1054
+ stack = self.sink.open_elements
1055
+ if stack:
1056
+ current = stack[-1]
1057
+ if current and current.namespace not in {None, "html"}:
1058
+ # Proper CDATA section in foreign content
1059
+ self.state = self.CDATA_SECTION
1060
+ return False
1061
+ # Treat as bogus comment in HTML context, preserving "[CDATA[" prefix
1062
+ self._emit_error("cdata-in-html-content")
1063
+ self.current_comment.clear()
1064
+ # Add the consumed "[CDATA[" text to the comment
1065
+ for ch in "[CDATA[":
1066
+ self.current_comment.append(ch)
1067
+ self.state = self.BOGUS_COMMENT
1068
+ return False
1069
+ self._emit_error("incorrectly-opened-comment")
1070
+ self.current_comment.clear()
1071
+ # Don't reconsume - bogus comment starts from current position
1072
+ self.state = self.BOGUS_COMMENT
1073
+ return False
1074
+
1075
+ def _state_comment_start(self) -> bool:
1076
+ replacement = "\ufffd"
1077
+ c = self._get_char()
1078
+ if c is None:
1079
+ self._emit_error("eof-in-comment")
1080
+ self._emit_comment()
1081
+ self._emit_token(EOFToken())
1082
+ return True
1083
+ if c == "-":
1084
+ self.state = self.COMMENT_START_DASH
1085
+ return False
1086
+ if c == ">":
1087
+ self._emit_error("abrupt-closing-of-empty-comment")
1088
+ self._emit_comment()
1089
+ self.state = self.DATA
1090
+ return False
1091
+ if c == "\0":
1092
+ self._emit_error("unexpected-null-character")
1093
+ self.current_comment.append(replacement)
1094
+ else:
1095
+ self.current_comment.append(c)
1096
+ self.state = self.COMMENT
1097
+ return False
1098
+
1099
+ def _state_comment_start_dash(self) -> bool:
1100
+ replacement = "\ufffd"
1101
+ c = self._get_char()
1102
+ if c is None:
1103
+ self._emit_error("eof-in-comment")
1104
+ self._emit_comment()
1105
+ self._emit_token(EOFToken())
1106
+ return True
1107
+ if c == "-":
1108
+ self.state = self.COMMENT_END
1109
+ return False
1110
+ if c == ">":
1111
+ self._emit_error("abrupt-closing-of-empty-comment")
1112
+ self._emit_comment()
1113
+ self.state = self.DATA
1114
+ return False
1115
+ if c == "\0":
1116
+ self._emit_error("unexpected-null-character")
1117
+ self.current_comment.extend(("-", replacement))
1118
+ else:
1119
+ self.current_comment.extend(("-", c))
1120
+ self.state = self.COMMENT
1121
+ return False
1122
+
1123
+ def _state_comment(self) -> bool:
1124
+ replacement = "\ufffd"
1125
+ while True:
1126
+ if self._consume_comment_run():
1127
+ continue
1128
+ c = self._get_char()
1129
+ if c is None:
1130
+ self._emit_error("eof-in-comment")
1131
+ self._emit_comment()
1132
+ self._emit_token(EOFToken())
1133
+ return True
1134
+ if c == "-":
1135
+ self.state = self.COMMENT_END_DASH
1136
+ return False
1137
+ # c == "\0" - the only remaining possibility after _consume_comment_run
1138
+ self._emit_error("unexpected-null-character")
1139
+ self.current_comment.append(replacement)
1140
+
1141
+ def _state_comment_end_dash(self) -> bool:
1142
+ replacement = "\ufffd"
1143
+ c = self._get_char()
1144
+ if c is None:
1145
+ self._emit_error("eof-in-comment")
1146
+ self._emit_comment()
1147
+ self._emit_token(EOFToken())
1148
+ return True
1149
+ if c == "-":
1150
+ self.state = self.COMMENT_END
1151
+ return False
1152
+ if c == "\0":
1153
+ self._emit_error("unexpected-null-character")
1154
+ self.current_comment.extend(("-", replacement))
1155
+ self.state = self.COMMENT
1156
+ return False
1157
+ # Per spec: append "-" and current char, switch to COMMENT state
1158
+ self.current_comment.extend(("-", c))
1159
+ self.state = self.COMMENT
1160
+ return False
1161
+
1162
+ def _state_comment_end(self) -> bool:
1163
+ replacement = "\ufffd"
1164
+ c = self._get_char()
1165
+ if c is None:
1166
+ self._emit_error("eof-in-comment")
1167
+ self._emit_comment()
1168
+ self._emit_token(EOFToken())
1169
+ return True
1170
+ if c == ">":
1171
+ self._emit_comment()
1172
+ self.state = self.DATA
1173
+ return False
1174
+ if c == "!":
1175
+ self.state = self.COMMENT_END_BANG
1176
+ return False
1177
+ if c == "-":
1178
+ self.current_comment.append("-")
1179
+ return False
1180
+ if c == "\0":
1181
+ self._emit_error("unexpected-null-character")
1182
+ self.current_comment.extend(("--", replacement))
1183
+ self.state = self.COMMENT
1184
+ return False
1185
+ self._emit_error("incorrectly-closed-comment")
1186
+ self.current_comment.extend(("--", c))
1187
+ self.state = self.COMMENT
1188
+ return False
1189
+
1190
+ def _state_comment_end_bang(self) -> bool:
1191
+ replacement = "\ufffd"
1192
+ c = self._get_char()
1193
+ if c is None:
1194
+ self._emit_error("eof-in-comment")
1195
+ self._emit_comment()
1196
+ self._emit_token(EOFToken())
1197
+ return True
1198
+ if c == "-":
1199
+ self.current_comment.append("-")
1200
+ self.current_comment.append("-")
1201
+ self.current_comment.append("!")
1202
+ self.state = self.COMMENT_END_DASH
1203
+ return False
1204
+ if c == ">":
1205
+ self._emit_error("incorrectly-closed-comment")
1206
+ self._emit_comment()
1207
+ self.state = self.DATA
1208
+ return False
1209
+ if c == "\0":
1210
+ self._emit_error("unexpected-null-character")
1211
+ self.current_comment.append("-")
1212
+ self.current_comment.append("-")
1213
+ self.current_comment.append("!")
1214
+ self.current_comment.append(replacement)
1215
+ self.state = self.COMMENT
1216
+ return False
1217
+ self.current_comment.append("-")
1218
+ self.current_comment.append("-")
1219
+ self.current_comment.append("!")
1220
+ self.current_comment.append(c)
1221
+ self.state = self.COMMENT
1222
+ return False
1223
+
1224
+ def _state_bogus_comment(self) -> bool:
1225
+ replacement = "\ufffd"
1226
+ while True:
1227
+ c = self._get_char()
1228
+ if c is None:
1229
+ self._emit_comment()
1230
+ self._emit_token(EOFToken())
1231
+ return True
1232
+ if c == ">":
1233
+ self._emit_comment()
1234
+ self.state = self.DATA
1235
+ return False
1236
+ if c == "\0":
1237
+ self.current_comment.append(replacement)
1238
+ else:
1239
+ self.current_comment.append(c)
1240
+
1241
+ def _state_doctype(self) -> bool:
1242
+ c = self._get_char()
1243
+ if c is None:
1244
+ self._emit_error("eof-in-doctype")
1245
+ self.current_doctype_force_quirks = True
1246
+ self._emit_doctype()
1247
+ self._emit_token(EOFToken())
1248
+ return True
1249
+ if c in ("\t", "\n", "\f", " "):
1250
+ self.state = self.BEFORE_DOCTYPE_NAME
1251
+ return False
1252
+ if c == ">":
1253
+ self._emit_error("expected-doctype-name-but-got-right-bracket")
1254
+ self.current_doctype_force_quirks = True
1255
+ self._emit_doctype()
1256
+ self.state = self.DATA
1257
+ return False
1258
+ self._emit_error("missing-whitespace-before-doctype-name")
1259
+ self._reconsume_current()
1260
+ self.state = self.BEFORE_DOCTYPE_NAME
1261
+ return False
1262
+
1263
+ def _state_before_doctype_name(self) -> bool:
1264
+ while True:
1265
+ c = self._get_char()
1266
+ if c is None:
1267
+ self._emit_error("eof-in-doctype-name")
1268
+ self.current_doctype_force_quirks = True
1269
+ self._emit_doctype()
1270
+ self._emit_token(EOFToken())
1271
+ return True
1272
+ if c in ("\t", "\n", "\f", " "):
1273
+ return False
1274
+ if c == ">":
1275
+ self._emit_error("expected-doctype-name-but-got-right-bracket")
1276
+ self.current_doctype_force_quirks = True
1277
+ self._emit_doctype()
1278
+ self.state = self.DATA
1279
+ return False
1280
+ if "A" <= c <= "Z":
1281
+ self.current_doctype_name.append(chr(ord(c) + 32))
1282
+ elif c == "\0":
1283
+ self._emit_error("unexpected-null-character")
1284
+ self.current_doctype_name.append("\ufffd")
1285
+ else:
1286
+ self.current_doctype_name.append(c)
1287
+ self.state = self.DOCTYPE_NAME
1288
+ return False
1289
+
1290
+ def _state_doctype_name(self) -> bool:
1291
+ while True:
1292
+ c = self._get_char()
1293
+ if c is None:
1294
+ self._emit_error("eof-in-doctype-name")
1295
+ self.current_doctype_force_quirks = True
1296
+ self._emit_doctype()
1297
+ self._emit_token(EOFToken())
1298
+ return True
1299
+ if c in ("\t", "\n", "\f", " "):
1300
+ self.state = self.AFTER_DOCTYPE_NAME
1301
+ return False
1302
+ if c == ">":
1303
+ self._emit_doctype()
1304
+ self.state = self.DATA
1305
+ return False
1306
+ if "A" <= c <= "Z":
1307
+ self.current_doctype_name.append(chr(ord(c) + 32))
1308
+ continue
1309
+ if c == "\0":
1310
+ self._emit_error("unexpected-null-character")
1311
+ self.current_doctype_name.append("\ufffd")
1312
+ continue
1313
+ self.current_doctype_name.append(c)
1314
+
1315
+ def _state_after_doctype_name(self) -> bool:
1316
+ if self._consume_case_insensitive("PUBLIC"):
1317
+ self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
1318
+ return False
1319
+ if self._consume_case_insensitive("SYSTEM"):
1320
+ self.state = self.AFTER_DOCTYPE_SYSTEM_KEYWORD
1321
+ return False
1322
+ while True:
1323
+ c = self._get_char()
1324
+ if c is None:
1325
+ self._emit_error("eof-in-doctype")
1326
+ self.current_doctype_force_quirks = True
1327
+ self._emit_doctype()
1328
+ self._emit_token(EOFToken())
1329
+ return True
1330
+ if c in ("\t", "\n", "\f", " "):
1331
+ continue
1332
+ if c == ">":
1333
+ self._emit_doctype()
1334
+ self.state = self.DATA
1335
+ return False
1336
+ self._emit_error("missing-whitespace-after-doctype-name")
1337
+ self.current_doctype_force_quirks = True
1338
+ self._reconsume_current()
1339
+ self.state = self.BOGUS_DOCTYPE
1340
+ return False
1341
+
1342
+ def _state_after_doctype_public_keyword(self) -> bool:
1343
+ while True:
1344
+ c = self._get_char()
1345
+ if c is None:
1346
+ self._emit_error("missing-quote-before-doctype-public-identifier")
1347
+ self.current_doctype_force_quirks = True
1348
+ self._emit_doctype()
1349
+ self._emit_token(EOFToken())
1350
+ return True
1351
+ if c in ("\t", "\n", "\f", " "):
1352
+ self.state = self.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
1353
+ return False
1354
+ if c == '"':
1355
+ self._emit_error("missing-whitespace-before-doctype-public-identifier")
1356
+ self.current_doctype_public = []
1357
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
1358
+ return False
1359
+ if c == "'":
1360
+ self._emit_error("missing-whitespace-before-doctype-public-identifier")
1361
+ self.current_doctype_public = []
1362
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
1363
+ return False
1364
+ if c == ">":
1365
+ self._emit_error("missing-doctype-public-identifier")
1366
+ self.current_doctype_force_quirks = True
1367
+ self._emit_doctype()
1368
+ self.state = self.DATA
1369
+ return False
1370
+ self._emit_error("unexpected-character-after-doctype-public-keyword")
1371
+ self.current_doctype_force_quirks = True
1372
+ self._reconsume_current()
1373
+ self.state = self.BOGUS_DOCTYPE
1374
+ return False
1375
+
1376
+ def _state_after_doctype_system_keyword(self) -> bool:
1377
+ while True:
1378
+ c = self._get_char()
1379
+ if c is None:
1380
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1381
+ self.current_doctype_force_quirks = True
1382
+ self._emit_doctype()
1383
+ self._emit_token(EOFToken())
1384
+ return True
1385
+ if c in ("\t", "\n", "\f", " "):
1386
+ self.state = self.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
1387
+ return False
1388
+ if c == '"':
1389
+ self._emit_error("missing-whitespace-after-doctype-public-identifier")
1390
+ self.current_doctype_system = []
1391
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1392
+ return False
1393
+ if c == "'":
1394
+ self._emit_error("missing-whitespace-after-doctype-public-identifier")
1395
+ self.current_doctype_system = []
1396
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1397
+ return False
1398
+ if c == ">":
1399
+ self._emit_error("missing-doctype-system-identifier")
1400
+ self.current_doctype_force_quirks = True
1401
+ self._emit_doctype()
1402
+ self.state = self.DATA
1403
+ return False
1404
+ self._emit_error("unexpected-character-after-doctype-system-keyword")
1405
+ self.current_doctype_force_quirks = True
1406
+ self._reconsume_current()
1407
+ self.state = self.BOGUS_DOCTYPE
1408
+ return False
1409
+
1410
+ def _state_before_doctype_public_identifier(self) -> bool:
1411
+ while True:
1412
+ c = self._get_char()
1413
+ if c is None:
1414
+ self._emit_error("missing-doctype-public-identifier")
1415
+ self.current_doctype_force_quirks = True
1416
+ self._emit_doctype()
1417
+ self._emit_token(EOFToken())
1418
+ return True
1419
+ if c in ("\t", "\n", "\f", " "):
1420
+ continue
1421
+ if c == '"':
1422
+ self.current_doctype_public = []
1423
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
1424
+ return False
1425
+ if c == "'":
1426
+ self.current_doctype_public = []
1427
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
1428
+ return False
1429
+ if c == ">":
1430
+ self._emit_error("missing-doctype-public-identifier")
1431
+ self.current_doctype_force_quirks = True
1432
+ self._emit_doctype()
1433
+ self.state = self.DATA
1434
+ return False
1435
+ self._emit_error("missing-quote-before-doctype-public-identifier")
1436
+ self.current_doctype_force_quirks = True
1437
+ self._reconsume_current()
1438
+ self.state = self.BOGUS_DOCTYPE
1439
+ return False
1440
+
1441
+ def _state_doctype_public_identifier_double_quoted(self) -> bool:
1442
+ if self.current_doctype_public is None: # pragma: no cover
1443
+ self.current_doctype_public = []
1444
+ while True:
1445
+ c = self._get_char()
1446
+ if c is None:
1447
+ self._emit_error("eof-in-doctype-public-identifier")
1448
+ self.current_doctype_force_quirks = True
1449
+ self._emit_doctype()
1450
+ self._emit_token(EOFToken())
1451
+ return True
1452
+ if c == '"':
1453
+ self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
1454
+ return False
1455
+ if c == "\0":
1456
+ self._emit_error("unexpected-null-character")
1457
+ self.current_doctype_public.append("\ufffd")
1458
+ continue
1459
+ if c == ">":
1460
+ self._emit_error("abrupt-doctype-public-identifier")
1461
+ self.current_doctype_force_quirks = True
1462
+ self._emit_doctype()
1463
+ self.state = self.DATA
1464
+ return False
1465
+ self.current_doctype_public.append(c)
1466
+
1467
+ def _state_doctype_public_identifier_single_quoted(self) -> bool:
1468
+ if self.current_doctype_public is None: # pragma: no cover
1469
+ self.current_doctype_public = []
1470
+ while True:
1471
+ c = self._get_char()
1472
+ if c is None:
1473
+ self._emit_error("eof-in-doctype-public-identifier")
1474
+ self.current_doctype_force_quirks = True
1475
+ self._emit_doctype()
1476
+ self._emit_token(EOFToken())
1477
+ return True
1478
+ if c == "'":
1479
+ self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
1480
+ return False
1481
+ if c == "\0":
1482
+ self._emit_error("unexpected-null-character")
1483
+ self.current_doctype_public.append("\ufffd")
1484
+ continue
1485
+ if c == ">":
1486
+ self._emit_error("abrupt-doctype-public-identifier")
1487
+ self.current_doctype_force_quirks = True
1488
+ self._emit_doctype()
1489
+ self.state = self.DATA
1490
+ return False
1491
+ self.current_doctype_public.append(c)
1492
+
1493
+ def _state_after_doctype_public_identifier(self) -> bool:
1494
+ while True:
1495
+ c = self._get_char()
1496
+ if c is None:
1497
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1498
+ self.current_doctype_force_quirks = True
1499
+ self._emit_doctype()
1500
+ self._emit_token(EOFToken())
1501
+ return True
1502
+ if c in ("\t", "\n", "\f", " "):
1503
+ self.state = self.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
1504
+ return False
1505
+ if c == ">":
1506
+ self._emit_doctype()
1507
+ self.state = self.DATA
1508
+ return False
1509
+ if c == '"':
1510
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1511
+ self.current_doctype_system = []
1512
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1513
+ return False
1514
+ if c == "'":
1515
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1516
+ self.current_doctype_system = []
1517
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1518
+ return False
1519
+ self._emit_error("unexpected-character-after-doctype-public-identifier")
1520
+ self.current_doctype_force_quirks = True
1521
+ self._reconsume_current()
1522
+ self.state = self.BOGUS_DOCTYPE
1523
+ return False
1524
+
1525
+ def _state_between_doctype_public_and_system_identifiers(self) -> bool:
1526
+ while True:
1527
+ c = self._get_char()
1528
+ if c is None:
1529
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1530
+ self.current_doctype_force_quirks = True
1531
+ self._emit_doctype()
1532
+ self._emit_token(EOFToken())
1533
+ return True
1534
+ if c in ("\t", "\n", "\f", " "):
1535
+ continue
1536
+ if c == ">":
1537
+ self._emit_doctype()
1538
+ self.state = self.DATA
1539
+ return False
1540
+ if c == '"':
1541
+ self.current_doctype_system = []
1542
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1543
+ return False
1544
+ if c == "'":
1545
+ self.current_doctype_system = []
1546
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1547
+ return False
1548
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1549
+ self.current_doctype_force_quirks = True
1550
+ self._reconsume_current()
1551
+ self.state = self.BOGUS_DOCTYPE
1552
+ return False
1553
+
1554
+ def _state_before_doctype_system_identifier(self) -> bool:
1555
+ while True:
1556
+ c = self._get_char()
1557
+ if c is None:
1558
+ self._emit_error("missing-doctype-system-identifier")
1559
+ self.current_doctype_force_quirks = True
1560
+ self._emit_doctype()
1561
+ self._emit_token(EOFToken())
1562
+ return True
1563
+ if c in ("\t", "\n", "\f", " "):
1564
+ continue
1565
+ if c == '"':
1566
+ self.current_doctype_system = []
1567
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1568
+ return False
1569
+ if c == "'":
1570
+ self.current_doctype_system = []
1571
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1572
+ return False
1573
+ if c == ">":
1574
+ self._emit_error("missing-doctype-system-identifier")
1575
+ self.current_doctype_force_quirks = True
1576
+ self._emit_doctype()
1577
+ self.state = self.DATA
1578
+ return False
1579
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1580
+ self.current_doctype_force_quirks = True
1581
+ self._reconsume_current()
1582
+ self.state = self.BOGUS_DOCTYPE
1583
+ return False
1584
+
1585
+ def _state_doctype_system_identifier_double_quoted(self) -> bool:
1586
+ if self.current_doctype_system is None: # pragma: no cover
1587
+ self.current_doctype_system = []
1588
+ while True:
1589
+ c = self._get_char()
1590
+ if c is None:
1591
+ self._emit_error("eof-in-doctype-system-identifier")
1592
+ self.current_doctype_force_quirks = True
1593
+ self._emit_doctype()
1594
+ self._emit_token(EOFToken())
1595
+ return True
1596
+ if c == '"':
1597
+ self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
1598
+ return False
1599
+ if c == "\0":
1600
+ self._emit_error("unexpected-null-character")
1601
+ self.current_doctype_system.append("\ufffd")
1602
+ continue
1603
+ if c == ">":
1604
+ self._emit_error("abrupt-doctype-system-identifier")
1605
+ self.current_doctype_force_quirks = True
1606
+ self._emit_doctype()
1607
+ self.state = self.DATA
1608
+ return False
1609
+ self.current_doctype_system.append(c)
1610
+
1611
+ def _state_doctype_system_identifier_single_quoted(self) -> bool:
1612
+ if self.current_doctype_system is None: # pragma: no cover
1613
+ self.current_doctype_system = []
1614
+ while True:
1615
+ c = self._get_char()
1616
+ if c is None:
1617
+ self._emit_error("eof-in-doctype-system-identifier")
1618
+ self.current_doctype_force_quirks = True
1619
+ self._emit_doctype()
1620
+ self._emit_token(EOFToken())
1621
+ return True
1622
+ if c == "'":
1623
+ self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
1624
+ return False
1625
+ if c == "\0":
1626
+ self._emit_error("unexpected-null-character")
1627
+ self.current_doctype_system.append("\ufffd")
1628
+ continue
1629
+ if c == ">":
1630
+ self._emit_error("abrupt-doctype-system-identifier")
1631
+ self.current_doctype_force_quirks = True
1632
+ self._emit_doctype()
1633
+ self.state = self.DATA
1634
+ return False
1635
+ self.current_doctype_system.append(c)
1636
+
1637
+ def _state_after_doctype_system_identifier(self) -> bool:
1638
+ while True:
1639
+ c = self._get_char()
1640
+ if c is None:
1641
+ self._emit_error("eof-in-doctype")
1642
+ self.current_doctype_force_quirks = True
1643
+ self._emit_doctype()
1644
+ self._emit_token(EOFToken())
1645
+ return True
1646
+ if c in ("\t", "\n", "\f", " "):
1647
+ continue
1648
+ if c == ">":
1649
+ self._emit_doctype()
1650
+ self.state = self.DATA
1651
+ return False
1652
+ self._emit_error("unexpected-character-after-doctype-system-identifier")
1653
+ self._reconsume_current()
1654
+ self.state = self.BOGUS_DOCTYPE
1655
+ return False
1656
+
1657
+ def _state_bogus_doctype(self) -> bool:
1658
+ while True:
1659
+ c = self._get_char()
1660
+ if c is None:
1661
+ self._emit_doctype()
1662
+ self._emit_token(EOFToken())
1663
+ return True
1664
+ if c == ">":
1665
+ self._emit_doctype()
1666
+ self.state = self.DATA
1667
+ return False
1668
+
1669
+ # ---------------------
1670
+ # Low-level helpers
1671
+ # ---------------------
1672
+
1673
+ def _get_char(self) -> str | None:
1674
+ if self.reconsume:
1675
+ self.reconsume = False
1676
+ return self.current_char
1677
+
1678
+ buffer = self.buffer
1679
+ pos = self.pos
1680
+ length = self.length
1681
+ while True:
1682
+ if pos >= length:
1683
+ self.pos = pos
1684
+ self.current_char = None
1685
+ return None
1686
+
1687
+ c = buffer[pos]
1688
+ pos += 1
1689
+
1690
+ if c == "\r":
1691
+ self.ignore_lf = True
1692
+ self.current_char = "\n"
1693
+ self.pos = pos
1694
+ return "\n"
1695
+
1696
+ if c == "\n":
1697
+ if self.ignore_lf:
1698
+ self.ignore_lf = False
1699
+ continue
1700
+ # Line tracking now computed on-demand via _get_line_at_pos()
1701
+
1702
+ else:
1703
+ self.ignore_lf = False
1704
+
1705
+ self.current_char = c
1706
+ self.pos = pos
1707
+ return c
1708
+
1709
+ def _reconsume_current(self) -> None:
1710
+ self.reconsume = True
1711
+
1712
+ def _append_text(self, text: str) -> None:
1713
+ """Append text to buffer, recording start position if this is the first chunk."""
1714
+ if not self.text_buffer:
1715
+ # Record where text started (current position before this chunk)
1716
+ self.text_start_pos = self.pos
1717
+ self.text_buffer.append(text)
1718
+
1719
+ def _flush_text(self) -> None:
1720
+ if not self.text_buffer:
1721
+ return
1722
+
1723
+ # Optimization: Avoid join for single chunk
1724
+ # text_buffer is never populated with empty strings
1725
+ if len(self.text_buffer) == 1:
1726
+ data = self.text_buffer[0]
1727
+ else:
1728
+ data = "".join(self.text_buffer)
1729
+
1730
+ # Calculate raw text length before any processing for position tracking
1731
+ raw_len = len(data)
1732
+
1733
+ self.text_buffer.clear()
1734
+ if self.state == self.DATA and "\0" in data:
1735
+ count = data.count("\0")
1736
+ for _ in range(count):
1737
+ self._emit_error("unexpected-null-character")
1738
+
1739
+ # Per HTML5 spec:
1740
+ # - RCDATA state (title, textarea): decode character references
1741
+ # - RAWTEXT state (style, script, etc): do NOT decode
1742
+ # - PLAINTEXT state: do NOT decode
1743
+ # - CDATA sections: do NOT decode
1744
+ if self.state >= self.PLAINTEXT or self.CDATA_SECTION <= self.state <= self.CDATA_SECTION_END:
1745
+ pass
1746
+ elif self.state >= self.RAWTEXT:
1747
+ pass
1748
+ else:
1749
+ if "&" in data:
1750
+ data = decode_entities_in_text(data)
1751
+ # Apply XML coercion if enabled
1752
+ if self.opts.xml_coercion:
1753
+ data = _coerce_text_for_xml(data)
1754
+
1755
+ # Record position at END of raw text (1-indexed column = raw_len)
1756
+ self._record_text_end_position(raw_len)
1757
+ self.sink.process_characters(data)
1758
+ # Note: process_characters never returns Plaintext or RawData
1759
+ # State switches happen via _emit_current_tag instead
1760
+
1761
+ def _append_attr_value_char(self, c: str) -> None:
1762
+ self.current_attr_value.append(c)
1763
+
1764
+ def _finish_attribute(self) -> None:
1765
+ attr_name_buffer = self.current_attr_name
1766
+ if not attr_name_buffer:
1767
+ return
1768
+ if len(attr_name_buffer) == 1:
1769
+ name = attr_name_buffer[0]
1770
+ else:
1771
+ name = "".join(attr_name_buffer)
1772
+ attrs = self.current_tag_attrs
1773
+ is_duplicate = name in attrs
1774
+ attr_name_buffer.clear()
1775
+ attr_value_buffer = self.current_attr_value
1776
+ if is_duplicate:
1777
+ self._emit_error("duplicate-attribute")
1778
+ attr_value_buffer.clear()
1779
+ self.current_attr_value_has_amp = False
1780
+ return
1781
+ if not attr_value_buffer:
1782
+ value = ""
1783
+ elif len(attr_value_buffer) == 1:
1784
+ value = attr_value_buffer[0]
1785
+ else:
1786
+ value = "".join(attr_value_buffer)
1787
+ if self.current_attr_value_has_amp:
1788
+ value = decode_entities_in_text(value, in_attribute=True)
1789
+ attrs[name] = value
1790
+ attr_value_buffer.clear()
1791
+ self.current_attr_value_has_amp = False
1792
+
1793
+ def _emit_current_tag(self) -> bool:
1794
+ name_parts = self.current_tag_name
1795
+ part_count = len(name_parts)
1796
+ # Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
1797
+ if part_count == 1:
1798
+ name = name_parts[0]
1799
+ else:
1800
+ name = "".join(name_parts)
1801
+ attrs = self.current_tag_attrs
1802
+ self.current_tag_attrs = {}
1803
+
1804
+ tag = self._tag_token
1805
+ tag.kind = self.current_tag_kind
1806
+ tag.name = name
1807
+ tag.attrs = attrs
1808
+ tag.self_closing = self.current_tag_self_closing
1809
+
1810
+ switched_to_rawtext = False
1811
+ if self.current_tag_kind == Tag.START:
1812
+ self.last_start_tag_name = name
1813
+ needs_rawtext_check = name in _RAWTEXT_SWITCH_TAGS or name == "plaintext"
1814
+ if needs_rawtext_check:
1815
+ stack = self.sink.open_elements
1816
+ current_node = stack[-1] if stack else None
1817
+ namespace = current_node.namespace if current_node else None
1818
+ if namespace is None or namespace == "html":
1819
+ if name in _RCDATA_ELEMENTS:
1820
+ self.state = self.RCDATA
1821
+ self.rawtext_tag_name = name
1822
+ switched_to_rawtext = True
1823
+ elif name in _RAWTEXT_SWITCH_TAGS:
1824
+ self.state = self.RAWTEXT
1825
+ self.rawtext_tag_name = name
1826
+ switched_to_rawtext = True
1827
+ else:
1828
+ # Must be "plaintext" - the only other way needs_rawtext_check can be True
1829
+ self.state = self.PLAINTEXT
1830
+ switched_to_rawtext = True
1831
+ # Remember current state before emitting
1832
+
1833
+ # Emit token to sink
1834
+ self._record_token_position()
1835
+ result = self.sink.process_token(tag)
1836
+ if result == 1: # TokenSinkResult.Plaintext
1837
+ self.state = self.PLAINTEXT
1838
+ switched_to_rawtext = True
1839
+
1840
+ self.current_tag_name.clear()
1841
+ self.current_attr_name.clear()
1842
+ self.current_attr_value.clear()
1843
+ self.current_tag_self_closing = False
1844
+ self.current_tag_kind = Tag.START
1845
+ return switched_to_rawtext
1846
+
1847
+ def _emit_comment(self) -> None:
1848
+ data = "".join(self.current_comment)
1849
+ self.current_comment.clear()
1850
+ # Apply XML coercion if enabled
1851
+ if self.opts.xml_coercion:
1852
+ data = _coerce_comment_for_xml(data)
1853
+ self._comment_token.data = data
1854
+ self._emit_token(self._comment_token)
1855
+
1856
+ def _emit_doctype(self) -> None:
1857
+ name = "".join(self.current_doctype_name) if self.current_doctype_name else None
1858
+ # If public_id/system_id is a list (even empty), join it; if None, keep None
1859
+ public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
1860
+ system_id = "".join(self.current_doctype_system) if self.current_doctype_system is not None else None
1861
+ doctype = Doctype(
1862
+ name=name,
1863
+ public_id=public_id,
1864
+ system_id=system_id,
1865
+ force_quirks=self.current_doctype_force_quirks,
1866
+ )
1867
+ self.current_doctype_name.clear()
1868
+ self.current_doctype_public = None
1869
+ self.current_doctype_system = None
1870
+ self.current_doctype_force_quirks = False
1871
+ self._emit_token(DoctypeToken(doctype))
1872
+
1873
+ def _emit_token(self, token: Any) -> None:
1874
+ self._record_token_position()
1875
+ self.sink.process_token(token)
1876
+ # Note: process_token never returns Plaintext or RawData for state switches
1877
+ # State switches happen via _emit_current_tag checking sink response
1878
+
1879
+ def _record_token_position(self) -> None:
1880
+ """Record current position as 0-indexed column for the last emitted token.
1881
+
1882
+ Per the spec, the position should be at the end of the token (after the last char).
1883
+ """
1884
+ if not self.collect_errors:
1885
+ return
1886
+ # pos points after the last consumed character, which is exactly what we want
1887
+ pos = self.pos
1888
+ last_newline = self.buffer.rfind("\n", 0, pos)
1889
+ if last_newline == -1:
1890
+ column = pos # 0-indexed from start
1891
+ else:
1892
+ column = pos - last_newline - 1 # 0-indexed from after newline
1893
+ self.last_token_line = self._get_line_at_pos(pos)
1894
+ self.last_token_column = column
1895
+
1896
+ def _record_text_end_position(self, raw_len: int) -> None:
1897
+ """Record position at end of text token (after last character).
1898
+
1899
+ Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
1900
+ behavior of reporting the column of the last character (1-indexed).
1901
+ """
1902
+ if not self.collect_errors:
1903
+ return
1904
+ # Position of last character of text (0-indexed)
1905
+ end_pos = self.text_start_pos + raw_len
1906
+ last_newline = self.buffer.rfind("\n", 0, end_pos)
1907
+ if last_newline == -1:
1908
+ column = end_pos # 1-indexed column = end_pos (position after last char)
1909
+ else:
1910
+ column = end_pos - last_newline - 1
1911
+ self.last_token_line = self._get_line_at_pos(end_pos)
1912
+ self.last_token_column = column
1913
+
1914
+ def _emit_error(self, code: str) -> None:
1915
+ if not self.collect_errors:
1916
+ return
1917
+ # Compute column on-demand: scan backwards to find last newline
1918
+ pos = max(0, self.pos - 1) # Current position being processed
1919
+ last_newline = self.buffer.rfind("\n", 0, pos + 1)
1920
+ if last_newline == -1:
1921
+ column = pos + 1 # 1-indexed from start of input
1922
+ else:
1923
+ column = pos - last_newline # 1-indexed from after newline
1924
+
1925
+ message = generate_error_message(code)
1926
+ line = self._get_line_at_pos(self.pos)
1927
+ self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
1928
+
1929
+ def _consume_if(self, literal: str) -> bool:
1930
+ end = self.pos + len(literal)
1931
+ if end > self.length:
1932
+ return False
1933
+ segment = self.buffer[self.pos : end]
1934
+ if segment != literal:
1935
+ return False
1936
+ self.pos = end
1937
+ return True
1938
+
1939
+ def _consume_case_insensitive(self, literal: str) -> bool:
1940
+ end = self.pos + len(literal)
1941
+ if end > self.length:
1942
+ return False
1943
+ segment = self.buffer[self.pos : end]
1944
+ if segment.lower() != literal.lower():
1945
+ return False
1946
+ self.pos = end
1947
+ return True
1948
+
1949
+ def _consume_comment_run(self) -> bool:
1950
+ # Note: Comments are never reconsumed
1951
+ pos = self.pos
1952
+ length = self.length
1953
+ if pos >= length:
1954
+ return False
1955
+
1956
+ # Handle ignore_lf for CRLF sequences
1957
+ if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
1958
+ self.ignore_lf = False
1959
+ pos += 1
1960
+ self.pos = pos
1961
+ if pos >= length:
1962
+ return False
1963
+
1964
+ match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
1965
+ if match:
1966
+ chunk = match.group(0)
1967
+ # Handle CRLF normalization for comments
1968
+ if "\r" in chunk:
1969
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
1970
+ self.ignore_lf = chunk.endswith("\r")
1971
+ self.current_comment.append(chunk)
1972
+ self.pos = match.end()
1973
+ return True
1974
+ return False
1975
+
1976
+ def _state_cdata_section(self) -> bool:
1977
+ # CDATA section state - consume characters until we see ']'
1978
+ while True:
1979
+ c = self._get_char()
1980
+ if c is None:
1981
+ self._emit_error("eof-in-cdata")
1982
+ self._flush_text()
1983
+ self._emit_token(EOFToken())
1984
+ return True
1985
+ if c == "]":
1986
+ self.state = self.CDATA_SECTION_BRACKET
1987
+ return False
1988
+ self._append_text(c)
1989
+
1990
+ def _state_cdata_section_bracket(self) -> bool:
1991
+ # Seen one ']', check for second ']'
1992
+ c = self._get_char()
1993
+ if c == "]":
1994
+ self.state = self.CDATA_SECTION_END
1995
+ return False
1996
+ # False alarm, emit the ']' we saw and continue
1997
+ self._append_text("]")
1998
+ if c is None:
1999
+ self._emit_error("eof-in-cdata")
2000
+ self._flush_text()
2001
+ self._emit_token(EOFToken())
2002
+ return True
2003
+ self._reconsume_current()
2004
+ self.state = self.CDATA_SECTION
2005
+ return False
2006
+
2007
+ def _state_cdata_section_end(self) -> bool:
2008
+ # Seen ']]', check for '>'
2009
+ c = self._get_char()
2010
+ if c == ">":
2011
+ # End of CDATA section
2012
+ self._flush_text()
2013
+ self.state = self.DATA
2014
+ return False
2015
+ # Not the end - we saw ']]' but not '>'. Emit one ']' and check if the next char is another ']'
2016
+ self._append_text("]")
2017
+ if c is None:
2018
+ # EOF after ']]' - emit the second ']' too
2019
+ self._append_text("]")
2020
+ self._emit_error("eof-in-cdata")
2021
+ self._flush_text()
2022
+ self._emit_token(EOFToken())
2023
+ return True
2024
+ if c == "]":
2025
+ # Still might be ']]>' sequence, stay in CDATA_SECTION_END
2026
+ return False
2027
+ # Not a bracket, so emit the second ']', reconsume current char and go back to CDATA_SECTION
2028
+ self._append_text("]")
2029
+ self._reconsume_current()
2030
+ self.state = self.CDATA_SECTION
2031
+ return False
2032
+
2033
+ def _state_rcdata(self) -> bool:
2034
+ buffer = self.buffer
2035
+ length = self.length
2036
+ pos = self.pos
2037
+ while True:
2038
+ if self.reconsume:
2039
+ self.reconsume = False
2040
+ if self.current_char is None:
2041
+ self._flush_text()
2042
+ self._emit_token(EOFToken())
2043
+ return True
2044
+ self.pos -= 1
2045
+ pos = self.pos
2046
+
2047
+ # Optimized loop using find
2048
+ lt_index = buffer.find("<", pos)
2049
+ amp_index = buffer.find("&", pos)
2050
+ null_index = buffer.find("\0", pos)
2051
+
2052
+ # Find the nearest special character
2053
+ next_special = length
2054
+ if lt_index != -1:
2055
+ next_special = lt_index
2056
+ if amp_index != -1 and amp_index < next_special:
2057
+ next_special = amp_index
2058
+ if null_index != -1 and null_index < next_special:
2059
+ next_special = null_index
2060
+
2061
+ # Consume everything up to the special character
2062
+ if next_special > pos:
2063
+ chunk = buffer[pos:next_special]
2064
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2065
+ pos = next_special
2066
+ self.pos = pos
2067
+
2068
+ # Handle EOF
2069
+ if pos >= length:
2070
+ self._flush_text()
2071
+ self._emit_token(EOFToken())
2072
+ return True
2073
+
2074
+ # Handle special characters - we're at one of them after find()
2075
+ if null_index == pos:
2076
+ self.ignore_lf = False
2077
+ self._emit_error("unexpected-null-character")
2078
+ self._append_text("\ufffd")
2079
+ pos += 1
2080
+ self.pos = pos
2081
+ elif amp_index == pos:
2082
+ # Ampersand in RCDATA - will be decoded by _flush_text
2083
+ self._append_text("&")
2084
+ pos += 1
2085
+ self.pos = pos
2086
+ else:
2087
+ # lt_index == pos - the only remaining possibility
2088
+ # Less-than sign - might be start of end tag
2089
+ pos += 1
2090
+ self.pos = pos
2091
+ self.state = self.RCDATA_LESS_THAN_SIGN
2092
+ return False
2093
+
2094
+ def _state_rcdata_less_than_sign(self) -> bool:
2095
+ c = self._get_char()
2096
+ if c == "/":
2097
+ self.current_tag_name.clear()
2098
+ self.state = self.RCDATA_END_TAG_OPEN
2099
+ return False
2100
+ self._append_text("<")
2101
+ self._reconsume_current()
2102
+ self.state = self.RCDATA
2103
+ return False
2104
+
2105
+ def _state_rcdata_end_tag_open(self) -> bool:
2106
+ c = self._get_char()
2107
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2108
+ self.current_tag_name.append(c.lower())
2109
+ self.original_tag_name.append(c)
2110
+ self.state = self.RCDATA_END_TAG_NAME
2111
+ return False
2112
+ self.text_buffer.extend(("<", "/"))
2113
+ self._reconsume_current()
2114
+ self.state = self.RCDATA
2115
+ return False
2116
+
2117
+ def _state_rcdata_end_tag_name(self) -> bool:
2118
+ # Check if this matches the opening tag name
2119
+ while True:
2120
+ c = self._get_char()
2121
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2122
+ self.current_tag_name.append(c.lower())
2123
+ self.original_tag_name.append(c)
2124
+ continue
2125
+ # End of tag name - check if it matches
2126
+ tag_name = "".join(self.current_tag_name)
2127
+ if tag_name == self.rawtext_tag_name:
2128
+ if c == ">":
2129
+ attrs: dict[str, str | None] = {}
2130
+ tag = Tag(Tag.END, tag_name, attrs, False)
2131
+ self._flush_text()
2132
+ self._emit_token(tag)
2133
+ self.state = self.DATA
2134
+ self.rawtext_tag_name = None
2135
+ self.original_tag_name.clear()
2136
+ return False
2137
+ if c in (" ", "\t", "\n", "\r", "\f"):
2138
+ # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2139
+ self.current_tag_kind = Tag.END
2140
+ self.current_tag_attrs = {}
2141
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2142
+ return False
2143
+ if c == "/":
2144
+ self._flush_text()
2145
+ self.current_tag_kind = Tag.END
2146
+ self.current_tag_attrs = {}
2147
+ self.state = self.SELF_CLOSING_START_TAG
2148
+ return False
2149
+ # If we hit EOF or tag doesn't match, emit as text
2150
+ if c is None:
2151
+ # EOF - emit incomplete tag as text (preserve original case) then EOF
2152
+ self.text_buffer.extend(("<", "/"))
2153
+ for ch in self.original_tag_name:
2154
+ self._append_text(ch)
2155
+ self.current_tag_name.clear()
2156
+ self.original_tag_name.clear()
2157
+ self._flush_text()
2158
+ self._emit_token(EOFToken())
2159
+ return True
2160
+ # Not a matching end tag - emit as text (preserve original case)
2161
+ self.text_buffer.extend(("<", "/"))
2162
+ for ch in self.original_tag_name:
2163
+ self._append_text(ch)
2164
+ self.current_tag_name.clear()
2165
+ self.original_tag_name.clear()
2166
+ self._reconsume_current()
2167
+ self.state = self.RCDATA
2168
+ return False
2169
+
2170
+ def _state_rawtext(self) -> bool:
2171
+ buffer = self.buffer
2172
+ length = self.length
2173
+ pos = self.pos
2174
+ while True:
2175
+ if self.reconsume:
2176
+ self.reconsume = False
2177
+ if self.current_char is None:
2178
+ self._flush_text()
2179
+ self._emit_token(EOFToken())
2180
+ return True
2181
+ self.pos -= 1
2182
+ pos = self.pos
2183
+
2184
+ # Optimized loop using find
2185
+ lt_index = buffer.find("<", pos)
2186
+ null_index = buffer.find("\0", pos)
2187
+ next_special = lt_index if lt_index != -1 else length
2188
+ if null_index != -1 and null_index < next_special:
2189
+ if null_index > pos:
2190
+ chunk = buffer[pos:null_index]
2191
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2192
+ else:
2193
+ self.ignore_lf = False
2194
+ self._emit_error("unexpected-null-character")
2195
+ self._append_text("\ufffd")
2196
+ pos = null_index + 1
2197
+ self.pos = pos
2198
+ continue
2199
+ if lt_index == -1:
2200
+ if pos < length:
2201
+ chunk = buffer[pos:length]
2202
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2203
+ self.pos = length
2204
+ self._flush_text()
2205
+ self._emit_token(EOFToken())
2206
+ return True
2207
+ if lt_index > pos:
2208
+ chunk = buffer[pos:lt_index]
2209
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2210
+ pos = lt_index + 1
2211
+ self.pos = pos
2212
+ # Handle script escaped transition before treating '<' as markup boundary
2213
+ if self.rawtext_tag_name == "script":
2214
+ next1 = self._peek_char(0)
2215
+ next2 = self._peek_char(1)
2216
+ next3 = self._peek_char(2)
2217
+ if next1 == "!" and next2 == "-" and next3 == "-":
2218
+ self.text_buffer.extend(["<", "!", "-", "-"])
2219
+ self._get_char()
2220
+ self._get_char()
2221
+ self._get_char()
2222
+ self.state = self.SCRIPT_DATA_ESCAPED
2223
+ return False
2224
+ self.state = self.RAWTEXT_LESS_THAN_SIGN
2225
+ return False
2226
+
2227
+ def _state_rawtext_less_than_sign(self) -> bool:
2228
+ c = self._get_char()
2229
+ if c == "/":
2230
+ self.current_tag_name.clear()
2231
+ self.state = self.RAWTEXT_END_TAG_OPEN
2232
+ return False
2233
+ self._append_text("<")
2234
+ self._reconsume_current()
2235
+ self.state = self.RAWTEXT
2236
+ return False
2237
+
2238
+ def _state_rawtext_end_tag_open(self) -> bool:
2239
+ c = self._get_char()
2240
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2241
+ self.current_tag_name.append(c.lower())
2242
+ self.original_tag_name.append(c)
2243
+ self.state = self.RAWTEXT_END_TAG_NAME
2244
+ return False
2245
+ self.text_buffer.extend(("<", "/"))
2246
+ self._reconsume_current()
2247
+ self.state = self.RAWTEXT
2248
+ return False
2249
+
2250
+ def _state_rawtext_end_tag_name(self) -> bool:
2251
+ # Check if this matches the opening tag name
2252
+ while True:
2253
+ c = self._get_char()
2254
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2255
+ self.current_tag_name.append(c.lower())
2256
+ self.original_tag_name.append(c)
2257
+ continue
2258
+ # End of tag name - check if it matches
2259
+ tag_name = "".join(self.current_tag_name)
2260
+ if tag_name == self.rawtext_tag_name:
2261
+ if c == ">":
2262
+ attrs: dict[str, str | None] = {}
2263
+ tag = Tag(Tag.END, tag_name, attrs, False)
2264
+ self._flush_text()
2265
+ self._emit_token(tag)
2266
+ self.state = self.DATA
2267
+ self.rawtext_tag_name = None
2268
+ self.original_tag_name.clear()
2269
+ return False
2270
+ if c in (" ", "\t", "\n", "\r", "\f"):
2271
+ # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2272
+ self.current_tag_kind = Tag.END
2273
+ self.current_tag_attrs = {}
2274
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2275
+ return False
2276
+ if c == "/":
2277
+ self._flush_text()
2278
+ self.current_tag_kind = Tag.END
2279
+ self.current_tag_attrs = {}
2280
+ self.state = self.SELF_CLOSING_START_TAG
2281
+ return False
2282
+ # If we hit EOF or tag doesn't match, emit as text
2283
+ if c is None:
2284
+ # EOF - emit incomplete tag as text (preserve original case) then EOF
2285
+ self.text_buffer.extend(("<", "/"))
2286
+ for ch in self.original_tag_name:
2287
+ self._append_text(ch)
2288
+ self.current_tag_name.clear()
2289
+ self.original_tag_name.clear()
2290
+ self._flush_text()
2291
+ self._emit_token(EOFToken())
2292
+ return True
2293
+ # Not a matching end tag - emit as text (preserve original case)
2294
+ self.text_buffer.extend(("<", "/"))
2295
+ for ch in self.original_tag_name:
2296
+ self._append_text(ch)
2297
+ self.current_tag_name.clear()
2298
+ self.original_tag_name.clear()
2299
+ self._reconsume_current()
2300
+ self.state = self.RAWTEXT
2301
+ return False
2302
+
2303
+ def _state_plaintext(self) -> bool:
2304
+ # PLAINTEXT state - consume everything as text, no end tag
2305
+ if self.pos < self.length:
2306
+ remaining = self.buffer[self.pos :]
2307
+ # Replace null bytes with replacement character
2308
+ if "\0" in remaining:
2309
+ remaining = remaining.replace("\0", "\ufffd")
2310
+ self._emit_error("unexpected-null-character")
2311
+ self._append_text(remaining)
2312
+ self.pos = self.length
2313
+ self._flush_text()
2314
+ self._emit_token(EOFToken())
2315
+ return True
2316
+
2317
+ def _state_script_data_escaped(self) -> bool:
2318
+ c = self._get_char()
2319
+ if c is None:
2320
+ self._flush_text()
2321
+ self._emit_token(EOFToken())
2322
+ return True
2323
+ if c == "-":
2324
+ self._append_text("-")
2325
+ self.state = self.SCRIPT_DATA_ESCAPED_DASH
2326
+ return False
2327
+ if c == "<":
2328
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2329
+ return False
2330
+ if c == "\0":
2331
+ self._emit_error("unexpected-null-character")
2332
+ self._append_text("\ufffd")
2333
+ return False
2334
+ self._append_text(c)
2335
+ return False
2336
+
2337
+ def _state_script_data_escaped_dash(self) -> bool:
2338
+ c = self._get_char()
2339
+ if c is None:
2340
+ self._flush_text()
2341
+ self._emit_token(EOFToken())
2342
+ return True
2343
+ if c == "-":
2344
+ self._append_text("-")
2345
+ self.state = self.SCRIPT_DATA_ESCAPED_DASH_DASH
2346
+ return False
2347
+ if c == "<":
2348
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2349
+ return False
2350
+ if c == "\0":
2351
+ self._emit_error("unexpected-null-character")
2352
+ self._append_text("\ufffd")
2353
+ self.state = self.SCRIPT_DATA_ESCAPED
2354
+ return False
2355
+ self._append_text(c)
2356
+ self.state = self.SCRIPT_DATA_ESCAPED
2357
+ return False
2358
+
2359
+ def _state_script_data_escaped_dash_dash(self) -> bool:
2360
+ c = self._get_char()
2361
+ if c is None:
2362
+ self._flush_text()
2363
+ self._emit_token(EOFToken())
2364
+ return True
2365
+ if c == "-":
2366
+ self._append_text("-")
2367
+ return False
2368
+ if c == "<":
2369
+ self._append_text("<")
2370
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2371
+ return False
2372
+ if c == ">":
2373
+ self._append_text(">")
2374
+ self.state = self.RAWTEXT
2375
+ return False
2376
+ if c == "\0":
2377
+ self._emit_error("unexpected-null-character")
2378
+ self._append_text("\ufffd")
2379
+ self.state = self.SCRIPT_DATA_ESCAPED
2380
+ return False
2381
+ self._append_text(c)
2382
+ self.state = self.SCRIPT_DATA_ESCAPED
2383
+ return False
2384
+
2385
+ def _state_script_data_escaped_less_than_sign(self) -> bool:
2386
+ c = self._get_char()
2387
+ if c == "/":
2388
+ self.temp_buffer.clear()
2389
+ self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_OPEN
2390
+ return False
2391
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2392
+ self.temp_buffer.clear()
2393
+ self._append_text("<")
2394
+ self._reconsume_current()
2395
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
2396
+ return False
2397
+ self._append_text("<")
2398
+ self._reconsume_current()
2399
+ self.state = self.SCRIPT_DATA_ESCAPED
2400
+
2401
+ return False
2402
+
2403
+ def _state_script_data_escaped_end_tag_open(self) -> bool:
2404
+ c = self._get_char()
2405
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2406
+ self.current_tag_name.clear()
2407
+ self.original_tag_name.clear()
2408
+ self._reconsume_current()
2409
+ self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_NAME
2410
+ return False
2411
+ self.text_buffer.extend(("<", "/"))
2412
+ self._reconsume_current()
2413
+ self.state = self.SCRIPT_DATA_ESCAPED
2414
+ return False
2415
+
2416
+ def _state_script_data_escaped_end_tag_name(self) -> bool:
2417
+ c = self._get_char()
2418
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2419
+ self.current_tag_name.append(c.lower())
2420
+ self.original_tag_name.append(c)
2421
+ self.temp_buffer.append(c)
2422
+ return False
2423
+ # Check if this is an appropriate end tag
2424
+ tag_name = "".join(self.current_tag_name)
2425
+ is_appropriate = tag_name == self.rawtext_tag_name
2426
+
2427
+ if is_appropriate:
2428
+ if c in (" ", "\t", "\n", "\r", "\f"):
2429
+ self.current_tag_kind = Tag.END
2430
+ self.current_tag_attrs = {}
2431
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2432
+ return False
2433
+ if c == "/":
2434
+ self._flush_text()
2435
+ self.current_tag_kind = Tag.END
2436
+ self.current_tag_attrs = {}
2437
+ self.state = self.SELF_CLOSING_START_TAG
2438
+ return False
2439
+ if c == ">":
2440
+ self._flush_text()
2441
+ attrs: dict[str, str | None] = {}
2442
+ tag = Tag(Tag.END, tag_name, attrs, False)
2443
+ self._emit_token(tag)
2444
+ self.state = self.DATA
2445
+ self.rawtext_tag_name = None
2446
+ self.current_tag_name.clear()
2447
+ self.original_tag_name.clear()
2448
+ return False
2449
+ # Not an appropriate end tag
2450
+ self.text_buffer.extend(("<", "/"))
2451
+ for ch in self.temp_buffer:
2452
+ self._append_text(ch)
2453
+ self._reconsume_current()
2454
+ self.state = self.SCRIPT_DATA_ESCAPED
2455
+ return False
2456
+
2457
+ def _state_script_data_double_escape_start(self) -> bool:
2458
+ c = self._get_char()
2459
+ if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2460
+ # Check if temp_buffer contains "script"
2461
+ temp = "".join(self.temp_buffer).lower()
2462
+ if temp == "script":
2463
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2464
+ else:
2465
+ self.state = self.SCRIPT_DATA_ESCAPED
2466
+ self._append_text(c)
2467
+ return False
2468
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2469
+ self.temp_buffer.append(c)
2470
+ self._append_text(c)
2471
+ return False
2472
+ self._reconsume_current()
2473
+ self.state = self.SCRIPT_DATA_ESCAPED
2474
+ return False
2475
+
2476
+ def _state_script_data_double_escaped(self) -> bool:
2477
+ c = self._get_char()
2478
+ if c is None:
2479
+ self._flush_text()
2480
+ self._emit_token(EOFToken())
2481
+ return True
2482
+ if c == "-":
2483
+ self._append_text("-")
2484
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH
2485
+ return False
2486
+ if c == "<":
2487
+ self._append_text("<")
2488
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2489
+ return False
2490
+ if c == "\0":
2491
+ self._emit_error("unexpected-null-character")
2492
+ self._append_text("\ufffd")
2493
+ return False
2494
+ self._append_text(c)
2495
+ return False
2496
+
2497
+ def _state_script_data_double_escaped_dash(self) -> bool:
2498
+ c = self._get_char()
2499
+ if c is None:
2500
+ self._flush_text()
2501
+ self._emit_token(EOFToken())
2502
+ return True
2503
+ if c == "-":
2504
+ self._append_text("-")
2505
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
2506
+ return False
2507
+ if c == "<":
2508
+ self._append_text("<")
2509
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2510
+ return False
2511
+ if c == "\0":
2512
+ self._emit_error("unexpected-null-character")
2513
+ self._append_text("\ufffd")
2514
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2515
+ return False
2516
+ self._append_text(c)
2517
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2518
+ return False
2519
+
2520
+ def _state_script_data_double_escaped_dash_dash(self) -> bool:
2521
+ c = self._get_char()
2522
+ if c is None:
2523
+ self._flush_text()
2524
+ self._emit_token(EOFToken())
2525
+ return True
2526
+ if c == "-":
2527
+ self._append_text("-")
2528
+ return False
2529
+ if c == "<":
2530
+ self._append_text("<")
2531
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2532
+
2533
+ return False
2534
+ if c == ">":
2535
+ self._append_text(">")
2536
+ self.state = self.RAWTEXT
2537
+
2538
+ return False
2539
+ if c == "\0":
2540
+ self._emit_error("unexpected-null-character")
2541
+ self._append_text("\ufffd")
2542
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2543
+ return False
2544
+ self._append_text(c)
2545
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2546
+ return False
2547
+
2548
+ def _state_script_data_double_escaped_less_than_sign(self) -> bool:
2549
+ c = self._get_char()
2550
+ if c == "/":
2551
+ self.temp_buffer.clear()
2552
+ self._append_text("/")
2553
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_END
2554
+ return False
2555
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2556
+ self.temp_buffer.clear()
2557
+ self._reconsume_current()
2558
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
2559
+ return False
2560
+ self._reconsume_current()
2561
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2562
+ return False
2563
+
2564
+ def _state_script_data_double_escape_end(self) -> bool:
2565
+ c = self._get_char()
2566
+ if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2567
+ # Check if temp_buffer contains "script"
2568
+ temp = "".join(self.temp_buffer).lower()
2569
+
2570
+ if temp == "script":
2571
+ self.state = self.SCRIPT_DATA_ESCAPED
2572
+ else:
2573
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2574
+ self._append_text(c)
2575
+ return False
2576
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2577
+ self.temp_buffer.append(c)
2578
+ self._append_text(c)
2579
+ return False
2580
+ self._reconsume_current()
2581
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2582
+ return False
2583
+
2584
+
2585
+ Tokenizer._STATE_HANDLERS = [ # type: ignore[attr-defined]
2586
+ Tokenizer._state_data,
2587
+ Tokenizer._state_tag_open,
2588
+ Tokenizer._state_end_tag_open,
2589
+ Tokenizer._state_tag_name,
2590
+ Tokenizer._state_before_attribute_name,
2591
+ Tokenizer._state_attribute_name,
2592
+ Tokenizer._state_after_attribute_name,
2593
+ Tokenizer._state_before_attribute_value,
2594
+ Tokenizer._state_attribute_value_double,
2595
+ Tokenizer._state_attribute_value_single,
2596
+ Tokenizer._state_attribute_value_unquoted,
2597
+ Tokenizer._state_after_attribute_value_quoted,
2598
+ Tokenizer._state_self_closing_start_tag,
2599
+ Tokenizer._state_markup_declaration_open,
2600
+ Tokenizer._state_comment_start,
2601
+ Tokenizer._state_comment_start_dash,
2602
+ Tokenizer._state_comment,
2603
+ Tokenizer._state_comment_end_dash,
2604
+ Tokenizer._state_comment_end,
2605
+ Tokenizer._state_comment_end_bang,
2606
+ Tokenizer._state_bogus_comment,
2607
+ Tokenizer._state_doctype,
2608
+ Tokenizer._state_before_doctype_name,
2609
+ Tokenizer._state_doctype_name,
2610
+ Tokenizer._state_after_doctype_name,
2611
+ Tokenizer._state_bogus_doctype,
2612
+ Tokenizer._state_after_doctype_public_keyword,
2613
+ Tokenizer._state_after_doctype_system_keyword,
2614
+ Tokenizer._state_before_doctype_public_identifier,
2615
+ Tokenizer._state_doctype_public_identifier_double_quoted,
2616
+ Tokenizer._state_doctype_public_identifier_single_quoted,
2617
+ Tokenizer._state_after_doctype_public_identifier,
2618
+ Tokenizer._state_between_doctype_public_and_system_identifiers,
2619
+ Tokenizer._state_before_doctype_system_identifier,
2620
+ Tokenizer._state_doctype_system_identifier_double_quoted,
2621
+ Tokenizer._state_doctype_system_identifier_single_quoted,
2622
+ Tokenizer._state_after_doctype_system_identifier,
2623
+ Tokenizer._state_cdata_section,
2624
+ Tokenizer._state_cdata_section_bracket,
2625
+ Tokenizer._state_cdata_section_end,
2626
+ Tokenizer._state_rcdata,
2627
+ Tokenizer._state_rcdata_less_than_sign,
2628
+ Tokenizer._state_rcdata_end_tag_open,
2629
+ Tokenizer._state_rcdata_end_tag_name,
2630
+ Tokenizer._state_rawtext,
2631
+ Tokenizer._state_rawtext_less_than_sign,
2632
+ Tokenizer._state_rawtext_end_tag_open,
2633
+ Tokenizer._state_rawtext_end_tag_name,
2634
+ Tokenizer._state_plaintext,
2635
+ Tokenizer._state_script_data_escaped,
2636
+ Tokenizer._state_script_data_escaped_dash,
2637
+ Tokenizer._state_script_data_escaped_dash_dash,
2638
+ Tokenizer._state_script_data_escaped_less_than_sign,
2639
+ Tokenizer._state_script_data_escaped_end_tag_open,
2640
+ Tokenizer._state_script_data_escaped_end_tag_name,
2641
+ Tokenizer._state_script_data_double_escape_start,
2642
+ Tokenizer._state_script_data_double_escaped,
2643
+ Tokenizer._state_script_data_double_escaped_dash,
2644
+ Tokenizer._state_script_data_double_escaped_dash_dash,
2645
+ Tokenizer._state_script_data_double_escaped_less_than_sign,
2646
+ Tokenizer._state_script_data_double_escape_end,
2647
+ ]