justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/tokenizer.py ADDED
@@ -0,0 +1,2590 @@
1
+ import re
2
+ from bisect import bisect_right
3
+
4
+ from .entities import decode_entities_in_text
5
+ from .errors import generate_error_message
6
+ from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
7
+
8
+ _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
9
+ _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
10
+ _RCDATA_ELEMENTS = {"title", "textarea"}
11
+ _RAWTEXT_SWITCH_TAGS = {
12
+ "script",
13
+ "style",
14
+ "xmp",
15
+ "iframe",
16
+ "noembed",
17
+ "noframes",
18
+ "textarea",
19
+ "title",
20
+ }
21
+
22
+ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
23
+ _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
24
+ _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
25
+
26
+ _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
27
+ _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
28
+ _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
29
+ _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
30
+
31
+ # XML Coercion Regex
32
+ _xml_invalid_single_chars = []
33
+ for _plane in range(17):
34
+ _base = _plane * 0x10000
35
+ _xml_invalid_single_chars.append(chr(_base + 0xFFFE))
36
+ _xml_invalid_single_chars.append(chr(_base + 0xFFFF))
37
+
38
+ _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
39
+
40
+
41
+ def _xml_coercion_callback(match):
42
+ if match.group(0) == "\f":
43
+ return " "
44
+ return "\ufffd"
45
+
46
+
47
+ def _coerce_text_for_xml(text):
48
+ """Apply XML coercion to text content."""
49
+ # Fast path for ASCII
50
+ if text.isascii():
51
+ if "\f" in text:
52
+ return text.replace("\f", " ")
53
+ return text
54
+
55
+ if not _XML_COERCION_PATTERN.search(text):
56
+ return text
57
+ return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
58
+
59
+
60
+ def _coerce_comment_for_xml(text):
61
+ """Apply XML coercion to comment content - handle double hyphens."""
62
+ # Replace -- with - - (with space)
63
+ if "--" in text:
64
+ return text.replace("--", "- -")
65
+ return text
66
+
67
+
68
+ class TokenizerOpts:
69
+ __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
70
+
71
+ def __init__(
72
+ self,
73
+ exact_errors=False,
74
+ discard_bom=True,
75
+ initial_state=None,
76
+ initial_rawtext_tag=None,
77
+ xml_coercion=False,
78
+ ):
79
+ self.exact_errors = bool(exact_errors)
80
+ self.discard_bom = bool(discard_bom)
81
+ self.initial_state = initial_state
82
+ self.initial_rawtext_tag = initial_rawtext_tag
83
+ self.xml_coercion = bool(xml_coercion)
84
+
85
+
86
+ class Tokenizer:
87
+ DATA = 0
88
+ TAG_OPEN = 1
89
+ END_TAG_OPEN = 2
90
+ TAG_NAME = 3
91
+ BEFORE_ATTRIBUTE_NAME = 4
92
+ ATTRIBUTE_NAME = 5
93
+ AFTER_ATTRIBUTE_NAME = 6
94
+ BEFORE_ATTRIBUTE_VALUE = 7
95
+ ATTRIBUTE_VALUE_DOUBLE = 8
96
+ ATTRIBUTE_VALUE_SINGLE = 9
97
+ ATTRIBUTE_VALUE_UNQUOTED = 10
98
+ AFTER_ATTRIBUTE_VALUE_QUOTED = 11
99
+ SELF_CLOSING_START_TAG = 12
100
+ MARKUP_DECLARATION_OPEN = 13
101
+ COMMENT_START = 14
102
+ COMMENT_START_DASH = 15
103
+ COMMENT = 16
104
+ COMMENT_END_DASH = 17
105
+ COMMENT_END = 18
106
+ COMMENT_END_BANG = 19
107
+ BOGUS_COMMENT = 20
108
+ DOCTYPE = 21
109
+ BEFORE_DOCTYPE_NAME = 22
110
+ DOCTYPE_NAME = 23
111
+ AFTER_DOCTYPE_NAME = 24
112
+ BOGUS_DOCTYPE = 25
113
+ AFTER_DOCTYPE_PUBLIC_KEYWORD = 26
114
+ AFTER_DOCTYPE_SYSTEM_KEYWORD = 27
115
+ BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 28
116
+ DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 29
117
+ DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 30
118
+ AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 31
119
+ BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 32
120
+ BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 33
121
+ DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 34
122
+ DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 35
123
+ AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 36
124
+ CDATA_SECTION = 37
125
+ CDATA_SECTION_BRACKET = 38
126
+ CDATA_SECTION_END = 39
127
+ RCDATA = 40
128
+ RCDATA_LESS_THAN_SIGN = 41
129
+ RCDATA_END_TAG_OPEN = 42
130
+ RCDATA_END_TAG_NAME = 43
131
+ RAWTEXT = 44
132
+ RAWTEXT_LESS_THAN_SIGN = 45
133
+ RAWTEXT_END_TAG_OPEN = 46
134
+ RAWTEXT_END_TAG_NAME = 47
135
+ PLAINTEXT = 48
136
+ SCRIPT_DATA_ESCAPED = 49
137
+ SCRIPT_DATA_ESCAPED_DASH = 50
138
+ SCRIPT_DATA_ESCAPED_DASH_DASH = 51
139
+ SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 52
140
+ SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 53
141
+ SCRIPT_DATA_ESCAPED_END_TAG_NAME = 54
142
+ SCRIPT_DATA_DOUBLE_ESCAPE_START = 55
143
+ SCRIPT_DATA_DOUBLE_ESCAPED = 56
144
+ SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 57
145
+ SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 58
146
+ SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 59
147
+ SCRIPT_DATA_DOUBLE_ESCAPE_END = 60
148
+
149
+ __slots__ = (
150
+ "_comment_token",
151
+ "_newline_positions",
152
+ "_state_handlers",
153
+ "_tag_token",
154
+ "buffer",
155
+ "collect_errors",
156
+ "current_attr_name",
157
+ "current_attr_value",
158
+ "current_attr_value_has_amp",
159
+ "current_char",
160
+ "current_comment",
161
+ "current_doctype_force_quirks",
162
+ "current_doctype_name",
163
+ "current_doctype_public",
164
+ "current_doctype_system",
165
+ "current_tag_attrs",
166
+ "current_tag_kind",
167
+ "current_tag_name",
168
+ "current_tag_self_closing",
169
+ "errors",
170
+ "ignore_lf",
171
+ "last_start_tag_name",
172
+ "last_token_column",
173
+ "last_token_line",
174
+ "length",
175
+ "opts",
176
+ "original_tag_name",
177
+ "pos",
178
+ "rawtext_tag_name",
179
+ "reconsume",
180
+ "sink",
181
+ "state",
182
+ "temp_buffer",
183
+ "text_buffer",
184
+ "text_start_pos",
185
+ )
186
+
187
+ # _STATE_HANDLERS is defined at the end of the file
188
+
189
+ def __init__(self, sink, opts=None, collect_errors=False):
190
+ self.sink = sink
191
+ self.opts = opts or TokenizerOpts()
192
+ self.collect_errors = collect_errors
193
+ self.errors = []
194
+
195
+ self.state = self.DATA
196
+ self.buffer = ""
197
+ self.length = 0
198
+ self.pos = 0
199
+ self.reconsume = False
200
+ self.current_char = ""
201
+ self.ignore_lf = False
202
+ self.last_token_line = 1
203
+ self.last_token_column = 0
204
+
205
+ # Reusable buffers to avoid per-token allocations.
206
+ self.text_buffer = []
207
+ self.text_start_pos = 0
208
+ self.current_tag_name = []
209
+ self.current_tag_attrs = {}
210
+ self.current_attr_name = []
211
+ self.current_attr_value = []
212
+ self.current_attr_value_has_amp = False
213
+ self.current_tag_self_closing = False
214
+ self.current_tag_kind = Tag.START
215
+ self.current_comment = []
216
+ self.current_doctype_name = []
217
+ self.current_doctype_public = None # None = not set, [] = empty string
218
+ self.current_doctype_system = None # None = not set, [] = empty string
219
+ self.current_doctype_force_quirks = False
220
+ self.last_start_tag_name = None
221
+ self.rawtext_tag_name = None
222
+ self.original_tag_name = []
223
+ self.temp_buffer = []
224
+ self._tag_token = Tag(Tag.START, "", {}, False)
225
+ self._comment_token = CommentToken("")
226
+
227
+ def initialize(self, html):
228
+ if html and html[0] == "\ufeff" and self.opts.discard_bom:
229
+ html = html[1:]
230
+
231
+ self.buffer = html or ""
232
+ self.length = len(self.buffer)
233
+ self.pos = 0
234
+ self.reconsume = False
235
+ self.current_char = ""
236
+ self.ignore_lf = False
237
+ self.last_token_line = 1
238
+ self.last_token_column = 0
239
+ self.errors = []
240
+ self.text_buffer.clear()
241
+ self.text_start_pos = 0
242
+ self.current_tag_name.clear()
243
+ self.current_tag_attrs = {}
244
+ self.current_attr_name.clear()
245
+ self.current_attr_value.clear()
246
+ self.current_attr_value_has_amp = False
247
+ self.current_comment.clear()
248
+ self.current_doctype_name.clear()
249
+ self.current_doctype_public = None
250
+ self.current_doctype_system = None
251
+ self.current_doctype_force_quirks = False
252
+ self.current_tag_self_closing = False
253
+ self.current_tag_kind = Tag.START
254
+ self.rawtext_tag_name = self.opts.initial_rawtext_tag
255
+ self.temp_buffer.clear()
256
+ self.last_start_tag_name = None
257
+ self._tag_token.kind = Tag.START
258
+ self._tag_token.name = ""
259
+ self._tag_token.attrs = {}
260
+ self._tag_token.self_closing = False
261
+
262
+ initial_state = self.opts.initial_state
263
+ if isinstance(initial_state, int):
264
+ self.state = initial_state
265
+ else:
266
+ self.state = self.DATA
267
+
268
+ # Pre-compute newline positions for O(log n) line lookups
269
+ if self.collect_errors:
270
+ self._newline_positions = []
271
+ pos = -1
272
+ buffer = self.buffer
273
+ while True:
274
+ pos = buffer.find("\n", pos + 1)
275
+ if pos == -1:
276
+ break
277
+ self._newline_positions.append(pos)
278
+ else:
279
+ self._newline_positions = None
280
+
281
+ def _get_line_at_pos(self, pos):
282
+ """Get line number (1-indexed) for a position using binary search."""
283
+ # Line number = count of newlines before pos + 1
284
+ return bisect_right(self._newline_positions, pos - 1) + 1
285
+
286
+ def step(self):
287
+ """Run one step of the tokenizer state machine. Returns True if EOF reached."""
288
+ handler = self._STATE_HANDLERS[self.state]
289
+ return handler(self)
290
+
291
+ def run(self, html):
292
+ self.initialize(html)
293
+ while True:
294
+ if self.step():
295
+ break
296
+
297
+ # ---------------------
298
+ # Helper methods
299
+ # ---------------------
300
+
301
+ def _peek_char(self, offset):
302
+ """Peek ahead at character at current position + offset without consuming"""
303
+ peek_pos = self.pos + offset
304
+ if peek_pos < self.length:
305
+ return self.buffer[peek_pos]
306
+ return None
307
+
308
+ def _append_text_chunk(self, chunk, *, ends_with_cr=False):
309
+ self._append_text(chunk)
310
+ self.ignore_lf = ends_with_cr
311
+
312
+ # ---------------------
313
+ # State handlers
314
+ # ---------------------
315
+
316
+ def _state_data(self):
317
+ buffer = self.buffer
318
+ length = self.length
319
+ pos = self.pos
320
+ while True:
321
+ if self.reconsume:
322
+ # Note: reconsume is never True at EOF in DATA state
323
+ self.reconsume = False
324
+ self.pos -= 1
325
+ pos = self.pos
326
+
327
+ if pos >= length:
328
+ self.pos = length
329
+ self.current_char = None
330
+ self._flush_text()
331
+ self._emit_token(EOFToken())
332
+ return True
333
+
334
+ # Optimized loop using find
335
+ next_lt = buffer.find("<", pos)
336
+
337
+ if next_lt == -1:
338
+ next_lt = length
339
+
340
+ end = next_lt
341
+
342
+ if end > pos:
343
+ chunk = buffer[pos:end]
344
+
345
+ if "\r" in chunk:
346
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
347
+
348
+ self._append_text(chunk)
349
+ self.ignore_lf = chunk.endswith("\r")
350
+
351
+ pos = end
352
+ self.pos = pos
353
+ if pos >= length:
354
+ continue
355
+
356
+ # After find("<"), we're always at '<' unless reconsume is True
357
+ # But reconsume only happens after TAG_OPEN which reconsumed '<'
358
+ c = buffer[pos]
359
+ pos += 1
360
+ self.pos = pos
361
+ self.current_char = c
362
+ self.ignore_lf = False
363
+ # c is always '<' here due to find() optimization above
364
+ # Optimization: Peek ahead for common tag starts
365
+ if pos < length:
366
+ nc = buffer[pos]
367
+ if ("a" <= nc <= "z") or ("A" <= nc <= "Z"):
368
+ self._flush_text()
369
+ # Inline _start_tag(Tag.START)
370
+ self.current_tag_kind = Tag.START
371
+ self.current_tag_name.clear()
372
+ self.current_attr_name.clear()
373
+ self.current_attr_value.clear()
374
+ self.current_attr_value_has_amp = False
375
+ self.current_tag_self_closing = False
376
+
377
+ if "A" <= nc <= "Z":
378
+ nc = chr(ord(nc) + 32)
379
+ self.current_tag_name.append(nc)
380
+ self.pos += 1
381
+ self.state = self.TAG_NAME
382
+ return self._state_tag_name()
383
+
384
+ if nc == "!":
385
+ # Optimization: Peek ahead for comments
386
+ if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
387
+ self._flush_text()
388
+ self.pos += 3 # Consume !--
389
+ self.current_comment.clear()
390
+ self.state = self.COMMENT_START
391
+ return self._state_comment_start()
392
+
393
+ if nc == "/":
394
+ # Check next char for end tag
395
+ if pos + 1 < length:
396
+ nnc = buffer[pos + 1]
397
+ if ("a" <= nnc <= "z") or ("A" <= nnc <= "Z"):
398
+ self._flush_text()
399
+ # Inline _start_tag(Tag.END)
400
+ self.current_tag_kind = Tag.END
401
+ self.current_tag_name.clear()
402
+ self.current_attr_name.clear()
403
+ self.current_attr_value.clear()
404
+ self.current_attr_value_has_amp = False
405
+ self.current_tag_self_closing = False
406
+
407
+ if "A" <= nnc <= "Z":
408
+ nnc = chr(ord(nnc) + 32)
409
+ self.current_tag_name.append(nnc)
410
+ self.pos += 2 # Consume / and nnc
411
+ self.state = self.TAG_NAME
412
+ return self._state_tag_name()
413
+
414
+ self._flush_text()
415
+ self.state = self.TAG_OPEN
416
+ return self._state_tag_open()
417
+
418
+ def _state_tag_open(self):
419
+ c = self._get_char()
420
+ if c is None:
421
+ self._emit_error("eof-before-tag-name")
422
+ self._append_text("<")
423
+ self._flush_text()
424
+ self._emit_token(EOFToken())
425
+ return True
426
+ if c == "!":
427
+ self.state = self.MARKUP_DECLARATION_OPEN
428
+ return False
429
+ if c == "/":
430
+ self.state = self.END_TAG_OPEN
431
+ return False
432
+ if c == "?":
433
+ self._emit_error("unexpected-question-mark-instead-of-tag-name")
434
+ self.current_comment.clear()
435
+ self._reconsume_current()
436
+ self.state = self.BOGUS_COMMENT
437
+ return False
438
+
439
+ self._emit_error("invalid-first-character-of-tag-name")
440
+ self._append_text("<")
441
+ self._reconsume_current()
442
+ self.state = self.DATA
443
+ return False
444
+
445
+ def _state_end_tag_open(self):
446
+ c = self._get_char()
447
+ if c is None:
448
+ self._emit_error("eof-before-tag-name")
449
+ self._append_text("<")
450
+ self._append_text("/")
451
+ self._flush_text()
452
+ self._emit_token(EOFToken())
453
+ return True
454
+ if c == ">":
455
+ self._emit_error("empty-end-tag")
456
+ self.state = self.DATA
457
+ return False
458
+
459
+ self._emit_error("invalid-first-character-of-tag-name")
460
+ self.current_comment.clear()
461
+ self._reconsume_current()
462
+ self.state = self.BOGUS_COMMENT
463
+ return False
464
+
465
+ def _state_tag_name(self):
466
+ replacement = "\ufffd"
467
+ append_tag_char = self.current_tag_name.append
468
+ buffer = self.buffer
469
+ length = self.length
470
+
471
+ while True:
472
+ # Inline _consume_tag_name_run
473
+ # Note: reconsume and ignore_lf are never True when entering TAG_NAME
474
+ pos = self.pos
475
+ if pos < length:
476
+ # Optimization: Check for common terminators before regex
477
+ match = None
478
+ if buffer[pos] not in "\t\n\f />\0\r":
479
+ match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
480
+
481
+ if match:
482
+ chunk = match.group(0)
483
+ if not chunk.islower():
484
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
485
+ append_tag_char(chunk)
486
+ self.pos = match.end()
487
+
488
+ if self.pos < length:
489
+ c = buffer[self.pos]
490
+ if c in (" ", "\t", "\n", "\f", "\r"):
491
+ self.pos += 1
492
+ if c == "\r":
493
+ self.ignore_lf = True
494
+ self.state = self.BEFORE_ATTRIBUTE_NAME
495
+ return self._state_before_attribute_name()
496
+ if c == ">":
497
+ self.pos += 1
498
+ if not self._emit_current_tag():
499
+ self.state = self.DATA
500
+ return False
501
+ if c == "/":
502
+ self.pos += 1
503
+ self.state = self.SELF_CLOSING_START_TAG
504
+ return self._state_self_closing_start_tag()
505
+
506
+ c = self._get_char()
507
+ if c is None:
508
+ self._emit_error("eof-in-tag")
509
+ # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
510
+ # The incomplete tag is discarded (not emitted as text)
511
+ self._emit_token(EOFToken())
512
+ return True
513
+ if c in ("\t", "\n", "\f", " "):
514
+ self.state = self.BEFORE_ATTRIBUTE_NAME
515
+ return self._state_before_attribute_name()
516
+ if c == "/":
517
+ self.state = self.SELF_CLOSING_START_TAG
518
+ return self._state_self_closing_start_tag()
519
+ if c == ">":
520
+ # In slow path, tag name is only first char (from DATA),
521
+ # so no rawtext elements possible - always set DATA state
522
+ self._emit_current_tag()
523
+ self.state = self.DATA
524
+ return False
525
+ # c == "\0" - the only remaining possibility after fast-path
526
+ self._emit_error("unexpected-null-character")
527
+ append_tag_char(replacement)
528
+
529
+ def _state_before_attribute_name(self):
530
+ buffer = self.buffer
531
+ length = self.length
532
+
533
+ while True:
534
+ # Optimization: Skip whitespace
535
+ if not self.reconsume and not self.ignore_lf:
536
+ if self.pos < length:
537
+ # Check if current char is whitespace before running regex
538
+ if buffer[self.pos] in " \t\n\f":
539
+ match = _WHITESPACE_PATTERN.match(buffer, self.pos)
540
+ if match:
541
+ self.pos = match.end()
542
+
543
+ # Inline _get_char
544
+ if self.reconsume: # pragma: no cover
545
+ self.reconsume = False
546
+ c = self.current_char
547
+ elif self.pos >= length:
548
+ c = None
549
+ else:
550
+ c = buffer[self.pos]
551
+ self.pos += 1
552
+
553
+ self.current_char = c
554
+
555
+ if c == " ":
556
+ self.ignore_lf = False
557
+ continue
558
+ if c == "\n":
559
+ if self.ignore_lf:
560
+ self.ignore_lf = False
561
+ # Line tracking now computed on-demand via _get_line_at_pos()
562
+ continue
563
+ if c == "\t" or c == "\f":
564
+ self.ignore_lf = False
565
+ continue
566
+ if c == "\r":
567
+ self.ignore_lf = False
568
+ if self.pos < length and buffer[self.pos] == "\n":
569
+ self.pos += 1
570
+ continue
571
+
572
+ if c is None:
573
+ self._emit_error("eof-in-tag")
574
+ self._flush_text()
575
+ self._emit_token(EOFToken())
576
+ return True
577
+
578
+ if c == "/":
579
+ self.state = self.SELF_CLOSING_START_TAG
580
+ return False
581
+ if c == ">":
582
+ self._finish_attribute()
583
+ if not self._emit_current_tag():
584
+ self.state = self.DATA
585
+ return False
586
+ if c == "=":
587
+ self._emit_error("unexpected-equals-sign-before-attribute-name")
588
+ self.current_attr_name.clear()
589
+ self.current_attr_value.clear()
590
+ self.current_attr_value_has_amp = False
591
+ self.current_attr_name.append("=")
592
+ self.state = self.ATTRIBUTE_NAME
593
+ return False # Let main loop dispatch to avoid recursion
594
+
595
+ self.current_attr_name.clear()
596
+ self.current_attr_value.clear()
597
+ self.current_attr_value_has_amp = False
598
+ if c == "\0":
599
+ self._emit_error("unexpected-null-character")
600
+ c = "\ufffd"
601
+ elif "A" <= c <= "Z":
602
+ c = chr(ord(c) + 32)
603
+
604
+ self.current_attr_name.append(c)
605
+ self.state = self.ATTRIBUTE_NAME
606
+ return False # Let main loop dispatch to avoid recursion
607
+
608
+ def _state_attribute_name(self):
609
+ replacement = "\ufffd"
610
+ append_attr_char = self.current_attr_name.append
611
+ buffer = self.buffer
612
+ length = self.length
613
+
614
+ while True:
615
+ # Inline _consume_attribute_name_run
616
+ if not self.reconsume and not self.ignore_lf:
617
+ pos = self.pos
618
+ if pos < length:
619
+ # Optimization: Check for common terminators before regex
620
+ match = None
621
+ if buffer[pos] not in "\t\n\f />=\0\"'<\r":
622
+ match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
623
+
624
+ if match:
625
+ chunk = match.group(0)
626
+ if not chunk.islower():
627
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
628
+ append_attr_char(chunk)
629
+ self.pos = match.end()
630
+
631
+ if self.pos < length:
632
+ c = buffer[self.pos]
633
+ if c == "=":
634
+ self.pos += 1
635
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
636
+ return self._state_before_attribute_value()
637
+ if c in (" ", "\t", "\n", "\f", "\r"):
638
+ self.pos += 1
639
+ if c == "\r":
640
+ self.ignore_lf = True
641
+ self._finish_attribute()
642
+ self.state = self.AFTER_ATTRIBUTE_NAME
643
+ return False # Let main loop dispatch to avoid recursion
644
+ if c == ">":
645
+ self.pos += 1
646
+ self._finish_attribute()
647
+ if not self._emit_current_tag():
648
+ self.state = self.DATA
649
+ return False
650
+ if c == "/":
651
+ self.pos += 1
652
+ self._finish_attribute()
653
+ self.state = self.SELF_CLOSING_START_TAG
654
+ return self._state_self_closing_start_tag()
655
+
656
+ c = self._get_char()
657
+ if c is None:
658
+ self._emit_error("eof-in-tag")
659
+ self._flush_text()
660
+ self._emit_token(EOFToken())
661
+ return True
662
+ if c in ("\t", "\n", "\f", " "):
663
+ self._finish_attribute()
664
+ self.state = self.AFTER_ATTRIBUTE_NAME
665
+ return False # Let main loop dispatch to avoid recursion
666
+ if c == "/":
667
+ self._finish_attribute()
668
+ self.state = self.SELF_CLOSING_START_TAG
669
+ return self._state_self_closing_start_tag()
670
+ if c == "=":
671
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
672
+ return self._state_before_attribute_value()
673
+ if c == ">":
674
+ self._finish_attribute()
675
+ if not self._emit_current_tag():
676
+ self.state = self.DATA
677
+ return False
678
+ if c == "\0":
679
+ self._emit_error("unexpected-null-character")
680
+ append_attr_char(replacement)
681
+ continue
682
+ if c in ('"', "'", "<"):
683
+ self._emit_error("unexpected-character-in-attribute-name")
684
+ append_attr_char(c)
685
+
686
+ def _state_after_attribute_name(self):
687
+ buffer = self.buffer
688
+ length = self.length
689
+
690
+ while True:
691
+ # Optimization: Skip whitespace
692
+ if not self.reconsume and not self.ignore_lf:
693
+ if self.pos < length:
694
+ match = _WHITESPACE_PATTERN.match(buffer, self.pos)
695
+ if match:
696
+ self.pos = match.end()
697
+
698
+ # Inline _get_char
699
+ if self.pos >= length:
700
+ c = None
701
+ else:
702
+ c = buffer[self.pos]
703
+ self.pos += 1
704
+
705
+ self.current_char = c
706
+
707
+ if c == " ":
708
+ self.ignore_lf = False
709
+ continue
710
+ if c == "\n":
711
+ # Note: Only reachable when ignore_lf=True (CR-LF handling)
712
+ # Standalone \n is caught by whitespace optimization
713
+ self.ignore_lf = False
714
+ continue
715
+ if c == "\r":
716
+ self.ignore_lf = True
717
+ continue
718
+ if c == "\t" or c == "\f":
719
+ self.ignore_lf = False
720
+ continue
721
+
722
+ self.ignore_lf = False
723
+
724
+ if c is None:
725
+ self._emit_error("eof-in-tag")
726
+ self._flush_text()
727
+ self._emit_token(EOFToken())
728
+ return True
729
+ if c == "/":
730
+ self._finish_attribute()
731
+ self.state = self.SELF_CLOSING_START_TAG
732
+ return False
733
+ if c == "=":
734
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
735
+ return False
736
+ if c == ">":
737
+ self._finish_attribute()
738
+ if not self._emit_current_tag():
739
+ self.state = self.DATA
740
+ return False
741
+ self._finish_attribute()
742
+ self.current_attr_name.clear()
743
+ self.current_attr_value.clear()
744
+ self.current_attr_value_has_amp = False
745
+ if c == "\0":
746
+ self._emit_error("unexpected-null-character")
747
+ c = "\ufffd"
748
+ elif "A" <= c <= "Z":
749
+ c = chr(ord(c) + 32)
750
+ self.current_attr_name.append(c)
751
+ self.state = self.ATTRIBUTE_NAME
752
+ return False # Let main loop dispatch to avoid recursion
753
+
754
+ def _state_before_attribute_value(self):
755
+ while True:
756
+ c = self._get_char()
757
+ if c is None:
758
+ self._emit_error("eof-in-tag")
759
+ self._flush_text()
760
+ self._emit_token(EOFToken())
761
+ return True
762
+ if c in ("\t", "\n", "\f", " "):
763
+ continue
764
+ if c == '"':
765
+ self.state = self.ATTRIBUTE_VALUE_DOUBLE
766
+ return self._state_attribute_value_double()
767
+ if c == "'":
768
+ self.state = self.ATTRIBUTE_VALUE_SINGLE
769
+ return self._state_attribute_value_single()
770
+ if c == ">":
771
+ self._emit_error("missing-attribute-value")
772
+ self._finish_attribute()
773
+ if not self._emit_current_tag():
774
+ self.state = self.DATA
775
+ return False
776
+ self._reconsume_current()
777
+ self.state = self.ATTRIBUTE_VALUE_UNQUOTED
778
+ return self._state_attribute_value_unquoted()
779
+
780
+ def _state_attribute_value_double(self):
781
+ replacement = "\ufffd"
782
+ stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
783
+ buffer = self.buffer
784
+ length = self.length
785
+
786
+ while True:
787
+ # Inline _consume_attribute_value_run
788
+ pos = self.pos
789
+ if pos < length:
790
+ # Optimization: Optimistically look for quote
791
+ next_quote = buffer.find('"', pos)
792
+ if next_quote == -1:
793
+ next_quote = length
794
+
795
+ # Check if we skipped other terminators
796
+ chunk = buffer[pos:next_quote]
797
+ if "&" in chunk or "\0" in chunk:
798
+ # Fallback to regex if complex chars present
799
+ match = stop_pattern.search(buffer, pos)
800
+ # Note: match is always found because we checked for & or \0 above
801
+ end = match.start()
802
+ else:
803
+ end = next_quote
804
+
805
+ if end > pos:
806
+ # chunk is already valid if we took the fast path
807
+ if end != next_quote:
808
+ chunk = buffer[pos:end]
809
+
810
+ # Normalize chunk for value if needed
811
+ if "\r" in chunk:
812
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
813
+
814
+ self.current_attr_value.append(chunk)
815
+ self.pos = end
816
+
817
+ # Inlined _get_char logic
818
+ if self.pos >= length:
819
+ self.current_char = None
820
+ self._emit_error("eof-in-tag")
821
+ self._emit_token(EOFToken())
822
+ return True
823
+
824
+ c = buffer[self.pos]
825
+ self.pos += 1
826
+
827
+ self.current_char = c
828
+
829
+ if c == '"':
830
+ self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
831
+ return self._state_after_attribute_value_quoted()
832
+ if c == "&":
833
+ self._append_attr_value_char("&")
834
+ self.current_attr_value_has_amp = True
835
+ else:
836
+ # c == "\0" - the only remaining possibility after fast-path
837
+ self._emit_error("unexpected-null-character")
838
+ self._append_attr_value_char(replacement)
839
+
840
+ def _state_attribute_value_single(self):
841
+ replacement = "\ufffd"
842
+ stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
843
+ buffer = self.buffer
844
+ length = self.length
845
+
846
+ while True:
847
+ # Inline _consume_attribute_value_run
848
+ pos = self.pos
849
+ if pos < length:
850
+ # Optimization: Optimistically look for quote
851
+ next_quote = buffer.find("'", pos)
852
+ if next_quote == -1:
853
+ next_quote = length
854
+
855
+ # Check if we skipped other terminators
856
+ chunk = buffer[pos:next_quote]
857
+ if "&" in chunk or "\0" in chunk:
858
+ # Fallback to regex if complex chars present
859
+ match = stop_pattern.search(buffer, pos)
860
+ # Note: match is always found because we checked for & or \0 above
861
+ end = match.start()
862
+ else:
863
+ end = next_quote
864
+
865
+ if end > pos:
866
+ # chunk is already valid if we took the fast path
867
+ if end != next_quote:
868
+ chunk = buffer[pos:end]
869
+
870
+ # Normalize chunk for value if needed
871
+ if "\r" in chunk:
872
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
873
+
874
+ self.current_attr_value.append(chunk)
875
+ self.pos = end
876
+
877
+ # Inlined _get_char logic
878
+ if self.pos >= length:
879
+ self.current_char = None
880
+ self._emit_error("eof-in-tag")
881
+ self._emit_token(EOFToken())
882
+ return True
883
+
884
+ c = buffer[self.pos]
885
+ self.pos += 1
886
+
887
+ self.current_char = c
888
+
889
+ if c == "'":
890
+ self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
891
+ return self._state_after_attribute_value_quoted()
892
+ if c == "&":
893
+ self._append_attr_value_char("&")
894
+ self.current_attr_value_has_amp = True
895
+ else:
896
+ # c == "\0" - the only remaining possibility after fast-path
897
+ self._emit_error("unexpected-null-character")
898
+ self._append_attr_value_char(replacement)
899
+
900
+ def _state_attribute_value_unquoted(self):
901
+ replacement = "\ufffd"
902
+ stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
903
+ buffer = self.buffer
904
+ length = self.length
905
+
906
+ while True:
907
+ # Inline _consume_attribute_value_run
908
+ if not self.reconsume:
909
+ pos = self.pos
910
+ if pos < length:
911
+ match = stop_pattern.search(buffer, pos)
912
+ # Note: match is always found - pattern matches terminators or EOF
913
+ end = match.start() if match else length
914
+
915
+ if end > pos:
916
+ self.current_attr_value.append(buffer[pos:end])
917
+ self.pos = end
918
+
919
+ c = self._get_char()
920
+ if c is None:
921
+ # Per HTML5 spec: EOF in attribute value is a parse error
922
+ # The incomplete tag is discarded (not emitted)
923
+ self._emit_error("eof-in-tag")
924
+ self._emit_token(EOFToken())
925
+ return True
926
+ if c in ("\t", "\n", "\f", " "):
927
+ self._finish_attribute()
928
+ self.state = self.BEFORE_ATTRIBUTE_NAME
929
+ return False
930
+ if c == ">":
931
+ self._finish_attribute()
932
+ if not self._emit_current_tag():
933
+ self.state = self.DATA
934
+ return False
935
+ if c == "&":
936
+ self._append_attr_value_char("&")
937
+ self.current_attr_value_has_amp = True
938
+ continue
939
+ if c in ('"', "'", "<", "=", "`"):
940
+ self._emit_error("unexpected-character-in-unquoted-attribute-value")
941
+ if c == "\0":
942
+ self._emit_error("unexpected-null-character")
943
+ self._append_attr_value_char(replacement)
944
+ continue
945
+ self._append_attr_value_char(c)
946
+
947
+ def _state_after_attribute_value_quoted(self):
948
+ """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
949
+ c = self._get_char()
950
+ if c is None:
951
+ self._emit_error("eof-in-tag")
952
+ self._flush_text()
953
+ self._emit_token(EOFToken())
954
+ return True
955
+ if c in ("\t", "\n", "\f", " "):
956
+ self._finish_attribute()
957
+ self.state = self.BEFORE_ATTRIBUTE_NAME
958
+ return False
959
+ if c == "/":
960
+ self._finish_attribute()
961
+ self.state = self.SELF_CLOSING_START_TAG
962
+ return False
963
+ if c == ">":
964
+ self._finish_attribute()
965
+ if not self._emit_current_tag():
966
+ self.state = self.DATA
967
+ return False
968
+ # Anything else: parse error, reconsume in before attribute name state
969
+ self._emit_error("missing-whitespace-between-attributes")
970
+ self._finish_attribute()
971
+ self._reconsume_current()
972
+ self.state = self.BEFORE_ATTRIBUTE_NAME
973
+ return False
974
+
975
+ def _state_self_closing_start_tag(self):
976
+ c = self._get_char()
977
+ if c is None:
978
+ self._emit_error("eof-in-tag")
979
+ self._flush_text()
980
+ self._emit_token(EOFToken())
981
+ return True
982
+ if c == ">":
983
+ self.current_tag_self_closing = True
984
+ self._emit_current_tag()
985
+ self.state = self.DATA
986
+ return False
987
+ self._emit_error("unexpected-character-after-solidus-in-tag")
988
+ self._reconsume_current()
989
+ self.state = self.BEFORE_ATTRIBUTE_NAME
990
+ return False
991
+
992
+ def _state_markup_declaration_open(self):
993
+ # Note: Comment handling (<!--) is optimized in DATA state fast-path
994
+ # This code only handles DOCTYPE and CDATA, or malformed markup
995
+ if self._consume_case_insensitive("DOCTYPE"):
996
+ self.current_doctype_name.clear()
997
+ self.current_doctype_public = None
998
+ self.current_doctype_system = None
999
+ self.current_doctype_force_quirks = False
1000
+ self.state = self.DOCTYPE
1001
+ return False
1002
+ if self._consume_if("[CDATA["):
1003
+ # CDATA sections are only valid in foreign content (SVG/MathML)
1004
+ # Check if the adjusted current node is in a foreign namespace
1005
+ stack = self.sink.open_elements
1006
+ if stack:
1007
+ current = stack[-1]
1008
+ if current and current.namespace not in {None, "html"}:
1009
+ # Proper CDATA section in foreign content
1010
+ self.state = self.CDATA_SECTION
1011
+ return False
1012
+ # Treat as bogus comment in HTML context, preserving "[CDATA[" prefix
1013
+ self._emit_error("cdata-in-html-content")
1014
+ self.current_comment.clear()
1015
+ # Add the consumed "[CDATA[" text to the comment
1016
+ for ch in "[CDATA[":
1017
+ self.current_comment.append(ch)
1018
+ self.state = self.BOGUS_COMMENT
1019
+ return False
1020
+ self._emit_error("incorrectly-opened-comment")
1021
+ self.current_comment.clear()
1022
+ # Don't reconsume - bogus comment starts from current position
1023
+ self.state = self.BOGUS_COMMENT
1024
+ return False
1025
+
1026
+ def _state_comment_start(self):
1027
+ replacement = "\ufffd"
1028
+ c = self._get_char()
1029
+ if c is None:
1030
+ self._emit_error("eof-in-comment")
1031
+ self._emit_comment()
1032
+ self._emit_token(EOFToken())
1033
+ return True
1034
+ if c == "-":
1035
+ self.state = self.COMMENT_START_DASH
1036
+ return False
1037
+ if c == ">":
1038
+ self._emit_error("abrupt-closing-of-empty-comment")
1039
+ self._emit_comment()
1040
+ self.state = self.DATA
1041
+ return False
1042
+ if c == "\0":
1043
+ self._emit_error("unexpected-null-character")
1044
+ self.current_comment.append(replacement)
1045
+ else:
1046
+ self.current_comment.append(c)
1047
+ self.state = self.COMMENT
1048
+ return False
1049
+
1050
+ def _state_comment_start_dash(self):
1051
+ replacement = "\ufffd"
1052
+ c = self._get_char()
1053
+ if c is None:
1054
+ self._emit_error("eof-in-comment")
1055
+ self._emit_comment()
1056
+ self._emit_token(EOFToken())
1057
+ return True
1058
+ if c == "-":
1059
+ self.state = self.COMMENT_END
1060
+ return False
1061
+ if c == ">":
1062
+ self._emit_error("abrupt-closing-of-empty-comment")
1063
+ self._emit_comment()
1064
+ self.state = self.DATA
1065
+ return False
1066
+ if c == "\0":
1067
+ self._emit_error("unexpected-null-character")
1068
+ self.current_comment.extend(("-", replacement))
1069
+ else:
1070
+ self.current_comment.extend(("-", c))
1071
+ self.state = self.COMMENT
1072
+ return False
1073
+
1074
+ def _state_comment(self):
1075
+ replacement = "\ufffd"
1076
+ while True:
1077
+ if self._consume_comment_run():
1078
+ continue
1079
+ c = self._get_char()
1080
+ if c is None:
1081
+ self._emit_error("eof-in-comment")
1082
+ self._emit_comment()
1083
+ self._emit_token(EOFToken())
1084
+ return True
1085
+ if c == "-":
1086
+ self.state = self.COMMENT_END_DASH
1087
+ return False
1088
+ # c == "\0" - the only remaining possibility after _consume_comment_run
1089
+ self._emit_error("unexpected-null-character")
1090
+ self.current_comment.append(replacement)
1091
+
1092
+ def _state_comment_end_dash(self):
1093
+ replacement = "\ufffd"
1094
+ c = self._get_char()
1095
+ if c is None:
1096
+ self._emit_error("eof-in-comment")
1097
+ self._emit_comment()
1098
+ self._emit_token(EOFToken())
1099
+ return True
1100
+ if c == "-":
1101
+ self.state = self.COMMENT_END
1102
+ return False
1103
+ if c == "\0":
1104
+ self._emit_error("unexpected-null-character")
1105
+ self.current_comment.extend(("-", replacement))
1106
+ self.state = self.COMMENT
1107
+ return False
1108
+ # Per spec: append "-" and current char, switch to COMMENT state
1109
+ self.current_comment.extend(("-", c))
1110
+ self.state = self.COMMENT
1111
+ return False
1112
+
1113
+ def _state_comment_end(self):
1114
+ replacement = "\ufffd"
1115
+ c = self._get_char()
1116
+ if c is None:
1117
+ self._emit_error("eof-in-comment")
1118
+ self._emit_comment()
1119
+ self._emit_token(EOFToken())
1120
+ return True
1121
+ if c == ">":
1122
+ self._emit_comment()
1123
+ self.state = self.DATA
1124
+ return False
1125
+ if c == "!":
1126
+ self.state = self.COMMENT_END_BANG
1127
+ return False
1128
+ if c == "-":
1129
+ self.current_comment.append("-")
1130
+ return False
1131
+ if c == "\0":
1132
+ self._emit_error("unexpected-null-character")
1133
+ self.current_comment.extend(("--", replacement))
1134
+ self.state = self.COMMENT
1135
+ return False
1136
+ self._emit_error("incorrectly-closed-comment")
1137
+ self.current_comment.extend(("--", c))
1138
+ self.state = self.COMMENT
1139
+ return False
1140
+
1141
+ def _state_comment_end_bang(self):
1142
+ replacement = "\ufffd"
1143
+ c = self._get_char()
1144
+ if c is None:
1145
+ self._emit_error("eof-in-comment")
1146
+ self._emit_comment()
1147
+ self._emit_token(EOFToken())
1148
+ return True
1149
+ if c == "-":
1150
+ self.current_comment.append("-")
1151
+ self.current_comment.append("-")
1152
+ self.current_comment.append("!")
1153
+ self.state = self.COMMENT_END_DASH
1154
+ return False
1155
+ if c == ">":
1156
+ self._emit_error("incorrectly-closed-comment")
1157
+ self._emit_comment()
1158
+ self.state = self.DATA
1159
+ return False
1160
+ if c == "\0":
1161
+ self._emit_error("unexpected-null-character")
1162
+ self.current_comment.append("-")
1163
+ self.current_comment.append("-")
1164
+ self.current_comment.append("!")
1165
+ self.current_comment.append(replacement)
1166
+ self.state = self.COMMENT
1167
+ return False
1168
+ self.current_comment.append("-")
1169
+ self.current_comment.append("-")
1170
+ self.current_comment.append("!")
1171
+ self.current_comment.append(c)
1172
+ self.state = self.COMMENT
1173
+ return False
1174
+
1175
+ def _state_bogus_comment(self):
1176
+ replacement = "\ufffd"
1177
+ while True:
1178
+ c = self._get_char()
1179
+ if c is None:
1180
+ self._emit_comment()
1181
+ self._emit_token(EOFToken())
1182
+ return True
1183
+ if c == ">":
1184
+ self._emit_comment()
1185
+ self.state = self.DATA
1186
+ return False
1187
+ if c == "\0":
1188
+ self.current_comment.append(replacement)
1189
+ else:
1190
+ self.current_comment.append(c)
1191
+
1192
+ def _state_doctype(self):
1193
+ c = self._get_char()
1194
+ if c is None:
1195
+ self._emit_error("eof-in-doctype")
1196
+ self.current_doctype_force_quirks = True
1197
+ self._emit_doctype()
1198
+ self._emit_token(EOFToken())
1199
+ return True
1200
+ if c in ("\t", "\n", "\f", " "):
1201
+ self.state = self.BEFORE_DOCTYPE_NAME
1202
+ return False
1203
+ if c == ">":
1204
+ self._emit_error("expected-doctype-name-but-got-right-bracket")
1205
+ self.current_doctype_force_quirks = True
1206
+ self._emit_doctype()
1207
+ self.state = self.DATA
1208
+ return False
1209
+ self._emit_error("missing-whitespace-before-doctype-name")
1210
+ self._reconsume_current()
1211
+ self.state = self.BEFORE_DOCTYPE_NAME
1212
+ return False
1213
+
1214
+ def _state_before_doctype_name(self):
1215
+ while True:
1216
+ c = self._get_char()
1217
+ if c is None:
1218
+ self._emit_error("eof-in-doctype-name")
1219
+ self.current_doctype_force_quirks = True
1220
+ self._emit_doctype()
1221
+ self._emit_token(EOFToken())
1222
+ return True
1223
+ if c in ("\t", "\n", "\f", " "):
1224
+ return False
1225
+ if c == ">":
1226
+ self._emit_error("expected-doctype-name-but-got-right-bracket")
1227
+ self.current_doctype_force_quirks = True
1228
+ self._emit_doctype()
1229
+ self.state = self.DATA
1230
+ return False
1231
+ if "A" <= c <= "Z":
1232
+ self.current_doctype_name.append(chr(ord(c) + 32))
1233
+ elif c == "\0":
1234
+ self._emit_error("unexpected-null-character")
1235
+ self.current_doctype_name.append("\ufffd")
1236
+ else:
1237
+ self.current_doctype_name.append(c)
1238
+ self.state = self.DOCTYPE_NAME
1239
+ return False
1240
+
1241
+ def _state_doctype_name(self):
1242
+ while True:
1243
+ c = self._get_char()
1244
+ if c is None:
1245
+ self._emit_error("eof-in-doctype-name")
1246
+ self.current_doctype_force_quirks = True
1247
+ self._emit_doctype()
1248
+ self._emit_token(EOFToken())
1249
+ return True
1250
+ if c in ("\t", "\n", "\f", " "):
1251
+ self.state = self.AFTER_DOCTYPE_NAME
1252
+ return False
1253
+ if c == ">":
1254
+ self._emit_doctype()
1255
+ self.state = self.DATA
1256
+ return False
1257
+ if "A" <= c <= "Z":
1258
+ self.current_doctype_name.append(chr(ord(c) + 32))
1259
+ continue
1260
+ if c == "\0":
1261
+ self._emit_error("unexpected-null-character")
1262
+ self.current_doctype_name.append("\ufffd")
1263
+ continue
1264
+ self.current_doctype_name.append(c)
1265
+
1266
+ def _state_after_doctype_name(self):
1267
+ if self._consume_case_insensitive("PUBLIC"):
1268
+ self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
1269
+ return False
1270
+ if self._consume_case_insensitive("SYSTEM"):
1271
+ self.state = self.AFTER_DOCTYPE_SYSTEM_KEYWORD
1272
+ return False
1273
+ while True:
1274
+ c = self._get_char()
1275
+ if c is None:
1276
+ self._emit_error("eof-in-doctype")
1277
+ self.current_doctype_force_quirks = True
1278
+ self._emit_doctype()
1279
+ self._emit_token(EOFToken())
1280
+ return True
1281
+ if c in ("\t", "\n", "\f", " "):
1282
+ continue
1283
+ if c == ">":
1284
+ self._emit_doctype()
1285
+ self.state = self.DATA
1286
+ return False
1287
+ self._emit_error("missing-whitespace-after-doctype-name")
1288
+ self.current_doctype_force_quirks = True
1289
+ self._reconsume_current()
1290
+ self.state = self.BOGUS_DOCTYPE
1291
+ return False
1292
+
1293
+ def _state_after_doctype_public_keyword(self):
1294
+ while True:
1295
+ c = self._get_char()
1296
+ if c is None:
1297
+ self._emit_error("missing-quote-before-doctype-public-identifier")
1298
+ self.current_doctype_force_quirks = True
1299
+ self._emit_doctype()
1300
+ self._emit_token(EOFToken())
1301
+ return True
1302
+ if c in ("\t", "\n", "\f", " "):
1303
+ self.state = self.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
1304
+ return False
1305
+ if c == '"':
1306
+ self._emit_error("missing-whitespace-before-doctype-public-identifier")
1307
+ self.current_doctype_public = []
1308
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
1309
+ return False
1310
+ if c == "'":
1311
+ self._emit_error("missing-whitespace-before-doctype-public-identifier")
1312
+ self.current_doctype_public = []
1313
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
1314
+ return False
1315
+ if c == ">":
1316
+ self._emit_error("missing-doctype-public-identifier")
1317
+ self.current_doctype_force_quirks = True
1318
+ self._emit_doctype()
1319
+ self.state = self.DATA
1320
+ return False
1321
+ self._emit_error("unexpected-character-after-doctype-public-keyword")
1322
+ self.current_doctype_force_quirks = True
1323
+ self._reconsume_current()
1324
+ self.state = self.BOGUS_DOCTYPE
1325
+ return False
1326
+
1327
+ def _state_after_doctype_system_keyword(self):
1328
+ while True:
1329
+ c = self._get_char()
1330
+ if c is None:
1331
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1332
+ self.current_doctype_force_quirks = True
1333
+ self._emit_doctype()
1334
+ self._emit_token(EOFToken())
1335
+ return True
1336
+ if c in ("\t", "\n", "\f", " "):
1337
+ self.state = self.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
1338
+ return False
1339
+ if c == '"':
1340
+ self._emit_error("missing-whitespace-after-doctype-public-identifier")
1341
+ self.current_doctype_system = []
1342
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1343
+ return False
1344
+ if c == "'":
1345
+ self._emit_error("missing-whitespace-after-doctype-public-identifier")
1346
+ self.current_doctype_system = []
1347
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1348
+ return False
1349
+ if c == ">":
1350
+ self._emit_error("missing-doctype-system-identifier")
1351
+ self.current_doctype_force_quirks = True
1352
+ self._emit_doctype()
1353
+ self.state = self.DATA
1354
+ return False
1355
+ self._emit_error("unexpected-character-after-doctype-system-keyword")
1356
+ self.current_doctype_force_quirks = True
1357
+ self._reconsume_current()
1358
+ self.state = self.BOGUS_DOCTYPE
1359
+ return False
1360
+
1361
+ def _state_before_doctype_public_identifier(self):
1362
+ while True:
1363
+ c = self._get_char()
1364
+ if c is None:
1365
+ self._emit_error("missing-doctype-public-identifier")
1366
+ self.current_doctype_force_quirks = True
1367
+ self._emit_doctype()
1368
+ self._emit_token(EOFToken())
1369
+ return True
1370
+ if c in ("\t", "\n", "\f", " "):
1371
+ continue
1372
+ if c == '"':
1373
+ self.current_doctype_public = []
1374
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
1375
+ return False
1376
+ if c == "'":
1377
+ self.current_doctype_public = []
1378
+ self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
1379
+ return False
1380
+ if c == ">":
1381
+ self._emit_error("missing-doctype-public-identifier")
1382
+ self.current_doctype_force_quirks = True
1383
+ self._emit_doctype()
1384
+ self.state = self.DATA
1385
+ return False
1386
+ self._emit_error("missing-quote-before-doctype-public-identifier")
1387
+ self.current_doctype_force_quirks = True
1388
+ self._reconsume_current()
1389
+ self.state = self.BOGUS_DOCTYPE
1390
+ return False
1391
+
1392
+ def _state_doctype_public_identifier_double_quoted(self):
1393
+ while True:
1394
+ c = self._get_char()
1395
+ if c is None:
1396
+ self._emit_error("eof-in-doctype-public-identifier")
1397
+ self.current_doctype_force_quirks = True
1398
+ self._emit_doctype()
1399
+ self._emit_token(EOFToken())
1400
+ return True
1401
+ if c == '"':
1402
+ self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
1403
+ return False
1404
+ if c == "\0":
1405
+ self._emit_error("unexpected-null-character")
1406
+ self.current_doctype_public.append("\ufffd")
1407
+ continue
1408
+ if c == ">":
1409
+ self._emit_error("abrupt-doctype-public-identifier")
1410
+ self.current_doctype_force_quirks = True
1411
+ self._emit_doctype()
1412
+ self.state = self.DATA
1413
+ return False
1414
+ self.current_doctype_public.append(c)
1415
+
1416
+ def _state_doctype_public_identifier_single_quoted(self):
1417
+ while True:
1418
+ c = self._get_char()
1419
+ if c is None:
1420
+ self._emit_error("eof-in-doctype-public-identifier")
1421
+ self.current_doctype_force_quirks = True
1422
+ self._emit_doctype()
1423
+ self._emit_token(EOFToken())
1424
+ return True
1425
+ if c == "'":
1426
+ self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
1427
+ return False
1428
+ if c == "\0":
1429
+ self._emit_error("unexpected-null-character")
1430
+ self.current_doctype_public.append("\ufffd")
1431
+ continue
1432
+ if c == ">":
1433
+ self._emit_error("abrupt-doctype-public-identifier")
1434
+ self.current_doctype_force_quirks = True
1435
+ self._emit_doctype()
1436
+ self.state = self.DATA
1437
+ return False
1438
+ self.current_doctype_public.append(c)
1439
+
1440
+ def _state_after_doctype_public_identifier(self):
1441
+ while True:
1442
+ c = self._get_char()
1443
+ if c is None:
1444
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1445
+ self.current_doctype_force_quirks = True
1446
+ self._emit_doctype()
1447
+ self._emit_token(EOFToken())
1448
+ return True
1449
+ if c in ("\t", "\n", "\f", " "):
1450
+ self.state = self.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
1451
+ return False
1452
+ if c == ">":
1453
+ self._emit_doctype()
1454
+ self.state = self.DATA
1455
+ return False
1456
+ if c == '"':
1457
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1458
+ self.current_doctype_system = []
1459
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1460
+ return False
1461
+ if c == "'":
1462
+ self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
1463
+ self.current_doctype_system = []
1464
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1465
+ return False
1466
+ self._emit_error("unexpected-character-after-doctype-public-identifier")
1467
+ self.current_doctype_force_quirks = True
1468
+ self._reconsume_current()
1469
+ self.state = self.BOGUS_DOCTYPE
1470
+ return False
1471
+
1472
+ def _state_between_doctype_public_and_system_identifiers(self):
1473
+ while True:
1474
+ c = self._get_char()
1475
+ if c is None:
1476
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1477
+ self.current_doctype_force_quirks = True
1478
+ self._emit_doctype()
1479
+ self._emit_token(EOFToken())
1480
+ return True
1481
+ if c in ("\t", "\n", "\f", " "):
1482
+ continue
1483
+ if c == ">":
1484
+ self._emit_doctype()
1485
+ self.state = self.DATA
1486
+ return False
1487
+ if c == '"':
1488
+ self.current_doctype_system = []
1489
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1490
+ return False
1491
+ if c == "'":
1492
+ self.current_doctype_system = []
1493
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1494
+ return False
1495
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1496
+ self.current_doctype_force_quirks = True
1497
+ self._reconsume_current()
1498
+ self.state = self.BOGUS_DOCTYPE
1499
+ return False
1500
+
1501
+ def _state_before_doctype_system_identifier(self):
1502
+ while True:
1503
+ c = self._get_char()
1504
+ if c is None:
1505
+ self._emit_error("missing-doctype-system-identifier")
1506
+ self.current_doctype_force_quirks = True
1507
+ self._emit_doctype()
1508
+ self._emit_token(EOFToken())
1509
+ return True
1510
+ if c in ("\t", "\n", "\f", " "):
1511
+ continue
1512
+ if c == '"':
1513
+ self.current_doctype_system = []
1514
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
1515
+ return False
1516
+ if c == "'":
1517
+ self.current_doctype_system = []
1518
+ self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
1519
+ return False
1520
+ if c == ">":
1521
+ self._emit_error("missing-doctype-system-identifier")
1522
+ self.current_doctype_force_quirks = True
1523
+ self._emit_doctype()
1524
+ self.state = self.DATA
1525
+ return False
1526
+ self._emit_error("missing-quote-before-doctype-system-identifier")
1527
+ self.current_doctype_force_quirks = True
1528
+ self._reconsume_current()
1529
+ self.state = self.BOGUS_DOCTYPE
1530
+ return False
1531
+
1532
+ def _state_doctype_system_identifier_double_quoted(self):
1533
+ while True:
1534
+ c = self._get_char()
1535
+ if c is None:
1536
+ self._emit_error("eof-in-doctype-system-identifier")
1537
+ self.current_doctype_force_quirks = True
1538
+ self._emit_doctype()
1539
+ self._emit_token(EOFToken())
1540
+ return True
1541
+ if c == '"':
1542
+ self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
1543
+ return False
1544
+ if c == "\0":
1545
+ self._emit_error("unexpected-null-character")
1546
+ self.current_doctype_system.append("\ufffd")
1547
+ continue
1548
+ if c == ">":
1549
+ self._emit_error("abrupt-doctype-system-identifier")
1550
+ self.current_doctype_force_quirks = True
1551
+ self._emit_doctype()
1552
+ self.state = self.DATA
1553
+ return False
1554
+ self.current_doctype_system.append(c)
1555
+
1556
+ def _state_doctype_system_identifier_single_quoted(self):
1557
+ while True:
1558
+ c = self._get_char()
1559
+ if c is None:
1560
+ self._emit_error("eof-in-doctype-system-identifier")
1561
+ self.current_doctype_force_quirks = True
1562
+ self._emit_doctype()
1563
+ self._emit_token(EOFToken())
1564
+ return True
1565
+ if c == "'":
1566
+ self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
1567
+ return False
1568
+ if c == "\0":
1569
+ self._emit_error("unexpected-null-character")
1570
+ self.current_doctype_system.append("\ufffd")
1571
+ continue
1572
+ if c == ">":
1573
+ self._emit_error("abrupt-doctype-system-identifier")
1574
+ self.current_doctype_force_quirks = True
1575
+ self._emit_doctype()
1576
+ self.state = self.DATA
1577
+ return False
1578
+ self.current_doctype_system.append(c)
1579
+
1580
+ def _state_after_doctype_system_identifier(self):
1581
+ while True:
1582
+ c = self._get_char()
1583
+ if c is None:
1584
+ self._emit_error("eof-in-doctype")
1585
+ self.current_doctype_force_quirks = True
1586
+ self._emit_doctype()
1587
+ self._emit_token(EOFToken())
1588
+ return True
1589
+ if c in ("\t", "\n", "\f", " "):
1590
+ continue
1591
+ if c == ">":
1592
+ self._emit_doctype()
1593
+ self.state = self.DATA
1594
+ return False
1595
+ self._emit_error("unexpected-character-after-doctype-system-identifier")
1596
+ self._reconsume_current()
1597
+ self.state = self.BOGUS_DOCTYPE
1598
+ return False
1599
+
1600
+ def _state_bogus_doctype(self):
1601
+ while True:
1602
+ c = self._get_char()
1603
+ if c is None:
1604
+ self._emit_doctype()
1605
+ self._emit_token(EOFToken())
1606
+ return True
1607
+ if c == ">":
1608
+ self._emit_doctype()
1609
+ self.state = self.DATA
1610
+ return False
1611
+
1612
+ # ---------------------
1613
+ # Low-level helpers
1614
+ # ---------------------
1615
+
1616
+ def _get_char(self):
1617
+ if self.reconsume:
1618
+ self.reconsume = False
1619
+ return self.current_char
1620
+
1621
+ buffer = self.buffer
1622
+ pos = self.pos
1623
+ length = self.length
1624
+ while True:
1625
+ if pos >= length:
1626
+ self.pos = pos
1627
+ self.current_char = None
1628
+ return None
1629
+
1630
+ c = buffer[pos]
1631
+ pos += 1
1632
+
1633
+ if c == "\r":
1634
+ self.ignore_lf = True
1635
+ self.current_char = "\n"
1636
+ self.pos = pos
1637
+ return "\n"
1638
+
1639
+ if c == "\n":
1640
+ if self.ignore_lf:
1641
+ self.ignore_lf = False
1642
+ continue
1643
+ # Line tracking now computed on-demand via _get_line_at_pos()
1644
+
1645
+ else:
1646
+ self.ignore_lf = False
1647
+
1648
+ self.current_char = c
1649
+ self.pos = pos
1650
+ return c
1651
+
1652
+ def _reconsume_current(self):
1653
+ self.reconsume = True
1654
+
1655
+ def _append_text(self, text):
1656
+ """Append text to buffer, recording start position if this is the first chunk."""
1657
+ if not self.text_buffer:
1658
+ # Record where text started (current position before this chunk)
1659
+ self.text_start_pos = self.pos
1660
+ self.text_buffer.append(text)
1661
+
1662
+ def _flush_text(self):
1663
+ if not self.text_buffer:
1664
+ return
1665
+
1666
+ # Optimization: Avoid join for single chunk
1667
+ # text_buffer is never populated with empty strings
1668
+ if len(self.text_buffer) == 1:
1669
+ data = self.text_buffer[0]
1670
+ else:
1671
+ data = "".join(self.text_buffer)
1672
+
1673
+ # Calculate raw text length before any processing for position tracking
1674
+ raw_len = len(data)
1675
+
1676
+ self.text_buffer.clear()
1677
+ if self.state == self.DATA and "\0" in data:
1678
+ count = data.count("\0")
1679
+ for _ in range(count):
1680
+ self._emit_error("unexpected-null-character")
1681
+
1682
+ # Per HTML5 spec:
1683
+ # - RCDATA state (title, textarea): decode character references
1684
+ # - RAWTEXT state (style, script, etc): do NOT decode
1685
+ # - PLAINTEXT state: do NOT decode
1686
+ # - CDATA sections: do NOT decode
1687
+ if self.state >= self.PLAINTEXT or self.CDATA_SECTION <= self.state <= self.CDATA_SECTION_END:
1688
+ pass
1689
+ elif self.state >= self.RAWTEXT:
1690
+ pass
1691
+ else:
1692
+ if "&" in data:
1693
+ data = decode_entities_in_text(data)
1694
+ # Apply XML coercion if enabled
1695
+ if self.opts.xml_coercion:
1696
+ data = _coerce_text_for_xml(data)
1697
+
1698
+ # Record position at END of raw text (1-indexed column = raw_len)
1699
+ self._record_text_end_position(raw_len)
1700
+ self.sink.process_characters(data)
1701
+ # Note: process_characters never returns Plaintext or RawData
1702
+ # State switches happen via _emit_current_tag instead
1703
+
1704
+ def _append_attr_value_char(self, c):
1705
+ self.current_attr_value.append(c)
1706
+
1707
+ def _finish_attribute(self):
1708
+ attr_name_buffer = self.current_attr_name
1709
+ if not attr_name_buffer:
1710
+ return
1711
+ if len(attr_name_buffer) == 1:
1712
+ name = attr_name_buffer[0]
1713
+ else:
1714
+ name = "".join(attr_name_buffer)
1715
+ attrs = self.current_tag_attrs
1716
+ is_duplicate = name in attrs
1717
+ attr_name_buffer.clear()
1718
+ attr_value_buffer = self.current_attr_value
1719
+ if is_duplicate:
1720
+ self._emit_error("duplicate-attribute")
1721
+ attr_value_buffer.clear()
1722
+ self.current_attr_value_has_amp = False
1723
+ return
1724
+ if not attr_value_buffer:
1725
+ value = ""
1726
+ elif len(attr_value_buffer) == 1:
1727
+ value = attr_value_buffer[0]
1728
+ else:
1729
+ value = "".join(attr_value_buffer)
1730
+ if self.current_attr_value_has_amp:
1731
+ value = decode_entities_in_text(value, in_attribute=True)
1732
+ attrs[name] = value
1733
+ attr_value_buffer.clear()
1734
+ self.current_attr_value_has_amp = False
1735
+
1736
+ def _emit_current_tag(self):
1737
+ name_parts = self.current_tag_name
1738
+ part_count = len(name_parts)
1739
+ # Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
1740
+ if part_count == 1:
1741
+ name = name_parts[0]
1742
+ else:
1743
+ name = "".join(name_parts)
1744
+ attrs = self.current_tag_attrs
1745
+ self.current_tag_attrs = {}
1746
+
1747
+ tag = self._tag_token
1748
+ tag.kind = self.current_tag_kind
1749
+ tag.name = name
1750
+ tag.attrs = attrs
1751
+ tag.self_closing = self.current_tag_self_closing
1752
+
1753
+ switched_to_rawtext = False
1754
+ if self.current_tag_kind == Tag.START:
1755
+ self.last_start_tag_name = name
1756
+ needs_rawtext_check = name in _RAWTEXT_SWITCH_TAGS or name == "plaintext"
1757
+ if needs_rawtext_check:
1758
+ stack = self.sink.open_elements
1759
+ current_node = stack[-1] if stack else None
1760
+ namespace = current_node.namespace if current_node else None
1761
+ if namespace is None or namespace == "html":
1762
+ if name in _RCDATA_ELEMENTS:
1763
+ self.state = self.RCDATA
1764
+ self.rawtext_tag_name = name
1765
+ switched_to_rawtext = True
1766
+ elif name in _RAWTEXT_SWITCH_TAGS:
1767
+ self.state = self.RAWTEXT
1768
+ self.rawtext_tag_name = name
1769
+ switched_to_rawtext = True
1770
+ else:
1771
+ # Must be "plaintext" - the only other way needs_rawtext_check can be True
1772
+ self.state = self.PLAINTEXT
1773
+ switched_to_rawtext = True
1774
+ # Remember current state before emitting
1775
+
1776
+ # Emit token to sink
1777
+ self._record_token_position()
1778
+ result = self.sink.process_token(tag)
1779
+ if result == 1: # TokenSinkResult.Plaintext
1780
+ self.state = self.PLAINTEXT
1781
+ switched_to_rawtext = True
1782
+
1783
+ self.current_tag_name.clear()
1784
+ self.current_attr_name.clear()
1785
+ self.current_attr_value.clear()
1786
+ self.current_tag_self_closing = False
1787
+ self.current_tag_kind = Tag.START
1788
+ return switched_to_rawtext
1789
+
1790
+ def _emit_comment(self):
1791
+ data = "".join(self.current_comment)
1792
+ self.current_comment.clear()
1793
+ # Apply XML coercion if enabled
1794
+ if self.opts.xml_coercion:
1795
+ data = _coerce_comment_for_xml(data)
1796
+ self._comment_token.data = data
1797
+ self._emit_token(self._comment_token)
1798
+
1799
+ def _emit_doctype(self):
1800
+ name = "".join(self.current_doctype_name) if self.current_doctype_name else None
1801
+ # If public_id/system_id is a list (even empty), join it; if None, keep None
1802
+ public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
1803
+ system_id = "".join(self.current_doctype_system) if self.current_doctype_system is not None else None
1804
+ doctype = Doctype(
1805
+ name=name,
1806
+ public_id=public_id,
1807
+ system_id=system_id,
1808
+ force_quirks=self.current_doctype_force_quirks,
1809
+ )
1810
+ self.current_doctype_name.clear()
1811
+ self.current_doctype_public = None
1812
+ self.current_doctype_system = None
1813
+ self.current_doctype_force_quirks = False
1814
+ self._emit_token(DoctypeToken(doctype))
1815
+
1816
+ def _emit_token(self, token):
1817
+ self._record_token_position()
1818
+ self.sink.process_token(token)
1819
+ # Note: process_token never returns Plaintext or RawData for state switches
1820
+ # State switches happen via _emit_current_tag checking sink response
1821
+
1822
+ def _record_token_position(self):
1823
+ """Record current position as 0-indexed column for the last emitted token.
1824
+
1825
+ Per the spec, the position should be at the end of the token (after the last char).
1826
+ """
1827
+ if not self.collect_errors:
1828
+ return
1829
+ # pos points after the last consumed character, which is exactly what we want
1830
+ pos = self.pos
1831
+ last_newline = self.buffer.rfind("\n", 0, pos)
1832
+ if last_newline == -1:
1833
+ column = pos # 0-indexed from start
1834
+ else:
1835
+ column = pos - last_newline - 1 # 0-indexed from after newline
1836
+ self.last_token_line = self._get_line_at_pos(pos)
1837
+ self.last_token_column = column
1838
+
1839
+ def _record_text_end_position(self, raw_len):
1840
+ """Record position at end of text token (after last character).
1841
+
1842
+ Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
1843
+ behavior of reporting the column of the last character (1-indexed).
1844
+ """
1845
+ if not self.collect_errors:
1846
+ return
1847
+ # Position of last character of text (0-indexed)
1848
+ end_pos = self.text_start_pos + raw_len
1849
+ last_newline = self.buffer.rfind("\n", 0, end_pos)
1850
+ if last_newline == -1:
1851
+ column = end_pos # 1-indexed column = end_pos (position after last char)
1852
+ else:
1853
+ column = end_pos - last_newline - 1
1854
+ self.last_token_line = self._get_line_at_pos(end_pos)
1855
+ self.last_token_column = column
1856
+
1857
+ def _emit_error(self, code):
1858
+ if not self.collect_errors:
1859
+ return
1860
+ # Compute column on-demand: scan backwards to find last newline
1861
+ pos = max(0, self.pos - 1) # Current position being processed
1862
+ last_newline = self.buffer.rfind("\n", 0, pos + 1)
1863
+ if last_newline == -1:
1864
+ column = pos + 1 # 1-indexed from start of input
1865
+ else:
1866
+ column = pos - last_newline # 1-indexed from after newline
1867
+
1868
+ message = generate_error_message(code)
1869
+ line = self._get_line_at_pos(self.pos)
1870
+ self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
1871
+
1872
+ def _consume_if(self, literal):
1873
+ end = self.pos + len(literal)
1874
+ if end > self.length:
1875
+ return False
1876
+ segment = self.buffer[self.pos : end]
1877
+ if segment != literal:
1878
+ return False
1879
+ self.pos = end
1880
+ return True
1881
+
1882
+ def _consume_case_insensitive(self, literal):
1883
+ end = self.pos + len(literal)
1884
+ if end > self.length:
1885
+ return False
1886
+ segment = self.buffer[self.pos : end]
1887
+ if segment.lower() != literal.lower():
1888
+ return False
1889
+ self.pos = end
1890
+ return True
1891
+
1892
+ def _consume_comment_run(self):
1893
+ # Note: Comments are never reconsumed
1894
+ pos = self.pos
1895
+ length = self.length
1896
+ if pos >= length:
1897
+ return False
1898
+
1899
+ # Handle ignore_lf for CRLF sequences
1900
+ if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
1901
+ self.ignore_lf = False
1902
+ pos += 1
1903
+ self.pos = pos
1904
+ if pos >= length:
1905
+ return False
1906
+
1907
+ match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
1908
+ if match:
1909
+ chunk = match.group(0)
1910
+ # Handle CRLF normalization for comments
1911
+ if "\r" in chunk:
1912
+ chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
1913
+ self.ignore_lf = chunk.endswith("\r")
1914
+ self.current_comment.append(chunk)
1915
+ self.pos = match.end()
1916
+ return True
1917
+ return False
1918
+
1919
+ def _state_cdata_section(self):
1920
+ # CDATA section state - consume characters until we see ']'
1921
+ while True:
1922
+ c = self._get_char()
1923
+ if c is None:
1924
+ self._emit_error("eof-in-cdata")
1925
+ self._flush_text()
1926
+ self._emit_token(EOFToken())
1927
+ return True
1928
+ if c == "]":
1929
+ self.state = self.CDATA_SECTION_BRACKET
1930
+ return False
1931
+ self._append_text(c)
1932
+
1933
+ def _state_cdata_section_bracket(self):
1934
+ # Seen one ']', check for second ']'
1935
+ c = self._get_char()
1936
+ if c == "]":
1937
+ self.state = self.CDATA_SECTION_END
1938
+ return False
1939
+ # False alarm, emit the ']' we saw and continue
1940
+ self._append_text("]")
1941
+ if c is None:
1942
+ self._emit_error("eof-in-cdata")
1943
+ self._flush_text()
1944
+ self._emit_token(EOFToken())
1945
+ return True
1946
+ self._reconsume_current()
1947
+ self.state = self.CDATA_SECTION
1948
+ return False
1949
+
1950
+ def _state_cdata_section_end(self):
1951
+ # Seen ']]', check for '>'
1952
+ c = self._get_char()
1953
+ if c == ">":
1954
+ # End of CDATA section
1955
+ self._flush_text()
1956
+ self.state = self.DATA
1957
+ return False
1958
+ # Not the end - we saw ']]' but not '>'. Emit one ']' and check if the next char is another ']'
1959
+ self._append_text("]")
1960
+ if c is None:
1961
+ # EOF after ']]' - emit the second ']' too
1962
+ self._append_text("]")
1963
+ self._emit_error("eof-in-cdata")
1964
+ self._flush_text()
1965
+ self._emit_token(EOFToken())
1966
+ return True
1967
+ if c == "]":
1968
+ # Still might be ']]>' sequence, stay in CDATA_SECTION_END
1969
+ return False
1970
+ # Not a bracket, so emit the second ']', reconsume current char and go back to CDATA_SECTION
1971
+ self._append_text("]")
1972
+ self._reconsume_current()
1973
+ self.state = self.CDATA_SECTION
1974
+ return False
1975
+
1976
+ def _state_rcdata(self):
1977
+ buffer = self.buffer
1978
+ length = self.length
1979
+ pos = self.pos
1980
+ while True:
1981
+ if self.reconsume:
1982
+ self.reconsume = False
1983
+ if self.current_char is None:
1984
+ self._flush_text()
1985
+ self._emit_token(EOFToken())
1986
+ return True
1987
+ self.pos -= 1
1988
+ pos = self.pos
1989
+
1990
+ # Optimized loop using find
1991
+ lt_index = buffer.find("<", pos)
1992
+ amp_index = buffer.find("&", pos)
1993
+ null_index = buffer.find("\0", pos)
1994
+
1995
+ # Find the nearest special character
1996
+ next_special = length
1997
+ if lt_index != -1:
1998
+ next_special = lt_index
1999
+ if amp_index != -1 and amp_index < next_special:
2000
+ next_special = amp_index
2001
+ if null_index != -1 and null_index < next_special:
2002
+ next_special = null_index
2003
+
2004
+ # Consume everything up to the special character
2005
+ if next_special > pos:
2006
+ chunk = buffer[pos:next_special]
2007
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2008
+ pos = next_special
2009
+ self.pos = pos
2010
+
2011
+ # Handle EOF
2012
+ if pos >= length:
2013
+ self._flush_text()
2014
+ self._emit_token(EOFToken())
2015
+ return True
2016
+
2017
+ # Handle special characters - we're at one of them after find()
2018
+ if null_index == pos:
2019
+ self.ignore_lf = False
2020
+ self._emit_error("unexpected-null-character")
2021
+ self._append_text("\ufffd")
2022
+ pos += 1
2023
+ self.pos = pos
2024
+ elif amp_index == pos:
2025
+ # Ampersand in RCDATA - will be decoded by _flush_text
2026
+ self._append_text("&")
2027
+ pos += 1
2028
+ self.pos = pos
2029
+ else:
2030
+ # lt_index == pos - the only remaining possibility
2031
+ # Less-than sign - might be start of end tag
2032
+ pos += 1
2033
+ self.pos = pos
2034
+ self.state = self.RCDATA_LESS_THAN_SIGN
2035
+ return False
2036
+
2037
+ def _state_rcdata_less_than_sign(self):
2038
+ c = self._get_char()
2039
+ if c == "/":
2040
+ self.current_tag_name.clear()
2041
+ self.state = self.RCDATA_END_TAG_OPEN
2042
+ return False
2043
+ self._append_text("<")
2044
+ self._reconsume_current()
2045
+ self.state = self.RCDATA
2046
+ return False
2047
+
2048
+ def _state_rcdata_end_tag_open(self):
2049
+ c = self._get_char()
2050
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2051
+ self.current_tag_name.append(c.lower())
2052
+ self.original_tag_name.append(c)
2053
+ self.state = self.RCDATA_END_TAG_NAME
2054
+ return False
2055
+ self.text_buffer.extend(("<", "/"))
2056
+ self._reconsume_current()
2057
+ self.state = self.RCDATA
2058
+ return False
2059
+
2060
+ def _state_rcdata_end_tag_name(self):
2061
+ # Check if this matches the opening tag name
2062
+ while True:
2063
+ c = self._get_char()
2064
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2065
+ self.current_tag_name.append(c.lower())
2066
+ self.original_tag_name.append(c)
2067
+ continue
2068
+ # End of tag name - check if it matches
2069
+ tag_name = "".join(self.current_tag_name)
2070
+ if tag_name == self.rawtext_tag_name:
2071
+ if c == ">":
2072
+ attrs = []
2073
+ tag = Tag(Tag.END, tag_name, attrs, False)
2074
+ self._flush_text()
2075
+ self._emit_token(tag)
2076
+ self.state = self.DATA
2077
+ self.rawtext_tag_name = None
2078
+ self.original_tag_name.clear()
2079
+ return False
2080
+ if c in (" ", "\t", "\n", "\r", "\f"):
2081
+ # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2082
+ self.current_tag_kind = Tag.END
2083
+ self.current_tag_attrs = {}
2084
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2085
+ return False
2086
+ if c == "/":
2087
+ self._flush_text()
2088
+ self.current_tag_kind = Tag.END
2089
+ self.current_tag_attrs = {}
2090
+ self.state = self.SELF_CLOSING_START_TAG
2091
+ return False
2092
+ # If we hit EOF or tag doesn't match, emit as text
2093
+ if c is None:
2094
+ # EOF - emit incomplete tag as text (preserve original case) then EOF
2095
+ self.text_buffer.extend(("<", "/"))
2096
+ for ch in self.original_tag_name:
2097
+ self._append_text(ch)
2098
+ self.current_tag_name.clear()
2099
+ self.original_tag_name.clear()
2100
+ self._flush_text()
2101
+ self._emit_token(EOFToken())
2102
+ return True
2103
+ # Not a matching end tag - emit as text (preserve original case)
2104
+ self.text_buffer.extend(("<", "/"))
2105
+ for ch in self.original_tag_name:
2106
+ self._append_text(ch)
2107
+ self.current_tag_name.clear()
2108
+ self.original_tag_name.clear()
2109
+ self._reconsume_current()
2110
+ self.state = self.RCDATA
2111
+ return False
2112
+
2113
+ def _state_rawtext(self):
2114
+ buffer = self.buffer
2115
+ length = self.length
2116
+ pos = self.pos
2117
+ while True:
2118
+ if self.reconsume:
2119
+ self.reconsume = False
2120
+ if self.current_char is None:
2121
+ self._flush_text()
2122
+ self._emit_token(EOFToken())
2123
+ return True
2124
+ self.pos -= 1
2125
+ pos = self.pos
2126
+
2127
+ # Optimized loop using find
2128
+ lt_index = buffer.find("<", pos)
2129
+ null_index = buffer.find("\0", pos)
2130
+ next_special = lt_index if lt_index != -1 else length
2131
+ if null_index != -1 and null_index < next_special:
2132
+ if null_index > pos:
2133
+ chunk = buffer[pos:null_index]
2134
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2135
+ else:
2136
+ self.ignore_lf = False
2137
+ self._emit_error("unexpected-null-character")
2138
+ self._append_text("\ufffd")
2139
+ pos = null_index + 1
2140
+ self.pos = pos
2141
+ continue
2142
+ if lt_index == -1:
2143
+ if pos < length:
2144
+ chunk = buffer[pos:length]
2145
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2146
+ self.pos = length
2147
+ self._flush_text()
2148
+ self._emit_token(EOFToken())
2149
+ return True
2150
+ if lt_index > pos:
2151
+ chunk = buffer[pos:lt_index]
2152
+ self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2153
+ pos = lt_index + 1
2154
+ self.pos = pos
2155
+ # Handle script escaped transition before treating '<' as markup boundary
2156
+ if self.rawtext_tag_name == "script":
2157
+ next1 = self._peek_char(0)
2158
+ next2 = self._peek_char(1)
2159
+ next3 = self._peek_char(2)
2160
+ if next1 == "!" and next2 == "-" and next3 == "-":
2161
+ self.text_buffer.extend(["<", "!", "-", "-"])
2162
+ self._get_char()
2163
+ self._get_char()
2164
+ self._get_char()
2165
+ self.state = self.SCRIPT_DATA_ESCAPED
2166
+ return False
2167
+ self.state = self.RAWTEXT_LESS_THAN_SIGN
2168
+ return False
2169
+
2170
+ def _state_rawtext_less_than_sign(self):
2171
+ c = self._get_char()
2172
+ if c == "/":
2173
+ self.current_tag_name.clear()
2174
+ self.state = self.RAWTEXT_END_TAG_OPEN
2175
+ return False
2176
+ self._append_text("<")
2177
+ self._reconsume_current()
2178
+ self.state = self.RAWTEXT
2179
+ return False
2180
+
2181
+ def _state_rawtext_end_tag_open(self):
2182
+ c = self._get_char()
2183
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2184
+ self.current_tag_name.append(c.lower())
2185
+ self.original_tag_name.append(c)
2186
+ self.state = self.RAWTEXT_END_TAG_NAME
2187
+ return False
2188
+ self.text_buffer.extend(("<", "/"))
2189
+ self._reconsume_current()
2190
+ self.state = self.RAWTEXT
2191
+ return False
2192
+
2193
+ def _state_rawtext_end_tag_name(self):
2194
+ # Check if this matches the opening tag name
2195
+ while True:
2196
+ c = self._get_char()
2197
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2198
+ self.current_tag_name.append(c.lower())
2199
+ self.original_tag_name.append(c)
2200
+ continue
2201
+ # End of tag name - check if it matches
2202
+ tag_name = "".join(self.current_tag_name)
2203
+ if tag_name == self.rawtext_tag_name:
2204
+ if c == ">":
2205
+ attrs = []
2206
+ tag = Tag(Tag.END, tag_name, attrs, False)
2207
+ self._flush_text()
2208
+ self._emit_token(tag)
2209
+ self.state = self.DATA
2210
+ self.rawtext_tag_name = None
2211
+ self.original_tag_name.clear()
2212
+ return False
2213
+ if c in (" ", "\t", "\n", "\r", "\f"):
2214
+ # Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
2215
+ self.current_tag_kind = Tag.END
2216
+ self.current_tag_attrs = {}
2217
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2218
+ return False
2219
+ if c == "/":
2220
+ self._flush_text()
2221
+ self.current_tag_kind = Tag.END
2222
+ self.current_tag_attrs = {}
2223
+ self.state = self.SELF_CLOSING_START_TAG
2224
+ return False
2225
+ # If we hit EOF or tag doesn't match, emit as text
2226
+ if c is None:
2227
+ # EOF - emit incomplete tag as text (preserve original case) then EOF
2228
+ self.text_buffer.extend(("<", "/"))
2229
+ for ch in self.original_tag_name:
2230
+ self._append_text(ch)
2231
+ self.current_tag_name.clear()
2232
+ self.original_tag_name.clear()
2233
+ self._flush_text()
2234
+ self._emit_token(EOFToken())
2235
+ return True
2236
+ # Not a matching end tag - emit as text (preserve original case)
2237
+ self.text_buffer.extend(("<", "/"))
2238
+ for ch in self.original_tag_name:
2239
+ self._append_text(ch)
2240
+ self.current_tag_name.clear()
2241
+ self.original_tag_name.clear()
2242
+ self._reconsume_current()
2243
+ self.state = self.RAWTEXT
2244
+ return False
2245
+
2246
+ def _state_plaintext(self):
2247
+ # PLAINTEXT state - consume everything as text, no end tag
2248
+ if self.pos < self.length:
2249
+ remaining = self.buffer[self.pos :]
2250
+ # Replace null bytes with replacement character
2251
+ if "\0" in remaining:
2252
+ remaining = remaining.replace("\0", "\ufffd")
2253
+ self._emit_error("unexpected-null-character")
2254
+ self._append_text(remaining)
2255
+ self.pos = self.length
2256
+ self._flush_text()
2257
+ self._emit_token(EOFToken())
2258
+ return True
2259
+
2260
+ def _state_script_data_escaped(self):
2261
+ c = self._get_char()
2262
+ if c is None:
2263
+ self._flush_text()
2264
+ self._emit_token(EOFToken())
2265
+ return True
2266
+ if c == "-":
2267
+ self._append_text("-")
2268
+ self.state = self.SCRIPT_DATA_ESCAPED_DASH
2269
+ return False
2270
+ if c == "<":
2271
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2272
+ return False
2273
+ if c == "\0":
2274
+ self._emit_error("unexpected-null-character")
2275
+ self._append_text("\ufffd")
2276
+ return False
2277
+ self._append_text(c)
2278
+ return False
2279
+
2280
+ def _state_script_data_escaped_dash(self):
2281
+ c = self._get_char()
2282
+ if c is None:
2283
+ self._flush_text()
2284
+ self._emit_token(EOFToken())
2285
+ return True
2286
+ if c == "-":
2287
+ self._append_text("-")
2288
+ self.state = self.SCRIPT_DATA_ESCAPED_DASH_DASH
2289
+ return False
2290
+ if c == "<":
2291
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2292
+ return False
2293
+ if c == "\0":
2294
+ self._emit_error("unexpected-null-character")
2295
+ self._append_text("\ufffd")
2296
+ self.state = self.SCRIPT_DATA_ESCAPED
2297
+ return False
2298
+ self._append_text(c)
2299
+ self.state = self.SCRIPT_DATA_ESCAPED
2300
+ return False
2301
+
2302
+ def _state_script_data_escaped_dash_dash(self):
2303
+ c = self._get_char()
2304
+ if c is None:
2305
+ self._flush_text()
2306
+ self._emit_token(EOFToken())
2307
+ return True
2308
+ if c == "-":
2309
+ self._append_text("-")
2310
+ return False
2311
+ if c == "<":
2312
+ self._append_text("<")
2313
+ self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
2314
+ return False
2315
+ if c == ">":
2316
+ self._append_text(">")
2317
+ self.state = self.RAWTEXT
2318
+ return False
2319
+ if c == "\0":
2320
+ self._emit_error("unexpected-null-character")
2321
+ self._append_text("\ufffd")
2322
+ self.state = self.SCRIPT_DATA_ESCAPED
2323
+ return False
2324
+ self._append_text(c)
2325
+ self.state = self.SCRIPT_DATA_ESCAPED
2326
+ return False
2327
+
2328
+ def _state_script_data_escaped_less_than_sign(self):
2329
+ c = self._get_char()
2330
+ if c == "/":
2331
+ self.temp_buffer.clear()
2332
+ self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_OPEN
2333
+ return False
2334
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2335
+ self.temp_buffer.clear()
2336
+ self._append_text("<")
2337
+ self._reconsume_current()
2338
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
2339
+ return False
2340
+ self._append_text("<")
2341
+ self._reconsume_current()
2342
+ self.state = self.SCRIPT_DATA_ESCAPED
2343
+
2344
+ return False
2345
+
2346
+ def _state_script_data_escaped_end_tag_open(self):
2347
+ c = self._get_char()
2348
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2349
+ self.current_tag_name.clear()
2350
+ self.original_tag_name.clear()
2351
+ self._reconsume_current()
2352
+ self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_NAME
2353
+ return False
2354
+ self.text_buffer.extend(("<", "/"))
2355
+ self._reconsume_current()
2356
+ self.state = self.SCRIPT_DATA_ESCAPED
2357
+ return False
2358
+
2359
+ def _state_script_data_escaped_end_tag_name(self):
2360
+ c = self._get_char()
2361
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2362
+ self.current_tag_name.append(c.lower())
2363
+ self.original_tag_name.append(c)
2364
+ self.temp_buffer.append(c)
2365
+ return False
2366
+ # Check if this is an appropriate end tag
2367
+ tag_name = "".join(self.current_tag_name)
2368
+ is_appropriate = tag_name == self.rawtext_tag_name
2369
+
2370
+ if is_appropriate:
2371
+ if c in (" ", "\t", "\n", "\r", "\f"):
2372
+ self.current_tag_kind = Tag.END
2373
+ self.current_tag_attrs = {}
2374
+ self.state = self.BEFORE_ATTRIBUTE_NAME
2375
+ return False
2376
+ if c == "/":
2377
+ self._flush_text()
2378
+ self.current_tag_kind = Tag.END
2379
+ self.current_tag_attrs = {}
2380
+ self.state = self.SELF_CLOSING_START_TAG
2381
+ return False
2382
+ if c == ">":
2383
+ self._flush_text()
2384
+ attrs = []
2385
+ tag = Tag(Tag.END, tag_name, attrs, False)
2386
+ self._emit_token(tag)
2387
+ self.state = self.DATA
2388
+ self.rawtext_tag_name = None
2389
+ self.current_tag_name.clear()
2390
+ self.original_tag_name.clear()
2391
+ return False
2392
+ # Not an appropriate end tag
2393
+ self.text_buffer.extend(("<", "/"))
2394
+ for ch in self.temp_buffer:
2395
+ self._append_text(ch)
2396
+ self._reconsume_current()
2397
+ self.state = self.SCRIPT_DATA_ESCAPED
2398
+ return False
2399
+
2400
+ def _state_script_data_double_escape_start(self):
2401
+ c = self._get_char()
2402
+ if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2403
+ # Check if temp_buffer contains "script"
2404
+ temp = "".join(self.temp_buffer).lower()
2405
+ if temp == "script":
2406
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2407
+ else:
2408
+ self.state = self.SCRIPT_DATA_ESCAPED
2409
+ self._append_text(c)
2410
+ return False
2411
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2412
+ self.temp_buffer.append(c)
2413
+ self._append_text(c)
2414
+ return False
2415
+ self._reconsume_current()
2416
+ self.state = self.SCRIPT_DATA_ESCAPED
2417
+ return False
2418
+
2419
+ def _state_script_data_double_escaped(self):
2420
+ c = self._get_char()
2421
+ if c is None:
2422
+ self._flush_text()
2423
+ self._emit_token(EOFToken())
2424
+ return True
2425
+ if c == "-":
2426
+ self._append_text("-")
2427
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH
2428
+ return False
2429
+ if c == "<":
2430
+ self._append_text("<")
2431
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2432
+ return False
2433
+ if c == "\0":
2434
+ self._emit_error("unexpected-null-character")
2435
+ self._append_text("\ufffd")
2436
+ return False
2437
+ self._append_text(c)
2438
+ return False
2439
+
2440
+ def _state_script_data_double_escaped_dash(self):
2441
+ c = self._get_char()
2442
+ if c is None:
2443
+ self._flush_text()
2444
+ self._emit_token(EOFToken())
2445
+ return True
2446
+ if c == "-":
2447
+ self._append_text("-")
2448
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
2449
+ return False
2450
+ if c == "<":
2451
+ self._append_text("<")
2452
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2453
+ return False
2454
+ if c == "\0":
2455
+ self._emit_error("unexpected-null-character")
2456
+ self._append_text("\ufffd")
2457
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2458
+ return False
2459
+ self._append_text(c)
2460
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2461
+ return False
2462
+
2463
+ def _state_script_data_double_escaped_dash_dash(self):
2464
+ c = self._get_char()
2465
+ if c is None:
2466
+ self._flush_text()
2467
+ self._emit_token(EOFToken())
2468
+ return True
2469
+ if c == "-":
2470
+ self._append_text("-")
2471
+ return False
2472
+ if c == "<":
2473
+ self._append_text("<")
2474
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
2475
+
2476
+ return False
2477
+ if c == ">":
2478
+ self._append_text(">")
2479
+ self.state = self.RAWTEXT
2480
+
2481
+ return False
2482
+ if c == "\0":
2483
+ self._emit_error("unexpected-null-character")
2484
+ self._append_text("\ufffd")
2485
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2486
+ return False
2487
+ self._append_text(c)
2488
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2489
+ return False
2490
+
2491
+ def _state_script_data_double_escaped_less_than_sign(self):
2492
+ c = self._get_char()
2493
+ if c == "/":
2494
+ self.temp_buffer.clear()
2495
+ self._append_text("/")
2496
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_END
2497
+ return False
2498
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2499
+ self.temp_buffer.clear()
2500
+ self._reconsume_current()
2501
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
2502
+ return False
2503
+ self._reconsume_current()
2504
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2505
+ return False
2506
+
2507
+ def _state_script_data_double_escape_end(self):
2508
+ c = self._get_char()
2509
+ if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2510
+ # Check if temp_buffer contains "script"
2511
+ temp = "".join(self.temp_buffer).lower()
2512
+
2513
+ if temp == "script":
2514
+ self.state = self.SCRIPT_DATA_ESCAPED
2515
+ else:
2516
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2517
+ self._append_text(c)
2518
+ return False
2519
+ if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2520
+ self.temp_buffer.append(c)
2521
+ self._append_text(c)
2522
+ return False
2523
+ self._reconsume_current()
2524
+ self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2525
+ return False
2526
+
2527
+
2528
+ Tokenizer._STATE_HANDLERS = [
2529
+ Tokenizer._state_data,
2530
+ Tokenizer._state_tag_open,
2531
+ Tokenizer._state_end_tag_open,
2532
+ Tokenizer._state_tag_name,
2533
+ Tokenizer._state_before_attribute_name,
2534
+ Tokenizer._state_attribute_name,
2535
+ Tokenizer._state_after_attribute_name,
2536
+ Tokenizer._state_before_attribute_value,
2537
+ Tokenizer._state_attribute_value_double,
2538
+ Tokenizer._state_attribute_value_single,
2539
+ Tokenizer._state_attribute_value_unquoted,
2540
+ Tokenizer._state_after_attribute_value_quoted,
2541
+ Tokenizer._state_self_closing_start_tag,
2542
+ Tokenizer._state_markup_declaration_open,
2543
+ Tokenizer._state_comment_start,
2544
+ Tokenizer._state_comment_start_dash,
2545
+ Tokenizer._state_comment,
2546
+ Tokenizer._state_comment_end_dash,
2547
+ Tokenizer._state_comment_end,
2548
+ Tokenizer._state_comment_end_bang,
2549
+ Tokenizer._state_bogus_comment,
2550
+ Tokenizer._state_doctype,
2551
+ Tokenizer._state_before_doctype_name,
2552
+ Tokenizer._state_doctype_name,
2553
+ Tokenizer._state_after_doctype_name,
2554
+ Tokenizer._state_bogus_doctype,
2555
+ Tokenizer._state_after_doctype_public_keyword,
2556
+ Tokenizer._state_after_doctype_system_keyword,
2557
+ Tokenizer._state_before_doctype_public_identifier,
2558
+ Tokenizer._state_doctype_public_identifier_double_quoted,
2559
+ Tokenizer._state_doctype_public_identifier_single_quoted,
2560
+ Tokenizer._state_after_doctype_public_identifier,
2561
+ Tokenizer._state_between_doctype_public_and_system_identifiers,
2562
+ Tokenizer._state_before_doctype_system_identifier,
2563
+ Tokenizer._state_doctype_system_identifier_double_quoted,
2564
+ Tokenizer._state_doctype_system_identifier_single_quoted,
2565
+ Tokenizer._state_after_doctype_system_identifier,
2566
+ Tokenizer._state_cdata_section,
2567
+ Tokenizer._state_cdata_section_bracket,
2568
+ Tokenizer._state_cdata_section_end,
2569
+ Tokenizer._state_rcdata,
2570
+ Tokenizer._state_rcdata_less_than_sign,
2571
+ Tokenizer._state_rcdata_end_tag_open,
2572
+ Tokenizer._state_rcdata_end_tag_name,
2573
+ Tokenizer._state_rawtext,
2574
+ Tokenizer._state_rawtext_less_than_sign,
2575
+ Tokenizer._state_rawtext_end_tag_open,
2576
+ Tokenizer._state_rawtext_end_tag_name,
2577
+ Tokenizer._state_plaintext,
2578
+ Tokenizer._state_script_data_escaped,
2579
+ Tokenizer._state_script_data_escaped_dash,
2580
+ Tokenizer._state_script_data_escaped_dash_dash,
2581
+ Tokenizer._state_script_data_escaped_less_than_sign,
2582
+ Tokenizer._state_script_data_escaped_end_tag_open,
2583
+ Tokenizer._state_script_data_escaped_end_tag_name,
2584
+ Tokenizer._state_script_data_double_escape_start,
2585
+ Tokenizer._state_script_data_double_escaped,
2586
+ Tokenizer._state_script_data_double_escaped_dash,
2587
+ Tokenizer._state_script_data_double_escaped_dash_dash,
2588
+ Tokenizer._state_script_data_double_escaped_less_than_sign,
2589
+ Tokenizer._state_script_data_double_escape_end,
2590
+ ]