justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/tokenizer.py
CHANGED
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
|
|
10
10
|
from .entities import decode_entities_in_text
|
|
11
11
|
from .errors import generate_error_message
|
|
12
|
-
from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
12
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
13
13
|
|
|
14
14
|
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
|
|
15
15
|
_ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
|
|
@@ -79,7 +79,14 @@ def _coerce_comment_for_xml(text: str) -> str:
|
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
class TokenizerOpts:
|
|
82
|
-
__slots__ = (
|
|
82
|
+
__slots__ = (
|
|
83
|
+
"discard_bom",
|
|
84
|
+
"emit_bogus_markup_as_text",
|
|
85
|
+
"exact_errors",
|
|
86
|
+
"initial_rawtext_tag",
|
|
87
|
+
"initial_state",
|
|
88
|
+
"xml_coercion",
|
|
89
|
+
)
|
|
83
90
|
|
|
84
91
|
discard_bom: bool
|
|
85
92
|
exact_errors: bool
|
|
@@ -91,12 +98,14 @@ class TokenizerOpts:
|
|
|
91
98
|
self,
|
|
92
99
|
exact_errors: bool = False,
|
|
93
100
|
discard_bom: bool = True,
|
|
101
|
+
emit_bogus_markup_as_text: bool = False,
|
|
94
102
|
initial_state: int | None = None,
|
|
95
103
|
initial_rawtext_tag: str | None = None,
|
|
96
104
|
xml_coercion: bool = False,
|
|
97
105
|
) -> None:
|
|
98
106
|
self.exact_errors = bool(exact_errors)
|
|
99
107
|
self.discard_bom = bool(discard_bom)
|
|
108
|
+
self.emit_bogus_markup_as_text = bool(emit_bogus_markup_as_text)
|
|
100
109
|
self.initial_state = initial_state
|
|
101
110
|
self.initial_rawtext_tag = initial_rawtext_tag
|
|
102
111
|
self.xml_coercion = bool(xml_coercion)
|
|
@@ -203,6 +212,7 @@ class Tokenizer:
|
|
|
203
212
|
"text_buffer",
|
|
204
213
|
"text_start_pos",
|
|
205
214
|
"track_node_locations",
|
|
215
|
+
"track_tag_positions",
|
|
206
216
|
)
|
|
207
217
|
|
|
208
218
|
_comment_token: CommentToken
|
|
@@ -211,6 +221,7 @@ class Tokenizer:
|
|
|
211
221
|
_tag_token: Tag
|
|
212
222
|
buffer: str
|
|
213
223
|
collect_errors: bool
|
|
224
|
+
track_tag_positions: bool
|
|
214
225
|
track_node_locations: bool
|
|
215
226
|
current_attr_name: list[str]
|
|
216
227
|
current_attr_value: list[str]
|
|
@@ -252,11 +263,13 @@ class Tokenizer:
|
|
|
252
263
|
*,
|
|
253
264
|
collect_errors: bool = False,
|
|
254
265
|
track_node_locations: bool = False,
|
|
266
|
+
track_tag_positions: bool = False,
|
|
255
267
|
) -> None:
|
|
256
268
|
self.sink = sink
|
|
257
269
|
self.opts = opts or TokenizerOpts()
|
|
258
270
|
self.collect_errors = collect_errors
|
|
259
271
|
self.track_node_locations = bool(track_node_locations)
|
|
272
|
+
self.track_tag_positions = bool(track_tag_positions)
|
|
260
273
|
self.errors = []
|
|
261
274
|
|
|
262
275
|
self.state = self.DATA
|
|
@@ -396,8 +409,9 @@ class Tokenizer:
|
|
|
396
409
|
|
|
397
410
|
def run(self, html: str | None) -> None:
|
|
398
411
|
self.initialize(html)
|
|
412
|
+
handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
|
|
399
413
|
while True:
|
|
400
|
-
if self.
|
|
414
|
+
if handlers[self.state](self): # type: ignore[no-any-return]
|
|
401
415
|
break
|
|
402
416
|
|
|
403
417
|
# ---------------------
|
|
@@ -486,7 +500,7 @@ class Tokenizer:
|
|
|
486
500
|
self.state = self.TAG_NAME
|
|
487
501
|
return self._state_tag_name()
|
|
488
502
|
|
|
489
|
-
if nc == "!":
|
|
503
|
+
if nc == "!" and not self.opts.emit_bogus_markup_as_text:
|
|
490
504
|
# Optimization: Peek ahead for comments
|
|
491
505
|
if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
|
|
492
506
|
self._flush_text()
|
|
@@ -529,12 +543,20 @@ class Tokenizer:
|
|
|
529
543
|
self._emit_token(EOFToken())
|
|
530
544
|
return True
|
|
531
545
|
if c == "!":
|
|
546
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
547
|
+
self._append_text("<!")
|
|
548
|
+
self.state = self.DATA
|
|
549
|
+
return False
|
|
532
550
|
self.state = self.MARKUP_DECLARATION_OPEN
|
|
533
551
|
return False
|
|
534
552
|
if c == "/":
|
|
535
553
|
self.state = self.END_TAG_OPEN
|
|
536
554
|
return False
|
|
537
555
|
if c == "?":
|
|
556
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
557
|
+
self._append_text("<?")
|
|
558
|
+
self.state = self.DATA
|
|
559
|
+
return False
|
|
538
560
|
self._emit_error("unexpected-question-mark-instead-of-tag-name")
|
|
539
561
|
self.current_comment.clear()
|
|
540
562
|
self._reconsume_current()
|
|
@@ -551,6 +573,11 @@ class Tokenizer:
|
|
|
551
573
|
c = self._get_char()
|
|
552
574
|
if c is None:
|
|
553
575
|
self._emit_error("eof-before-tag-name")
|
|
576
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
577
|
+
self._append_text("</")
|
|
578
|
+
self._flush_text()
|
|
579
|
+
self._emit_token(EOFToken())
|
|
580
|
+
return True
|
|
554
581
|
self._append_text("<")
|
|
555
582
|
self._append_text("/")
|
|
556
583
|
self._flush_text()
|
|
@@ -558,6 +585,16 @@ class Tokenizer:
|
|
|
558
585
|
return True
|
|
559
586
|
if c == ">":
|
|
560
587
|
self._emit_error("empty-end-tag")
|
|
588
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
589
|
+
self._append_text("</>")
|
|
590
|
+
self.state = self.DATA
|
|
591
|
+
return False
|
|
592
|
+
self.state = self.DATA
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
596
|
+
self._append_text("</")
|
|
597
|
+
self._append_text(c)
|
|
561
598
|
self.state = self.DATA
|
|
562
599
|
return False
|
|
563
600
|
|
|
@@ -593,6 +630,8 @@ class Tokenizer:
|
|
|
593
630
|
if pos < length:
|
|
594
631
|
next_char = buffer[pos]
|
|
595
632
|
if next_char in (" ", "\t", "\n", "\f"):
|
|
633
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
634
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
596
635
|
pos += 1
|
|
597
636
|
self.pos = pos
|
|
598
637
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
@@ -604,6 +643,8 @@ class Tokenizer:
|
|
|
604
643
|
self.state = self.DATA
|
|
605
644
|
return False
|
|
606
645
|
if next_char == "/":
|
|
646
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
647
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
607
648
|
pos += 1
|
|
608
649
|
self.pos = pos
|
|
609
650
|
self.state = self.SELF_CLOSING_START_TAG
|
|
@@ -620,15 +661,20 @@ class Tokenizer:
|
|
|
620
661
|
if c is None:
|
|
621
662
|
self.pos = pos
|
|
622
663
|
self._emit_error("eof-in-tag")
|
|
623
|
-
|
|
624
|
-
# The incomplete tag is discarded (not emitted as text)
|
|
664
|
+
self._emit_incomplete_tag_as_text()
|
|
625
665
|
self._emit_token(EOFToken())
|
|
626
666
|
return True
|
|
627
667
|
if c in ("\t", "\n", "\f", " "):
|
|
668
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
669
|
+
self.pos = pos
|
|
670
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
628
671
|
self.pos = pos
|
|
629
672
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
630
673
|
return self._state_before_attribute_name()
|
|
631
674
|
if c == "/":
|
|
675
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
676
|
+
self.pos = pos
|
|
677
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
632
678
|
self.pos = pos
|
|
633
679
|
self.state = self.SELF_CLOSING_START_TAG
|
|
634
680
|
return self._state_self_closing_start_tag()
|
|
@@ -675,6 +721,7 @@ class Tokenizer:
|
|
|
675
721
|
|
|
676
722
|
if c is None:
|
|
677
723
|
self._emit_error("eof-in-tag")
|
|
724
|
+
self._emit_incomplete_tag_as_text()
|
|
678
725
|
self._flush_text()
|
|
679
726
|
self._emit_token(EOFToken())
|
|
680
727
|
return True
|
|
@@ -733,37 +780,43 @@ class Tokenizer:
|
|
|
733
780
|
pos = match.end()
|
|
734
781
|
|
|
735
782
|
if pos < length:
|
|
736
|
-
|
|
737
|
-
if
|
|
783
|
+
next_char = buffer[pos]
|
|
784
|
+
if next_char == "=":
|
|
738
785
|
pos += 1
|
|
739
786
|
self.pos = pos
|
|
740
787
|
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
741
788
|
return self._state_before_attribute_value()
|
|
742
|
-
if
|
|
789
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
743
790
|
pos += 1
|
|
744
791
|
self.pos = pos
|
|
745
792
|
self._finish_attribute()
|
|
746
793
|
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
747
794
|
return False # Let main loop dispatch to avoid recursion
|
|
748
|
-
if
|
|
795
|
+
if next_char == ">":
|
|
749
796
|
pos += 1
|
|
750
797
|
self.pos = pos
|
|
751
798
|
self._finish_attribute()
|
|
752
799
|
if not self._emit_current_tag():
|
|
753
800
|
self.state = self.DATA
|
|
754
801
|
return False
|
|
755
|
-
if
|
|
802
|
+
if next_char == "/":
|
|
756
803
|
pos += 1
|
|
757
804
|
self.pos = pos
|
|
758
805
|
self._finish_attribute()
|
|
759
806
|
self.state = self.SELF_CLOSING_START_TAG
|
|
760
807
|
return self._state_self_closing_start_tag()
|
|
761
808
|
|
|
809
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
810
|
+
if pos >= length:
|
|
811
|
+
c: str | None = None
|
|
812
|
+
else:
|
|
813
|
+
c = buffer[pos]
|
|
814
|
+
pos += 1
|
|
815
|
+
self.current_char = c
|
|
762
816
|
self.pos = pos
|
|
763
|
-
c = self._get_char() # type: ignore[assignment]
|
|
764
|
-
pos = self.pos
|
|
765
817
|
if c is None:
|
|
766
818
|
self._emit_error("eof-in-tag")
|
|
819
|
+
self._emit_incomplete_tag_as_text()
|
|
767
820
|
self._flush_text()
|
|
768
821
|
self._emit_token(EOFToken())
|
|
769
822
|
return True
|
|
@@ -798,9 +851,8 @@ class Tokenizer:
|
|
|
798
851
|
# Optimization: Skip whitespace
|
|
799
852
|
if not self.reconsume:
|
|
800
853
|
if self.pos < length:
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
self.pos = match.end()
|
|
854
|
+
if buffer[self.pos] in " \t\n\f":
|
|
855
|
+
self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
|
|
804
856
|
|
|
805
857
|
# Inline _get_char
|
|
806
858
|
if self.pos >= length:
|
|
@@ -816,6 +868,7 @@ class Tokenizer:
|
|
|
816
868
|
|
|
817
869
|
if c is None:
|
|
818
870
|
self._emit_error("eof-in-tag")
|
|
871
|
+
self._emit_incomplete_tag_as_text()
|
|
819
872
|
self._flush_text()
|
|
820
873
|
self._emit_token(EOFToken())
|
|
821
874
|
return True
|
|
@@ -846,9 +899,17 @@ class Tokenizer:
|
|
|
846
899
|
|
|
847
900
|
def _state_before_attribute_value(self) -> bool:
|
|
848
901
|
while True:
|
|
849
|
-
|
|
902
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
903
|
+
pos = self.pos
|
|
904
|
+
if pos >= self.length:
|
|
905
|
+
c: str | None = None
|
|
906
|
+
else:
|
|
907
|
+
c = self.buffer[pos]
|
|
908
|
+
self.pos = pos + 1
|
|
909
|
+
self.current_char = c
|
|
850
910
|
if c is None:
|
|
851
911
|
self._emit_error("eof-in-tag")
|
|
912
|
+
self._emit_incomplete_tag_as_text()
|
|
852
913
|
self._flush_text()
|
|
853
914
|
self._emit_token(EOFToken())
|
|
854
915
|
return True
|
|
@@ -906,6 +967,7 @@ class Tokenizer:
|
|
|
906
967
|
if self.pos >= length:
|
|
907
968
|
self.current_char = None
|
|
908
969
|
self._emit_error("eof-in-tag")
|
|
970
|
+
self._emit_incomplete_tag_as_text()
|
|
909
971
|
self._emit_token(EOFToken())
|
|
910
972
|
return True
|
|
911
973
|
|
|
@@ -961,6 +1023,7 @@ class Tokenizer:
|
|
|
961
1023
|
if self.pos >= length:
|
|
962
1024
|
self.current_char = None
|
|
963
1025
|
self._emit_error("eof-in-tag")
|
|
1026
|
+
self._emit_incomplete_tag_as_text()
|
|
964
1027
|
self._emit_token(EOFToken())
|
|
965
1028
|
return True
|
|
966
1029
|
|
|
@@ -1014,6 +1077,7 @@ class Tokenizer:
|
|
|
1014
1077
|
# Per HTML5 spec: EOF in attribute value is a parse error
|
|
1015
1078
|
# The incomplete tag is discarded (not emitted)
|
|
1016
1079
|
self._emit_error("eof-in-tag")
|
|
1080
|
+
self._emit_incomplete_tag_as_text()
|
|
1017
1081
|
self._emit_token(EOFToken())
|
|
1018
1082
|
return True
|
|
1019
1083
|
if c in ("\t", "\n", "\f", " "):
|
|
@@ -1049,6 +1113,7 @@ class Tokenizer:
|
|
|
1049
1113
|
|
|
1050
1114
|
if c is None:
|
|
1051
1115
|
self._emit_error("eof-in-tag")
|
|
1116
|
+
self._emit_incomplete_tag_as_text()
|
|
1052
1117
|
self._flush_text()
|
|
1053
1118
|
self._emit_token(EOFToken())
|
|
1054
1119
|
return True
|
|
@@ -1076,6 +1141,7 @@ class Tokenizer:
|
|
|
1076
1141
|
c = self._get_char()
|
|
1077
1142
|
if c is None:
|
|
1078
1143
|
self._emit_error("eof-in-tag")
|
|
1144
|
+
self._emit_incomplete_tag_as_text()
|
|
1079
1145
|
self._flush_text()
|
|
1080
1146
|
self._emit_token(EOFToken())
|
|
1081
1147
|
return True
|
|
@@ -1797,6 +1863,7 @@ class Tokenizer:
|
|
|
1797
1863
|
"unexpected-null-character",
|
|
1798
1864
|
line=line,
|
|
1799
1865
|
column=column,
|
|
1866
|
+
category="tokenizer",
|
|
1800
1867
|
message=message,
|
|
1801
1868
|
source_html=self.buffer,
|
|
1802
1869
|
)
|
|
@@ -1878,7 +1945,12 @@ class Tokenizer:
|
|
|
1878
1945
|
tag.name = name
|
|
1879
1946
|
tag.attrs = attrs
|
|
1880
1947
|
tag.self_closing = self.current_tag_self_closing
|
|
1881
|
-
|
|
1948
|
+
if self.track_tag_positions:
|
|
1949
|
+
tag.start_pos = self.current_token_start_pos
|
|
1950
|
+
tag.end_pos = self.pos
|
|
1951
|
+
else:
|
|
1952
|
+
tag.start_pos = None
|
|
1953
|
+
tag.end_pos = None
|
|
1882
1954
|
self.last_token_start_pos = tag.start_pos
|
|
1883
1955
|
|
|
1884
1956
|
switched_to_rawtext = False
|
|
@@ -1919,6 +1991,30 @@ class Tokenizer:
|
|
|
1919
1991
|
self.current_tag_kind = Tag.START
|
|
1920
1992
|
return switched_to_rawtext
|
|
1921
1993
|
|
|
1994
|
+
def _emit_incomplete_tag_as_text(self) -> None:
|
|
1995
|
+
if not self.opts.emit_bogus_markup_as_text:
|
|
1996
|
+
return
|
|
1997
|
+
start = self.current_token_start_pos
|
|
1998
|
+
if start is None: # pragma: no cover
|
|
1999
|
+
return
|
|
2000
|
+
raw = self.buffer[start : self.pos]
|
|
2001
|
+
if raw: # pragma: no branch
|
|
2002
|
+
self._emit_token(CharacterTokens(raw))
|
|
2003
|
+
|
|
2004
|
+
def _emit_raw_end_tag_as_text(self, pos: int) -> bool:
|
|
2005
|
+
end = self.buffer.find(">", pos)
|
|
2006
|
+
if end == -1:
|
|
2007
|
+
self.pos = self.length
|
|
2008
|
+
self._emit_incomplete_tag_as_text()
|
|
2009
|
+
self._emit_token(EOFToken())
|
|
2010
|
+
return True
|
|
2011
|
+
self.pos = end + 1
|
|
2012
|
+
raw = self.buffer[self.current_token_start_pos : self.pos]
|
|
2013
|
+
if raw: # pragma: no branch
|
|
2014
|
+
self._emit_token(CharacterTokens(raw))
|
|
2015
|
+
self.state = self.DATA
|
|
2016
|
+
return False
|
|
2017
|
+
|
|
1922
2018
|
def _emit_comment(self) -> None:
|
|
1923
2019
|
data = "".join(self.current_comment)
|
|
1924
2020
|
self.current_comment.clear()
|
|
@@ -1947,7 +2043,7 @@ class Tokenizer:
|
|
|
1947
2043
|
self.current_doctype_force_quirks = False
|
|
1948
2044
|
self._emit_token(DoctypeToken(doctype))
|
|
1949
2045
|
|
|
1950
|
-
def _emit_token(self, token:
|
|
2046
|
+
def _emit_token(self, token: AnyToken) -> None:
|
|
1951
2047
|
if self.collect_errors:
|
|
1952
2048
|
self._record_token_position()
|
|
1953
2049
|
self.sink.process_token(token)
|
|
@@ -1998,7 +2094,9 @@ class Tokenizer:
|
|
|
1998
2094
|
|
|
1999
2095
|
message = generate_error_message(code)
|
|
2000
2096
|
line = self._get_line_at_pos(self.pos)
|
|
2001
|
-
self.errors.append(
|
|
2097
|
+
self.errors.append(
|
|
2098
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2099
|
+
)
|
|
2002
2100
|
|
|
2003
2101
|
def _emit_error_at_pos(self, code: str, pos: int) -> None:
|
|
2004
2102
|
last_newline = self.buffer.rfind("\n", 0, pos + 1)
|
|
@@ -2009,7 +2107,9 @@ class Tokenizer:
|
|
|
2009
2107
|
|
|
2010
2108
|
message = generate_error_message(code)
|
|
2011
2109
|
line = self._get_line_at_pos(pos)
|
|
2012
|
-
self.errors.append(
|
|
2110
|
+
self.errors.append(
|
|
2111
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2112
|
+
)
|
|
2013
2113
|
|
|
2014
2114
|
def _consume_if(self, literal: str) -> bool:
|
|
2015
2115
|
end = self.pos + len(literal)
|
justhtml/tokens.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Literal
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Tag:
|
|
7
|
-
__slots__ = ("attrs", "kind", "name", "self_closing", "start_pos")
|
|
7
|
+
__slots__ = ("attrs", "end_pos", "kind", "name", "self_closing", "start_pos")
|
|
8
8
|
|
|
9
9
|
START: Literal[0] = 0
|
|
10
10
|
END: Literal[1] = 1
|
|
@@ -12,6 +12,7 @@ class Tag:
|
|
|
12
12
|
kind: int
|
|
13
13
|
name: str
|
|
14
14
|
attrs: dict[str, str | None]
|
|
15
|
+
end_pos: int | None
|
|
15
16
|
self_closing: bool
|
|
16
17
|
start_pos: int | None
|
|
17
18
|
|
|
@@ -22,12 +23,14 @@ class Tag:
|
|
|
22
23
|
attrs: dict[str, str | None] | None,
|
|
23
24
|
self_closing: bool = False,
|
|
24
25
|
start_pos: int | None = None,
|
|
26
|
+
end_pos: int | None = None,
|
|
25
27
|
) -> None:
|
|
26
28
|
self.kind = kind
|
|
27
29
|
self.name = name
|
|
28
30
|
self.attrs = attrs if attrs is not None else {}
|
|
29
31
|
self.self_closing = bool(self_closing)
|
|
30
32
|
self.start_pos = start_pos
|
|
33
|
+
self.end_pos = end_pos
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
class CharacterTokens:
|
|
@@ -84,6 +87,9 @@ class EOFToken:
|
|
|
84
87
|
__slots__ = ()
|
|
85
88
|
|
|
86
89
|
|
|
90
|
+
AnyToken = Tag | CharacterTokens | CommentToken | DoctypeToken | EOFToken
|
|
91
|
+
|
|
92
|
+
|
|
87
93
|
class TokenSinkResult:
|
|
88
94
|
__slots__ = ()
|
|
89
95
|
|
|
@@ -94,8 +100,9 @@ class TokenSinkResult:
|
|
|
94
100
|
class ParseError:
|
|
95
101
|
"""Represents a parse error with location information."""
|
|
96
102
|
|
|
97
|
-
__slots__ = ("_end_column", "_source_html", "code", "column", "line", "message")
|
|
103
|
+
__slots__ = ("_end_column", "_source_html", "category", "code", "column", "line", "message")
|
|
98
104
|
|
|
105
|
+
category: str
|
|
99
106
|
code: str
|
|
100
107
|
line: int | None
|
|
101
108
|
column: int | None
|
|
@@ -110,10 +117,12 @@ class ParseError:
|
|
|
110
117
|
code: str,
|
|
111
118
|
line: int | None = None,
|
|
112
119
|
column: int | None = None,
|
|
120
|
+
category: str = "parse",
|
|
113
121
|
message: str | None = None,
|
|
114
122
|
source_html: str | None = None,
|
|
115
123
|
end_column: int | None = None,
|
|
116
124
|
) -> None:
|
|
125
|
+
self.category = category
|
|
117
126
|
self.code = code
|
|
118
127
|
self.line = line
|
|
119
128
|
self.column = column
|
|
@@ -123,7 +132,11 @@ class ParseError:
|
|
|
123
132
|
|
|
124
133
|
def __repr__(self) -> str:
|
|
125
134
|
if self.line is not None and self.column is not None:
|
|
135
|
+
if self.category != "parse":
|
|
136
|
+
return f"ParseError({self.code!r}, line={self.line}, column={self.column}, category={self.category!r})"
|
|
126
137
|
return f"ParseError({self.code!r}, line={self.line}, column={self.column})"
|
|
138
|
+
if self.category != "parse":
|
|
139
|
+
return f"ParseError({self.code!r}, category={self.category!r})"
|
|
127
140
|
return f"ParseError({self.code!r})"
|
|
128
141
|
|
|
129
142
|
def __str__(self) -> str:
|
|
@@ -138,7 +151,12 @@ class ParseError:
|
|
|
138
151
|
def __eq__(self, other: object) -> bool:
|
|
139
152
|
if not isinstance(other, ParseError):
|
|
140
153
|
return NotImplemented
|
|
141
|
-
return
|
|
154
|
+
return (
|
|
155
|
+
self.category == other.category
|
|
156
|
+
and self.code == other.code
|
|
157
|
+
and self.line == other.line
|
|
158
|
+
and self.column == other.column
|
|
159
|
+
)
|
|
142
160
|
|
|
143
161
|
def as_exception(self, end_column: int | None = None) -> SyntaxError:
|
|
144
162
|
"""Convert to a SyntaxError-like exception with source highlighting.
|