justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/tokenizer.py CHANGED
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
 
10
10
  from .entities import decode_entities_in_text
11
11
  from .errors import generate_error_message
12
- from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
12
+ from .tokens import AnyToken, CharacterTokens, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
13
13
 
14
14
  _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
15
15
  _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
@@ -79,7 +79,14 @@ def _coerce_comment_for_xml(text: str) -> str:
79
79
 
80
80
 
81
81
  class TokenizerOpts:
82
- __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
82
+ __slots__ = (
83
+ "discard_bom",
84
+ "emit_bogus_markup_as_text",
85
+ "exact_errors",
86
+ "initial_rawtext_tag",
87
+ "initial_state",
88
+ "xml_coercion",
89
+ )
83
90
 
84
91
  discard_bom: bool
85
92
  exact_errors: bool
@@ -91,12 +98,14 @@ class TokenizerOpts:
91
98
  self,
92
99
  exact_errors: bool = False,
93
100
  discard_bom: bool = True,
101
+ emit_bogus_markup_as_text: bool = False,
94
102
  initial_state: int | None = None,
95
103
  initial_rawtext_tag: str | None = None,
96
104
  xml_coercion: bool = False,
97
105
  ) -> None:
98
106
  self.exact_errors = bool(exact_errors)
99
107
  self.discard_bom = bool(discard_bom)
108
+ self.emit_bogus_markup_as_text = bool(emit_bogus_markup_as_text)
100
109
  self.initial_state = initial_state
101
110
  self.initial_rawtext_tag = initial_rawtext_tag
102
111
  self.xml_coercion = bool(xml_coercion)
@@ -203,6 +212,7 @@ class Tokenizer:
203
212
  "text_buffer",
204
213
  "text_start_pos",
205
214
  "track_node_locations",
215
+ "track_tag_positions",
206
216
  )
207
217
 
208
218
  _comment_token: CommentToken
@@ -211,6 +221,7 @@ class Tokenizer:
211
221
  _tag_token: Tag
212
222
  buffer: str
213
223
  collect_errors: bool
224
+ track_tag_positions: bool
214
225
  track_node_locations: bool
215
226
  current_attr_name: list[str]
216
227
  current_attr_value: list[str]
@@ -252,11 +263,13 @@ class Tokenizer:
252
263
  *,
253
264
  collect_errors: bool = False,
254
265
  track_node_locations: bool = False,
266
+ track_tag_positions: bool = False,
255
267
  ) -> None:
256
268
  self.sink = sink
257
269
  self.opts = opts or TokenizerOpts()
258
270
  self.collect_errors = collect_errors
259
271
  self.track_node_locations = bool(track_node_locations)
272
+ self.track_tag_positions = bool(track_tag_positions)
260
273
  self.errors = []
261
274
 
262
275
  self.state = self.DATA
@@ -396,8 +409,9 @@ class Tokenizer:
396
409
 
397
410
  def run(self, html: str | None) -> None:
398
411
  self.initialize(html)
412
+ handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
399
413
  while True:
400
- if self.step():
414
+ if handlers[self.state](self): # type: ignore[no-any-return]
401
415
  break
402
416
 
403
417
  # ---------------------
@@ -486,7 +500,7 @@ class Tokenizer:
486
500
  self.state = self.TAG_NAME
487
501
  return self._state_tag_name()
488
502
 
489
- if nc == "!":
503
+ if nc == "!" and not self.opts.emit_bogus_markup_as_text:
490
504
  # Optimization: Peek ahead for comments
491
505
  if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
492
506
  self._flush_text()
@@ -529,12 +543,20 @@ class Tokenizer:
529
543
  self._emit_token(EOFToken())
530
544
  return True
531
545
  if c == "!":
546
+ if self.opts.emit_bogus_markup_as_text:
547
+ self._append_text("<!")
548
+ self.state = self.DATA
549
+ return False
532
550
  self.state = self.MARKUP_DECLARATION_OPEN
533
551
  return False
534
552
  if c == "/":
535
553
  self.state = self.END_TAG_OPEN
536
554
  return False
537
555
  if c == "?":
556
+ if self.opts.emit_bogus_markup_as_text:
557
+ self._append_text("<?")
558
+ self.state = self.DATA
559
+ return False
538
560
  self._emit_error("unexpected-question-mark-instead-of-tag-name")
539
561
  self.current_comment.clear()
540
562
  self._reconsume_current()
@@ -551,6 +573,11 @@ class Tokenizer:
551
573
  c = self._get_char()
552
574
  if c is None:
553
575
  self._emit_error("eof-before-tag-name")
576
+ if self.opts.emit_bogus_markup_as_text:
577
+ self._append_text("</")
578
+ self._flush_text()
579
+ self._emit_token(EOFToken())
580
+ return True
554
581
  self._append_text("<")
555
582
  self._append_text("/")
556
583
  self._flush_text()
@@ -558,6 +585,16 @@ class Tokenizer:
558
585
  return True
559
586
  if c == ">":
560
587
  self._emit_error("empty-end-tag")
588
+ if self.opts.emit_bogus_markup_as_text:
589
+ self._append_text("</>")
590
+ self.state = self.DATA
591
+ return False
592
+ self.state = self.DATA
593
+ return False
594
+
595
+ if self.opts.emit_bogus_markup_as_text:
596
+ self._append_text("</")
597
+ self._append_text(c)
561
598
  self.state = self.DATA
562
599
  return False
563
600
 
@@ -593,6 +630,8 @@ class Tokenizer:
593
630
  if pos < length:
594
631
  next_char = buffer[pos]
595
632
  if next_char in (" ", "\t", "\n", "\f"):
633
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
634
+ return self._emit_raw_end_tag_as_text(pos)
596
635
  pos += 1
597
636
  self.pos = pos
598
637
  self.state = self.BEFORE_ATTRIBUTE_NAME
@@ -604,6 +643,8 @@ class Tokenizer:
604
643
  self.state = self.DATA
605
644
  return False
606
645
  if next_char == "/":
646
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
647
+ return self._emit_raw_end_tag_as_text(pos)
607
648
  pos += 1
608
649
  self.pos = pos
609
650
  self.state = self.SELF_CLOSING_START_TAG
@@ -620,15 +661,20 @@ class Tokenizer:
620
661
  if c is None:
621
662
  self.pos = pos
622
663
  self._emit_error("eof-in-tag")
623
- # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
624
- # The incomplete tag is discarded (not emitted as text)
664
+ self._emit_incomplete_tag_as_text()
625
665
  self._emit_token(EOFToken())
626
666
  return True
627
667
  if c in ("\t", "\n", "\f", " "):
668
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
669
+ self.pos = pos
670
+ return self._emit_raw_end_tag_as_text(pos)
628
671
  self.pos = pos
629
672
  self.state = self.BEFORE_ATTRIBUTE_NAME
630
673
  return self._state_before_attribute_name()
631
674
  if c == "/":
675
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
676
+ self.pos = pos
677
+ return self._emit_raw_end_tag_as_text(pos)
632
678
  self.pos = pos
633
679
  self.state = self.SELF_CLOSING_START_TAG
634
680
  return self._state_self_closing_start_tag()
@@ -675,6 +721,7 @@ class Tokenizer:
675
721
 
676
722
  if c is None:
677
723
  self._emit_error("eof-in-tag")
724
+ self._emit_incomplete_tag_as_text()
678
725
  self._flush_text()
679
726
  self._emit_token(EOFToken())
680
727
  return True
@@ -733,37 +780,43 @@ class Tokenizer:
733
780
  pos = match.end()
734
781
 
735
782
  if pos < length:
736
- c = buffer[pos]
737
- if c == "=":
783
+ next_char = buffer[pos]
784
+ if next_char == "=":
738
785
  pos += 1
739
786
  self.pos = pos
740
787
  self.state = self.BEFORE_ATTRIBUTE_VALUE
741
788
  return self._state_before_attribute_value()
742
- if c in (" ", "\t", "\n", "\f"):
789
+ if next_char in (" ", "\t", "\n", "\f"):
743
790
  pos += 1
744
791
  self.pos = pos
745
792
  self._finish_attribute()
746
793
  self.state = self.AFTER_ATTRIBUTE_NAME
747
794
  return False # Let main loop dispatch to avoid recursion
748
- if c == ">":
795
+ if next_char == ">":
749
796
  pos += 1
750
797
  self.pos = pos
751
798
  self._finish_attribute()
752
799
  if not self._emit_current_tag():
753
800
  self.state = self.DATA
754
801
  return False
755
- if c == "/":
802
+ if next_char == "/":
756
803
  pos += 1
757
804
  self.pos = pos
758
805
  self._finish_attribute()
759
806
  self.state = self.SELF_CLOSING_START_TAG
760
807
  return self._state_self_closing_start_tag()
761
808
 
809
+ # Inline _get_char (reconsume is never True in this state)
810
+ if pos >= length:
811
+ c: str | None = None
812
+ else:
813
+ c = buffer[pos]
814
+ pos += 1
815
+ self.current_char = c
762
816
  self.pos = pos
763
- c = self._get_char() # type: ignore[assignment]
764
- pos = self.pos
765
817
  if c is None:
766
818
  self._emit_error("eof-in-tag")
819
+ self._emit_incomplete_tag_as_text()
767
820
  self._flush_text()
768
821
  self._emit_token(EOFToken())
769
822
  return True
@@ -798,9 +851,8 @@ class Tokenizer:
798
851
  # Optimization: Skip whitespace
799
852
  if not self.reconsume:
800
853
  if self.pos < length:
801
- match = _WHITESPACE_PATTERN.match(buffer, self.pos)
802
- if match:
803
- self.pos = match.end()
854
+ if buffer[self.pos] in " \t\n\f":
855
+ self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
804
856
 
805
857
  # Inline _get_char
806
858
  if self.pos >= length:
@@ -816,6 +868,7 @@ class Tokenizer:
816
868
 
817
869
  if c is None:
818
870
  self._emit_error("eof-in-tag")
871
+ self._emit_incomplete_tag_as_text()
819
872
  self._flush_text()
820
873
  self._emit_token(EOFToken())
821
874
  return True
@@ -846,9 +899,17 @@ class Tokenizer:
846
899
 
847
900
  def _state_before_attribute_value(self) -> bool:
848
901
  while True:
849
- c = self._get_char()
902
+ # Inline _get_char (reconsume is never True in this state)
903
+ pos = self.pos
904
+ if pos >= self.length:
905
+ c: str | None = None
906
+ else:
907
+ c = self.buffer[pos]
908
+ self.pos = pos + 1
909
+ self.current_char = c
850
910
  if c is None:
851
911
  self._emit_error("eof-in-tag")
912
+ self._emit_incomplete_tag_as_text()
852
913
  self._flush_text()
853
914
  self._emit_token(EOFToken())
854
915
  return True
@@ -906,6 +967,7 @@ class Tokenizer:
906
967
  if self.pos >= length:
907
968
  self.current_char = None
908
969
  self._emit_error("eof-in-tag")
970
+ self._emit_incomplete_tag_as_text()
909
971
  self._emit_token(EOFToken())
910
972
  return True
911
973
 
@@ -961,6 +1023,7 @@ class Tokenizer:
961
1023
  if self.pos >= length:
962
1024
  self.current_char = None
963
1025
  self._emit_error("eof-in-tag")
1026
+ self._emit_incomplete_tag_as_text()
964
1027
  self._emit_token(EOFToken())
965
1028
  return True
966
1029
 
@@ -1014,6 +1077,7 @@ class Tokenizer:
1014
1077
  # Per HTML5 spec: EOF in attribute value is a parse error
1015
1078
  # The incomplete tag is discarded (not emitted)
1016
1079
  self._emit_error("eof-in-tag")
1080
+ self._emit_incomplete_tag_as_text()
1017
1081
  self._emit_token(EOFToken())
1018
1082
  return True
1019
1083
  if c in ("\t", "\n", "\f", " "):
@@ -1049,6 +1113,7 @@ class Tokenizer:
1049
1113
 
1050
1114
  if c is None:
1051
1115
  self._emit_error("eof-in-tag")
1116
+ self._emit_incomplete_tag_as_text()
1052
1117
  self._flush_text()
1053
1118
  self._emit_token(EOFToken())
1054
1119
  return True
@@ -1076,6 +1141,7 @@ class Tokenizer:
1076
1141
  c = self._get_char()
1077
1142
  if c is None:
1078
1143
  self._emit_error("eof-in-tag")
1144
+ self._emit_incomplete_tag_as_text()
1079
1145
  self._flush_text()
1080
1146
  self._emit_token(EOFToken())
1081
1147
  return True
@@ -1797,6 +1863,7 @@ class Tokenizer:
1797
1863
  "unexpected-null-character",
1798
1864
  line=line,
1799
1865
  column=column,
1866
+ category="tokenizer",
1800
1867
  message=message,
1801
1868
  source_html=self.buffer,
1802
1869
  )
@@ -1878,7 +1945,12 @@ class Tokenizer:
1878
1945
  tag.name = name
1879
1946
  tag.attrs = attrs
1880
1947
  tag.self_closing = self.current_tag_self_closing
1881
- tag.start_pos = self.current_token_start_pos
1948
+ if self.track_tag_positions:
1949
+ tag.start_pos = self.current_token_start_pos
1950
+ tag.end_pos = self.pos
1951
+ else:
1952
+ tag.start_pos = None
1953
+ tag.end_pos = None
1882
1954
  self.last_token_start_pos = tag.start_pos
1883
1955
 
1884
1956
  switched_to_rawtext = False
@@ -1919,6 +1991,30 @@ class Tokenizer:
1919
1991
  self.current_tag_kind = Tag.START
1920
1992
  return switched_to_rawtext
1921
1993
 
1994
+ def _emit_incomplete_tag_as_text(self) -> None:
1995
+ if not self.opts.emit_bogus_markup_as_text:
1996
+ return
1997
+ start = self.current_token_start_pos
1998
+ if start is None: # pragma: no cover
1999
+ return
2000
+ raw = self.buffer[start : self.pos]
2001
+ if raw: # pragma: no branch
2002
+ self._emit_token(CharacterTokens(raw))
2003
+
2004
+ def _emit_raw_end_tag_as_text(self, pos: int) -> bool:
2005
+ end = self.buffer.find(">", pos)
2006
+ if end == -1:
2007
+ self.pos = self.length
2008
+ self._emit_incomplete_tag_as_text()
2009
+ self._emit_token(EOFToken())
2010
+ return True
2011
+ self.pos = end + 1
2012
+ raw = self.buffer[self.current_token_start_pos : self.pos]
2013
+ if raw: # pragma: no branch
2014
+ self._emit_token(CharacterTokens(raw))
2015
+ self.state = self.DATA
2016
+ return False
2017
+
1922
2018
  def _emit_comment(self) -> None:
1923
2019
  data = "".join(self.current_comment)
1924
2020
  self.current_comment.clear()
@@ -1947,7 +2043,7 @@ class Tokenizer:
1947
2043
  self.current_doctype_force_quirks = False
1948
2044
  self._emit_token(DoctypeToken(doctype))
1949
2045
 
1950
- def _emit_token(self, token: Any) -> None:
2046
+ def _emit_token(self, token: AnyToken) -> None:
1951
2047
  if self.collect_errors:
1952
2048
  self._record_token_position()
1953
2049
  self.sink.process_token(token)
@@ -1998,7 +2094,9 @@ class Tokenizer:
1998
2094
 
1999
2095
  message = generate_error_message(code)
2000
2096
  line = self._get_line_at_pos(self.pos)
2001
- self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
2097
+ self.errors.append(
2098
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2099
+ )
2002
2100
 
2003
2101
  def _emit_error_at_pos(self, code: str, pos: int) -> None:
2004
2102
  last_newline = self.buffer.rfind("\n", 0, pos + 1)
@@ -2009,7 +2107,9 @@ class Tokenizer:
2009
2107
 
2010
2108
  message = generate_error_message(code)
2011
2109
  line = self._get_line_at_pos(pos)
2012
- self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
2110
+ self.errors.append(
2111
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2112
+ )
2013
2113
 
2014
2114
  def _consume_if(self, literal: str) -> bool:
2015
2115
  end = self.pos + len(literal)
justhtml/tokens.py CHANGED
@@ -4,7 +4,7 @@ from typing import Literal
4
4
 
5
5
 
6
6
  class Tag:
7
- __slots__ = ("attrs", "kind", "name", "self_closing", "start_pos")
7
+ __slots__ = ("attrs", "end_pos", "kind", "name", "self_closing", "start_pos")
8
8
 
9
9
  START: Literal[0] = 0
10
10
  END: Literal[1] = 1
@@ -12,6 +12,7 @@ class Tag:
12
12
  kind: int
13
13
  name: str
14
14
  attrs: dict[str, str | None]
15
+ end_pos: int | None
15
16
  self_closing: bool
16
17
  start_pos: int | None
17
18
 
@@ -22,12 +23,14 @@ class Tag:
22
23
  attrs: dict[str, str | None] | None,
23
24
  self_closing: bool = False,
24
25
  start_pos: int | None = None,
26
+ end_pos: int | None = None,
25
27
  ) -> None:
26
28
  self.kind = kind
27
29
  self.name = name
28
30
  self.attrs = attrs if attrs is not None else {}
29
31
  self.self_closing = bool(self_closing)
30
32
  self.start_pos = start_pos
33
+ self.end_pos = end_pos
31
34
 
32
35
 
33
36
  class CharacterTokens:
@@ -84,6 +87,9 @@ class EOFToken:
84
87
  __slots__ = ()
85
88
 
86
89
 
90
+ AnyToken = Tag | CharacterTokens | CommentToken | DoctypeToken | EOFToken
91
+
92
+
87
93
  class TokenSinkResult:
88
94
  __slots__ = ()
89
95
 
@@ -94,8 +100,9 @@ class TokenSinkResult:
94
100
  class ParseError:
95
101
  """Represents a parse error with location information."""
96
102
 
97
- __slots__ = ("_end_column", "_source_html", "code", "column", "line", "message")
103
+ __slots__ = ("_end_column", "_source_html", "category", "code", "column", "line", "message")
98
104
 
105
+ category: str
99
106
  code: str
100
107
  line: int | None
101
108
  column: int | None
@@ -110,10 +117,12 @@ class ParseError:
110
117
  code: str,
111
118
  line: int | None = None,
112
119
  column: int | None = None,
120
+ category: str = "parse",
113
121
  message: str | None = None,
114
122
  source_html: str | None = None,
115
123
  end_column: int | None = None,
116
124
  ) -> None:
125
+ self.category = category
117
126
  self.code = code
118
127
  self.line = line
119
128
  self.column = column
@@ -123,7 +132,11 @@ class ParseError:
123
132
 
124
133
  def __repr__(self) -> str:
125
134
  if self.line is not None and self.column is not None:
135
+ if self.category != "parse":
136
+ return f"ParseError({self.code!r}, line={self.line}, column={self.column}, category={self.category!r})"
126
137
  return f"ParseError({self.code!r}, line={self.line}, column={self.column})"
138
+ if self.category != "parse":
139
+ return f"ParseError({self.code!r}, category={self.category!r})"
127
140
  return f"ParseError({self.code!r})"
128
141
 
129
142
  def __str__(self) -> str:
@@ -138,7 +151,12 @@ class ParseError:
138
151
  def __eq__(self, other: object) -> bool:
139
152
  if not isinstance(other, ParseError):
140
153
  return NotImplemented
141
- return self.code == other.code and self.line == other.line and self.column == other.column
154
+ return (
155
+ self.category == other.category
156
+ and self.code == other.code
157
+ and self.line == other.line
158
+ and self.column == other.column
159
+ )
142
160
 
143
161
  def as_exception(self, end_column: int | None = None) -> SyntaxError:
144
162
  """Convert to a SyntaxError-like exception with source highlighting.