justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
justhtml/tokens.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
class Tag:
|
|
2
|
+
__slots__ = ("attrs", "kind", "name", "self_closing")
|
|
3
|
+
|
|
4
|
+
START = 0
|
|
5
|
+
END = 1
|
|
6
|
+
|
|
7
|
+
def __init__(self, kind, name, attrs, self_closing=False):
|
|
8
|
+
self.kind = kind
|
|
9
|
+
self.name = name
|
|
10
|
+
self.attrs = attrs if attrs is not None else {}
|
|
11
|
+
self.self_closing = bool(self_closing)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CharacterTokens:
|
|
15
|
+
__slots__ = ("data",)
|
|
16
|
+
|
|
17
|
+
def __init__(self, data):
|
|
18
|
+
self.data = data
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CommentToken:
|
|
22
|
+
__slots__ = ("data",)
|
|
23
|
+
|
|
24
|
+
def __init__(self, data):
|
|
25
|
+
self.data = data
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Doctype:
|
|
29
|
+
__slots__ = ("force_quirks", "name", "public_id", "system_id")
|
|
30
|
+
|
|
31
|
+
def __init__(self, name=None, public_id=None, system_id=None, force_quirks=False):
|
|
32
|
+
self.name = name
|
|
33
|
+
self.public_id = public_id
|
|
34
|
+
self.system_id = system_id
|
|
35
|
+
self.force_quirks = bool(force_quirks)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DoctypeToken:
|
|
39
|
+
__slots__ = ("doctype",)
|
|
40
|
+
|
|
41
|
+
def __init__(self, doctype):
|
|
42
|
+
self.doctype = doctype
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class EOFToken:
|
|
46
|
+
__slots__ = ()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TokenSinkResult:
|
|
50
|
+
__slots__ = ()
|
|
51
|
+
|
|
52
|
+
Continue = 0
|
|
53
|
+
Plaintext = 1
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ParseError:
|
|
57
|
+
"""Represents a parse error with location information."""
|
|
58
|
+
|
|
59
|
+
__slots__ = ("_end_column", "_source_html", "code", "column", "line", "message")
|
|
60
|
+
|
|
61
|
+
def __init__(self, code, line=None, column=None, message=None, source_html=None, end_column=None):
|
|
62
|
+
self.code = code
|
|
63
|
+
self.line = line
|
|
64
|
+
self.column = column
|
|
65
|
+
self.message = message or code
|
|
66
|
+
self._source_html = source_html
|
|
67
|
+
self._end_column = end_column
|
|
68
|
+
|
|
69
|
+
def __repr__(self):
|
|
70
|
+
if self.line is not None and self.column is not None:
|
|
71
|
+
return f"ParseError({self.code!r}, line={self.line}, column={self.column})"
|
|
72
|
+
return f"ParseError({self.code!r})"
|
|
73
|
+
|
|
74
|
+
def __str__(self):
|
|
75
|
+
if self.line is not None and self.column is not None:
|
|
76
|
+
if self.message != self.code:
|
|
77
|
+
return f"({self.line},{self.column}): {self.code} - {self.message}"
|
|
78
|
+
return f"({self.line},{self.column}): {self.code}"
|
|
79
|
+
if self.message != self.code:
|
|
80
|
+
return f"{self.code} - {self.message}"
|
|
81
|
+
return self.code
|
|
82
|
+
|
|
83
|
+
def __eq__(self, other):
|
|
84
|
+
if not isinstance(other, ParseError):
|
|
85
|
+
return NotImplemented
|
|
86
|
+
return self.code == other.code and self.line == other.line and self.column == other.column
|
|
87
|
+
|
|
88
|
+
__hash__ = None # Unhashable since we define __eq__
|
|
89
|
+
|
|
90
|
+
def as_exception(self, end_column=None):
|
|
91
|
+
"""Convert to a SyntaxError-like exception with source highlighting.
|
|
92
|
+
|
|
93
|
+
This uses Python 3.11+ enhanced error display to show the exact
|
|
94
|
+
location in the HTML source where the error occurred.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
end_column: Optional end column for highlighting a range.
|
|
98
|
+
If None, attempts to highlight the full tag at the error position.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
A SyntaxError instance configured to display the error location.
|
|
102
|
+
"""
|
|
103
|
+
if self.line is None or self.column is None or not self._source_html:
|
|
104
|
+
# Fall back to regular exception if we don't have location info
|
|
105
|
+
exc = SyntaxError(self.message)
|
|
106
|
+
exc.msg = self.message
|
|
107
|
+
return exc
|
|
108
|
+
|
|
109
|
+
# Split HTML into lines
|
|
110
|
+
lines = self._source_html.split("\n")
|
|
111
|
+
if self.line < 1 or self.line > len(lines):
|
|
112
|
+
# Invalid line number
|
|
113
|
+
exc = SyntaxError(self.message)
|
|
114
|
+
exc.msg = self.message
|
|
115
|
+
return exc
|
|
116
|
+
|
|
117
|
+
# Get the line with the error (1-indexed line -> 0-indexed array)
|
|
118
|
+
error_line = lines[self.line - 1]
|
|
119
|
+
|
|
120
|
+
# Create SyntaxError with location information
|
|
121
|
+
exc = SyntaxError(self.message)
|
|
122
|
+
exc.filename = "<html>"
|
|
123
|
+
exc.lineno = self.line
|
|
124
|
+
exc.offset = self.column
|
|
125
|
+
exc.text = error_line
|
|
126
|
+
exc.msg = self.message
|
|
127
|
+
|
|
128
|
+
# Set end position for highlighting
|
|
129
|
+
# Use stored end_column if provided, otherwise use parameter, otherwise auto-detect
|
|
130
|
+
if self._end_column is not None:
|
|
131
|
+
exc.end_lineno = self.line
|
|
132
|
+
exc.end_offset = self._end_column
|
|
133
|
+
elif end_column is not None:
|
|
134
|
+
exc.end_lineno = self.line
|
|
135
|
+
exc.end_offset = end_column
|
|
136
|
+
else:
|
|
137
|
+
# Try to find and highlight the full tag at this position
|
|
138
|
+
col_idx = self.column - 1 # Convert to 0-indexed
|
|
139
|
+
|
|
140
|
+
# Look backwards for '<' if we're not already on it
|
|
141
|
+
start_idx = col_idx
|
|
142
|
+
if start_idx < len(error_line) and error_line[start_idx] == "<":
|
|
143
|
+
# Already at '<', use this position
|
|
144
|
+
pass
|
|
145
|
+
else:
|
|
146
|
+
# Look backwards for '<'
|
|
147
|
+
found_tag_start = False
|
|
148
|
+
while start_idx > 0 and error_line[start_idx - 1] != "<":
|
|
149
|
+
start_idx -= 1
|
|
150
|
+
if col_idx - start_idx > 10: # Don't look too far back
|
|
151
|
+
start_idx = col_idx
|
|
152
|
+
break
|
|
153
|
+
|
|
154
|
+
# If we found a '<' before our position, use it as start
|
|
155
|
+
if start_idx > 0 and error_line[start_idx - 1] == "<":
|
|
156
|
+
start_idx -= 1
|
|
157
|
+
found_tag_start = True
|
|
158
|
+
|
|
159
|
+
# If we didn't find a tag start, use original position
|
|
160
|
+
if not found_tag_start:
|
|
161
|
+
start_idx = col_idx
|
|
162
|
+
|
|
163
|
+
# Look forward for '>' to find end of tag
|
|
164
|
+
end_idx = col_idx
|
|
165
|
+
while end_idx < len(error_line) and error_line[end_idx] != ">":
|
|
166
|
+
end_idx += 1
|
|
167
|
+
if end_idx < len(error_line) and error_line[end_idx] == ">":
|
|
168
|
+
end_idx += 1 # Include the '>'
|
|
169
|
+
|
|
170
|
+
# Set the highlighting range (convert back to 1-indexed)
|
|
171
|
+
exc.end_lineno = self.line
|
|
172
|
+
exc.offset = start_idx + 1
|
|
173
|
+
exc.end_offset = end_idx + 1
|
|
174
|
+
|
|
175
|
+
return exc
|