justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
justhtml/selector.py
ADDED
|
@@ -0,0 +1,925 @@
|
|
|
1
|
+
# CSS Selector implementation for JustHTML
|
|
2
|
+
# Supports a subset of CSS selectors for querying the DOM
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SelectorError(ValueError):
|
|
6
|
+
"""Raised when a CSS selector is invalid."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Token types for the CSS selector lexer
|
|
10
|
+
class TokenType:
|
|
11
|
+
TAG = "TAG" # div, span, etc.
|
|
12
|
+
ID = "ID" # #foo
|
|
13
|
+
CLASS = "CLASS" # .bar
|
|
14
|
+
UNIVERSAL = "UNIVERSAL" # *
|
|
15
|
+
ATTR_START = "ATTR_START" # [
|
|
16
|
+
ATTR_END = "ATTR_END" # ]
|
|
17
|
+
ATTR_OP = "ATTR_OP" # =, ~=, |=, ^=, $=, *=
|
|
18
|
+
STRING = "STRING" # "value" or 'value' or unquoted
|
|
19
|
+
COMBINATOR = "COMBINATOR" # >, +, ~, or whitespace (descendant)
|
|
20
|
+
COMMA = "COMMA" # ,
|
|
21
|
+
COLON = "COLON" # :
|
|
22
|
+
PAREN_OPEN = "PAREN_OPEN" # (
|
|
23
|
+
PAREN_CLOSE = "PAREN_CLOSE" # )
|
|
24
|
+
EOF = "EOF"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Token:
|
|
28
|
+
__slots__ = ("type", "value")
|
|
29
|
+
|
|
30
|
+
def __init__(self, token_type, value=None):
|
|
31
|
+
self.type = token_type
|
|
32
|
+
self.value = value
|
|
33
|
+
|
|
34
|
+
def __repr__(self):
|
|
35
|
+
return f"Token({self.type}, {self.value!r})"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SelectorTokenizer:
|
|
39
|
+
"""Tokenizes a CSS selector string into tokens."""
|
|
40
|
+
|
|
41
|
+
__slots__ = ("length", "pos", "selector")
|
|
42
|
+
|
|
43
|
+
def __init__(self, selector):
|
|
44
|
+
self.selector = selector
|
|
45
|
+
self.pos = 0
|
|
46
|
+
self.length = len(selector)
|
|
47
|
+
|
|
48
|
+
def _peek(self, offset=0):
|
|
49
|
+
pos = self.pos + offset
|
|
50
|
+
if pos < self.length:
|
|
51
|
+
return self.selector[pos]
|
|
52
|
+
return ""
|
|
53
|
+
|
|
54
|
+
def _advance(self):
|
|
55
|
+
ch = self._peek()
|
|
56
|
+
self.pos += 1
|
|
57
|
+
return ch
|
|
58
|
+
|
|
59
|
+
def _skip_whitespace(self):
|
|
60
|
+
while self.pos < self.length and self.selector[self.pos] in " \t\n\r\f":
|
|
61
|
+
self.pos += 1
|
|
62
|
+
|
|
63
|
+
def _is_name_start(self, ch):
|
|
64
|
+
# CSS identifier start: letter, underscore, or non-ASCII
|
|
65
|
+
return ch.isalpha() or ch == "_" or ch == "-" or ord(ch) > 127
|
|
66
|
+
|
|
67
|
+
def _is_name_char(self, ch):
|
|
68
|
+
# CSS identifier continuation: name-start or digit
|
|
69
|
+
return self._is_name_start(ch) or ch.isdigit()
|
|
70
|
+
|
|
71
|
+
def _read_name(self):
|
|
72
|
+
start = self.pos
|
|
73
|
+
while self.pos < self.length and self._is_name_char(self.selector[self.pos]):
|
|
74
|
+
self.pos += 1
|
|
75
|
+
return self.selector[start : self.pos]
|
|
76
|
+
|
|
77
|
+
def _read_string(self, quote):
|
|
78
|
+
# Skip opening quote
|
|
79
|
+
self.pos += 1
|
|
80
|
+
start = self.pos
|
|
81
|
+
parts = []
|
|
82
|
+
|
|
83
|
+
while self.pos < self.length:
|
|
84
|
+
ch = self.selector[self.pos]
|
|
85
|
+
if ch == quote:
|
|
86
|
+
# Append any remaining text before the closing quote
|
|
87
|
+
if self.pos > start:
|
|
88
|
+
parts.append(self.selector[start : self.pos])
|
|
89
|
+
self.pos += 1
|
|
90
|
+
return "".join(parts)
|
|
91
|
+
if ch == "\\":
|
|
92
|
+
# Append text before the backslash
|
|
93
|
+
if self.pos > start:
|
|
94
|
+
parts.append(self.selector[start : self.pos])
|
|
95
|
+
self.pos += 1
|
|
96
|
+
if self.pos < self.length:
|
|
97
|
+
# Append the escaped character
|
|
98
|
+
parts.append(self.selector[self.pos])
|
|
99
|
+
self.pos += 1
|
|
100
|
+
start = self.pos
|
|
101
|
+
else:
|
|
102
|
+
start = self.pos
|
|
103
|
+
else:
|
|
104
|
+
self.pos += 1
|
|
105
|
+
|
|
106
|
+
raise SelectorError(f"Unterminated string in selector: {self.selector!r}")
|
|
107
|
+
|
|
108
|
+
def _read_unquoted_attr_value(self):
|
|
109
|
+
# Read an unquoted attribute value (CSS identifier)
|
|
110
|
+
start = self.pos
|
|
111
|
+
while self.pos < self.length:
|
|
112
|
+
ch = self.selector[self.pos]
|
|
113
|
+
if ch in " \t\n\r\f]":
|
|
114
|
+
break
|
|
115
|
+
self.pos += 1
|
|
116
|
+
return self.selector[start : self.pos]
|
|
117
|
+
|
|
118
|
+
def tokenize(self):
|
|
119
|
+
tokens = []
|
|
120
|
+
pending_whitespace = False
|
|
121
|
+
|
|
122
|
+
while self.pos < self.length:
|
|
123
|
+
ch = self.selector[self.pos]
|
|
124
|
+
|
|
125
|
+
# Skip whitespace but remember it for combinator detection
|
|
126
|
+
if ch in " \t\n\r\f":
|
|
127
|
+
pending_whitespace = True
|
|
128
|
+
self._skip_whitespace()
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
# Handle combinators: >, +, ~
|
|
132
|
+
if ch in ">+~":
|
|
133
|
+
pending_whitespace = False
|
|
134
|
+
self.pos += 1
|
|
135
|
+
self._skip_whitespace()
|
|
136
|
+
tokens.append(Token(TokenType.COMBINATOR, ch))
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# If we had whitespace and this isn't a combinator symbol or comma,
|
|
140
|
+
# it's a descendant combinator. Note: combinators and commas consume
|
|
141
|
+
# trailing whitespace, so pending_whitespace is always False after them.
|
|
142
|
+
if pending_whitespace and tokens and ch not in ",":
|
|
143
|
+
tokens.append(Token(TokenType.COMBINATOR, " "))
|
|
144
|
+
pending_whitespace = False
|
|
145
|
+
|
|
146
|
+
# Universal selector
|
|
147
|
+
if ch == "*":
|
|
148
|
+
self.pos += 1
|
|
149
|
+
tokens.append(Token(TokenType.UNIVERSAL))
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# ID selector
|
|
153
|
+
if ch == "#":
|
|
154
|
+
self.pos += 1
|
|
155
|
+
name = self._read_name()
|
|
156
|
+
if not name:
|
|
157
|
+
raise SelectorError(f"Expected identifier after # at position {self.pos}")
|
|
158
|
+
tokens.append(Token(TokenType.ID, name))
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# Class selector
|
|
162
|
+
if ch == ".":
|
|
163
|
+
self.pos += 1
|
|
164
|
+
name = self._read_name()
|
|
165
|
+
if not name:
|
|
166
|
+
raise SelectorError(f"Expected identifier after . at position {self.pos}")
|
|
167
|
+
tokens.append(Token(TokenType.CLASS, name))
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Attribute selector
|
|
171
|
+
if ch == "[":
|
|
172
|
+
self.pos += 1
|
|
173
|
+
tokens.append(Token(TokenType.ATTR_START))
|
|
174
|
+
self._skip_whitespace()
|
|
175
|
+
|
|
176
|
+
# Read attribute name
|
|
177
|
+
attr_name = self._read_name()
|
|
178
|
+
if not attr_name:
|
|
179
|
+
raise SelectorError(f"Expected attribute name at position {self.pos}")
|
|
180
|
+
tokens.append(Token(TokenType.TAG, attr_name)) # Reuse TAG for attr name
|
|
181
|
+
self._skip_whitespace()
|
|
182
|
+
|
|
183
|
+
# Check for operator
|
|
184
|
+
ch2 = self._peek()
|
|
185
|
+
if ch2 == "]":
|
|
186
|
+
self.pos += 1
|
|
187
|
+
tokens.append(Token(TokenType.ATTR_END))
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# Read operator
|
|
191
|
+
if ch2 == "=":
|
|
192
|
+
self.pos += 1
|
|
193
|
+
tokens.append(Token(TokenType.ATTR_OP, "="))
|
|
194
|
+
elif ch2 in "~|^$*":
|
|
195
|
+
op_char = ch2
|
|
196
|
+
self.pos += 1
|
|
197
|
+
if self._peek() != "=":
|
|
198
|
+
raise SelectorError(f"Expected = after {op_char} at position {self.pos}")
|
|
199
|
+
self.pos += 1
|
|
200
|
+
tokens.append(Token(TokenType.ATTR_OP, op_char + "="))
|
|
201
|
+
else:
|
|
202
|
+
raise SelectorError(f"Unexpected character in attribute selector: {ch2!r}")
|
|
203
|
+
|
|
204
|
+
self._skip_whitespace()
|
|
205
|
+
|
|
206
|
+
# Read value
|
|
207
|
+
ch3 = self._peek()
|
|
208
|
+
if ch3 == '"' or ch3 == "'":
|
|
209
|
+
value = self._read_string(ch3)
|
|
210
|
+
else:
|
|
211
|
+
value = self._read_unquoted_attr_value()
|
|
212
|
+
tokens.append(Token(TokenType.STRING, value))
|
|
213
|
+
|
|
214
|
+
self._skip_whitespace()
|
|
215
|
+
if self._peek() != "]":
|
|
216
|
+
raise SelectorError(f"Expected ] at position {self.pos}")
|
|
217
|
+
self.pos += 1
|
|
218
|
+
tokens.append(Token(TokenType.ATTR_END))
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# Comma (selector grouping)
|
|
222
|
+
if ch == ",":
|
|
223
|
+
self.pos += 1
|
|
224
|
+
self._skip_whitespace()
|
|
225
|
+
tokens.append(Token(TokenType.COMMA))
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Pseudo-class
|
|
229
|
+
if ch == ":":
|
|
230
|
+
self.pos += 1
|
|
231
|
+
tokens.append(Token(TokenType.COLON))
|
|
232
|
+
# Read pseudo-class name
|
|
233
|
+
name = self._read_name()
|
|
234
|
+
if not name:
|
|
235
|
+
raise SelectorError(f"Expected pseudo-class name after : at position {self.pos}")
|
|
236
|
+
tokens.append(Token(TokenType.TAG, name))
|
|
237
|
+
|
|
238
|
+
# Check for functional pseudo-class
|
|
239
|
+
if self._peek() == "(":
|
|
240
|
+
self.pos += 1
|
|
241
|
+
tokens.append(Token(TokenType.PAREN_OPEN))
|
|
242
|
+
self._skip_whitespace()
|
|
243
|
+
|
|
244
|
+
# Special handling for :not() - can contain a selector
|
|
245
|
+
# For :nth-child() - read the expression
|
|
246
|
+
paren_depth = 1
|
|
247
|
+
arg_start = self.pos
|
|
248
|
+
while self.pos < self.length and paren_depth > 0:
|
|
249
|
+
c = self.selector[self.pos]
|
|
250
|
+
if c == "(":
|
|
251
|
+
paren_depth += 1
|
|
252
|
+
elif c == ")":
|
|
253
|
+
paren_depth -= 1
|
|
254
|
+
if paren_depth > 0:
|
|
255
|
+
self.pos += 1
|
|
256
|
+
|
|
257
|
+
arg = self.selector[arg_start : self.pos].strip()
|
|
258
|
+
if arg:
|
|
259
|
+
tokens.append(Token(TokenType.STRING, arg))
|
|
260
|
+
|
|
261
|
+
if self._peek() != ")":
|
|
262
|
+
raise SelectorError(f"Expected ) at position {self.pos}")
|
|
263
|
+
self.pos += 1
|
|
264
|
+
tokens.append(Token(TokenType.PAREN_CLOSE))
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
# Tag name
|
|
268
|
+
if self._is_name_start(ch):
|
|
269
|
+
name = self._read_name()
|
|
270
|
+
tokens.append(Token(TokenType.TAG, name.lower())) # Tags are case-insensitive
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
raise SelectorError(f"Unexpected character {ch!r} at position {self.pos}")
|
|
274
|
+
|
|
275
|
+
tokens.append(Token(TokenType.EOF))
|
|
276
|
+
return tokens
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# AST Node types for parsed selectors
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class SimpleSelector:
|
|
283
|
+
"""A single simple selector (tag, id, class, attribute, or pseudo-class)."""
|
|
284
|
+
|
|
285
|
+
__slots__ = ("arg", "name", "operator", "type", "value")
|
|
286
|
+
|
|
287
|
+
TYPE_TAG = "tag"
|
|
288
|
+
TYPE_ID = "id"
|
|
289
|
+
TYPE_CLASS = "class"
|
|
290
|
+
TYPE_UNIVERSAL = "universal"
|
|
291
|
+
TYPE_ATTR = "attr"
|
|
292
|
+
TYPE_PSEUDO = "pseudo"
|
|
293
|
+
|
|
294
|
+
def __init__(self, selector_type, name=None, operator=None, value=None, arg=None):
|
|
295
|
+
self.type = selector_type
|
|
296
|
+
self.name = name
|
|
297
|
+
self.operator = operator
|
|
298
|
+
self.value = value
|
|
299
|
+
self.arg = arg # For :not() and :nth-child()
|
|
300
|
+
|
|
301
|
+
def __repr__(self):
|
|
302
|
+
parts = [f"SimpleSelector({self.type!r}"]
|
|
303
|
+
if self.name:
|
|
304
|
+
parts.append(f", name={self.name!r}")
|
|
305
|
+
if self.operator:
|
|
306
|
+
parts.append(f", op={self.operator!r}")
|
|
307
|
+
if self.value is not None:
|
|
308
|
+
parts.append(f", value={self.value!r}")
|
|
309
|
+
if self.arg is not None:
|
|
310
|
+
parts.append(f", arg={self.arg!r}")
|
|
311
|
+
parts.append(")")
|
|
312
|
+
return "".join(parts)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class CompoundSelector:
|
|
316
|
+
"""A sequence of simple selectors (e.g., div.foo#bar)."""
|
|
317
|
+
|
|
318
|
+
__slots__ = ("selectors",)
|
|
319
|
+
|
|
320
|
+
def __init__(self, selectors=None):
|
|
321
|
+
self.selectors = selectors or []
|
|
322
|
+
|
|
323
|
+
def __repr__(self):
|
|
324
|
+
return f"CompoundSelector({self.selectors!r})"
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class ComplexSelector:
|
|
328
|
+
"""A chain of compound selectors with combinators."""
|
|
329
|
+
|
|
330
|
+
__slots__ = ("parts",)
|
|
331
|
+
|
|
332
|
+
def __init__(self):
|
|
333
|
+
# List of (combinator, compound_selector) tuples
|
|
334
|
+
# First item has combinator=None
|
|
335
|
+
self.parts = []
|
|
336
|
+
|
|
337
|
+
def __repr__(self):
|
|
338
|
+
return f"ComplexSelector({self.parts!r})"
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class SelectorList:
|
|
342
|
+
"""A comma-separated list of complex selectors."""
|
|
343
|
+
|
|
344
|
+
__slots__ = ("selectors",)
|
|
345
|
+
|
|
346
|
+
def __init__(self, selectors=None):
|
|
347
|
+
self.selectors = selectors or []
|
|
348
|
+
|
|
349
|
+
def __repr__(self):
|
|
350
|
+
return f"SelectorList({self.selectors!r})"
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class SelectorParser:
|
|
354
|
+
"""Parses a list of tokens into a selector AST."""
|
|
355
|
+
|
|
356
|
+
__slots__ = ("pos", "tokens")
|
|
357
|
+
|
|
358
|
+
def __init__(self, tokens):
|
|
359
|
+
self.tokens = tokens
|
|
360
|
+
self.pos = 0
|
|
361
|
+
|
|
362
|
+
def _peek(self):
|
|
363
|
+
if self.pos < len(self.tokens):
|
|
364
|
+
return self.tokens[self.pos]
|
|
365
|
+
return Token(TokenType.EOF)
|
|
366
|
+
|
|
367
|
+
def _advance(self):
|
|
368
|
+
token = self._peek()
|
|
369
|
+
self.pos += 1
|
|
370
|
+
return token
|
|
371
|
+
|
|
372
|
+
def _expect(self, token_type):
|
|
373
|
+
token = self._peek()
|
|
374
|
+
if token.type != token_type:
|
|
375
|
+
raise SelectorError(f"Expected {token_type}, got {token.type}")
|
|
376
|
+
return self._advance()
|
|
377
|
+
|
|
378
|
+
def parse(self):
|
|
379
|
+
"""Parse a complete selector (possibly comma-separated list)."""
|
|
380
|
+
selectors = []
|
|
381
|
+
# parse_selector() validates non-empty input, so first selector always exists
|
|
382
|
+
selectors.append(self._parse_complex_selector())
|
|
383
|
+
|
|
384
|
+
while self._peek().type == TokenType.COMMA:
|
|
385
|
+
self._advance() # consume comma
|
|
386
|
+
selector = self._parse_complex_selector()
|
|
387
|
+
if selector:
|
|
388
|
+
selectors.append(selector)
|
|
389
|
+
|
|
390
|
+
if self._peek().type != TokenType.EOF:
|
|
391
|
+
raise SelectorError(f"Unexpected token: {self._peek()}")
|
|
392
|
+
|
|
393
|
+
if len(selectors) == 1:
|
|
394
|
+
return selectors[0]
|
|
395
|
+
return SelectorList(selectors)
|
|
396
|
+
|
|
397
|
+
def _parse_complex_selector(self):
|
|
398
|
+
"""Parse a complex selector (compound selectors with combinators)."""
|
|
399
|
+
complex_sel = ComplexSelector()
|
|
400
|
+
|
|
401
|
+
# First compound selector (no combinator)
|
|
402
|
+
compound = self._parse_compound_selector()
|
|
403
|
+
if not compound:
|
|
404
|
+
return None
|
|
405
|
+
complex_sel.parts.append((None, compound))
|
|
406
|
+
|
|
407
|
+
# Parse combinator + compound selector pairs
|
|
408
|
+
while self._peek().type == TokenType.COMBINATOR:
|
|
409
|
+
combinator = self._advance().value
|
|
410
|
+
compound = self._parse_compound_selector()
|
|
411
|
+
if not compound:
|
|
412
|
+
raise SelectorError("Expected selector after combinator")
|
|
413
|
+
complex_sel.parts.append((combinator, compound))
|
|
414
|
+
|
|
415
|
+
return complex_sel
|
|
416
|
+
|
|
417
|
+
def _parse_compound_selector(self):
|
|
418
|
+
"""Parse a compound selector (sequence of simple selectors)."""
|
|
419
|
+
simple_selectors = []
|
|
420
|
+
|
|
421
|
+
while True:
|
|
422
|
+
token = self._peek()
|
|
423
|
+
|
|
424
|
+
if token.type == TokenType.TAG:
|
|
425
|
+
self._advance()
|
|
426
|
+
simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_TAG, name=token.value))
|
|
427
|
+
|
|
428
|
+
elif token.type == TokenType.UNIVERSAL:
|
|
429
|
+
self._advance()
|
|
430
|
+
simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_UNIVERSAL))
|
|
431
|
+
|
|
432
|
+
elif token.type == TokenType.ID:
|
|
433
|
+
self._advance()
|
|
434
|
+
simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_ID, name=token.value))
|
|
435
|
+
|
|
436
|
+
elif token.type == TokenType.CLASS:
|
|
437
|
+
self._advance()
|
|
438
|
+
simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_CLASS, name=token.value))
|
|
439
|
+
|
|
440
|
+
elif token.type == TokenType.ATTR_START:
|
|
441
|
+
simple_selectors.append(self._parse_attribute_selector())
|
|
442
|
+
|
|
443
|
+
elif token.type == TokenType.COLON:
|
|
444
|
+
simple_selectors.append(self._parse_pseudo_selector())
|
|
445
|
+
|
|
446
|
+
else:
|
|
447
|
+
break
|
|
448
|
+
|
|
449
|
+
if not simple_selectors:
|
|
450
|
+
return None
|
|
451
|
+
return CompoundSelector(simple_selectors)
|
|
452
|
+
|
|
453
|
+
def _parse_attribute_selector(self):
|
|
454
|
+
"""Parse an attribute selector [attr], [attr=value], etc."""
|
|
455
|
+
self._expect(TokenType.ATTR_START)
|
|
456
|
+
|
|
457
|
+
attr_name = self._expect(TokenType.TAG).value
|
|
458
|
+
|
|
459
|
+
token = self._peek()
|
|
460
|
+
if token.type == TokenType.ATTR_END:
|
|
461
|
+
self._advance()
|
|
462
|
+
return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name)
|
|
463
|
+
|
|
464
|
+
operator = self._expect(TokenType.ATTR_OP).value
|
|
465
|
+
value = self._expect(TokenType.STRING).value
|
|
466
|
+
self._expect(TokenType.ATTR_END)
|
|
467
|
+
|
|
468
|
+
return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name, operator=operator, value=value)
|
|
469
|
+
|
|
470
|
+
def _parse_pseudo_selector(self):
|
|
471
|
+
"""Parse a pseudo-class selector like :first-child or :not(selector)."""
|
|
472
|
+
self._expect(TokenType.COLON)
|
|
473
|
+
name = self._expect(TokenType.TAG).value
|
|
474
|
+
|
|
475
|
+
# Functional pseudo-class
|
|
476
|
+
if self._peek().type == TokenType.PAREN_OPEN:
|
|
477
|
+
self._advance()
|
|
478
|
+
arg = None
|
|
479
|
+
if self._peek().type == TokenType.STRING:
|
|
480
|
+
arg = self._advance().value
|
|
481
|
+
self._expect(TokenType.PAREN_CLOSE)
|
|
482
|
+
return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name, arg=arg)
|
|
483
|
+
|
|
484
|
+
return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
class SelectorMatcher:
|
|
488
|
+
"""Matches selectors against DOM nodes."""
|
|
489
|
+
|
|
490
|
+
__slots__ = ()
|
|
491
|
+
|
|
492
|
+
def matches(self, node, selector):
|
|
493
|
+
"""Check if a node matches a parsed selector."""
|
|
494
|
+
if isinstance(selector, SelectorList):
|
|
495
|
+
return any(self.matches(node, sel) for sel in selector.selectors)
|
|
496
|
+
if isinstance(selector, ComplexSelector):
|
|
497
|
+
return self._matches_complex(node, selector)
|
|
498
|
+
if isinstance(selector, CompoundSelector):
|
|
499
|
+
return self._matches_compound(node, selector)
|
|
500
|
+
if isinstance(selector, SimpleSelector):
|
|
501
|
+
return self._matches_simple(node, selector)
|
|
502
|
+
return False
|
|
503
|
+
|
|
504
|
+
def _matches_complex(self, node, selector):
|
|
505
|
+
"""Match a complex selector (with combinators)."""
|
|
506
|
+
# Work backwards from the rightmost compound selector
|
|
507
|
+
parts = selector.parts
|
|
508
|
+
if not parts:
|
|
509
|
+
return False
|
|
510
|
+
|
|
511
|
+
# Start with the rightmost part
|
|
512
|
+
combinator, compound = parts[-1]
|
|
513
|
+
if not self._matches_compound(node, compound):
|
|
514
|
+
return False
|
|
515
|
+
|
|
516
|
+
# Work backwards through the chain
|
|
517
|
+
current = node
|
|
518
|
+
for i in range(len(parts) - 2, -1, -1):
|
|
519
|
+
combinator, compound = parts[i + 1]
|
|
520
|
+
prev_compound = parts[i][1]
|
|
521
|
+
|
|
522
|
+
if combinator == " ": # Descendant
|
|
523
|
+
found = False
|
|
524
|
+
ancestor = current.parent
|
|
525
|
+
while ancestor:
|
|
526
|
+
if self._matches_compound(ancestor, prev_compound):
|
|
527
|
+
current = ancestor
|
|
528
|
+
found = True
|
|
529
|
+
break
|
|
530
|
+
ancestor = ancestor.parent
|
|
531
|
+
if not found:
|
|
532
|
+
return False
|
|
533
|
+
|
|
534
|
+
elif combinator == ">": # Child
|
|
535
|
+
parent = current.parent
|
|
536
|
+
if not parent or not self._matches_compound(parent, prev_compound):
|
|
537
|
+
return False
|
|
538
|
+
current = parent
|
|
539
|
+
|
|
540
|
+
elif combinator == "+": # Adjacent sibling
|
|
541
|
+
sibling = self._get_previous_sibling(current)
|
|
542
|
+
if not sibling or not self._matches_compound(sibling, prev_compound):
|
|
543
|
+
return False
|
|
544
|
+
current = sibling
|
|
545
|
+
|
|
546
|
+
else: # combinator == "~" - General sibling
|
|
547
|
+
found = False
|
|
548
|
+
sibling = self._get_previous_sibling(current)
|
|
549
|
+
while sibling:
|
|
550
|
+
if self._matches_compound(sibling, prev_compound):
|
|
551
|
+
current = sibling
|
|
552
|
+
found = True
|
|
553
|
+
break
|
|
554
|
+
sibling = self._get_previous_sibling(sibling)
|
|
555
|
+
if not found:
|
|
556
|
+
return False
|
|
557
|
+
|
|
558
|
+
return True
|
|
559
|
+
|
|
560
|
+
def _matches_compound(self, node, compound):
|
|
561
|
+
"""Match a compound selector (all simple selectors must match)."""
|
|
562
|
+
return all(self._matches_simple(node, simple) for simple in compound.selectors)
|
|
563
|
+
|
|
564
|
+
def _matches_simple(self, node, selector):
|
|
565
|
+
"""Match a simple selector against a node."""
|
|
566
|
+
# Text nodes and other non-element nodes don't match element selectors
|
|
567
|
+
if not hasattr(node, "name") or node.name.startswith("#"):
|
|
568
|
+
return False
|
|
569
|
+
|
|
570
|
+
sel_type = selector.type
|
|
571
|
+
|
|
572
|
+
if sel_type == SimpleSelector.TYPE_UNIVERSAL:
|
|
573
|
+
return True
|
|
574
|
+
|
|
575
|
+
if sel_type == SimpleSelector.TYPE_TAG:
|
|
576
|
+
# HTML tag names are case-insensitive
|
|
577
|
+
return node.name.lower() == selector.name.lower()
|
|
578
|
+
|
|
579
|
+
if sel_type == SimpleSelector.TYPE_ID:
|
|
580
|
+
node_id = node.attrs.get("id", "") if node.attrs else ""
|
|
581
|
+
return node_id == selector.name
|
|
582
|
+
|
|
583
|
+
if sel_type == SimpleSelector.TYPE_CLASS:
|
|
584
|
+
class_attr = node.attrs.get("class", "") if node.attrs else ""
|
|
585
|
+
classes = class_attr.split() if class_attr else []
|
|
586
|
+
return selector.name in classes
|
|
587
|
+
|
|
588
|
+
if sel_type == SimpleSelector.TYPE_ATTR:
|
|
589
|
+
return self._matches_attribute(node, selector)
|
|
590
|
+
|
|
591
|
+
if sel_type == SimpleSelector.TYPE_PSEUDO:
|
|
592
|
+
return self._matches_pseudo(node, selector)
|
|
593
|
+
|
|
594
|
+
return False
|
|
595
|
+
|
|
596
|
+
def _matches_attribute(self, node, selector):
|
|
597
|
+
"""Match an attribute selector."""
|
|
598
|
+
attrs = node.attrs or {}
|
|
599
|
+
attr_name = selector.name.lower() # Attribute names are case-insensitive in HTML
|
|
600
|
+
|
|
601
|
+
# Check if attribute exists (for any case)
|
|
602
|
+
attr_value = None
|
|
603
|
+
for name, value in attrs.items():
|
|
604
|
+
if name.lower() == attr_name:
|
|
605
|
+
attr_value = value
|
|
606
|
+
break
|
|
607
|
+
|
|
608
|
+
if attr_value is None:
|
|
609
|
+
return False
|
|
610
|
+
|
|
611
|
+
# Presence check only
|
|
612
|
+
if selector.operator is None:
|
|
613
|
+
return True
|
|
614
|
+
|
|
615
|
+
value = selector.value
|
|
616
|
+
op = selector.operator
|
|
617
|
+
|
|
618
|
+
if op == "=":
|
|
619
|
+
return attr_value == value
|
|
620
|
+
|
|
621
|
+
if op == "~=":
|
|
622
|
+
# Space-separated word match
|
|
623
|
+
words = attr_value.split() if attr_value else []
|
|
624
|
+
return value in words
|
|
625
|
+
|
|
626
|
+
if op == "|=":
|
|
627
|
+
# Hyphen-separated prefix match (e.g., lang="en" matches lang|="en-US")
|
|
628
|
+
return attr_value == value or attr_value.startswith(value + "-")
|
|
629
|
+
|
|
630
|
+
if op == "^=":
|
|
631
|
+
# Starts with
|
|
632
|
+
return attr_value.startswith(value) if value else False
|
|
633
|
+
|
|
634
|
+
if op == "$=":
|
|
635
|
+
# Ends with
|
|
636
|
+
return attr_value.endswith(value) if value else False
|
|
637
|
+
|
|
638
|
+
if op == "*=":
|
|
639
|
+
# Contains
|
|
640
|
+
return value in attr_value if value else False
|
|
641
|
+
|
|
642
|
+
return False
|
|
643
|
+
|
|
644
|
+
def _matches_pseudo(self, node, selector):
|
|
645
|
+
"""Match a pseudo-class selector."""
|
|
646
|
+
name = selector.name.lower()
|
|
647
|
+
|
|
648
|
+
if name == "first-child":
|
|
649
|
+
return self._is_first_child(node)
|
|
650
|
+
|
|
651
|
+
if name == "last-child":
|
|
652
|
+
return self._is_last_child(node)
|
|
653
|
+
|
|
654
|
+
if name == "nth-child":
|
|
655
|
+
return self._matches_nth_child(node, selector.arg)
|
|
656
|
+
|
|
657
|
+
if name == "not":
|
|
658
|
+
if not selector.arg:
|
|
659
|
+
return True
|
|
660
|
+
# Parse the inner selector
|
|
661
|
+
inner = parse_selector(selector.arg)
|
|
662
|
+
return not self.matches(node, inner)
|
|
663
|
+
|
|
664
|
+
if name == "only-child":
|
|
665
|
+
return self._is_first_child(node) and self._is_last_child(node)
|
|
666
|
+
|
|
667
|
+
if name == "empty":
|
|
668
|
+
if not node.has_child_nodes():
|
|
669
|
+
return True
|
|
670
|
+
# Check if all children are empty text nodes
|
|
671
|
+
for child in node.children:
|
|
672
|
+
if hasattr(child, "name"):
|
|
673
|
+
if child.name == "#text":
|
|
674
|
+
if child.data and child.data.strip():
|
|
675
|
+
return False
|
|
676
|
+
elif not child.name.startswith("#"):
|
|
677
|
+
return False
|
|
678
|
+
return True
|
|
679
|
+
|
|
680
|
+
if name == "root":
|
|
681
|
+
# Root is the html element (or document root's first element child)
|
|
682
|
+
parent = node.parent
|
|
683
|
+
if parent and hasattr(parent, "name"):
|
|
684
|
+
return parent.name in ("#document", "#document-fragment")
|
|
685
|
+
return False
|
|
686
|
+
|
|
687
|
+
if name == "first-of-type":
|
|
688
|
+
return self._is_first_of_type(node)
|
|
689
|
+
|
|
690
|
+
if name == "last-of-type":
|
|
691
|
+
return self._is_last_of_type(node)
|
|
692
|
+
|
|
693
|
+
if name == "nth-of-type":
|
|
694
|
+
return self._matches_nth_of_type(node, selector.arg)
|
|
695
|
+
|
|
696
|
+
if name == "only-of-type":
|
|
697
|
+
return self._is_first_of_type(node) and self._is_last_of_type(node)
|
|
698
|
+
|
|
699
|
+
# Unknown pseudo-class - don't match
|
|
700
|
+
raise SelectorError(f"Unsupported pseudo-class: :{name}")
|
|
701
|
+
|
|
702
|
+
def _get_element_children(self, parent):
|
|
703
|
+
"""Get only element children (exclude text, comments, etc.)."""
|
|
704
|
+
if not parent or not parent.has_child_nodes():
|
|
705
|
+
return []
|
|
706
|
+
return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
|
|
707
|
+
|
|
708
|
+
def _get_previous_sibling(self, node):
|
|
709
|
+
"""Get the previous element sibling. Returns None if node is first or not found."""
|
|
710
|
+
parent = node.parent
|
|
711
|
+
if not parent:
|
|
712
|
+
return None
|
|
713
|
+
|
|
714
|
+
prev = None
|
|
715
|
+
for child in parent.children:
|
|
716
|
+
if child is node:
|
|
717
|
+
return prev
|
|
718
|
+
if hasattr(child, "name") and not child.name.startswith("#"):
|
|
719
|
+
prev = child
|
|
720
|
+
return None # node not in parent.children (detached)
|
|
721
|
+
|
|
722
|
+
def _is_first_child(self, node):
|
|
723
|
+
"""Check if node is the first element child of its parent."""
|
|
724
|
+
parent = node.parent
|
|
725
|
+
if not parent:
|
|
726
|
+
return False
|
|
727
|
+
elements = self._get_element_children(parent)
|
|
728
|
+
return elements and elements[0] is node
|
|
729
|
+
|
|
730
|
+
def _is_last_child(self, node):
|
|
731
|
+
"""Check if node is the last element child of its parent."""
|
|
732
|
+
parent = node.parent
|
|
733
|
+
if not parent:
|
|
734
|
+
return False
|
|
735
|
+
elements = self._get_element_children(parent)
|
|
736
|
+
return elements and elements[-1] is node
|
|
737
|
+
|
|
738
|
+
def _is_first_of_type(self, node):
|
|
739
|
+
"""Check if node is the first sibling of its type."""
|
|
740
|
+
parent = node.parent
|
|
741
|
+
if not parent:
|
|
742
|
+
return False
|
|
743
|
+
node_name = node.name.lower()
|
|
744
|
+
for child in self._get_element_children(parent):
|
|
745
|
+
if child.name.lower() == node_name:
|
|
746
|
+
return child is node
|
|
747
|
+
return False
|
|
748
|
+
|
|
749
|
+
def _is_last_of_type(self, node):
|
|
750
|
+
"""Check if node is the last sibling of its type."""
|
|
751
|
+
parent = node.parent
|
|
752
|
+
if not parent:
|
|
753
|
+
return False
|
|
754
|
+
node_name = node.name.lower()
|
|
755
|
+
last_of_type = None
|
|
756
|
+
for child in self._get_element_children(parent):
|
|
757
|
+
if child.name.lower() == node_name:
|
|
758
|
+
last_of_type = child
|
|
759
|
+
return last_of_type is node
|
|
760
|
+
|
|
761
|
+
def _parse_nth_expression(self, expr):
|
|
762
|
+
"""Parse an nth-child expression like '2n+1', 'odd', 'even', '3'."""
|
|
763
|
+
if not expr:
|
|
764
|
+
return None
|
|
765
|
+
|
|
766
|
+
expr = expr.strip().lower()
|
|
767
|
+
|
|
768
|
+
if expr == "odd":
|
|
769
|
+
return (2, 1) # 2n+1
|
|
770
|
+
if expr == "even":
|
|
771
|
+
return (2, 0) # 2n
|
|
772
|
+
|
|
773
|
+
# Parse An+B syntax
|
|
774
|
+
# Handle formats: n, 2n, 2n+1, -n+2, 3, etc.
|
|
775
|
+
a = 0
|
|
776
|
+
b = 0
|
|
777
|
+
|
|
778
|
+
# Remove all spaces
|
|
779
|
+
expr = expr.replace(" ", "")
|
|
780
|
+
|
|
781
|
+
if "n" in expr:
|
|
782
|
+
parts = expr.split("n")
|
|
783
|
+
a_part = parts[0]
|
|
784
|
+
b_part = parts[1] if len(parts) > 1 else ""
|
|
785
|
+
|
|
786
|
+
if a_part == "" or a_part == "+":
|
|
787
|
+
a = 1
|
|
788
|
+
elif a_part == "-":
|
|
789
|
+
a = -1
|
|
790
|
+
else:
|
|
791
|
+
try:
|
|
792
|
+
a = int(a_part)
|
|
793
|
+
except ValueError:
|
|
794
|
+
return None
|
|
795
|
+
|
|
796
|
+
if b_part:
|
|
797
|
+
try:
|
|
798
|
+
b = int(b_part)
|
|
799
|
+
except ValueError:
|
|
800
|
+
return None
|
|
801
|
+
else:
|
|
802
|
+
# Just a number
|
|
803
|
+
try:
|
|
804
|
+
b = int(expr)
|
|
805
|
+
except ValueError:
|
|
806
|
+
return None
|
|
807
|
+
|
|
808
|
+
return (a, b)
|
|
809
|
+
|
|
810
|
+
def _matches_nth(self, index, a, b):
|
|
811
|
+
"""Check if 1-based index matches An+B formula."""
|
|
812
|
+
if a == 0:
|
|
813
|
+
return index == b
|
|
814
|
+
# Solve: index = a*n + b for non-negative integer n
|
|
815
|
+
# n = (index - b) / a
|
|
816
|
+
diff = index - b
|
|
817
|
+
if a > 0:
|
|
818
|
+
return diff >= 0 and diff % a == 0
|
|
819
|
+
# a < 0: need diff <= 0 and diff divisible by abs(a)
|
|
820
|
+
return diff <= 0 and diff % a == 0
|
|
821
|
+
|
|
822
|
+
def _matches_nth_child(self, node, arg):
|
|
823
|
+
"""Match :nth-child(An+B)."""
|
|
824
|
+
parent = node.parent
|
|
825
|
+
if not parent:
|
|
826
|
+
return False
|
|
827
|
+
|
|
828
|
+
parsed = self._parse_nth_expression(arg)
|
|
829
|
+
if parsed is None:
|
|
830
|
+
return False
|
|
831
|
+
a, b = parsed
|
|
832
|
+
|
|
833
|
+
elements = self._get_element_children(parent)
|
|
834
|
+
for i, child in enumerate(elements):
|
|
835
|
+
if child is node:
|
|
836
|
+
return self._matches_nth(i + 1, a, b)
|
|
837
|
+
return False
|
|
838
|
+
|
|
839
|
+
def _matches_nth_of_type(self, node, arg):
|
|
840
|
+
"""Match :nth-of-type(An+B)."""
|
|
841
|
+
parent = node.parent
|
|
842
|
+
if not parent:
|
|
843
|
+
return False
|
|
844
|
+
|
|
845
|
+
parsed = self._parse_nth_expression(arg)
|
|
846
|
+
if parsed is None:
|
|
847
|
+
return False
|
|
848
|
+
a, b = parsed
|
|
849
|
+
|
|
850
|
+
node_name = node.name.lower()
|
|
851
|
+
elements = self._get_element_children(parent)
|
|
852
|
+
type_index = 0
|
|
853
|
+
for child in elements:
|
|
854
|
+
if child.name.lower() == node_name:
|
|
855
|
+
type_index += 1
|
|
856
|
+
if child is node:
|
|
857
|
+
return self._matches_nth(type_index, a, b)
|
|
858
|
+
return False
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def parse_selector(selector_string):
|
|
862
|
+
"""Parse a CSS selector string into an AST."""
|
|
863
|
+
if not selector_string or not selector_string.strip():
|
|
864
|
+
raise SelectorError("Empty selector")
|
|
865
|
+
|
|
866
|
+
tokenizer = SelectorTokenizer(selector_string.strip())
|
|
867
|
+
tokens = tokenizer.tokenize()
|
|
868
|
+
parser = SelectorParser(tokens)
|
|
869
|
+
return parser.parse()
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
# Global matcher instance
|
|
873
|
+
_matcher = SelectorMatcher()
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def query(root, selector_string):
|
|
877
|
+
"""
|
|
878
|
+
Query the DOM tree starting from root, returning all matching elements.
|
|
879
|
+
|
|
880
|
+
Searches descendants of root, not including root itself (matching browser
|
|
881
|
+
behavior for querySelectorAll).
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
root: The root node to search from
|
|
885
|
+
selector_string: A CSS selector string
|
|
886
|
+
|
|
887
|
+
Returns:
|
|
888
|
+
A list of matching nodes
|
|
889
|
+
"""
|
|
890
|
+
selector = parse_selector(selector_string)
|
|
891
|
+
results = []
|
|
892
|
+
_query_descendants(root, selector, results)
|
|
893
|
+
return results
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def _query_descendants(node, selector, results):
|
|
897
|
+
"""Recursively search for matching nodes in descendants."""
|
|
898
|
+
# Only recurse into children (not the node itself)
|
|
899
|
+
if node.has_child_nodes():
|
|
900
|
+
for child in node.children:
|
|
901
|
+
# Check if this child matches
|
|
902
|
+
if hasattr(child, "name") and not child.name.startswith("#"):
|
|
903
|
+
if _matcher.matches(child, selector):
|
|
904
|
+
results.append(child)
|
|
905
|
+
# Recurse into child's descendants
|
|
906
|
+
_query_descendants(child, selector, results)
|
|
907
|
+
|
|
908
|
+
# Also check template content if present
|
|
909
|
+
if hasattr(node, "template_content") and node.template_content:
|
|
910
|
+
_query_descendants(node.template_content, selector, results)
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
def matches(node, selector_string):
|
|
914
|
+
"""
|
|
915
|
+
Check if a node matches a CSS selector.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
node: The node to check
|
|
919
|
+
selector_string: A CSS selector string
|
|
920
|
+
|
|
921
|
+
Returns:
|
|
922
|
+
True if the node matches, False otherwise
|
|
923
|
+
"""
|
|
924
|
+
selector = parse_selector(selector_string)
|
|
925
|
+
return _matcher.matches(node, selector)
|