justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/selector.py ADDED
@@ -0,0 +1,965 @@
1
+ # CSS Selector implementation for JustHTML
2
+ # Supports a subset of CSS selectors for querying the DOM
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Any
7
+
8
+
9
+ class SelectorError(ValueError):
10
+ """Raised when a CSS selector is invalid."""
11
+
12
+
13
+ # Token types for the CSS selector lexer
14
+ class TokenType:
15
+ TAG: str = "TAG" # div, span, etc.
16
+ ID: str = "ID" # #foo
17
+ CLASS: str = "CLASS" # .bar
18
+ UNIVERSAL: str = "UNIVERSAL" # *
19
+ ATTR_START: str = "ATTR_START" # [
20
+ ATTR_END: str = "ATTR_END" # ]
21
+ ATTR_OP: str = "ATTR_OP" # =, ~=, |=, ^=, $=, *=
22
+ STRING: str = "STRING" # "value" or 'value' or unquoted
23
+ COMBINATOR: str = "COMBINATOR" # >, +, ~, or whitespace (descendant)
24
+ COMMA: str = "COMMA" # ,
25
+ COLON: str = "COLON" # :
26
+ PAREN_OPEN: str = "PAREN_OPEN" # (
27
+ PAREN_CLOSE: str = "PAREN_CLOSE" # )
28
+ EOF: str = "EOF"
29
+
30
+
31
+ class Token:
32
+ __slots__ = ("type", "value")
33
+
34
+ type: str
35
+ value: str | None
36
+
37
+ def __init__(self, token_type: str, value: str | None = None) -> None:
38
+ self.type = token_type
39
+ self.value = value
40
+
41
+ def __repr__(self) -> str:
42
+ return f"Token({self.type}, {self.value!r})"
43
+
44
+
45
+ class SelectorTokenizer:
46
+ """Tokenizes a CSS selector string into tokens."""
47
+
48
+ __slots__ = ("length", "pos", "selector")
49
+
50
+ selector: str
51
+ pos: int
52
+ length: int
53
+
54
+ def __init__(self, selector: str) -> None:
55
+ self.selector = selector
56
+ self.pos = 0
57
+ self.length = len(selector)
58
+
59
+ def _peek(self, offset: int = 0) -> str:
60
+ pos = self.pos + offset
61
+ if pos < self.length:
62
+ return self.selector[pos]
63
+ return ""
64
+
65
+ def _advance(self) -> str:
66
+ ch = self._peek()
67
+ self.pos += 1
68
+ return ch
69
+
70
+ def _skip_whitespace(self) -> None:
71
+ while self.pos < self.length and self.selector[self.pos] in " \t\n\r\f":
72
+ self.pos += 1
73
+
74
+ def _is_name_start(self, ch: str) -> bool:
75
+ # CSS identifier start: letter, underscore, or non-ASCII
76
+ return ch.isalpha() or ch == "_" or ch == "-" or ord(ch) > 127
77
+
78
+ def _is_name_char(self, ch: str) -> bool:
79
+ # CSS identifier continuation: name-start or digit
80
+ return self._is_name_start(ch) or ch.isdigit()
81
+
82
+ def _read_name(self) -> str:
83
+ start = self.pos
84
+ while self.pos < self.length and self._is_name_char(self.selector[self.pos]):
85
+ self.pos += 1
86
+ return self.selector[start : self.pos]
87
+
88
+ def _read_string(self, quote: str) -> str:
89
+ # Skip opening quote
90
+ self.pos += 1
91
+ start = self.pos
92
+ parts: list[str] = []
93
+
94
+ while self.pos < self.length:
95
+ ch = self.selector[self.pos]
96
+ if ch == quote:
97
+ # Append any remaining text before the closing quote
98
+ if self.pos > start:
99
+ parts.append(self.selector[start : self.pos])
100
+ self.pos += 1
101
+ return "".join(parts)
102
+ if ch == "\\":
103
+ # Append text before the backslash
104
+ if self.pos > start:
105
+ parts.append(self.selector[start : self.pos])
106
+ self.pos += 1
107
+ if self.pos < self.length:
108
+ # Append the escaped character
109
+ parts.append(self.selector[self.pos])
110
+ self.pos += 1
111
+ start = self.pos
112
+ else:
113
+ start = self.pos
114
+ else:
115
+ self.pos += 1
116
+
117
+ raise SelectorError(f"Unterminated string in selector: {self.selector!r}")
118
+
119
+ def _read_unquoted_attr_value(self) -> str:
120
+ # Read an unquoted attribute value (CSS identifier)
121
+ start = self.pos
122
+ while self.pos < self.length:
123
+ ch = self.selector[self.pos]
124
+ if ch in " \t\n\r\f]":
125
+ break
126
+ self.pos += 1
127
+ return self.selector[start : self.pos]
128
+
129
+ def tokenize(self) -> list[Token]:
130
+ tokens: list[Token] = []
131
+ pending_whitespace = False
132
+
133
+ while self.pos < self.length:
134
+ ch = self.selector[self.pos]
135
+
136
+ # Skip whitespace but remember it for combinator detection
137
+ if ch in " \t\n\r\f":
138
+ pending_whitespace = True
139
+ self._skip_whitespace()
140
+ continue
141
+
142
+ # Handle combinators: >, +, ~
143
+ if ch in ">+~":
144
+ pending_whitespace = False
145
+ self.pos += 1
146
+ self._skip_whitespace()
147
+ tokens.append(Token(TokenType.COMBINATOR, ch))
148
+ continue
149
+
150
+ # If we had whitespace and this isn't a combinator symbol or comma,
151
+ # it's a descendant combinator. Note: combinators and commas consume
152
+ # trailing whitespace, so pending_whitespace is always False after them.
153
+ if pending_whitespace and tokens and ch not in ",":
154
+ tokens.append(Token(TokenType.COMBINATOR, " "))
155
+ pending_whitespace = False
156
+
157
+ # Universal selector
158
+ if ch == "*":
159
+ self.pos += 1
160
+ tokens.append(Token(TokenType.UNIVERSAL))
161
+ continue
162
+
163
+ # ID selector
164
+ if ch == "#":
165
+ self.pos += 1
166
+ name = self._read_name()
167
+ if not name:
168
+ raise SelectorError(f"Expected identifier after # at position {self.pos}")
169
+ tokens.append(Token(TokenType.ID, name))
170
+ continue
171
+
172
+ # Class selector
173
+ if ch == ".":
174
+ self.pos += 1
175
+ name = self._read_name()
176
+ if not name:
177
+ raise SelectorError(f"Expected identifier after . at position {self.pos}")
178
+ tokens.append(Token(TokenType.CLASS, name))
179
+ continue
180
+
181
+ # Attribute selector
182
+ if ch == "[":
183
+ self.pos += 1
184
+ tokens.append(Token(TokenType.ATTR_START))
185
+ self._skip_whitespace()
186
+
187
+ # Read attribute name
188
+ attr_name = self._read_name()
189
+ if not attr_name:
190
+ raise SelectorError(f"Expected attribute name at position {self.pos}")
191
+ tokens.append(Token(TokenType.TAG, attr_name)) # Reuse TAG for attr name
192
+ self._skip_whitespace()
193
+
194
+ # Check for operator
195
+ ch2 = self._peek()
196
+ if ch2 == "]":
197
+ self.pos += 1
198
+ tokens.append(Token(TokenType.ATTR_END))
199
+ continue
200
+
201
+ # Read operator
202
+ if ch2 == "=":
203
+ self.pos += 1
204
+ tokens.append(Token(TokenType.ATTR_OP, "="))
205
+ elif ch2 in "~|^$*":
206
+ op_char = ch2
207
+ self.pos += 1
208
+ if self._peek() != "=":
209
+ raise SelectorError(f"Expected = after {op_char} at position {self.pos}")
210
+ self.pos += 1
211
+ tokens.append(Token(TokenType.ATTR_OP, op_char + "="))
212
+ else:
213
+ raise SelectorError(f"Unexpected character in attribute selector: {ch2!r}")
214
+
215
+ self._skip_whitespace()
216
+
217
+ # Read value
218
+ ch3 = self._peek()
219
+ if ch3 == '"' or ch3 == "'":
220
+ value = self._read_string(ch3)
221
+ else:
222
+ value = self._read_unquoted_attr_value()
223
+ tokens.append(Token(TokenType.STRING, value))
224
+
225
+ self._skip_whitespace()
226
+ if self._peek() != "]":
227
+ raise SelectorError(f"Expected ] at position {self.pos}")
228
+ self.pos += 1
229
+ tokens.append(Token(TokenType.ATTR_END))
230
+ continue
231
+
232
+ # Comma (selector grouping)
233
+ if ch == ",":
234
+ self.pos += 1
235
+ self._skip_whitespace()
236
+ tokens.append(Token(TokenType.COMMA))
237
+ continue
238
+
239
+ # Pseudo-class
240
+ if ch == ":":
241
+ self.pos += 1
242
+ tokens.append(Token(TokenType.COLON))
243
+ # Read pseudo-class name
244
+ name = self._read_name()
245
+ if not name:
246
+ raise SelectorError(f"Expected pseudo-class name after : at position {self.pos}")
247
+ tokens.append(Token(TokenType.TAG, name))
248
+
249
+ # Check for functional pseudo-class
250
+ if self._peek() == "(":
251
+ self.pos += 1
252
+ tokens.append(Token(TokenType.PAREN_OPEN))
253
+ self._skip_whitespace()
254
+
255
+ # Special handling for :not() - can contain a selector
256
+ # For :nth-child() - read the expression
257
+ paren_depth = 1
258
+ arg_start = self.pos
259
+ while self.pos < self.length and paren_depth > 0:
260
+ c = self.selector[self.pos]
261
+ if c == "(":
262
+ paren_depth += 1
263
+ elif c == ")":
264
+ paren_depth -= 1
265
+ if paren_depth > 0:
266
+ self.pos += 1
267
+
268
+ arg = self.selector[arg_start : self.pos].strip()
269
+ if arg:
270
+ tokens.append(Token(TokenType.STRING, arg))
271
+
272
+ if self._peek() != ")":
273
+ raise SelectorError(f"Expected ) at position {self.pos}")
274
+ self.pos += 1
275
+ tokens.append(Token(TokenType.PAREN_CLOSE))
276
+ continue
277
+
278
+ # Tag name
279
+ if self._is_name_start(ch):
280
+ name = self._read_name()
281
+ tokens.append(Token(TokenType.TAG, name.lower())) # Tags are case-insensitive
282
+ continue
283
+
284
+ raise SelectorError(f"Unexpected character {ch!r} at position {self.pos}")
285
+
286
+ tokens.append(Token(TokenType.EOF))
287
+ return tokens
288
+
289
+
290
+ # AST Node types for parsed selectors
291
+
292
+
293
+ class SimpleSelector:
294
+ """A single simple selector (tag, id, class, attribute, or pseudo-class)."""
295
+
296
+ __slots__ = ("arg", "name", "operator", "type", "value")
297
+
298
+ TYPE_TAG: str = "tag"
299
+ TYPE_ID: str = "id"
300
+ TYPE_CLASS: str = "class"
301
+ TYPE_UNIVERSAL: str = "universal"
302
+ TYPE_ATTR: str = "attr"
303
+ TYPE_PSEUDO: str = "pseudo"
304
+
305
+ type: str
306
+ name: str | None
307
+ operator: str | None
308
+ value: str | None
309
+ arg: str | None
310
+
311
+ def __init__(
312
+ self,
313
+ selector_type: str,
314
+ name: str | None = None,
315
+ operator: str | None = None,
316
+ value: str | None = None,
317
+ arg: str | None = None,
318
+ ) -> None:
319
+ self.type = selector_type
320
+ self.name = name
321
+ self.operator = operator
322
+ self.value = value
323
+ self.arg = arg # For :not() and :nth-child()
324
+
325
+ def __repr__(self) -> str:
326
+ parts = [f"SimpleSelector({self.type!r}"]
327
+ if self.name:
328
+ parts.append(f", name={self.name!r}")
329
+ if self.operator:
330
+ parts.append(f", op={self.operator!r}")
331
+ if self.value is not None:
332
+ parts.append(f", value={self.value!r}")
333
+ if self.arg is not None:
334
+ parts.append(f", arg={self.arg!r}")
335
+ parts.append(")")
336
+ return "".join(parts)
337
+
338
+
339
+ class CompoundSelector:
340
+ """A sequence of simple selectors (e.g., div.foo#bar)."""
341
+
342
+ __slots__ = ("selectors",)
343
+
344
+ selectors: list[SimpleSelector]
345
+
346
+ def __init__(self, selectors: list[SimpleSelector] | None = None) -> None:
347
+ self.selectors = selectors or []
348
+
349
+ def __repr__(self) -> str:
350
+ return f"CompoundSelector({self.selectors!r})"
351
+
352
+
353
+ class ComplexSelector:
354
+ """A chain of compound selectors with combinators."""
355
+
356
+ __slots__ = ("parts",)
357
+
358
+ parts: list[tuple[str | None, CompoundSelector]]
359
+
360
+ def __init__(self) -> None:
361
+ # List of (combinator, compound_selector) tuples
362
+ # First item has combinator=None
363
+ self.parts = []
364
+
365
+ def __repr__(self) -> str:
366
+ return f"ComplexSelector({self.parts!r})"
367
+
368
+
369
+ class SelectorList:
370
+ """A comma-separated list of complex selectors."""
371
+
372
+ __slots__ = ("selectors",)
373
+
374
+ selectors: list[ComplexSelector]
375
+
376
+ def __init__(self, selectors: list[ComplexSelector] | None = None) -> None:
377
+ self.selectors = selectors or []
378
+
379
+ def __repr__(self) -> str:
380
+ return f"SelectorList({self.selectors!r})"
381
+
382
+
383
+ # Type alias for parsed selectors
384
+ ParsedSelector = ComplexSelector | SelectorList
385
+
386
+
387
+ class SelectorParser:
388
+ """Parses a list of tokens into a selector AST."""
389
+
390
+ __slots__ = ("pos", "tokens")
391
+
392
+ tokens: list[Token]
393
+ pos: int
394
+
395
+ def __init__(self, tokens: list[Token]) -> None:
396
+ self.tokens = tokens
397
+ self.pos = 0
398
+
399
+ def _peek(self) -> Token:
400
+ if self.pos < len(self.tokens):
401
+ return self.tokens[self.pos]
402
+ return Token(TokenType.EOF)
403
+
404
+ def _advance(self) -> Token:
405
+ token = self._peek()
406
+ self.pos += 1
407
+ return token
408
+
409
+ def _expect(self, token_type: str) -> Token:
410
+ token = self._peek()
411
+ if token.type != token_type:
412
+ raise SelectorError(f"Expected {token_type}, got {token.type}")
413
+ return self._advance()
414
+
415
+ def parse(self) -> ParsedSelector:
416
+ """Parse a complete selector (possibly comma-separated list)."""
417
+ selectors: list[ComplexSelector] = []
418
+ # parse_selector() validates non-empty input, so first selector always exists
419
+ first = self._parse_complex_selector()
420
+ if first is None: # pragma: no cover
421
+ raise SelectorError("Empty selector")
422
+ selectors.append(first)
423
+
424
+ while self._peek().type == TokenType.COMMA:
425
+ self._advance() # consume comma
426
+ selector = self._parse_complex_selector()
427
+ if selector:
428
+ selectors.append(selector)
429
+
430
+ if self._peek().type != TokenType.EOF:
431
+ raise SelectorError(f"Unexpected token: {self._peek()}")
432
+
433
+ if len(selectors) == 1:
434
+ return selectors[0]
435
+ return SelectorList(selectors)
436
+
437
+ def _parse_complex_selector(self) -> ComplexSelector | None:
438
+ """Parse a complex selector (compound selectors with combinators)."""
439
+ complex_sel = ComplexSelector()
440
+
441
+ # First compound selector (no combinator)
442
+ compound = self._parse_compound_selector()
443
+ if not compound:
444
+ return None
445
+ complex_sel.parts.append((None, compound))
446
+
447
+ # Parse combinator + compound selector pairs
448
+ while self._peek().type == TokenType.COMBINATOR:
449
+ combinator = self._advance().value
450
+ compound = self._parse_compound_selector()
451
+ if not compound:
452
+ raise SelectorError("Expected selector after combinator")
453
+ complex_sel.parts.append((combinator, compound))
454
+
455
+ return complex_sel
456
+
457
+ def _parse_compound_selector(self) -> CompoundSelector | None:
458
+ """Parse a compound selector (sequence of simple selectors)."""
459
+ simple_selectors: list[SimpleSelector] = []
460
+
461
+ while True:
462
+ token = self._peek()
463
+
464
+ if token.type == TokenType.TAG:
465
+ self._advance()
466
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_TAG, name=token.value))
467
+
468
+ elif token.type == TokenType.UNIVERSAL:
469
+ self._advance()
470
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_UNIVERSAL))
471
+
472
+ elif token.type == TokenType.ID:
473
+ self._advance()
474
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_ID, name=token.value))
475
+
476
+ elif token.type == TokenType.CLASS:
477
+ self._advance()
478
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_CLASS, name=token.value))
479
+
480
+ elif token.type == TokenType.ATTR_START:
481
+ simple_selectors.append(self._parse_attribute_selector())
482
+
483
+ elif token.type == TokenType.COLON:
484
+ simple_selectors.append(self._parse_pseudo_selector())
485
+
486
+ else:
487
+ break
488
+
489
+ if not simple_selectors:
490
+ return None
491
+ return CompoundSelector(simple_selectors)
492
+
493
+ def _parse_attribute_selector(self) -> SimpleSelector:
494
+ """Parse an attribute selector [attr], [attr=value], etc."""
495
+ self._expect(TokenType.ATTR_START)
496
+
497
+ attr_name = self._expect(TokenType.TAG).value
498
+
499
+ token = self._peek()
500
+ if token.type == TokenType.ATTR_END:
501
+ self._advance()
502
+ return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name)
503
+
504
+ operator = self._expect(TokenType.ATTR_OP).value
505
+ value = self._expect(TokenType.STRING).value
506
+ self._expect(TokenType.ATTR_END)
507
+
508
+ return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name, operator=operator, value=value)
509
+
510
+ def _parse_pseudo_selector(self) -> SimpleSelector:
511
+ """Parse a pseudo-class selector like :first-child or :not(selector)."""
512
+ self._expect(TokenType.COLON)
513
+ name = self._expect(TokenType.TAG).value
514
+
515
+ # Functional pseudo-class
516
+ if self._peek().type == TokenType.PAREN_OPEN:
517
+ self._advance()
518
+ arg: str | None = None
519
+ if self._peek().type == TokenType.STRING:
520
+ arg = self._advance().value
521
+ self._expect(TokenType.PAREN_CLOSE)
522
+ return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name, arg=arg)
523
+
524
+ return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name)
525
+
526
+
527
+ class SelectorMatcher:
528
+ """Matches selectors against DOM nodes."""
529
+
530
+ __slots__ = ()
531
+
532
+ def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
533
+ """Check if a node matches a parsed selector."""
534
+ if isinstance(selector, SelectorList):
535
+ return any(self.matches(node, sel) for sel in selector.selectors)
536
+ if isinstance(selector, ComplexSelector):
537
+ return self._matches_complex(node, selector)
538
+ if isinstance(selector, CompoundSelector):
539
+ return self._matches_compound(node, selector)
540
+ if isinstance(selector, SimpleSelector):
541
+ return self._matches_simple(node, selector)
542
+ return False
543
+
544
+ def _matches_complex(self, node: Any, selector: ComplexSelector) -> bool:
545
+ """Match a complex selector (with combinators)."""
546
+ # Work backwards from the rightmost compound selector
547
+ parts = selector.parts
548
+ if not parts:
549
+ return False
550
+
551
+ # Start with the rightmost part
552
+ combinator, compound = parts[-1]
553
+ if not self._matches_compound(node, compound):
554
+ return False
555
+
556
+ # Work backwards through the chain
557
+ current = node
558
+ for i in range(len(parts) - 2, -1, -1):
559
+ combinator, compound = parts[i + 1]
560
+ prev_compound = parts[i][1]
561
+
562
+ if combinator == " ": # Descendant
563
+ found = False
564
+ ancestor = current.parent
565
+ while ancestor:
566
+ if self._matches_compound(ancestor, prev_compound):
567
+ current = ancestor
568
+ found = True
569
+ break
570
+ ancestor = ancestor.parent
571
+ if not found:
572
+ return False
573
+
574
+ elif combinator == ">": # Child
575
+ parent = current.parent
576
+ if not parent or not self._matches_compound(parent, prev_compound):
577
+ return False
578
+ current = parent
579
+
580
+ elif combinator == "+": # Adjacent sibling
581
+ sibling = self._get_previous_sibling(current)
582
+ if not sibling or not self._matches_compound(sibling, prev_compound):
583
+ return False
584
+ current = sibling
585
+
586
+ else: # combinator == "~" - General sibling
587
+ found = False
588
+ sibling = self._get_previous_sibling(current)
589
+ while sibling:
590
+ if self._matches_compound(sibling, prev_compound):
591
+ current = sibling
592
+ found = True
593
+ break
594
+ sibling = self._get_previous_sibling(sibling)
595
+ if not found:
596
+ return False
597
+
598
+ return True
599
+
600
+ def _matches_compound(self, node: Any, compound: CompoundSelector) -> bool:
601
+ """Match a compound selector (all simple selectors must match)."""
602
+ return all(self._matches_simple(node, simple) for simple in compound.selectors)
603
+
604
+ def _matches_simple(self, node: Any, selector: SimpleSelector) -> bool:
605
+ """Match a simple selector against a node."""
606
+ # Text nodes and other non-element nodes don't match element selectors
607
+ if not hasattr(node, "name") or node.name.startswith("#"):
608
+ return False
609
+
610
+ sel_type = selector.type
611
+
612
+ if sel_type == SimpleSelector.TYPE_UNIVERSAL:
613
+ return True
614
+
615
+ if sel_type == SimpleSelector.TYPE_TAG:
616
+ # HTML tag names are case-insensitive
617
+ return bool(node.name.lower() == (selector.name.lower() if selector.name else ""))
618
+
619
+ if sel_type == SimpleSelector.TYPE_ID:
620
+ node_id = node.attrs.get("id", "") if node.attrs else ""
621
+ return node_id == selector.name
622
+
623
+ if sel_type == SimpleSelector.TYPE_CLASS:
624
+ class_attr = node.attrs.get("class", "") if node.attrs else ""
625
+ classes = class_attr.split() if class_attr else []
626
+ return selector.name in classes
627
+
628
+ if sel_type == SimpleSelector.TYPE_ATTR:
629
+ return self._matches_attribute(node, selector)
630
+
631
+ if sel_type == SimpleSelector.TYPE_PSEUDO:
632
+ return self._matches_pseudo(node, selector)
633
+
634
+ return False
635
+
636
+ def _matches_attribute(self, node: Any, selector: SimpleSelector) -> bool:
637
+ """Match an attribute selector."""
638
+ attrs = node.attrs or {}
639
+ attr_name = (selector.name or "").lower() # Attribute names are case-insensitive in HTML
640
+
641
+ # Check if attribute exists (for any case)
642
+ attr_value: str | None = None
643
+ for name, value in attrs.items():
644
+ if name.lower() == attr_name:
645
+ attr_value = value
646
+ break
647
+
648
+ if attr_value is None:
649
+ return False
650
+
651
+ # Presence check only
652
+ if selector.operator is None:
653
+ return True
654
+
655
+ value = selector.value or ""
656
+ op = selector.operator
657
+
658
+ if op == "=":
659
+ return attr_value == value
660
+
661
+ if op == "~=":
662
+ # Space-separated word match
663
+ words = attr_value.split() if attr_value else []
664
+ return value in words
665
+
666
+ if op == "|=":
667
+ # Hyphen-separated prefix match (e.g., lang="en" matches lang|="en-US")
668
+ return attr_value == value or attr_value.startswith(value + "-")
669
+
670
+ if op == "^=":
671
+ # Starts with
672
+ return attr_value.startswith(value) if value else False
673
+
674
+ if op == "$=":
675
+ # Ends with
676
+ return attr_value.endswith(value) if value else False
677
+
678
+ if op == "*=":
679
+ # Contains
680
+ return value in attr_value if value else False
681
+
682
+ return False
683
+
684
+ def _matches_pseudo(self, node: Any, selector: SimpleSelector) -> bool:
685
+ """Match a pseudo-class selector."""
686
+ name = (selector.name or "").lower()
687
+
688
+ if name == "first-child":
689
+ return self._is_first_child(node)
690
+
691
+ if name == "last-child":
692
+ return self._is_last_child(node)
693
+
694
+ if name == "nth-child":
695
+ return self._matches_nth_child(node, selector.arg)
696
+
697
+ if name == "not":
698
+ if not selector.arg:
699
+ return True
700
+ # Parse the inner selector
701
+ inner = parse_selector(selector.arg)
702
+ return not self.matches(node, inner)
703
+
704
+ if name == "only-child":
705
+ return self._is_first_child(node) and self._is_last_child(node)
706
+
707
+ if name == "empty":
708
+ if not node.has_child_nodes():
709
+ return True
710
+ # Check if all children are empty text nodes
711
+ for child in node.children:
712
+ if hasattr(child, "name"):
713
+ if child.name == "#text":
714
+ if child.data and child.data.strip():
715
+ return False
716
+ elif not child.name.startswith("#"):
717
+ return False
718
+ return True
719
+
720
+ if name == "root":
721
+ # Root is the html element (or document root's first element child)
722
+ parent = node.parent
723
+ if parent and hasattr(parent, "name"):
724
+ return parent.name in ("#document", "#document-fragment")
725
+ return False
726
+
727
+ if name == "first-of-type":
728
+ return self._is_first_of_type(node)
729
+
730
+ if name == "last-of-type":
731
+ return self._is_last_of_type(node)
732
+
733
+ if name == "nth-of-type":
734
+ return self._matches_nth_of_type(node, selector.arg)
735
+
736
+ if name == "only-of-type":
737
+ return self._is_first_of_type(node) and self._is_last_of_type(node)
738
+
739
+ # Unknown pseudo-class - don't match
740
+ raise SelectorError(f"Unsupported pseudo-class: :{name}")
741
+
742
+ def _get_element_children(self, parent: Any) -> list[Any]:
743
+ """Get only element children (exclude text, comments, etc.)."""
744
+ if not parent or not parent.has_child_nodes():
745
+ return []
746
+ return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
747
+
748
+ def _get_previous_sibling(self, node: Any) -> Any | None:
749
+ """Get the previous element sibling. Returns None if node is first or not found."""
750
+ parent = node.parent
751
+ if not parent:
752
+ return None
753
+
754
+ prev: Any | None = None
755
+ for child in parent.children:
756
+ if child is node:
757
+ return prev
758
+ if hasattr(child, "name") and not child.name.startswith("#"):
759
+ prev = child
760
+ return None # node not in parent.children (detached)
761
+
762
+ def _is_first_child(self, node: Any) -> bool:
763
+ """Check if node is the first element child of its parent."""
764
+ parent = node.parent
765
+ if not parent:
766
+ return False
767
+ elements = self._get_element_children(parent)
768
+ return bool(elements) and elements[0] is node
769
+
770
+ def _is_last_child(self, node: Any) -> bool:
771
+ """Check if node is the last element child of its parent."""
772
+ parent = node.parent
773
+ if not parent:
774
+ return False
775
+ elements = self._get_element_children(parent)
776
+ return bool(elements) and elements[-1] is node
777
+
778
+ def _is_first_of_type(self, node: Any) -> bool:
779
+ """Check if node is the first sibling of its type."""
780
+ parent = node.parent
781
+ if not parent:
782
+ return False
783
+ node_name = node.name.lower()
784
+ for child in self._get_element_children(parent):
785
+ if child.name.lower() == node_name:
786
+ return child is node
787
+ return False
788
+
789
+ def _is_last_of_type(self, node: Any) -> bool:
790
+ """Check if node is the last sibling of its type."""
791
+ parent = node.parent
792
+ if not parent:
793
+ return False
794
+ node_name = node.name.lower()
795
+ last_of_type: Any | None = None
796
+ for child in self._get_element_children(parent):
797
+ if child.name.lower() == node_name:
798
+ last_of_type = child
799
+ return last_of_type is node
800
+
801
+ def _parse_nth_expression(self, expr: str | None) -> tuple[int, int] | None:
802
+ """Parse an nth-child expression like '2n+1', 'odd', 'even', '3'."""
803
+ if not expr:
804
+ return None
805
+
806
+ expr = expr.strip().lower()
807
+
808
+ if expr == "odd":
809
+ return (2, 1) # 2n+1
810
+ if expr == "even":
811
+ return (2, 0) # 2n
812
+
813
+ # Parse An+B syntax
814
+ # Handle formats: n, 2n, 2n+1, -n+2, 3, etc.
815
+ a = 0
816
+ b = 0
817
+
818
+ # Remove all spaces
819
+ expr = expr.replace(" ", "")
820
+
821
+ if "n" in expr:
822
+ parts = expr.split("n")
823
+ a_part = parts[0]
824
+ b_part = parts[1] if len(parts) > 1 else ""
825
+
826
+ if a_part == "" or a_part == "+":
827
+ a = 1
828
+ elif a_part == "-":
829
+ a = -1
830
+ else:
831
+ try:
832
+ a = int(a_part)
833
+ except ValueError:
834
+ return None
835
+
836
+ if b_part:
837
+ try:
838
+ b = int(b_part)
839
+ except ValueError:
840
+ return None
841
+ else:
842
+ # Just a number
843
+ try:
844
+ b = int(expr)
845
+ except ValueError:
846
+ return None
847
+
848
+ return (a, b)
849
+
850
+ def _matches_nth(self, index: int, a: int, b: int) -> bool:
851
+ """Check if 1-based index matches An+B formula."""
852
+ if a == 0:
853
+ return index == b
854
+ # Solve: index = a*n + b for non-negative integer n
855
+ # n = (index - b) / a
856
+ diff = index - b
857
+ if a > 0:
858
+ return diff >= 0 and diff % a == 0
859
+ # a < 0: need diff <= 0 and diff divisible by abs(a)
860
+ return diff <= 0 and diff % a == 0
861
+
862
+ def _matches_nth_child(self, node: Any, arg: str | None) -> bool:
863
+ """Match :nth-child(An+B)."""
864
+ parent = node.parent
865
+ if not parent:
866
+ return False
867
+
868
+ parsed = self._parse_nth_expression(arg)
869
+ if parsed is None:
870
+ return False
871
+ a, b = parsed
872
+
873
+ elements = self._get_element_children(parent)
874
+ for i, child in enumerate(elements):
875
+ if child is node:
876
+ return self._matches_nth(i + 1, a, b)
877
+ return False
878
+
879
+ def _matches_nth_of_type(self, node: Any, arg: str | None) -> bool:
880
+ """Match :nth-of-type(An+B)."""
881
+ parent = node.parent
882
+ if not parent:
883
+ return False
884
+
885
+ parsed = self._parse_nth_expression(arg)
886
+ if parsed is None:
887
+ return False
888
+ a, b = parsed
889
+
890
+ node_name = node.name.lower()
891
+ elements = self._get_element_children(parent)
892
+ type_index = 0
893
+ for child in elements:
894
+ if child.name.lower() == node_name:
895
+ type_index += 1
896
+ if child is node:
897
+ return self._matches_nth(type_index, a, b)
898
+ return False
899
+
900
+
901
+ def parse_selector(selector_string: str) -> ParsedSelector:
902
+ """Parse a CSS selector string into an AST."""
903
+ if not selector_string or not selector_string.strip():
904
+ raise SelectorError("Empty selector")
905
+
906
+ tokenizer = SelectorTokenizer(selector_string.strip())
907
+ tokens = tokenizer.tokenize()
908
+ parser = SelectorParser(tokens)
909
+ return parser.parse()
910
+
911
+
912
+ # Global matcher instance
913
+ _matcher: SelectorMatcher = SelectorMatcher()
914
+
915
+
916
+ def query(root: Any, selector_string: str) -> list[Any]:
917
+ """
918
+ Query the DOM tree starting from root, returning all matching elements.
919
+
920
+ Searches descendants of root, not including root itself (matching browser
921
+ behavior for querySelectorAll).
922
+
923
+ Args:
924
+ root: The root node to search from
925
+ selector_string: A CSS selector string
926
+
927
+ Returns:
928
+ A list of matching nodes
929
+ """
930
+ selector = parse_selector(selector_string)
931
+ results: list[Any] = []
932
+ _query_descendants(root, selector, results)
933
+ return results
934
+
935
+
936
+ def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
937
+ """Recursively search for matching nodes in descendants."""
938
+ # Only recurse into children (not the node itself)
939
+ if node.has_child_nodes():
940
+ for child in node.children:
941
+ # Check if this child matches
942
+ if hasattr(child, "name") and not child.name.startswith("#"):
943
+ if _matcher.matches(child, selector):
944
+ results.append(child)
945
+ # Recurse into child's descendants
946
+ _query_descendants(child, selector, results)
947
+
948
+ # Also check template content if present
949
+ if hasattr(node, "template_content") and node.template_content:
950
+ _query_descendants(node.template_content, selector, results)
951
+
952
+
953
+ def matches(node: Any, selector_string: str) -> bool:
954
+ """
955
+ Check if a node matches a CSS selector.
956
+
957
+ Args:
958
+ node: The node to check
959
+ selector_string: A CSS selector string
960
+
961
+ Returns:
962
+ True if the node matches, False otherwise
963
+ """
964
+ selector = parse_selector(selector_string)
965
+ return _matcher.matches(node, selector)