justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/selector.py ADDED
@@ -0,0 +1,925 @@
1
+ # CSS Selector implementation for JustHTML
2
+ # Supports a subset of CSS selectors for querying the DOM
3
+
4
+
5
+ class SelectorError(ValueError):
6
+ """Raised when a CSS selector is invalid."""
7
+
8
+
9
+ # Token types for the CSS selector lexer
10
+ class TokenType:
11
+ TAG = "TAG" # div, span, etc.
12
+ ID = "ID" # #foo
13
+ CLASS = "CLASS" # .bar
14
+ UNIVERSAL = "UNIVERSAL" # *
15
+ ATTR_START = "ATTR_START" # [
16
+ ATTR_END = "ATTR_END" # ]
17
+ ATTR_OP = "ATTR_OP" # =, ~=, |=, ^=, $=, *=
18
+ STRING = "STRING" # "value" or 'value' or unquoted
19
+ COMBINATOR = "COMBINATOR" # >, +, ~, or whitespace (descendant)
20
+ COMMA = "COMMA" # ,
21
+ COLON = "COLON" # :
22
+ PAREN_OPEN = "PAREN_OPEN" # (
23
+ PAREN_CLOSE = "PAREN_CLOSE" # )
24
+ EOF = "EOF"
25
+
26
+
27
+ class Token:
28
+ __slots__ = ("type", "value")
29
+
30
+ def __init__(self, token_type, value=None):
31
+ self.type = token_type
32
+ self.value = value
33
+
34
+ def __repr__(self):
35
+ return f"Token({self.type}, {self.value!r})"
36
+
37
+
38
+ class SelectorTokenizer:
39
+ """Tokenizes a CSS selector string into tokens."""
40
+
41
+ __slots__ = ("length", "pos", "selector")
42
+
43
+ def __init__(self, selector):
44
+ self.selector = selector
45
+ self.pos = 0
46
+ self.length = len(selector)
47
+
48
+ def _peek(self, offset=0):
49
+ pos = self.pos + offset
50
+ if pos < self.length:
51
+ return self.selector[pos]
52
+ return ""
53
+
54
+ def _advance(self):
55
+ ch = self._peek()
56
+ self.pos += 1
57
+ return ch
58
+
59
+ def _skip_whitespace(self):
60
+ while self.pos < self.length and self.selector[self.pos] in " \t\n\r\f":
61
+ self.pos += 1
62
+
63
+ def _is_name_start(self, ch):
64
+ # CSS identifier start: letter, underscore, or non-ASCII
65
+ return ch.isalpha() or ch == "_" or ch == "-" or ord(ch) > 127
66
+
67
+ def _is_name_char(self, ch):
68
+ # CSS identifier continuation: name-start or digit
69
+ return self._is_name_start(ch) or ch.isdigit()
70
+
71
+ def _read_name(self):
72
+ start = self.pos
73
+ while self.pos < self.length and self._is_name_char(self.selector[self.pos]):
74
+ self.pos += 1
75
+ return self.selector[start : self.pos]
76
+
77
+ def _read_string(self, quote):
78
+ # Skip opening quote
79
+ self.pos += 1
80
+ start = self.pos
81
+ parts = []
82
+
83
+ while self.pos < self.length:
84
+ ch = self.selector[self.pos]
85
+ if ch == quote:
86
+ # Append any remaining text before the closing quote
87
+ if self.pos > start:
88
+ parts.append(self.selector[start : self.pos])
89
+ self.pos += 1
90
+ return "".join(parts)
91
+ if ch == "\\":
92
+ # Append text before the backslash
93
+ if self.pos > start:
94
+ parts.append(self.selector[start : self.pos])
95
+ self.pos += 1
96
+ if self.pos < self.length:
97
+ # Append the escaped character
98
+ parts.append(self.selector[self.pos])
99
+ self.pos += 1
100
+ start = self.pos
101
+ else:
102
+ start = self.pos
103
+ else:
104
+ self.pos += 1
105
+
106
+ raise SelectorError(f"Unterminated string in selector: {self.selector!r}")
107
+
108
+ def _read_unquoted_attr_value(self):
109
+ # Read an unquoted attribute value (CSS identifier)
110
+ start = self.pos
111
+ while self.pos < self.length:
112
+ ch = self.selector[self.pos]
113
+ if ch in " \t\n\r\f]":
114
+ break
115
+ self.pos += 1
116
+ return self.selector[start : self.pos]
117
+
118
+ def tokenize(self):
119
+ tokens = []
120
+ pending_whitespace = False
121
+
122
+ while self.pos < self.length:
123
+ ch = self.selector[self.pos]
124
+
125
+ # Skip whitespace but remember it for combinator detection
126
+ if ch in " \t\n\r\f":
127
+ pending_whitespace = True
128
+ self._skip_whitespace()
129
+ continue
130
+
131
+ # Handle combinators: >, +, ~
132
+ if ch in ">+~":
133
+ pending_whitespace = False
134
+ self.pos += 1
135
+ self._skip_whitespace()
136
+ tokens.append(Token(TokenType.COMBINATOR, ch))
137
+ continue
138
+
139
+ # If we had whitespace and this isn't a combinator symbol or comma,
140
+ # it's a descendant combinator. Note: combinators and commas consume
141
+ # trailing whitespace, so pending_whitespace is always False after them.
142
+ if pending_whitespace and tokens and ch not in ",":
143
+ tokens.append(Token(TokenType.COMBINATOR, " "))
144
+ pending_whitespace = False
145
+
146
+ # Universal selector
147
+ if ch == "*":
148
+ self.pos += 1
149
+ tokens.append(Token(TokenType.UNIVERSAL))
150
+ continue
151
+
152
+ # ID selector
153
+ if ch == "#":
154
+ self.pos += 1
155
+ name = self._read_name()
156
+ if not name:
157
+ raise SelectorError(f"Expected identifier after # at position {self.pos}")
158
+ tokens.append(Token(TokenType.ID, name))
159
+ continue
160
+
161
+ # Class selector
162
+ if ch == ".":
163
+ self.pos += 1
164
+ name = self._read_name()
165
+ if not name:
166
+ raise SelectorError(f"Expected identifier after . at position {self.pos}")
167
+ tokens.append(Token(TokenType.CLASS, name))
168
+ continue
169
+
170
+ # Attribute selector
171
+ if ch == "[":
172
+ self.pos += 1
173
+ tokens.append(Token(TokenType.ATTR_START))
174
+ self._skip_whitespace()
175
+
176
+ # Read attribute name
177
+ attr_name = self._read_name()
178
+ if not attr_name:
179
+ raise SelectorError(f"Expected attribute name at position {self.pos}")
180
+ tokens.append(Token(TokenType.TAG, attr_name)) # Reuse TAG for attr name
181
+ self._skip_whitespace()
182
+
183
+ # Check for operator
184
+ ch2 = self._peek()
185
+ if ch2 == "]":
186
+ self.pos += 1
187
+ tokens.append(Token(TokenType.ATTR_END))
188
+ continue
189
+
190
+ # Read operator
191
+ if ch2 == "=":
192
+ self.pos += 1
193
+ tokens.append(Token(TokenType.ATTR_OP, "="))
194
+ elif ch2 in "~|^$*":
195
+ op_char = ch2
196
+ self.pos += 1
197
+ if self._peek() != "=":
198
+ raise SelectorError(f"Expected = after {op_char} at position {self.pos}")
199
+ self.pos += 1
200
+ tokens.append(Token(TokenType.ATTR_OP, op_char + "="))
201
+ else:
202
+ raise SelectorError(f"Unexpected character in attribute selector: {ch2!r}")
203
+
204
+ self._skip_whitespace()
205
+
206
+ # Read value
207
+ ch3 = self._peek()
208
+ if ch3 == '"' or ch3 == "'":
209
+ value = self._read_string(ch3)
210
+ else:
211
+ value = self._read_unquoted_attr_value()
212
+ tokens.append(Token(TokenType.STRING, value))
213
+
214
+ self._skip_whitespace()
215
+ if self._peek() != "]":
216
+ raise SelectorError(f"Expected ] at position {self.pos}")
217
+ self.pos += 1
218
+ tokens.append(Token(TokenType.ATTR_END))
219
+ continue
220
+
221
+ # Comma (selector grouping)
222
+ if ch == ",":
223
+ self.pos += 1
224
+ self._skip_whitespace()
225
+ tokens.append(Token(TokenType.COMMA))
226
+ continue
227
+
228
+ # Pseudo-class
229
+ if ch == ":":
230
+ self.pos += 1
231
+ tokens.append(Token(TokenType.COLON))
232
+ # Read pseudo-class name
233
+ name = self._read_name()
234
+ if not name:
235
+ raise SelectorError(f"Expected pseudo-class name after : at position {self.pos}")
236
+ tokens.append(Token(TokenType.TAG, name))
237
+
238
+ # Check for functional pseudo-class
239
+ if self._peek() == "(":
240
+ self.pos += 1
241
+ tokens.append(Token(TokenType.PAREN_OPEN))
242
+ self._skip_whitespace()
243
+
244
+ # Special handling for :not() - can contain a selector
245
+ # For :nth-child() - read the expression
246
+ paren_depth = 1
247
+ arg_start = self.pos
248
+ while self.pos < self.length and paren_depth > 0:
249
+ c = self.selector[self.pos]
250
+ if c == "(":
251
+ paren_depth += 1
252
+ elif c == ")":
253
+ paren_depth -= 1
254
+ if paren_depth > 0:
255
+ self.pos += 1
256
+
257
+ arg = self.selector[arg_start : self.pos].strip()
258
+ if arg:
259
+ tokens.append(Token(TokenType.STRING, arg))
260
+
261
+ if self._peek() != ")":
262
+ raise SelectorError(f"Expected ) at position {self.pos}")
263
+ self.pos += 1
264
+ tokens.append(Token(TokenType.PAREN_CLOSE))
265
+ continue
266
+
267
+ # Tag name
268
+ if self._is_name_start(ch):
269
+ name = self._read_name()
270
+ tokens.append(Token(TokenType.TAG, name.lower())) # Tags are case-insensitive
271
+ continue
272
+
273
+ raise SelectorError(f"Unexpected character {ch!r} at position {self.pos}")
274
+
275
+ tokens.append(Token(TokenType.EOF))
276
+ return tokens
277
+
278
+
279
+ # AST Node types for parsed selectors
280
+
281
+
282
+ class SimpleSelector:
283
+ """A single simple selector (tag, id, class, attribute, or pseudo-class)."""
284
+
285
+ __slots__ = ("arg", "name", "operator", "type", "value")
286
+
287
+ TYPE_TAG = "tag"
288
+ TYPE_ID = "id"
289
+ TYPE_CLASS = "class"
290
+ TYPE_UNIVERSAL = "universal"
291
+ TYPE_ATTR = "attr"
292
+ TYPE_PSEUDO = "pseudo"
293
+
294
+ def __init__(self, selector_type, name=None, operator=None, value=None, arg=None):
295
+ self.type = selector_type
296
+ self.name = name
297
+ self.operator = operator
298
+ self.value = value
299
+ self.arg = arg # For :not() and :nth-child()
300
+
301
+ def __repr__(self):
302
+ parts = [f"SimpleSelector({self.type!r}"]
303
+ if self.name:
304
+ parts.append(f", name={self.name!r}")
305
+ if self.operator:
306
+ parts.append(f", op={self.operator!r}")
307
+ if self.value is not None:
308
+ parts.append(f", value={self.value!r}")
309
+ if self.arg is not None:
310
+ parts.append(f", arg={self.arg!r}")
311
+ parts.append(")")
312
+ return "".join(parts)
313
+
314
+
315
+ class CompoundSelector:
316
+ """A sequence of simple selectors (e.g., div.foo#bar)."""
317
+
318
+ __slots__ = ("selectors",)
319
+
320
+ def __init__(self, selectors=None):
321
+ self.selectors = selectors or []
322
+
323
+ def __repr__(self):
324
+ return f"CompoundSelector({self.selectors!r})"
325
+
326
+
327
+ class ComplexSelector:
328
+ """A chain of compound selectors with combinators."""
329
+
330
+ __slots__ = ("parts",)
331
+
332
+ def __init__(self):
333
+ # List of (combinator, compound_selector) tuples
334
+ # First item has combinator=None
335
+ self.parts = []
336
+
337
+ def __repr__(self):
338
+ return f"ComplexSelector({self.parts!r})"
339
+
340
+
341
+ class SelectorList:
342
+ """A comma-separated list of complex selectors."""
343
+
344
+ __slots__ = ("selectors",)
345
+
346
+ def __init__(self, selectors=None):
347
+ self.selectors = selectors or []
348
+
349
+ def __repr__(self):
350
+ return f"SelectorList({self.selectors!r})"
351
+
352
+
353
+ class SelectorParser:
354
+ """Parses a list of tokens into a selector AST."""
355
+
356
+ __slots__ = ("pos", "tokens")
357
+
358
+ def __init__(self, tokens):
359
+ self.tokens = tokens
360
+ self.pos = 0
361
+
362
+ def _peek(self):
363
+ if self.pos < len(self.tokens):
364
+ return self.tokens[self.pos]
365
+ return Token(TokenType.EOF)
366
+
367
+ def _advance(self):
368
+ token = self._peek()
369
+ self.pos += 1
370
+ return token
371
+
372
+ def _expect(self, token_type):
373
+ token = self._peek()
374
+ if token.type != token_type:
375
+ raise SelectorError(f"Expected {token_type}, got {token.type}")
376
+ return self._advance()
377
+
378
+ def parse(self):
379
+ """Parse a complete selector (possibly comma-separated list)."""
380
+ selectors = []
381
+ # parse_selector() validates non-empty input, so first selector always exists
382
+ selectors.append(self._parse_complex_selector())
383
+
384
+ while self._peek().type == TokenType.COMMA:
385
+ self._advance() # consume comma
386
+ selector = self._parse_complex_selector()
387
+ if selector:
388
+ selectors.append(selector)
389
+
390
+ if self._peek().type != TokenType.EOF:
391
+ raise SelectorError(f"Unexpected token: {self._peek()}")
392
+
393
+ if len(selectors) == 1:
394
+ return selectors[0]
395
+ return SelectorList(selectors)
396
+
397
+ def _parse_complex_selector(self):
398
+ """Parse a complex selector (compound selectors with combinators)."""
399
+ complex_sel = ComplexSelector()
400
+
401
+ # First compound selector (no combinator)
402
+ compound = self._parse_compound_selector()
403
+ if not compound:
404
+ return None
405
+ complex_sel.parts.append((None, compound))
406
+
407
+ # Parse combinator + compound selector pairs
408
+ while self._peek().type == TokenType.COMBINATOR:
409
+ combinator = self._advance().value
410
+ compound = self._parse_compound_selector()
411
+ if not compound:
412
+ raise SelectorError("Expected selector after combinator")
413
+ complex_sel.parts.append((combinator, compound))
414
+
415
+ return complex_sel
416
+
417
+ def _parse_compound_selector(self):
418
+ """Parse a compound selector (sequence of simple selectors)."""
419
+ simple_selectors = []
420
+
421
+ while True:
422
+ token = self._peek()
423
+
424
+ if token.type == TokenType.TAG:
425
+ self._advance()
426
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_TAG, name=token.value))
427
+
428
+ elif token.type == TokenType.UNIVERSAL:
429
+ self._advance()
430
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_UNIVERSAL))
431
+
432
+ elif token.type == TokenType.ID:
433
+ self._advance()
434
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_ID, name=token.value))
435
+
436
+ elif token.type == TokenType.CLASS:
437
+ self._advance()
438
+ simple_selectors.append(SimpleSelector(SimpleSelector.TYPE_CLASS, name=token.value))
439
+
440
+ elif token.type == TokenType.ATTR_START:
441
+ simple_selectors.append(self._parse_attribute_selector())
442
+
443
+ elif token.type == TokenType.COLON:
444
+ simple_selectors.append(self._parse_pseudo_selector())
445
+
446
+ else:
447
+ break
448
+
449
+ if not simple_selectors:
450
+ return None
451
+ return CompoundSelector(simple_selectors)
452
+
453
+ def _parse_attribute_selector(self):
454
+ """Parse an attribute selector [attr], [attr=value], etc."""
455
+ self._expect(TokenType.ATTR_START)
456
+
457
+ attr_name = self._expect(TokenType.TAG).value
458
+
459
+ token = self._peek()
460
+ if token.type == TokenType.ATTR_END:
461
+ self._advance()
462
+ return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name)
463
+
464
+ operator = self._expect(TokenType.ATTR_OP).value
465
+ value = self._expect(TokenType.STRING).value
466
+ self._expect(TokenType.ATTR_END)
467
+
468
+ return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name, operator=operator, value=value)
469
+
470
+ def _parse_pseudo_selector(self):
471
+ """Parse a pseudo-class selector like :first-child or :not(selector)."""
472
+ self._expect(TokenType.COLON)
473
+ name = self._expect(TokenType.TAG).value
474
+
475
+ # Functional pseudo-class
476
+ if self._peek().type == TokenType.PAREN_OPEN:
477
+ self._advance()
478
+ arg = None
479
+ if self._peek().type == TokenType.STRING:
480
+ arg = self._advance().value
481
+ self._expect(TokenType.PAREN_CLOSE)
482
+ return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name, arg=arg)
483
+
484
+ return SimpleSelector(SimpleSelector.TYPE_PSEUDO, name=name)
485
+
486
+
487
+ class SelectorMatcher:
488
+ """Matches selectors against DOM nodes."""
489
+
490
+ __slots__ = ()
491
+
492
+ def matches(self, node, selector):
493
+ """Check if a node matches a parsed selector."""
494
+ if isinstance(selector, SelectorList):
495
+ return any(self.matches(node, sel) for sel in selector.selectors)
496
+ if isinstance(selector, ComplexSelector):
497
+ return self._matches_complex(node, selector)
498
+ if isinstance(selector, CompoundSelector):
499
+ return self._matches_compound(node, selector)
500
+ if isinstance(selector, SimpleSelector):
501
+ return self._matches_simple(node, selector)
502
+ return False
503
+
504
+ def _matches_complex(self, node, selector):
505
+ """Match a complex selector (with combinators)."""
506
+ # Work backwards from the rightmost compound selector
507
+ parts = selector.parts
508
+ if not parts:
509
+ return False
510
+
511
+ # Start with the rightmost part
512
+ combinator, compound = parts[-1]
513
+ if not self._matches_compound(node, compound):
514
+ return False
515
+
516
+ # Work backwards through the chain
517
+ current = node
518
+ for i in range(len(parts) - 2, -1, -1):
519
+ combinator, compound = parts[i + 1]
520
+ prev_compound = parts[i][1]
521
+
522
+ if combinator == " ": # Descendant
523
+ found = False
524
+ ancestor = current.parent
525
+ while ancestor:
526
+ if self._matches_compound(ancestor, prev_compound):
527
+ current = ancestor
528
+ found = True
529
+ break
530
+ ancestor = ancestor.parent
531
+ if not found:
532
+ return False
533
+
534
+ elif combinator == ">": # Child
535
+ parent = current.parent
536
+ if not parent or not self._matches_compound(parent, prev_compound):
537
+ return False
538
+ current = parent
539
+
540
+ elif combinator == "+": # Adjacent sibling
541
+ sibling = self._get_previous_sibling(current)
542
+ if not sibling or not self._matches_compound(sibling, prev_compound):
543
+ return False
544
+ current = sibling
545
+
546
+ else: # combinator == "~" - General sibling
547
+ found = False
548
+ sibling = self._get_previous_sibling(current)
549
+ while sibling:
550
+ if self._matches_compound(sibling, prev_compound):
551
+ current = sibling
552
+ found = True
553
+ break
554
+ sibling = self._get_previous_sibling(sibling)
555
+ if not found:
556
+ return False
557
+
558
+ return True
559
+
560
+ def _matches_compound(self, node, compound):
561
+ """Match a compound selector (all simple selectors must match)."""
562
+ return all(self._matches_simple(node, simple) for simple in compound.selectors)
563
+
564
+ def _matches_simple(self, node, selector):
565
+ """Match a simple selector against a node."""
566
+ # Text nodes and other non-element nodes don't match element selectors
567
+ if not hasattr(node, "name") or node.name.startswith("#"):
568
+ return False
569
+
570
+ sel_type = selector.type
571
+
572
+ if sel_type == SimpleSelector.TYPE_UNIVERSAL:
573
+ return True
574
+
575
+ if sel_type == SimpleSelector.TYPE_TAG:
576
+ # HTML tag names are case-insensitive
577
+ return node.name.lower() == selector.name.lower()
578
+
579
+ if sel_type == SimpleSelector.TYPE_ID:
580
+ node_id = node.attrs.get("id", "") if node.attrs else ""
581
+ return node_id == selector.name
582
+
583
+ if sel_type == SimpleSelector.TYPE_CLASS:
584
+ class_attr = node.attrs.get("class", "") if node.attrs else ""
585
+ classes = class_attr.split() if class_attr else []
586
+ return selector.name in classes
587
+
588
+ if sel_type == SimpleSelector.TYPE_ATTR:
589
+ return self._matches_attribute(node, selector)
590
+
591
+ if sel_type == SimpleSelector.TYPE_PSEUDO:
592
+ return self._matches_pseudo(node, selector)
593
+
594
+ return False
595
+
596
+ def _matches_attribute(self, node, selector):
597
+ """Match an attribute selector."""
598
+ attrs = node.attrs or {}
599
+ attr_name = selector.name.lower() # Attribute names are case-insensitive in HTML
600
+
601
+ # Check if attribute exists (for any case)
602
+ attr_value = None
603
+ for name, value in attrs.items():
604
+ if name.lower() == attr_name:
605
+ attr_value = value
606
+ break
607
+
608
+ if attr_value is None:
609
+ return False
610
+
611
+ # Presence check only
612
+ if selector.operator is None:
613
+ return True
614
+
615
+ value = selector.value
616
+ op = selector.operator
617
+
618
+ if op == "=":
619
+ return attr_value == value
620
+
621
+ if op == "~=":
622
+ # Space-separated word match
623
+ words = attr_value.split() if attr_value else []
624
+ return value in words
625
+
626
+ if op == "|=":
627
+ # Hyphen-separated prefix match (e.g., lang="en" matches lang|="en-US")
628
+ return attr_value == value or attr_value.startswith(value + "-")
629
+
630
+ if op == "^=":
631
+ # Starts with
632
+ return attr_value.startswith(value) if value else False
633
+
634
+ if op == "$=":
635
+ # Ends with
636
+ return attr_value.endswith(value) if value else False
637
+
638
+ if op == "*=":
639
+ # Contains
640
+ return value in attr_value if value else False
641
+
642
+ return False
643
+
644
+ def _matches_pseudo(self, node, selector):
645
+ """Match a pseudo-class selector."""
646
+ name = selector.name.lower()
647
+
648
+ if name == "first-child":
649
+ return self._is_first_child(node)
650
+
651
+ if name == "last-child":
652
+ return self._is_last_child(node)
653
+
654
+ if name == "nth-child":
655
+ return self._matches_nth_child(node, selector.arg)
656
+
657
+ if name == "not":
658
+ if not selector.arg:
659
+ return True
660
+ # Parse the inner selector
661
+ inner = parse_selector(selector.arg)
662
+ return not self.matches(node, inner)
663
+
664
+ if name == "only-child":
665
+ return self._is_first_child(node) and self._is_last_child(node)
666
+
667
+ if name == "empty":
668
+ if not node.has_child_nodes():
669
+ return True
670
+ # Check if all children are empty text nodes
671
+ for child in node.children:
672
+ if hasattr(child, "name"):
673
+ if child.name == "#text":
674
+ if child.data and child.data.strip():
675
+ return False
676
+ elif not child.name.startswith("#"):
677
+ return False
678
+ return True
679
+
680
+ if name == "root":
681
+ # Root is the html element (or document root's first element child)
682
+ parent = node.parent
683
+ if parent and hasattr(parent, "name"):
684
+ return parent.name in ("#document", "#document-fragment")
685
+ return False
686
+
687
+ if name == "first-of-type":
688
+ return self._is_first_of_type(node)
689
+
690
+ if name == "last-of-type":
691
+ return self._is_last_of_type(node)
692
+
693
+ if name == "nth-of-type":
694
+ return self._matches_nth_of_type(node, selector.arg)
695
+
696
+ if name == "only-of-type":
697
+ return self._is_first_of_type(node) and self._is_last_of_type(node)
698
+
699
+ # Unknown pseudo-class - don't match
700
+ raise SelectorError(f"Unsupported pseudo-class: :{name}")
701
+
702
+ def _get_element_children(self, parent):
703
+ """Get only element children (exclude text, comments, etc.)."""
704
+ if not parent or not parent.has_child_nodes():
705
+ return []
706
+ return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
707
+
708
+ def _get_previous_sibling(self, node):
709
+ """Get the previous element sibling. Returns None if node is first or not found."""
710
+ parent = node.parent
711
+ if not parent:
712
+ return None
713
+
714
+ prev = None
715
+ for child in parent.children:
716
+ if child is node:
717
+ return prev
718
+ if hasattr(child, "name") and not child.name.startswith("#"):
719
+ prev = child
720
+ return None # node not in parent.children (detached)
721
+
722
+ def _is_first_child(self, node):
723
+ """Check if node is the first element child of its parent."""
724
+ parent = node.parent
725
+ if not parent:
726
+ return False
727
+ elements = self._get_element_children(parent)
728
+ return elements and elements[0] is node
729
+
730
+ def _is_last_child(self, node):
731
+ """Check if node is the last element child of its parent."""
732
+ parent = node.parent
733
+ if not parent:
734
+ return False
735
+ elements = self._get_element_children(parent)
736
+ return elements and elements[-1] is node
737
+
738
+ def _is_first_of_type(self, node):
739
+ """Check if node is the first sibling of its type."""
740
+ parent = node.parent
741
+ if not parent:
742
+ return False
743
+ node_name = node.name.lower()
744
+ for child in self._get_element_children(parent):
745
+ if child.name.lower() == node_name:
746
+ return child is node
747
+ return False
748
+
749
+ def _is_last_of_type(self, node):
750
+ """Check if node is the last sibling of its type."""
751
+ parent = node.parent
752
+ if not parent:
753
+ return False
754
+ node_name = node.name.lower()
755
+ last_of_type = None
756
+ for child in self._get_element_children(parent):
757
+ if child.name.lower() == node_name:
758
+ last_of_type = child
759
+ return last_of_type is node
760
+
761
+ def _parse_nth_expression(self, expr):
762
+ """Parse an nth-child expression like '2n+1', 'odd', 'even', '3'."""
763
+ if not expr:
764
+ return None
765
+
766
+ expr = expr.strip().lower()
767
+
768
+ if expr == "odd":
769
+ return (2, 1) # 2n+1
770
+ if expr == "even":
771
+ return (2, 0) # 2n
772
+
773
+ # Parse An+B syntax
774
+ # Handle formats: n, 2n, 2n+1, -n+2, 3, etc.
775
+ a = 0
776
+ b = 0
777
+
778
+ # Remove all spaces
779
+ expr = expr.replace(" ", "")
780
+
781
+ if "n" in expr:
782
+ parts = expr.split("n")
783
+ a_part = parts[0]
784
+ b_part = parts[1] if len(parts) > 1 else ""
785
+
786
+ if a_part == "" or a_part == "+":
787
+ a = 1
788
+ elif a_part == "-":
789
+ a = -1
790
+ else:
791
+ try:
792
+ a = int(a_part)
793
+ except ValueError:
794
+ return None
795
+
796
+ if b_part:
797
+ try:
798
+ b = int(b_part)
799
+ except ValueError:
800
+ return None
801
+ else:
802
+ # Just a number
803
+ try:
804
+ b = int(expr)
805
+ except ValueError:
806
+ return None
807
+
808
+ return (a, b)
809
+
810
+ def _matches_nth(self, index, a, b):
811
+ """Check if 1-based index matches An+B formula."""
812
+ if a == 0:
813
+ return index == b
814
+ # Solve: index = a*n + b for non-negative integer n
815
+ # n = (index - b) / a
816
+ diff = index - b
817
+ if a > 0:
818
+ return diff >= 0 and diff % a == 0
819
+ # a < 0: need diff <= 0 and diff divisible by abs(a)
820
+ return diff <= 0 and diff % a == 0
821
+
822
+ def _matches_nth_child(self, node, arg):
823
+ """Match :nth-child(An+B)."""
824
+ parent = node.parent
825
+ if not parent:
826
+ return False
827
+
828
+ parsed = self._parse_nth_expression(arg)
829
+ if parsed is None:
830
+ return False
831
+ a, b = parsed
832
+
833
+ elements = self._get_element_children(parent)
834
+ for i, child in enumerate(elements):
835
+ if child is node:
836
+ return self._matches_nth(i + 1, a, b)
837
+ return False
838
+
839
+ def _matches_nth_of_type(self, node, arg):
840
+ """Match :nth-of-type(An+B)."""
841
+ parent = node.parent
842
+ if not parent:
843
+ return False
844
+
845
+ parsed = self._parse_nth_expression(arg)
846
+ if parsed is None:
847
+ return False
848
+ a, b = parsed
849
+
850
+ node_name = node.name.lower()
851
+ elements = self._get_element_children(parent)
852
+ type_index = 0
853
+ for child in elements:
854
+ if child.name.lower() == node_name:
855
+ type_index += 1
856
+ if child is node:
857
+ return self._matches_nth(type_index, a, b)
858
+ return False
859
+
860
+
861
+ def parse_selector(selector_string):
862
+ """Parse a CSS selector string into an AST."""
863
+ if not selector_string or not selector_string.strip():
864
+ raise SelectorError("Empty selector")
865
+
866
+ tokenizer = SelectorTokenizer(selector_string.strip())
867
+ tokens = tokenizer.tokenize()
868
+ parser = SelectorParser(tokens)
869
+ return parser.parse()
870
+
871
+
872
+ # Global matcher instance
873
+ _matcher = SelectorMatcher()
874
+
875
+
876
+ def query(root, selector_string):
877
+ """
878
+ Query the DOM tree starting from root, returning all matching elements.
879
+
880
+ Searches descendants of root, not including root itself (matching browser
881
+ behavior for querySelectorAll).
882
+
883
+ Args:
884
+ root: The root node to search from
885
+ selector_string: A CSS selector string
886
+
887
+ Returns:
888
+ A list of matching nodes
889
+ """
890
+ selector = parse_selector(selector_string)
891
+ results = []
892
+ _query_descendants(root, selector, results)
893
+ return results
894
+
895
+
896
+ def _query_descendants(node, selector, results):
897
+ """Recursively search for matching nodes in descendants."""
898
+ # Only recurse into children (not the node itself)
899
+ if node.has_child_nodes():
900
+ for child in node.children:
901
+ # Check if this child matches
902
+ if hasattr(child, "name") and not child.name.startswith("#"):
903
+ if _matcher.matches(child, selector):
904
+ results.append(child)
905
+ # Recurse into child's descendants
906
+ _query_descendants(child, selector, results)
907
+
908
+ # Also check template content if present
909
+ if hasattr(node, "template_content") and node.template_content:
910
+ _query_descendants(node.template_content, selector, results)
911
+
912
+
913
+ def matches(node, selector_string):
914
+ """
915
+ Check if a node matches a CSS selector.
916
+
917
+ Args:
918
+ node: The node to check
919
+ selector_string: A CSS selector string
920
+
921
+ Returns:
922
+ True if the node matches, False otherwise
923
+ """
924
+ selector = parse_selector(selector_string)
925
+ return _matcher.matches(node, selector)