justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/selector.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# CSS Selector implementation for JustHTML
|
|
2
2
|
# Supports a subset of CSS selectors for querying the DOM
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
class SelectorError(ValueError):
|
|
6
11
|
"""Raised when a CSS selector is invalid."""
|
|
@@ -8,30 +13,33 @@ class SelectorError(ValueError):
|
|
|
8
13
|
|
|
9
14
|
# Token types for the CSS selector lexer
|
|
10
15
|
class TokenType:
|
|
11
|
-
TAG = "TAG" # div, span, etc.
|
|
12
|
-
ID = "ID" # #foo
|
|
13
|
-
CLASS = "CLASS" # .bar
|
|
14
|
-
UNIVERSAL = "UNIVERSAL" # *
|
|
15
|
-
ATTR_START = "ATTR_START" # [
|
|
16
|
-
ATTR_END = "ATTR_END" # ]
|
|
17
|
-
ATTR_OP = "ATTR_OP" # =, ~=, |=, ^=, $=, *=
|
|
18
|
-
STRING = "STRING" # "value" or 'value' or unquoted
|
|
19
|
-
COMBINATOR = "COMBINATOR" # >, +, ~, or whitespace (descendant)
|
|
20
|
-
COMMA = "COMMA" # ,
|
|
21
|
-
COLON = "COLON" # :
|
|
22
|
-
PAREN_OPEN = "PAREN_OPEN" # (
|
|
23
|
-
PAREN_CLOSE = "PAREN_CLOSE" # )
|
|
24
|
-
EOF = "EOF"
|
|
16
|
+
TAG: str = "TAG" # div, span, etc.
|
|
17
|
+
ID: str = "ID" # #foo
|
|
18
|
+
CLASS: str = "CLASS" # .bar
|
|
19
|
+
UNIVERSAL: str = "UNIVERSAL" # *
|
|
20
|
+
ATTR_START: str = "ATTR_START" # [
|
|
21
|
+
ATTR_END: str = "ATTR_END" # ]
|
|
22
|
+
ATTR_OP: str = "ATTR_OP" # =, ~=, |=, ^=, $=, *=
|
|
23
|
+
STRING: str = "STRING" # "value" or 'value' or unquoted
|
|
24
|
+
COMBINATOR: str = "COMBINATOR" # >, +, ~, or whitespace (descendant)
|
|
25
|
+
COMMA: str = "COMMA" # ,
|
|
26
|
+
COLON: str = "COLON" # :
|
|
27
|
+
PAREN_OPEN: str = "PAREN_OPEN" # (
|
|
28
|
+
PAREN_CLOSE: str = "PAREN_CLOSE" # )
|
|
29
|
+
EOF: str = "EOF"
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
class Token:
|
|
28
33
|
__slots__ = ("type", "value")
|
|
29
34
|
|
|
30
|
-
|
|
35
|
+
type: str
|
|
36
|
+
value: str | None
|
|
37
|
+
|
|
38
|
+
def __init__(self, token_type: str, value: str | None = None) -> None:
|
|
31
39
|
self.type = token_type
|
|
32
40
|
self.value = value
|
|
33
41
|
|
|
34
|
-
def __repr__(self):
|
|
42
|
+
def __repr__(self) -> str:
|
|
35
43
|
return f"Token({self.type}, {self.value!r})"
|
|
36
44
|
|
|
37
45
|
|
|
@@ -40,45 +48,49 @@ class SelectorTokenizer:
|
|
|
40
48
|
|
|
41
49
|
__slots__ = ("length", "pos", "selector")
|
|
42
50
|
|
|
43
|
-
|
|
51
|
+
selector: str
|
|
52
|
+
pos: int
|
|
53
|
+
length: int
|
|
54
|
+
|
|
55
|
+
def __init__(self, selector: str) -> None:
|
|
44
56
|
self.selector = selector
|
|
45
57
|
self.pos = 0
|
|
46
58
|
self.length = len(selector)
|
|
47
59
|
|
|
48
|
-
def _peek(self, offset=0):
|
|
60
|
+
def _peek(self, offset: int = 0) -> str:
|
|
49
61
|
pos = self.pos + offset
|
|
50
62
|
if pos < self.length:
|
|
51
63
|
return self.selector[pos]
|
|
52
64
|
return ""
|
|
53
65
|
|
|
54
|
-
def _advance(self):
|
|
66
|
+
def _advance(self) -> str:
|
|
55
67
|
ch = self._peek()
|
|
56
68
|
self.pos += 1
|
|
57
69
|
return ch
|
|
58
70
|
|
|
59
|
-
def _skip_whitespace(self):
|
|
71
|
+
def _skip_whitespace(self) -> None:
|
|
60
72
|
while self.pos < self.length and self.selector[self.pos] in " \t\n\r\f":
|
|
61
73
|
self.pos += 1
|
|
62
74
|
|
|
63
|
-
def _is_name_start(self, ch):
|
|
75
|
+
def _is_name_start(self, ch: str) -> bool:
|
|
64
76
|
# CSS identifier start: letter, underscore, or non-ASCII
|
|
65
77
|
return ch.isalpha() or ch == "_" or ch == "-" or ord(ch) > 127
|
|
66
78
|
|
|
67
|
-
def _is_name_char(self, ch):
|
|
79
|
+
def _is_name_char(self, ch: str) -> bool:
|
|
68
80
|
# CSS identifier continuation: name-start or digit
|
|
69
81
|
return self._is_name_start(ch) or ch.isdigit()
|
|
70
82
|
|
|
71
|
-
def _read_name(self):
|
|
83
|
+
def _read_name(self) -> str:
|
|
72
84
|
start = self.pos
|
|
73
85
|
while self.pos < self.length and self._is_name_char(self.selector[self.pos]):
|
|
74
86
|
self.pos += 1
|
|
75
87
|
return self.selector[start : self.pos]
|
|
76
88
|
|
|
77
|
-
def _read_string(self, quote):
|
|
89
|
+
def _read_string(self, quote: str) -> str:
|
|
78
90
|
# Skip opening quote
|
|
79
91
|
self.pos += 1
|
|
80
92
|
start = self.pos
|
|
81
|
-
parts = []
|
|
93
|
+
parts: list[str] = []
|
|
82
94
|
|
|
83
95
|
while self.pos < self.length:
|
|
84
96
|
ch = self.selector[self.pos]
|
|
@@ -105,7 +117,7 @@ class SelectorTokenizer:
|
|
|
105
117
|
|
|
106
118
|
raise SelectorError(f"Unterminated string in selector: {self.selector!r}")
|
|
107
119
|
|
|
108
|
-
def _read_unquoted_attr_value(self):
|
|
120
|
+
def _read_unquoted_attr_value(self) -> str:
|
|
109
121
|
# Read an unquoted attribute value (CSS identifier)
|
|
110
122
|
start = self.pos
|
|
111
123
|
while self.pos < self.length:
|
|
@@ -115,8 +127,8 @@ class SelectorTokenizer:
|
|
|
115
127
|
self.pos += 1
|
|
116
128
|
return self.selector[start : self.pos]
|
|
117
129
|
|
|
118
|
-
def tokenize(self):
|
|
119
|
-
tokens = []
|
|
130
|
+
def tokenize(self) -> list[Token]:
|
|
131
|
+
tokens: list[Token] = []
|
|
120
132
|
pending_whitespace = False
|
|
121
133
|
|
|
122
134
|
while self.pos < self.length:
|
|
@@ -284,21 +296,34 @@ class SimpleSelector:
|
|
|
284
296
|
|
|
285
297
|
__slots__ = ("arg", "name", "operator", "type", "value")
|
|
286
298
|
|
|
287
|
-
TYPE_TAG = "tag"
|
|
288
|
-
TYPE_ID = "id"
|
|
289
|
-
TYPE_CLASS = "class"
|
|
290
|
-
TYPE_UNIVERSAL = "universal"
|
|
291
|
-
TYPE_ATTR = "attr"
|
|
292
|
-
TYPE_PSEUDO = "pseudo"
|
|
293
|
-
|
|
294
|
-
|
|
299
|
+
TYPE_TAG: str = "tag"
|
|
300
|
+
TYPE_ID: str = "id"
|
|
301
|
+
TYPE_CLASS: str = "class"
|
|
302
|
+
TYPE_UNIVERSAL: str = "universal"
|
|
303
|
+
TYPE_ATTR: str = "attr"
|
|
304
|
+
TYPE_PSEUDO: str = "pseudo"
|
|
305
|
+
|
|
306
|
+
type: str
|
|
307
|
+
name: str | None
|
|
308
|
+
operator: str | None
|
|
309
|
+
value: str | None
|
|
310
|
+
arg: str | None
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
selector_type: str,
|
|
315
|
+
name: str | None = None,
|
|
316
|
+
operator: str | None = None,
|
|
317
|
+
value: str | None = None,
|
|
318
|
+
arg: str | None = None,
|
|
319
|
+
) -> None:
|
|
295
320
|
self.type = selector_type
|
|
296
321
|
self.name = name
|
|
297
322
|
self.operator = operator
|
|
298
323
|
self.value = value
|
|
299
324
|
self.arg = arg # For :not() and :nth-child()
|
|
300
325
|
|
|
301
|
-
def __repr__(self):
|
|
326
|
+
def __repr__(self) -> str:
|
|
302
327
|
parts = [f"SimpleSelector({self.type!r}"]
|
|
303
328
|
if self.name:
|
|
304
329
|
parts.append(f", name={self.name!r}")
|
|
@@ -317,10 +342,12 @@ class CompoundSelector:
|
|
|
317
342
|
|
|
318
343
|
__slots__ = ("selectors",)
|
|
319
344
|
|
|
320
|
-
|
|
345
|
+
selectors: list[SimpleSelector]
|
|
346
|
+
|
|
347
|
+
def __init__(self, selectors: list[SimpleSelector] | None = None) -> None:
|
|
321
348
|
self.selectors = selectors or []
|
|
322
349
|
|
|
323
|
-
def __repr__(self):
|
|
350
|
+
def __repr__(self) -> str:
|
|
324
351
|
return f"CompoundSelector({self.selectors!r})"
|
|
325
352
|
|
|
326
353
|
|
|
@@ -329,12 +356,14 @@ class ComplexSelector:
|
|
|
329
356
|
|
|
330
357
|
__slots__ = ("parts",)
|
|
331
358
|
|
|
332
|
-
|
|
359
|
+
parts: list[tuple[str | None, CompoundSelector]]
|
|
360
|
+
|
|
361
|
+
def __init__(self) -> None:
|
|
333
362
|
# List of (combinator, compound_selector) tuples
|
|
334
363
|
# First item has combinator=None
|
|
335
364
|
self.parts = []
|
|
336
365
|
|
|
337
|
-
def __repr__(self):
|
|
366
|
+
def __repr__(self) -> str:
|
|
338
367
|
return f"ComplexSelector({self.parts!r})"
|
|
339
368
|
|
|
340
369
|
|
|
@@ -343,43 +372,55 @@ class SelectorList:
|
|
|
343
372
|
|
|
344
373
|
__slots__ = ("selectors",)
|
|
345
374
|
|
|
346
|
-
|
|
375
|
+
selectors: list[ComplexSelector]
|
|
376
|
+
|
|
377
|
+
def __init__(self, selectors: list[ComplexSelector] | None = None) -> None:
|
|
347
378
|
self.selectors = selectors or []
|
|
348
379
|
|
|
349
|
-
def __repr__(self):
|
|
380
|
+
def __repr__(self) -> str:
|
|
350
381
|
return f"SelectorList({self.selectors!r})"
|
|
351
382
|
|
|
352
383
|
|
|
384
|
+
# Type alias for parsed selectors
|
|
385
|
+
ParsedSelector = ComplexSelector | SelectorList
|
|
386
|
+
|
|
387
|
+
|
|
353
388
|
class SelectorParser:
|
|
354
389
|
"""Parses a list of tokens into a selector AST."""
|
|
355
390
|
|
|
356
391
|
__slots__ = ("pos", "tokens")
|
|
357
392
|
|
|
358
|
-
|
|
393
|
+
tokens: list[Token]
|
|
394
|
+
pos: int
|
|
395
|
+
|
|
396
|
+
def __init__(self, tokens: list[Token]) -> None:
|
|
359
397
|
self.tokens = tokens
|
|
360
398
|
self.pos = 0
|
|
361
399
|
|
|
362
|
-
def _peek(self):
|
|
400
|
+
def _peek(self) -> Token:
|
|
363
401
|
if self.pos < len(self.tokens):
|
|
364
402
|
return self.tokens[self.pos]
|
|
365
403
|
return Token(TokenType.EOF)
|
|
366
404
|
|
|
367
|
-
def _advance(self):
|
|
405
|
+
def _advance(self) -> Token:
|
|
368
406
|
token = self._peek()
|
|
369
407
|
self.pos += 1
|
|
370
408
|
return token
|
|
371
409
|
|
|
372
|
-
def _expect(self, token_type):
|
|
410
|
+
def _expect(self, token_type: str) -> Token:
|
|
373
411
|
token = self._peek()
|
|
374
412
|
if token.type != token_type:
|
|
375
413
|
raise SelectorError(f"Expected {token_type}, got {token.type}")
|
|
376
414
|
return self._advance()
|
|
377
415
|
|
|
378
|
-
def parse(self):
|
|
416
|
+
def parse(self) -> ParsedSelector:
|
|
379
417
|
"""Parse a complete selector (possibly comma-separated list)."""
|
|
380
|
-
selectors = []
|
|
418
|
+
selectors: list[ComplexSelector] = []
|
|
381
419
|
# parse_selector() validates non-empty input, so first selector always exists
|
|
382
|
-
|
|
420
|
+
first = self._parse_complex_selector()
|
|
421
|
+
if first is None: # pragma: no cover
|
|
422
|
+
raise SelectorError("Empty selector")
|
|
423
|
+
selectors.append(first)
|
|
383
424
|
|
|
384
425
|
while self._peek().type == TokenType.COMMA:
|
|
385
426
|
self._advance() # consume comma
|
|
@@ -394,7 +435,7 @@ class SelectorParser:
|
|
|
394
435
|
return selectors[0]
|
|
395
436
|
return SelectorList(selectors)
|
|
396
437
|
|
|
397
|
-
def _parse_complex_selector(self):
|
|
438
|
+
def _parse_complex_selector(self) -> ComplexSelector | None:
|
|
398
439
|
"""Parse a complex selector (compound selectors with combinators)."""
|
|
399
440
|
complex_sel = ComplexSelector()
|
|
400
441
|
|
|
@@ -414,9 +455,9 @@ class SelectorParser:
|
|
|
414
455
|
|
|
415
456
|
return complex_sel
|
|
416
457
|
|
|
417
|
-
def _parse_compound_selector(self):
|
|
458
|
+
def _parse_compound_selector(self) -> CompoundSelector | None:
|
|
418
459
|
"""Parse a compound selector (sequence of simple selectors)."""
|
|
419
|
-
simple_selectors = []
|
|
460
|
+
simple_selectors: list[SimpleSelector] = []
|
|
420
461
|
|
|
421
462
|
while True:
|
|
422
463
|
token = self._peek()
|
|
@@ -450,7 +491,7 @@ class SelectorParser:
|
|
|
450
491
|
return None
|
|
451
492
|
return CompoundSelector(simple_selectors)
|
|
452
493
|
|
|
453
|
-
def _parse_attribute_selector(self):
|
|
494
|
+
def _parse_attribute_selector(self) -> SimpleSelector:
|
|
454
495
|
"""Parse an attribute selector [attr], [attr=value], etc."""
|
|
455
496
|
self._expect(TokenType.ATTR_START)
|
|
456
497
|
|
|
@@ -467,7 +508,7 @@ class SelectorParser:
|
|
|
467
508
|
|
|
468
509
|
return SimpleSelector(SimpleSelector.TYPE_ATTR, name=attr_name, operator=operator, value=value)
|
|
469
510
|
|
|
470
|
-
def _parse_pseudo_selector(self):
|
|
511
|
+
def _parse_pseudo_selector(self) -> SimpleSelector:
|
|
471
512
|
"""Parse a pseudo-class selector like :first-child or :not(selector)."""
|
|
472
513
|
self._expect(TokenType.COLON)
|
|
473
514
|
name = self._expect(TokenType.TAG).value
|
|
@@ -475,7 +516,7 @@ class SelectorParser:
|
|
|
475
516
|
# Functional pseudo-class
|
|
476
517
|
if self._peek().type == TokenType.PAREN_OPEN:
|
|
477
518
|
self._advance()
|
|
478
|
-
arg = None
|
|
519
|
+
arg: str | None = None
|
|
479
520
|
if self._peek().type == TokenType.STRING:
|
|
480
521
|
arg = self._advance().value
|
|
481
522
|
self._expect(TokenType.PAREN_CLOSE)
|
|
@@ -489,7 +530,15 @@ class SelectorMatcher:
|
|
|
489
530
|
|
|
490
531
|
__slots__ = ()
|
|
491
532
|
|
|
492
|
-
def
|
|
533
|
+
def _unquote_pseudo_arg(self, arg: str) -> str:
|
|
534
|
+
arg = arg.strip()
|
|
535
|
+
if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
|
|
536
|
+
quote = arg[0]
|
|
537
|
+
# Minimal unescaping for common cases like :contains("click me")
|
|
538
|
+
return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
|
|
539
|
+
return arg
|
|
540
|
+
|
|
541
|
+
def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
|
|
493
542
|
"""Check if a node matches a parsed selector."""
|
|
494
543
|
if isinstance(selector, SelectorList):
|
|
495
544
|
return any(self.matches(node, sel) for sel in selector.selectors)
|
|
@@ -501,7 +550,7 @@ class SelectorMatcher:
|
|
|
501
550
|
return self._matches_simple(node, selector)
|
|
502
551
|
return False
|
|
503
552
|
|
|
504
|
-
def _matches_complex(self, node, selector):
|
|
553
|
+
def _matches_complex(self, node: Any, selector: ComplexSelector) -> bool:
|
|
505
554
|
"""Match a complex selector (with combinators)."""
|
|
506
555
|
# Work backwards from the rightmost compound selector
|
|
507
556
|
parts = selector.parts
|
|
@@ -557,11 +606,11 @@ class SelectorMatcher:
|
|
|
557
606
|
|
|
558
607
|
return True
|
|
559
608
|
|
|
560
|
-
def _matches_compound(self, node, compound):
|
|
609
|
+
def _matches_compound(self, node: Any, compound: CompoundSelector) -> bool:
|
|
561
610
|
"""Match a compound selector (all simple selectors must match)."""
|
|
562
611
|
return all(self._matches_simple(node, simple) for simple in compound.selectors)
|
|
563
612
|
|
|
564
|
-
def _matches_simple(self, node, selector):
|
|
613
|
+
def _matches_simple(self, node: Any, selector: SimpleSelector) -> bool:
|
|
565
614
|
"""Match a simple selector against a node."""
|
|
566
615
|
# Text nodes and other non-element nodes don't match element selectors
|
|
567
616
|
if not hasattr(node, "name") or node.name.startswith("#"):
|
|
@@ -574,7 +623,7 @@ class SelectorMatcher:
|
|
|
574
623
|
|
|
575
624
|
if sel_type == SimpleSelector.TYPE_TAG:
|
|
576
625
|
# HTML tag names are case-insensitive
|
|
577
|
-
return node.name.lower() == selector.name.lower()
|
|
626
|
+
return bool(node.name.lower() == (selector.name.lower() if selector.name else ""))
|
|
578
627
|
|
|
579
628
|
if sel_type == SimpleSelector.TYPE_ID:
|
|
580
629
|
node_id = node.attrs.get("id", "") if node.attrs else ""
|
|
@@ -593,13 +642,13 @@ class SelectorMatcher:
|
|
|
593
642
|
|
|
594
643
|
return False
|
|
595
644
|
|
|
596
|
-
def _matches_attribute(self, node, selector):
|
|
645
|
+
def _matches_attribute(self, node: Any, selector: SimpleSelector) -> bool:
|
|
597
646
|
"""Match an attribute selector."""
|
|
598
647
|
attrs = node.attrs or {}
|
|
599
|
-
attr_name = selector.name.lower() # Attribute names are case-insensitive in HTML
|
|
648
|
+
attr_name = (selector.name or "").lower() # Attribute names are case-insensitive in HTML
|
|
600
649
|
|
|
601
650
|
# Check if attribute exists (for any case)
|
|
602
|
-
attr_value = None
|
|
651
|
+
attr_value: str | None = None
|
|
603
652
|
for name, value in attrs.items():
|
|
604
653
|
if name.lower() == attr_name:
|
|
605
654
|
attr_value = value
|
|
@@ -612,7 +661,7 @@ class SelectorMatcher:
|
|
|
612
661
|
if selector.operator is None:
|
|
613
662
|
return True
|
|
614
663
|
|
|
615
|
-
value = selector.value
|
|
664
|
+
value = selector.value or ""
|
|
616
665
|
op = selector.operator
|
|
617
666
|
|
|
618
667
|
if op == "=":
|
|
@@ -641,9 +690,9 @@ class SelectorMatcher:
|
|
|
641
690
|
|
|
642
691
|
return False
|
|
643
692
|
|
|
644
|
-
def _matches_pseudo(self, node, selector):
|
|
693
|
+
def _matches_pseudo(self, node: Any, selector: SimpleSelector) -> bool:
|
|
645
694
|
"""Match a pseudo-class selector."""
|
|
646
|
-
name = selector.name.lower()
|
|
695
|
+
name = (selector.name or "").lower()
|
|
647
696
|
|
|
648
697
|
if name == "first-child":
|
|
649
698
|
return self._is_first_child(node)
|
|
@@ -684,6 +733,17 @@ class SelectorMatcher:
|
|
|
684
733
|
return parent.name in ("#document", "#document-fragment")
|
|
685
734
|
return False
|
|
686
735
|
|
|
736
|
+
if name == "contains":
|
|
737
|
+
if selector.arg is None:
|
|
738
|
+
raise SelectorError(":contains() requires a string argument")
|
|
739
|
+
needle = self._unquote_pseudo_arg(selector.arg)
|
|
740
|
+
if needle == "":
|
|
741
|
+
return True
|
|
742
|
+
# Non-standard (jQuery-style) pseudo-class: match elements whose descendant
|
|
743
|
+
# text contains the substring. We use `to_text()` to approximate textContent.
|
|
744
|
+
haystack: str = node.to_text(separator=" ", strip=True)
|
|
745
|
+
return needle in haystack
|
|
746
|
+
|
|
687
747
|
if name == "first-of-type":
|
|
688
748
|
return self._is_first_of_type(node)
|
|
689
749
|
|
|
@@ -699,43 +759,43 @@ class SelectorMatcher:
|
|
|
699
759
|
# Unknown pseudo-class - don't match
|
|
700
760
|
raise SelectorError(f"Unsupported pseudo-class: :{name}")
|
|
701
761
|
|
|
702
|
-
def _get_element_children(self, parent):
|
|
762
|
+
def _get_element_children(self, parent: Any) -> list[Any]:
|
|
703
763
|
"""Get only element children (exclude text, comments, etc.)."""
|
|
704
764
|
if not parent or not parent.has_child_nodes():
|
|
705
765
|
return []
|
|
706
|
-
return [c for c in parent.children if
|
|
766
|
+
return [c for c in parent.children if not c.name.startswith("#")]
|
|
707
767
|
|
|
708
|
-
def _get_previous_sibling(self, node):
|
|
768
|
+
def _get_previous_sibling(self, node: Any) -> Any | None:
|
|
709
769
|
"""Get the previous element sibling. Returns None if node is first or not found."""
|
|
710
770
|
parent = node.parent
|
|
711
771
|
if not parent:
|
|
712
772
|
return None
|
|
713
773
|
|
|
714
|
-
prev = None
|
|
774
|
+
prev: Any | None = None
|
|
715
775
|
for child in parent.children:
|
|
716
776
|
if child is node:
|
|
717
777
|
return prev
|
|
718
|
-
if
|
|
778
|
+
if not child.name.startswith("#"):
|
|
719
779
|
prev = child
|
|
720
780
|
return None # node not in parent.children (detached)
|
|
721
781
|
|
|
722
|
-
def _is_first_child(self, node):
|
|
782
|
+
def _is_first_child(self, node: Any) -> bool:
|
|
723
783
|
"""Check if node is the first element child of its parent."""
|
|
724
784
|
parent = node.parent
|
|
725
785
|
if not parent:
|
|
726
786
|
return False
|
|
727
787
|
elements = self._get_element_children(parent)
|
|
728
|
-
return elements and elements[0] is node
|
|
788
|
+
return bool(elements) and elements[0] is node
|
|
729
789
|
|
|
730
|
-
def _is_last_child(self, node):
|
|
790
|
+
def _is_last_child(self, node: Any) -> bool:
|
|
731
791
|
"""Check if node is the last element child of its parent."""
|
|
732
792
|
parent = node.parent
|
|
733
793
|
if not parent:
|
|
734
794
|
return False
|
|
735
795
|
elements = self._get_element_children(parent)
|
|
736
|
-
return elements and elements[-1] is node
|
|
796
|
+
return bool(elements) and elements[-1] is node
|
|
737
797
|
|
|
738
|
-
def _is_first_of_type(self, node):
|
|
798
|
+
def _is_first_of_type(self, node: Any) -> bool:
|
|
739
799
|
"""Check if node is the first sibling of its type."""
|
|
740
800
|
parent = node.parent
|
|
741
801
|
if not parent:
|
|
@@ -746,19 +806,19 @@ class SelectorMatcher:
|
|
|
746
806
|
return child is node
|
|
747
807
|
return False
|
|
748
808
|
|
|
749
|
-
def _is_last_of_type(self, node):
|
|
809
|
+
def _is_last_of_type(self, node: Any) -> bool:
|
|
750
810
|
"""Check if node is the last sibling of its type."""
|
|
751
811
|
parent = node.parent
|
|
752
812
|
if not parent:
|
|
753
813
|
return False
|
|
754
814
|
node_name = node.name.lower()
|
|
755
|
-
last_of_type = None
|
|
815
|
+
last_of_type: Any | None = None
|
|
756
816
|
for child in self._get_element_children(parent):
|
|
757
817
|
if child.name.lower() == node_name:
|
|
758
818
|
last_of_type = child
|
|
759
819
|
return last_of_type is node
|
|
760
820
|
|
|
761
|
-
def _parse_nth_expression(self, expr):
|
|
821
|
+
def _parse_nth_expression(self, expr: str | None) -> tuple[int, int] | None:
|
|
762
822
|
"""Parse an nth-child expression like '2n+1', 'odd', 'even', '3'."""
|
|
763
823
|
if not expr:
|
|
764
824
|
return None
|
|
@@ -807,7 +867,7 @@ class SelectorMatcher:
|
|
|
807
867
|
|
|
808
868
|
return (a, b)
|
|
809
869
|
|
|
810
|
-
def _matches_nth(self, index, a, b):
|
|
870
|
+
def _matches_nth(self, index: int, a: int, b: int) -> bool:
|
|
811
871
|
"""Check if 1-based index matches An+B formula."""
|
|
812
872
|
if a == 0:
|
|
813
873
|
return index == b
|
|
@@ -819,7 +879,7 @@ class SelectorMatcher:
|
|
|
819
879
|
# a < 0: need diff <= 0 and diff divisible by abs(a)
|
|
820
880
|
return diff <= 0 and diff % a == 0
|
|
821
881
|
|
|
822
|
-
def _matches_nth_child(self, node, arg):
|
|
882
|
+
def _matches_nth_child(self, node: Any, arg: str | None) -> bool:
|
|
823
883
|
"""Match :nth-child(An+B)."""
|
|
824
884
|
parent = node.parent
|
|
825
885
|
if not parent:
|
|
@@ -836,7 +896,7 @@ class SelectorMatcher:
|
|
|
836
896
|
return self._matches_nth(i + 1, a, b)
|
|
837
897
|
return False
|
|
838
898
|
|
|
839
|
-
def _matches_nth_of_type(self, node, arg):
|
|
899
|
+
def _matches_nth_of_type(self, node: Any, arg: str | None) -> bool:
|
|
840
900
|
"""Match :nth-of-type(An+B)."""
|
|
841
901
|
parent = node.parent
|
|
842
902
|
if not parent:
|
|
@@ -858,22 +918,72 @@ class SelectorMatcher:
|
|
|
858
918
|
return False
|
|
859
919
|
|
|
860
920
|
|
|
861
|
-
def parse_selector(selector_string):
|
|
921
|
+
def parse_selector(selector_string: str) -> ParsedSelector:
|
|
862
922
|
"""Parse a CSS selector string into an AST."""
|
|
863
923
|
if not selector_string or not selector_string.strip():
|
|
864
924
|
raise SelectorError("Empty selector")
|
|
865
925
|
|
|
866
|
-
|
|
926
|
+
return _parse_selector_cached(selector_string.strip())
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
@lru_cache(maxsize=512)
|
|
930
|
+
def _parse_selector_cached(selector_string: str) -> ParsedSelector:
|
|
931
|
+
tokenizer = SelectorTokenizer(selector_string)
|
|
867
932
|
tokens = tokenizer.tokenize()
|
|
868
933
|
parser = SelectorParser(tokens)
|
|
869
934
|
return parser.parse()
|
|
870
935
|
|
|
871
936
|
|
|
872
937
|
# Global matcher instance
|
|
873
|
-
_matcher = SelectorMatcher()
|
|
938
|
+
_matcher: SelectorMatcher = SelectorMatcher()
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
def _is_simple_tag_selector(selector: str) -> bool:
|
|
942
|
+
if not selector:
|
|
943
|
+
return False
|
|
944
|
+
ch0 = selector[0]
|
|
945
|
+
if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
|
|
946
|
+
return False
|
|
947
|
+
for ch in selector[1:]:
|
|
948
|
+
if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
|
|
949
|
+
continue
|
|
950
|
+
return False
|
|
951
|
+
return True
|
|
874
952
|
|
|
875
953
|
|
|
876
|
-
def
|
|
954
|
+
def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
|
|
955
|
+
results_append = results.append
|
|
956
|
+
|
|
957
|
+
stack: list[Any] = []
|
|
958
|
+
|
|
959
|
+
root_children = node.children
|
|
960
|
+
if root_children:
|
|
961
|
+
stack.extend(reversed(root_children))
|
|
962
|
+
|
|
963
|
+
if node.name == "template" and node.namespace == "html":
|
|
964
|
+
template_content = node.template_content
|
|
965
|
+
if template_content:
|
|
966
|
+
stack.append(template_content)
|
|
967
|
+
|
|
968
|
+
while stack:
|
|
969
|
+
current = stack.pop()
|
|
970
|
+
|
|
971
|
+
name = current.name
|
|
972
|
+
if not name.startswith("#"):
|
|
973
|
+
if name == tag_lower or name.lower() == tag_lower:
|
|
974
|
+
results_append(current)
|
|
975
|
+
|
|
976
|
+
children = current.children
|
|
977
|
+
if children:
|
|
978
|
+
stack.extend(reversed(children))
|
|
979
|
+
|
|
980
|
+
if name == "template" and current.namespace == "html":
|
|
981
|
+
template_content = current.template_content
|
|
982
|
+
if template_content:
|
|
983
|
+
stack.append(template_content)
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def query(root: Any, selector_string: str) -> list[Any]:
|
|
877
987
|
"""
|
|
878
988
|
Query the DOM tree starting from root, returning all matching elements.
|
|
879
989
|
|
|
@@ -887,30 +997,56 @@ def query(root, selector_string):
|
|
|
887
997
|
Returns:
|
|
888
998
|
A list of matching nodes
|
|
889
999
|
"""
|
|
890
|
-
|
|
891
|
-
|
|
1000
|
+
selector_string = selector_string.strip()
|
|
1001
|
+
if not selector_string:
|
|
1002
|
+
raise SelectorError("Empty selector")
|
|
1003
|
+
|
|
1004
|
+
results: list[Any] = []
|
|
1005
|
+
|
|
1006
|
+
if _is_simple_tag_selector(selector_string):
|
|
1007
|
+
_query_descendants_tag(root, selector_string.lower(), results)
|
|
1008
|
+
return results
|
|
1009
|
+
|
|
1010
|
+
selector = _parse_selector_cached(selector_string)
|
|
892
1011
|
_query_descendants(root, selector, results)
|
|
893
1012
|
return results
|
|
894
1013
|
|
|
895
1014
|
|
|
896
|
-
def _query_descendants(node, selector, results):
|
|
897
|
-
"""
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1015
|
+
def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
|
|
1016
|
+
"""Search for matching nodes in descendants."""
|
|
1017
|
+
matcher_matches = _matcher.matches
|
|
1018
|
+
results_append = results.append
|
|
1019
|
+
|
|
1020
|
+
# querySelectorAll searches descendants of root, not including root itself.
|
|
1021
|
+
stack: list[Any] = []
|
|
1022
|
+
|
|
1023
|
+
root_children = node.children
|
|
1024
|
+
if root_children:
|
|
1025
|
+
stack.extend(reversed(root_children))
|
|
1026
|
+
|
|
1027
|
+
if node.name == "template" and node.namespace == "html":
|
|
1028
|
+
template_content = node.template_content
|
|
1029
|
+
if template_content:
|
|
1030
|
+
stack.append(template_content)
|
|
1031
|
+
|
|
1032
|
+
while stack:
|
|
1033
|
+
current = stack.pop()
|
|
1034
|
+
|
|
1035
|
+
name = current.name
|
|
1036
|
+
if not name.startswith("#") and matcher_matches(current, selector):
|
|
1037
|
+
results_append(current)
|
|
1038
|
+
|
|
1039
|
+
children = current.children
|
|
1040
|
+
if children:
|
|
1041
|
+
stack.extend(reversed(children))
|
|
907
1042
|
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
1043
|
+
if name == "template" and current.namespace == "html":
|
|
1044
|
+
template_content = current.template_content
|
|
1045
|
+
if template_content:
|
|
1046
|
+
stack.append(template_content)
|
|
911
1047
|
|
|
912
1048
|
|
|
913
|
-
def matches(node, selector_string):
|
|
1049
|
+
def matches(node: Any, selector_string: str) -> bool:
|
|
914
1050
|
"""
|
|
915
1051
|
Check if a node matches a CSS selector.
|
|
916
1052
|
|