justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/selector.py CHANGED
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ from functools import lru_cache
6
7
  from typing import Any
7
8
 
8
9
 
@@ -529,6 +530,14 @@ class SelectorMatcher:
529
530
 
530
531
  __slots__ = ()
531
532
 
533
+ def _unquote_pseudo_arg(self, arg: str) -> str:
534
+ arg = arg.strip()
535
+ if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
536
+ quote = arg[0]
537
+ # Minimal unescaping for common cases like :contains("click me")
538
+ return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
539
+ return arg
540
+
532
541
  def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
533
542
  """Check if a node matches a parsed selector."""
534
543
  if isinstance(selector, SelectorList):
@@ -642,7 +651,9 @@ class SelectorMatcher:
642
651
  attr_value: str | None = None
643
652
  for name, value in attrs.items():
644
653
  if name.lower() == attr_name:
645
- attr_value = value
654
+ # Attributes can be boolean (represented as None in JustHTML).
655
+ # For selector matching, presence should still count.
656
+ attr_value = "" if value is None else str(value)
646
657
  break
647
658
 
648
659
  if attr_value is None:
@@ -724,6 +735,17 @@ class SelectorMatcher:
724
735
  return parent.name in ("#document", "#document-fragment")
725
736
  return False
726
737
 
738
+ if name == "contains":
739
+ if selector.arg is None:
740
+ raise SelectorError(":contains() requires a string argument")
741
+ needle = self._unquote_pseudo_arg(selector.arg)
742
+ if needle == "":
743
+ return True
744
+ # Non-standard (jQuery-style) pseudo-class: match elements whose descendant
745
+ # text contains the substring. We use `to_text()` to approximate textContent.
746
+ haystack: str = node.to_text(separator=" ", strip=True)
747
+ return needle in haystack
748
+
727
749
  if name == "first-of-type":
728
750
  return self._is_first_of_type(node)
729
751
 
@@ -743,7 +765,7 @@ class SelectorMatcher:
743
765
  """Get only element children (exclude text, comments, etc.)."""
744
766
  if not parent or not parent.has_child_nodes():
745
767
  return []
746
- return [c for c in parent.children if hasattr(c, "name") and not c.name.startswith("#")]
768
+ return [c for c in parent.children if not c.name.startswith("#")]
747
769
 
748
770
  def _get_previous_sibling(self, node: Any) -> Any | None:
749
771
  """Get the previous element sibling. Returns None if node is first or not found."""
@@ -755,7 +777,7 @@ class SelectorMatcher:
755
777
  for child in parent.children:
756
778
  if child is node:
757
779
  return prev
758
- if hasattr(child, "name") and not child.name.startswith("#"):
780
+ if not child.name.startswith("#"):
759
781
  prev = child
760
782
  return None # node not in parent.children (detached)
761
783
 
@@ -903,7 +925,12 @@ def parse_selector(selector_string: str) -> ParsedSelector:
903
925
  if not selector_string or not selector_string.strip():
904
926
  raise SelectorError("Empty selector")
905
927
 
906
- tokenizer = SelectorTokenizer(selector_string.strip())
928
+ return _parse_selector_cached(selector_string.strip())
929
+
930
+
931
+ @lru_cache(maxsize=512)
932
+ def _parse_selector_cached(selector_string: str) -> ParsedSelector:
933
+ tokenizer = SelectorTokenizer(selector_string)
907
934
  tokens = tokenizer.tokenize()
908
935
  parser = SelectorParser(tokens)
909
936
  return parser.parse()
@@ -913,6 +940,51 @@ def parse_selector(selector_string: str) -> ParsedSelector:
913
940
  _matcher: SelectorMatcher = SelectorMatcher()
914
941
 
915
942
 
943
+ def _is_simple_tag_selector(selector: str) -> bool:
944
+ if not selector:
945
+ return False
946
+ ch0 = selector[0]
947
+ if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
948
+ return False
949
+ for ch in selector[1:]:
950
+ if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
951
+ continue
952
+ return False
953
+ return True
954
+
955
+
956
+ def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
957
+ results_append = results.append
958
+
959
+ stack: list[Any] = []
960
+
961
+ root_children = node.children
962
+ if root_children:
963
+ stack.extend(reversed(root_children))
964
+
965
+ if node.name == "template" and node.namespace == "html":
966
+ template_content = node.template_content
967
+ if template_content:
968
+ stack.append(template_content)
969
+
970
+ while stack:
971
+ current = stack.pop()
972
+
973
+ name = current.name
974
+ if not name.startswith("#"):
975
+ if name == tag_lower or name.lower() == tag_lower:
976
+ results_append(current)
977
+
978
+ children = current.children
979
+ if children:
980
+ stack.extend(reversed(children))
981
+
982
+ if name == "template" and current.namespace == "html":
983
+ template_content = current.template_content
984
+ if template_content:
985
+ stack.append(template_content)
986
+
987
+
916
988
  def query(root: Any, selector_string: str) -> list[Any]:
917
989
  """
918
990
  Query the DOM tree starting from root, returning all matching elements.
@@ -927,27 +999,53 @@ def query(root: Any, selector_string: str) -> list[Any]:
927
999
  Returns:
928
1000
  A list of matching nodes
929
1001
  """
930
- selector = parse_selector(selector_string)
1002
+ selector_string = selector_string.strip()
1003
+ if not selector_string:
1004
+ raise SelectorError("Empty selector")
1005
+
931
1006
  results: list[Any] = []
1007
+
1008
+ if _is_simple_tag_selector(selector_string):
1009
+ _query_descendants_tag(root, selector_string.lower(), results)
1010
+ return results
1011
+
1012
+ selector = _parse_selector_cached(selector_string)
932
1013
  _query_descendants(root, selector, results)
933
1014
  return results
934
1015
 
935
1016
 
936
1017
  def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
937
- """Recursively search for matching nodes in descendants."""
938
- # Only recurse into children (not the node itself)
939
- if node.has_child_nodes():
940
- for child in node.children:
941
- # Check if this child matches
942
- if hasattr(child, "name") and not child.name.startswith("#"):
943
- if _matcher.matches(child, selector):
944
- results.append(child)
945
- # Recurse into child's descendants
946
- _query_descendants(child, selector, results)
947
-
948
- # Also check template content if present
949
- if hasattr(node, "template_content") and node.template_content:
950
- _query_descendants(node.template_content, selector, results)
1018
+ """Search for matching nodes in descendants."""
1019
+ matcher_matches = _matcher.matches
1020
+ results_append = results.append
1021
+
1022
+ # querySelectorAll searches descendants of root, not including root itself.
1023
+ stack: list[Any] = []
1024
+
1025
+ root_children = node.children
1026
+ if root_children:
1027
+ stack.extend(reversed(root_children))
1028
+
1029
+ if node.name == "template" and node.namespace == "html":
1030
+ template_content = node.template_content
1031
+ if template_content:
1032
+ stack.append(template_content)
1033
+
1034
+ while stack:
1035
+ current = stack.pop()
1036
+
1037
+ name = current.name
1038
+ if not name.startswith("#") and matcher_matches(current, selector):
1039
+ results_append(current)
1040
+
1041
+ children = current.children
1042
+ if children:
1043
+ stack.extend(reversed(children))
1044
+
1045
+ if name == "template" and current.namespace == "html":
1046
+ template_content = current.template_content
1047
+ if template_content:
1048
+ stack.append(template_content)
951
1049
 
952
1050
 
953
1051
  def matches(node: Any, selector_string: str) -> bool: