justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/selector.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
from functools import lru_cache
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
8
9
|
|
|
@@ -529,6 +530,14 @@ class SelectorMatcher:
|
|
|
529
530
|
|
|
530
531
|
__slots__ = ()
|
|
531
532
|
|
|
533
|
+
def _unquote_pseudo_arg(self, arg: str) -> str:
|
|
534
|
+
arg = arg.strip()
|
|
535
|
+
if len(arg) >= 2 and arg[0] == arg[-1] and arg[0] in ('"', "'"):
|
|
536
|
+
quote = arg[0]
|
|
537
|
+
# Minimal unescaping for common cases like :contains("click me")
|
|
538
|
+
return arg[1:-1].replace("\\" + quote, quote).replace("\\\\", "\\")
|
|
539
|
+
return arg
|
|
540
|
+
|
|
532
541
|
def matches(self, node: Any, selector: ParsedSelector | CompoundSelector | SimpleSelector) -> bool:
|
|
533
542
|
"""Check if a node matches a parsed selector."""
|
|
534
543
|
if isinstance(selector, SelectorList):
|
|
@@ -642,7 +651,9 @@ class SelectorMatcher:
|
|
|
642
651
|
attr_value: str | None = None
|
|
643
652
|
for name, value in attrs.items():
|
|
644
653
|
if name.lower() == attr_name:
|
|
645
|
-
|
|
654
|
+
# Attributes can be boolean (represented as None in JustHTML).
|
|
655
|
+
# For selector matching, presence should still count.
|
|
656
|
+
attr_value = "" if value is None else str(value)
|
|
646
657
|
break
|
|
647
658
|
|
|
648
659
|
if attr_value is None:
|
|
@@ -724,6 +735,17 @@ class SelectorMatcher:
|
|
|
724
735
|
return parent.name in ("#document", "#document-fragment")
|
|
725
736
|
return False
|
|
726
737
|
|
|
738
|
+
if name == "contains":
|
|
739
|
+
if selector.arg is None:
|
|
740
|
+
raise SelectorError(":contains() requires a string argument")
|
|
741
|
+
needle = self._unquote_pseudo_arg(selector.arg)
|
|
742
|
+
if needle == "":
|
|
743
|
+
return True
|
|
744
|
+
# Non-standard (jQuery-style) pseudo-class: match elements whose descendant
|
|
745
|
+
# text contains the substring. We use `to_text()` to approximate textContent.
|
|
746
|
+
haystack: str = node.to_text(separator=" ", strip=True)
|
|
747
|
+
return needle in haystack
|
|
748
|
+
|
|
727
749
|
if name == "first-of-type":
|
|
728
750
|
return self._is_first_of_type(node)
|
|
729
751
|
|
|
@@ -743,7 +765,7 @@ class SelectorMatcher:
|
|
|
743
765
|
"""Get only element children (exclude text, comments, etc.)."""
|
|
744
766
|
if not parent or not parent.has_child_nodes():
|
|
745
767
|
return []
|
|
746
|
-
return [c for c in parent.children if
|
|
768
|
+
return [c for c in parent.children if not c.name.startswith("#")]
|
|
747
769
|
|
|
748
770
|
def _get_previous_sibling(self, node: Any) -> Any | None:
|
|
749
771
|
"""Get the previous element sibling. Returns None if node is first or not found."""
|
|
@@ -755,7 +777,7 @@ class SelectorMatcher:
|
|
|
755
777
|
for child in parent.children:
|
|
756
778
|
if child is node:
|
|
757
779
|
return prev
|
|
758
|
-
if
|
|
780
|
+
if not child.name.startswith("#"):
|
|
759
781
|
prev = child
|
|
760
782
|
return None # node not in parent.children (detached)
|
|
761
783
|
|
|
@@ -903,7 +925,12 @@ def parse_selector(selector_string: str) -> ParsedSelector:
|
|
|
903
925
|
if not selector_string or not selector_string.strip():
|
|
904
926
|
raise SelectorError("Empty selector")
|
|
905
927
|
|
|
906
|
-
|
|
928
|
+
return _parse_selector_cached(selector_string.strip())
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
@lru_cache(maxsize=512)
|
|
932
|
+
def _parse_selector_cached(selector_string: str) -> ParsedSelector:
|
|
933
|
+
tokenizer = SelectorTokenizer(selector_string)
|
|
907
934
|
tokens = tokenizer.tokenize()
|
|
908
935
|
parser = SelectorParser(tokens)
|
|
909
936
|
return parser.parse()
|
|
@@ -913,6 +940,51 @@ def parse_selector(selector_string: str) -> ParsedSelector:
|
|
|
913
940
|
_matcher: SelectorMatcher = SelectorMatcher()
|
|
914
941
|
|
|
915
942
|
|
|
943
|
+
def _is_simple_tag_selector(selector: str) -> bool:
|
|
944
|
+
if not selector:
|
|
945
|
+
return False
|
|
946
|
+
ch0 = selector[0]
|
|
947
|
+
if not (ch0.isalpha() or ch0 == "_" or ch0 == "-" or ord(ch0) > 127):
|
|
948
|
+
return False
|
|
949
|
+
for ch in selector[1:]:
|
|
950
|
+
if ch.isalnum() or ch == "_" or ch == "-" or ord(ch) > 127:
|
|
951
|
+
continue
|
|
952
|
+
return False
|
|
953
|
+
return True
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def _query_descendants_tag(node: Any, tag_lower: str, results: list[Any]) -> None:
|
|
957
|
+
results_append = results.append
|
|
958
|
+
|
|
959
|
+
stack: list[Any] = []
|
|
960
|
+
|
|
961
|
+
root_children = node.children
|
|
962
|
+
if root_children:
|
|
963
|
+
stack.extend(reversed(root_children))
|
|
964
|
+
|
|
965
|
+
if node.name == "template" and node.namespace == "html":
|
|
966
|
+
template_content = node.template_content
|
|
967
|
+
if template_content:
|
|
968
|
+
stack.append(template_content)
|
|
969
|
+
|
|
970
|
+
while stack:
|
|
971
|
+
current = stack.pop()
|
|
972
|
+
|
|
973
|
+
name = current.name
|
|
974
|
+
if not name.startswith("#"):
|
|
975
|
+
if name == tag_lower or name.lower() == tag_lower:
|
|
976
|
+
results_append(current)
|
|
977
|
+
|
|
978
|
+
children = current.children
|
|
979
|
+
if children:
|
|
980
|
+
stack.extend(reversed(children))
|
|
981
|
+
|
|
982
|
+
if name == "template" and current.namespace == "html":
|
|
983
|
+
template_content = current.template_content
|
|
984
|
+
if template_content:
|
|
985
|
+
stack.append(template_content)
|
|
986
|
+
|
|
987
|
+
|
|
916
988
|
def query(root: Any, selector_string: str) -> list[Any]:
|
|
917
989
|
"""
|
|
918
990
|
Query the DOM tree starting from root, returning all matching elements.
|
|
@@ -927,27 +999,53 @@ def query(root: Any, selector_string: str) -> list[Any]:
|
|
|
927
999
|
Returns:
|
|
928
1000
|
A list of matching nodes
|
|
929
1001
|
"""
|
|
930
|
-
|
|
1002
|
+
selector_string = selector_string.strip()
|
|
1003
|
+
if not selector_string:
|
|
1004
|
+
raise SelectorError("Empty selector")
|
|
1005
|
+
|
|
931
1006
|
results: list[Any] = []
|
|
1007
|
+
|
|
1008
|
+
if _is_simple_tag_selector(selector_string):
|
|
1009
|
+
_query_descendants_tag(root, selector_string.lower(), results)
|
|
1010
|
+
return results
|
|
1011
|
+
|
|
1012
|
+
selector = _parse_selector_cached(selector_string)
|
|
932
1013
|
_query_descendants(root, selector, results)
|
|
933
1014
|
return results
|
|
934
1015
|
|
|
935
1016
|
|
|
936
1017
|
def _query_descendants(node: Any, selector: ParsedSelector, results: list[Any]) -> None:
|
|
937
|
-
"""
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
1018
|
+
"""Search for matching nodes in descendants."""
|
|
1019
|
+
matcher_matches = _matcher.matches
|
|
1020
|
+
results_append = results.append
|
|
1021
|
+
|
|
1022
|
+
# querySelectorAll searches descendants of root, not including root itself.
|
|
1023
|
+
stack: list[Any] = []
|
|
1024
|
+
|
|
1025
|
+
root_children = node.children
|
|
1026
|
+
if root_children:
|
|
1027
|
+
stack.extend(reversed(root_children))
|
|
1028
|
+
|
|
1029
|
+
if node.name == "template" and node.namespace == "html":
|
|
1030
|
+
template_content = node.template_content
|
|
1031
|
+
if template_content:
|
|
1032
|
+
stack.append(template_content)
|
|
1033
|
+
|
|
1034
|
+
while stack:
|
|
1035
|
+
current = stack.pop()
|
|
1036
|
+
|
|
1037
|
+
name = current.name
|
|
1038
|
+
if not name.startswith("#") and matcher_matches(current, selector):
|
|
1039
|
+
results_append(current)
|
|
1040
|
+
|
|
1041
|
+
children = current.children
|
|
1042
|
+
if children:
|
|
1043
|
+
stack.extend(reversed(children))
|
|
1044
|
+
|
|
1045
|
+
if name == "template" and current.namespace == "html":
|
|
1046
|
+
template_content = current.template_content
|
|
1047
|
+
if template_content:
|
|
1048
|
+
stack.append(template_content)
|
|
951
1049
|
|
|
952
1050
|
|
|
953
1051
|
def matches(node: Any, selector_string: str) -> bool:
|