selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selectolax/__init__.py +8 -0
- selectolax/base.pxi +4 -0
- selectolax/lexbor/attrs.pxi +120 -0
- selectolax/lexbor/node.pxi +1112 -0
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +215 -0
- selectolax/lexbor/util.pxi +20 -0
- selectolax/lexbor.c +53768 -0
- selectolax/lexbor.cpython-310-darwin.so +0 -0
- selectolax/lexbor.pxd +599 -0
- selectolax/lexbor.pyi +1248 -0
- selectolax/lexbor.pyx +677 -0
- selectolax/modest/node.pxi +991 -0
- selectolax/modest/selection.pxi +195 -0
- selectolax/modest/util.pxi +20 -0
- selectolax/parser.c +47848 -0
- selectolax/parser.cpython-310-darwin.so +0 -0
- selectolax/parser.pxd +578 -0
- selectolax/parser.pyi +770 -0
- selectolax/parser.pyx +443 -0
- selectolax/py.typed +0 -0
- selectolax/utils.pxi +117 -0
- selectolax-0.4.4.dist-info/METADATA +222 -0
- selectolax-0.4.4.dist-info/RECORD +27 -0
- selectolax-0.4.4.dist-info/WHEEL +6 -0
- selectolax-0.4.4.dist-info/licenses/LICENSE +10 -0
- selectolax-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
|
|
3
|
+
cdef lxb_dom_node_t *tmp
|
|
4
|
+
cdef lxb_dom_node_t *node = root
|
|
5
|
+
|
|
6
|
+
while node != NULL:
|
|
7
|
+
if node.first_child != NULL:
|
|
8
|
+
node = node.first_child
|
|
9
|
+
else:
|
|
10
|
+
while node != root and node.next == NULL:
|
|
11
|
+
tmp = node.parent
|
|
12
|
+
lxb_dom_node_remove(node)
|
|
13
|
+
node = tmp
|
|
14
|
+
|
|
15
|
+
if node == root:
|
|
16
|
+
lxb_dom_node_remove(node)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
tmp = node.next
|
|
20
|
+
lxb_dom_node_remove(node)
|
|
21
|
+
node = tmp
|
|
22
|
+
|
|
23
|
+
return NULL
|
|
24
|
+
|
|
25
|
+
cdef bint node_is_removed(lxb_dom_node_t* node):
|
|
26
|
+
if node.parent == NULL and node.next == NULL \
|
|
27
|
+
and node.prev == NULL:
|
|
28
|
+
return 1
|
|
29
|
+
return 0
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
from cpython.list cimport PyList_GET_SIZE
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@cython.final
|
|
7
|
+
cdef class LexborCSSSelector:
|
|
8
|
+
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self._create_css_parser()
|
|
11
|
+
self.results = []
|
|
12
|
+
self.current_node = None
|
|
13
|
+
|
|
14
|
+
cdef int _create_css_parser(self) except -1:
|
|
15
|
+
cdef lxb_status_t status
|
|
16
|
+
|
|
17
|
+
self.parser = lxb_css_parser_create()
|
|
18
|
+
status = lxb_css_parser_init(self.parser, NULL)
|
|
19
|
+
|
|
20
|
+
if status != LXB_STATUS_OK:
|
|
21
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
|
|
22
|
+
return -1
|
|
23
|
+
|
|
24
|
+
self.css_selectors = lxb_css_selectors_create()
|
|
25
|
+
status = lxb_css_selectors_init(self.css_selectors)
|
|
26
|
+
|
|
27
|
+
if status != LXB_STATUS_OK:
|
|
28
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
|
|
29
|
+
return -1
|
|
30
|
+
|
|
31
|
+
lxb_css_parser_selectors_set(self.parser, self.css_selectors)
|
|
32
|
+
|
|
33
|
+
self.selectors = lxb_selectors_create()
|
|
34
|
+
status = lxb_selectors_init(self.selectors)
|
|
35
|
+
lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
|
|
36
|
+
if status != LXB_STATUS_OK:
|
|
37
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
|
|
38
|
+
return -1
|
|
39
|
+
return 0
|
|
40
|
+
|
|
41
|
+
cpdef list find(self, str query, LexborNode node):
|
|
42
|
+
return self._find(query, node, 0)
|
|
43
|
+
|
|
44
|
+
cpdef list find_first(self, str query, LexborNode node):
|
|
45
|
+
return self._find(query, node, 1)
|
|
46
|
+
|
|
47
|
+
cpdef list _find(self, str query, LexborNode node, bint only_first):
|
|
48
|
+
cdef lxb_css_selector_list_t* selectors
|
|
49
|
+
cdef lxb_char_t* c_selector
|
|
50
|
+
cdef lxb_css_selector_list_t * selectors_list
|
|
51
|
+
|
|
52
|
+
if not isinstance(query, str):
|
|
53
|
+
raise TypeError("Query must be a string.")
|
|
54
|
+
|
|
55
|
+
bytes_query = query.encode(_ENCODING)
|
|
56
|
+
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(bytes_query))
|
|
57
|
+
|
|
58
|
+
if selectors_list == NULL:
|
|
59
|
+
raise SelectolaxError("Can't parse CSS selector.")
|
|
60
|
+
|
|
61
|
+
self.current_node = node
|
|
62
|
+
self.results = []
|
|
63
|
+
if only_first:
|
|
64
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
65
|
+
<lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
|
|
66
|
+
else:
|
|
67
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
68
|
+
<lxb_selectors_cb_f>css_finder_callback, <void*>self)
|
|
69
|
+
results = list(self.results)
|
|
70
|
+
self.results = []
|
|
71
|
+
self.current_node = None
|
|
72
|
+
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
73
|
+
return results
|
|
74
|
+
|
|
75
|
+
cpdef int any_matches(self, str query, LexborNode node) except -1:
|
|
76
|
+
cdef lxb_css_selector_list_t * selectors
|
|
77
|
+
cdef lxb_char_t * c_selector
|
|
78
|
+
cdef lxb_css_selector_list_t * selectors_list
|
|
79
|
+
cdef int result
|
|
80
|
+
|
|
81
|
+
if not isinstance(query, str):
|
|
82
|
+
raise TypeError("Query must be a string.")
|
|
83
|
+
|
|
84
|
+
bytes_query = query.encode(_ENCODING)
|
|
85
|
+
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
|
|
86
|
+
|
|
87
|
+
if selectors_list == NULL:
|
|
88
|
+
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
89
|
+
return -1
|
|
90
|
+
|
|
91
|
+
self.results = []
|
|
92
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
93
|
+
<lxb_selectors_cb_f> css_matcher_callback, <void *> self)
|
|
94
|
+
if status != LXB_STATUS_OK:
|
|
95
|
+
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
96
|
+
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
97
|
+
return -1
|
|
98
|
+
|
|
99
|
+
result = PyList_GET_SIZE(self.results) > 0
|
|
100
|
+
self.results = []
|
|
101
|
+
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
def __dealloc__(self):
|
|
105
|
+
if self.selectors != NULL:
|
|
106
|
+
lxb_selectors_destroy(self.selectors, True)
|
|
107
|
+
if self.parser != NULL:
|
|
108
|
+
lxb_css_parser_destroy(self.parser, True)
|
|
109
|
+
if self.css_selectors != NULL:
|
|
110
|
+
lxb_css_selectors_destroy(self.css_selectors, True)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
cdef class LexborSelector:
|
|
114
|
+
"""An advanced CSS selector that supports additional operations.
|
|
115
|
+
|
|
116
|
+
Think of it as a toolkit that mimics some of the features of XPath.
|
|
117
|
+
|
|
118
|
+
Please note, this is an experimental feature that can change in the future.
|
|
119
|
+
"""
|
|
120
|
+
cdef LexborNode node
|
|
121
|
+
cdef list nodes
|
|
122
|
+
|
|
123
|
+
def __init__(self, LexborNode node, query):
|
|
124
|
+
self.node = node
|
|
125
|
+
self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]
|
|
126
|
+
|
|
127
|
+
cpdef css(self, str query):
|
|
128
|
+
"""Evaluate CSS selector against current scope."""
|
|
129
|
+
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def matches(self) -> list:
|
|
133
|
+
"""Returns all possible matches"""
|
|
134
|
+
return self.nodes
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def any_matches(self) -> bool:
|
|
138
|
+
"""Returns True if there are any matches"""
|
|
139
|
+
return bool(self.nodes)
|
|
140
|
+
|
|
141
|
+
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
|
|
142
|
+
"""Filter all current matches given text."""
|
|
143
|
+
cdef list nodes = []
|
|
144
|
+
for node in self.nodes:
|
|
145
|
+
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
146
|
+
if node_text and text in node_text:
|
|
147
|
+
nodes.append(node)
|
|
148
|
+
self.nodes = nodes
|
|
149
|
+
return self
|
|
150
|
+
|
|
151
|
+
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
|
|
152
|
+
"""Returns True if any node in the current search scope contains specified text"""
|
|
153
|
+
cdef LexborNode node
|
|
154
|
+
for node in self.nodes:
|
|
155
|
+
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
156
|
+
if node_text and text in node_text:
|
|
157
|
+
return True
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
def attribute_longer_than(self, str attribute, int length, str start = None) -> LexborSelector:
|
|
161
|
+
"""Filter all current matches by attribute length.
|
|
162
|
+
|
|
163
|
+
Similar to `string-length` in XPath.
|
|
164
|
+
"""
|
|
165
|
+
cdef list nodes = []
|
|
166
|
+
for node in self.nodes:
|
|
167
|
+
attr = node.attributes.get(attribute)
|
|
168
|
+
if attr and start and start in attr:
|
|
169
|
+
attr = attr[attr.find(start) + len(start):]
|
|
170
|
+
if len(attr) > length:
|
|
171
|
+
nodes.append(node)
|
|
172
|
+
self.nodes = nodes
|
|
173
|
+
return self
|
|
174
|
+
|
|
175
|
+
def any_attribute_longer_than(self, str attribute, int length, str start = None) -> bool:
|
|
176
|
+
"""Returns True any href attribute longer than a specified length.
|
|
177
|
+
|
|
178
|
+
Similar to `string-length` in XPath.
|
|
179
|
+
"""
|
|
180
|
+
cdef LexborNode node
|
|
181
|
+
for node in self.nodes:
|
|
182
|
+
attr = node.attributes.get(attribute)
|
|
183
|
+
if attr and start and start in attr:
|
|
184
|
+
attr = attr[attr.find(start) + len(start):]
|
|
185
|
+
if len(attr) > length:
|
|
186
|
+
return True
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
def __bool__(self):
|
|
190
|
+
return bool(self.nodes)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
194
|
+
cdef LexborNode lxb_node
|
|
195
|
+
cdef LexborCSSSelector cls
|
|
196
|
+
cls = <LexborCSSSelector> ctx
|
|
197
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
198
|
+
cls.results.append(lxb_node)
|
|
199
|
+
return LXB_STATUS_OK
|
|
200
|
+
|
|
201
|
+
cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
202
|
+
cdef LexborNode lxb_node
|
|
203
|
+
cdef LexborCSSSelector cls
|
|
204
|
+
cls = <LexborCSSSelector> ctx
|
|
205
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
206
|
+
cls.results.append(lxb_node)
|
|
207
|
+
return LXB_STATUS_STOP
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
211
|
+
cdef LexborNode lxb_node
|
|
212
|
+
cdef LexborCSSSelector cls
|
|
213
|
+
cls = <LexborCSSSelector> ctx
|
|
214
|
+
cls.results.append(True)
|
|
215
|
+
return LXB_STATUS_STOP
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
include "../utils.pxi"
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def create_tag(tag: str):
|
|
5
|
+
"""
|
|
6
|
+
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
|
|
7
|
+
e.g. `"<div></div>"`.
|
|
8
|
+
"""
|
|
9
|
+
return do_create_tag(tag, LexborHTMLParser)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_fragment(html: str):
|
|
13
|
+
"""
|
|
14
|
+
Given HTML, parse it into a list of Nodes, such that the nodes
|
|
15
|
+
correspond to the given HTML.
|
|
16
|
+
|
|
17
|
+
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
|
|
18
|
+
if they are missing. This function does not add these tags.
|
|
19
|
+
"""
|
|
20
|
+
return do_parse_fragment(html, LexborHTMLParser)
|