selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selectolax/__init__.py +8 -0
- selectolax/base.pxi +4 -0
- selectolax/lexbor/attrs.pxi +120 -0
- selectolax/lexbor/node.pxi +1112 -0
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +215 -0
- selectolax/lexbor/util.pxi +20 -0
- selectolax/lexbor.c +53768 -0
- selectolax/lexbor.cpython-310-darwin.so +0 -0
- selectolax/lexbor.pxd +599 -0
- selectolax/lexbor.pyi +1248 -0
- selectolax/lexbor.pyx +677 -0
- selectolax/modest/node.pxi +991 -0
- selectolax/modest/selection.pxi +195 -0
- selectolax/modest/util.pxi +20 -0
- selectolax/parser.c +47848 -0
- selectolax/parser.cpython-310-darwin.so +0 -0
- selectolax/parser.pxd +578 -0
- selectolax/parser.pyi +770 -0
- selectolax/parser.pyx +443 -0
- selectolax/py.typed +0 -0
- selectolax/utils.pxi +117 -0
- selectolax-0.4.4.dist-info/METADATA +222 -0
- selectolax-0.4.4.dist-info/RECORD +27 -0
- selectolax-0.4.4.dist-info/WHEEL +6 -0
- selectolax-0.4.4.dist-info/licenses/LICENSE +10 -0
- selectolax-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@cython.final
|
|
6
|
+
cdef class CSSSelector:
|
|
7
|
+
|
|
8
|
+
cdef char *c_selector
|
|
9
|
+
cdef mycss_entry_t *css_entry
|
|
10
|
+
cdef modest_finder_t *finder
|
|
11
|
+
cdef mycss_selectors_list_t *selectors_list
|
|
12
|
+
|
|
13
|
+
def __init__(self, str selector):
|
|
14
|
+
|
|
15
|
+
selector_pybyte = selector.encode('UTF-8')
|
|
16
|
+
self.c_selector = selector_pybyte
|
|
17
|
+
|
|
18
|
+
# In order to propagate errors these methods should return no value
|
|
19
|
+
self._create_css_parser()
|
|
20
|
+
self._prepare_selector(self.css_entry, self.c_selector, len(self.c_selector))
|
|
21
|
+
self.finder = modest_finder_create_simple()
|
|
22
|
+
|
|
23
|
+
cdef myhtml_collection_t* find(self, myhtml_tree_node_t* scope):
|
|
24
|
+
"""Find all possible matches."""
|
|
25
|
+
|
|
26
|
+
cdef myhtml_collection_t *collection
|
|
27
|
+
|
|
28
|
+
collection = NULL
|
|
29
|
+
modest_finder_by_selectors_list(self.finder, scope, self.selectors_list, &collection)
|
|
30
|
+
|
|
31
|
+
return collection
|
|
32
|
+
|
|
33
|
+
cdef int _create_css_parser(self) except -1:
|
|
34
|
+
cdef mystatus_t status
|
|
35
|
+
|
|
36
|
+
cdef mycss_t *mycss = mycss_create()
|
|
37
|
+
status = mycss_init(mycss)
|
|
38
|
+
|
|
39
|
+
if status != 0:
|
|
40
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
|
|
41
|
+
return -1
|
|
42
|
+
|
|
43
|
+
self.css_entry = mycss_entry_create()
|
|
44
|
+
status = mycss_entry_init(mycss, self.css_entry)
|
|
45
|
+
|
|
46
|
+
if status != 0:
|
|
47
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
|
|
48
|
+
return -1
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
|
|
52
|
+
cdef mystatus_t out_status
|
|
53
|
+
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
|
|
54
|
+
selector, selector_size, &out_status)
|
|
55
|
+
|
|
56
|
+
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
|
|
57
|
+
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
|
|
58
|
+
return -1
|
|
59
|
+
return 0
|
|
60
|
+
|
|
61
|
+
def __dealloc__(self):
|
|
62
|
+
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
|
|
63
|
+
modest_finder_destroy(self.finder, 1)
|
|
64
|
+
|
|
65
|
+
cdef mycss_t *mycss = self.css_entry.mycss
|
|
66
|
+
mycss_entry_destroy(self.css_entry, 1)
|
|
67
|
+
mycss_destroy(mycss, 1)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
cdef class Selector:
|
|
71
|
+
"""An advanced CSS selector that supports additional operations.
|
|
72
|
+
|
|
73
|
+
Think of it as a toolkit that mimics some of the features of XPath.
|
|
74
|
+
|
|
75
|
+
Please note, this is an experimental feature that can change in the future.
|
|
76
|
+
"""
|
|
77
|
+
cdef Node node
|
|
78
|
+
cdef list nodes
|
|
79
|
+
|
|
80
|
+
def __init__(self, Node node, str query):
|
|
81
|
+
"""custom init, because __cinit__ doesn't accept C types"""
|
|
82
|
+
self.node = node
|
|
83
|
+
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
|
|
84
|
+
|
|
85
|
+
cpdef css(self, str query):
|
|
86
|
+
"""Evaluate CSS selector against current scope."""
|
|
87
|
+
cdef Node current_node
|
|
88
|
+
nodes = list()
|
|
89
|
+
for node in self.nodes:
|
|
90
|
+
current_node = node
|
|
91
|
+
nodes.extend(find_nodes(self.node.parser, current_node.node, query))
|
|
92
|
+
self.nodes = nodes
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def matches(self):
|
|
97
|
+
"""Returns all possible matches"""
|
|
98
|
+
return self.nodes
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def any_matches(self):
|
|
102
|
+
"""Returns True if there are any matches"""
|
|
103
|
+
return bool(self.nodes)
|
|
104
|
+
|
|
105
|
+
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
106
|
+
"""Filter all current matches given text."""
|
|
107
|
+
nodes = []
|
|
108
|
+
cdef Node node
|
|
109
|
+
for node in self.nodes:
|
|
110
|
+
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
111
|
+
if node_text and text in node_text:
|
|
112
|
+
nodes.append(node)
|
|
113
|
+
self.nodes = nodes
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
117
|
+
"""Returns True if any node in the current search scope contains specified text"""
|
|
118
|
+
nodes = []
|
|
119
|
+
cdef Node node
|
|
120
|
+
for node in self.nodes:
|
|
121
|
+
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
122
|
+
if node_text and text in node_text:
|
|
123
|
+
return True
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
def attribute_longer_than(self, str attribute, int length, str start = None):
|
|
127
|
+
"""Filter all current matches by attribute length.
|
|
128
|
+
|
|
129
|
+
Similar to `string-length` in XPath.
|
|
130
|
+
"""
|
|
131
|
+
nodes = []
|
|
132
|
+
for node in self.nodes:
|
|
133
|
+
attr = node.attributes.get(attribute)
|
|
134
|
+
if attr and start and start in attr:
|
|
135
|
+
attr = attr[attr.find(start) + len(start):]
|
|
136
|
+
if len(attr) > length:
|
|
137
|
+
nodes.append(node)
|
|
138
|
+
self.nodes = nodes
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def any_attribute_longer_than(self, str attribute, int length, str start = None):
|
|
142
|
+
"""Returns True any href attribute longer than a specified length.
|
|
143
|
+
|
|
144
|
+
Similar to `string-length` in XPath.
|
|
145
|
+
"""
|
|
146
|
+
cdef list nodes = []
|
|
147
|
+
cdef Node node
|
|
148
|
+
for node in self.nodes:
|
|
149
|
+
attr = node.attributes.get(attribute)
|
|
150
|
+
if attr and start and start in attr:
|
|
151
|
+
attr = attr[attr.find(start) + len(start):]
|
|
152
|
+
if len(attr) > length:
|
|
153
|
+
return True
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def __bool__(self):
|
|
157
|
+
return bool(self.nodes)
|
|
158
|
+
|
|
159
|
+
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
|
|
160
|
+
cdef myhtml_collection_t *collection
|
|
161
|
+
cdef CSSSelector selector = CSSSelector(query)
|
|
162
|
+
cdef Node n
|
|
163
|
+
cdef list result = []
|
|
164
|
+
collection = selector.find(node)
|
|
165
|
+
|
|
166
|
+
if collection == NULL:
|
|
167
|
+
return result
|
|
168
|
+
|
|
169
|
+
for i in range(collection.length):
|
|
170
|
+
n = Node.new(collection.list[i], parser)
|
|
171
|
+
result.append(n)
|
|
172
|
+
myhtml_collection_destroy(collection)
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple selectors):
|
|
177
|
+
cdef myhtml_collection_t *collection
|
|
178
|
+
cdef CSSSelector selector
|
|
179
|
+
cdef int collection_size
|
|
180
|
+
cdef str query
|
|
181
|
+
|
|
182
|
+
for query in selectors:
|
|
183
|
+
selector = CSSSelector(query)
|
|
184
|
+
collection_size = 0
|
|
185
|
+
collection = NULL
|
|
186
|
+
|
|
187
|
+
collection = selector.find(node)
|
|
188
|
+
if collection == NULL:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
collection_size = collection.length
|
|
192
|
+
myhtml_collection_destroy(collection)
|
|
193
|
+
if collection_size > 0:
|
|
194
|
+
return True
|
|
195
|
+
return False
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
include "../utils.pxi"
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def create_tag(tag: str):
|
|
5
|
+
"""
|
|
6
|
+
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
|
|
7
|
+
e.g. `"<div></div>"`.
|
|
8
|
+
"""
|
|
9
|
+
return do_create_tag(tag, HTMLParser)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_fragment(html: str):
|
|
13
|
+
"""
|
|
14
|
+
Given HTML, parse it into a list of Nodes, such that the nodes
|
|
15
|
+
correspond to the given HTML.
|
|
16
|
+
|
|
17
|
+
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
|
|
18
|
+
if they are missing. This function does not add these tags.
|
|
19
|
+
"""
|
|
20
|
+
return do_parse_fragment(html, HTMLParser)
|