selectolax 0.3.31__cp311-cp311-win_amd64.whl → 0.3.33__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +0 -1
- selectolax/lexbor/node.pxi +93 -41
- selectolax/lexbor/selection.pxi +27 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +6367 -6672
- selectolax/lexbor.cp311-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +32 -35
- selectolax/lexbor.pyi +111 -5
- selectolax/lexbor.pyx +43 -17
- selectolax/modest/node.pxi +37 -36
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +4484 -5266
- selectolax/parser.cp311-win_amd64.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +2 -2
- selectolax/parser.pyx +28 -31
- selectolax/utils.pxi +13 -3
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/METADATA +3 -3
- selectolax-0.3.33.dist-info/RECORD +26 -0
- selectolax-0.3.31.dist-info/RECORD +0 -26
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/WHEEL +0 -0
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/top_level.txt +0 -0
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,6 +9,7 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
@@ -23,9 +25,10 @@ cdef class Stack:
|
|
|
23
25
|
cdef bint is_empty(self):
|
|
24
26
|
return self.top <= 0
|
|
25
27
|
|
|
26
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
27
29
|
if self.top >= self.capacity:
|
|
28
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
29
32
|
self._stack[self.top] = res
|
|
30
33
|
self.top += 1
|
|
31
34
|
|
|
@@ -33,10 +36,13 @@ cdef class Stack:
|
|
|
33
36
|
self.top = self.top - 1
|
|
34
37
|
return self._stack[self.top]
|
|
35
38
|
|
|
36
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
37
40
|
self.capacity *= 2
|
|
38
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
39
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
40
46
|
|
|
41
47
|
cdef class _Attributes:
|
|
42
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -130,25 +136,24 @@ cdef class _Attributes:
|
|
|
130
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
131
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
132
138
|
|
|
133
|
-
|
|
134
|
-
|
|
135
139
|
ctypedef fused str_or_Node:
|
|
136
140
|
str
|
|
137
141
|
bytes
|
|
138
142
|
Node
|
|
139
143
|
|
|
140
|
-
|
|
141
144
|
cdef class Node:
|
|
142
145
|
"""A class that represents HTML node (element)."""
|
|
143
146
|
cdef myhtml_tree_node_t *node
|
|
144
147
|
cdef public HTMLParser parser
|
|
145
148
|
|
|
146
|
-
|
|
147
|
-
cdef
|
|
148
|
-
# custom
|
|
149
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
150
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
151
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
152
157
|
|
|
153
158
|
@property
|
|
154
159
|
def attributes(self):
|
|
@@ -288,7 +293,7 @@ cdef class Node:
|
|
|
288
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
289
294
|
text = ""
|
|
290
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
291
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
292
297
|
|
|
293
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
294
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -341,12 +346,10 @@ cdef class Node:
|
|
|
341
346
|
node = node.next
|
|
342
347
|
continue
|
|
343
348
|
|
|
344
|
-
next_node = Node()
|
|
345
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
346
350
|
yield next_node
|
|
347
351
|
node = node.next
|
|
348
352
|
|
|
349
|
-
|
|
350
353
|
def traverse(self, include_text=False):
|
|
351
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
352
355
|
|
|
@@ -360,16 +363,15 @@ cdef class Node:
|
|
|
360
363
|
node
|
|
361
364
|
"""
|
|
362
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
363
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
364
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
365
368
|
|
|
366
369
|
stack.push(self.node)
|
|
367
370
|
|
|
368
371
|
while not stack.is_empty():
|
|
369
372
|
current_node = stack.pop()
|
|
370
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
371
|
-
next_node = Node()
|
|
372
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
373
375
|
yield next_node
|
|
374
376
|
|
|
375
377
|
if current_node.next is not NULL:
|
|
@@ -398,8 +400,7 @@ cdef class Node:
|
|
|
398
400
|
"""Return the child node."""
|
|
399
401
|
cdef Node node
|
|
400
402
|
if self.node.child:
|
|
401
|
-
node = Node()
|
|
402
|
-
node._init(self.node.child, self.parser)
|
|
403
|
+
node = Node.new(self.node.child, self.parser)
|
|
403
404
|
return node
|
|
404
405
|
return None
|
|
405
406
|
|
|
@@ -408,8 +409,7 @@ cdef class Node:
|
|
|
408
409
|
"""Return the parent node."""
|
|
409
410
|
cdef Node node
|
|
410
411
|
if self.node.parent:
|
|
411
|
-
node = Node()
|
|
412
|
-
node._init(self.node.parent, self.parser)
|
|
412
|
+
node = Node.new(self.node.parent, self.parser)
|
|
413
413
|
return node
|
|
414
414
|
return None
|
|
415
415
|
|
|
@@ -418,8 +418,7 @@ cdef class Node:
|
|
|
418
418
|
"""Return next node."""
|
|
419
419
|
cdef Node node
|
|
420
420
|
if self.node.next:
|
|
421
|
-
node = Node()
|
|
422
|
-
node._init(self.node.next, self.parser)
|
|
421
|
+
node = Node.new(self.node.next, self.parser)
|
|
423
422
|
return node
|
|
424
423
|
return None
|
|
425
424
|
|
|
@@ -428,8 +427,7 @@ cdef class Node:
|
|
|
428
427
|
"""Return previous node."""
|
|
429
428
|
cdef Node node
|
|
430
429
|
if self.node.prev:
|
|
431
|
-
node = Node()
|
|
432
|
-
node._init(self.node.prev, self.parser)
|
|
430
|
+
node = Node.new(self.node.prev, self.parser)
|
|
433
431
|
return node
|
|
434
432
|
return None
|
|
435
433
|
|
|
@@ -438,8 +436,7 @@ cdef class Node:
|
|
|
438
436
|
"""Return last child node."""
|
|
439
437
|
cdef Node node
|
|
440
438
|
if self.node.last_child:
|
|
441
|
-
node = Node()
|
|
442
|
-
node._init(self.node.last_child, self.parser)
|
|
439
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
443
440
|
return node
|
|
444
441
|
return None
|
|
445
442
|
|
|
@@ -539,8 +536,8 @@ cdef class Node:
|
|
|
539
536
|
if delete_empty:
|
|
540
537
|
myhtml_node_delete(self.node)
|
|
541
538
|
return
|
|
542
|
-
cdef myhtml_tree_node_t* next_node
|
|
543
|
-
cdef myhtml_tree_node_t* current_node
|
|
539
|
+
cdef myhtml_tree_node_t* next_node
|
|
540
|
+
cdef myhtml_tree_node_t* current_node
|
|
544
541
|
|
|
545
542
|
if self.node.child.next != NULL:
|
|
546
543
|
current_node = self.node.child
|
|
@@ -574,6 +571,8 @@ cdef class Node:
|
|
|
574
571
|
'<html><body><div>Hello world!</div></body></html>'
|
|
575
572
|
|
|
576
573
|
"""
|
|
574
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
575
|
+
cdef Node element
|
|
577
576
|
for tag in tags:
|
|
578
577
|
for element in self.css(tag):
|
|
579
578
|
element.decompose(recursive=recursive)
|
|
@@ -600,7 +599,7 @@ cdef class Node:
|
|
|
600
599
|
|
|
601
600
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
602
601
|
"""
|
|
603
|
-
|
|
602
|
+
cdef Node element
|
|
604
603
|
for tag in tags:
|
|
605
604
|
for element in self.css(tag):
|
|
606
605
|
element.unwrap(delete_empty)
|
|
@@ -788,7 +787,7 @@ cdef class Node:
|
|
|
788
787
|
|
|
789
788
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
790
789
|
"""
|
|
791
|
-
|
|
790
|
+
cdef Node element
|
|
792
791
|
for tag in tags:
|
|
793
792
|
for element in self.css(tag):
|
|
794
793
|
element.unwrap(delete_empty)
|
|
@@ -847,6 +846,7 @@ cdef class Node:
|
|
|
847
846
|
The query to check.
|
|
848
847
|
|
|
849
848
|
"""
|
|
849
|
+
cdef Node node
|
|
850
850
|
if self.parser.cached_script_texts is None:
|
|
851
851
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
852
852
|
text_nodes = []
|
|
@@ -895,6 +895,7 @@ cdef class Node:
|
|
|
895
895
|
if not isinstance(other, Node):
|
|
896
896
|
return False
|
|
897
897
|
return self.html == other.html
|
|
898
|
+
|
|
898
899
|
@property
|
|
899
900
|
def text_content(self):
|
|
900
901
|
"""Returns the text of the node if it is a text node.
|
|
@@ -948,8 +949,8 @@ cdef class Node:
|
|
|
948
949
|
while not stack.is_empty():
|
|
949
950
|
current_node = stack.pop()
|
|
950
951
|
|
|
951
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
952
|
-
|
|
952
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
953
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
953
954
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
954
955
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
955
956
|
if left_text and right_text:
|
selectolax/modest/selection.pxi
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
@cython.final
|
|
4
6
|
cdef class CSSSelector:
|
|
@@ -28,35 +30,33 @@ cdef class CSSSelector:
|
|
|
28
30
|
|
|
29
31
|
return collection
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
cdef _create_css_parser(self):
|
|
33
|
+
cdef int _create_css_parser(self) except -1:
|
|
33
34
|
cdef mystatus_t status
|
|
34
35
|
|
|
35
36
|
cdef mycss_t *mycss = mycss_create()
|
|
36
37
|
status = mycss_init(mycss)
|
|
37
38
|
|
|
38
39
|
if status != 0:
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
|
|
41
|
+
return -1
|
|
41
42
|
|
|
42
43
|
self.css_entry = mycss_entry_create()
|
|
43
44
|
status = mycss_entry_init(mycss, self.css_entry)
|
|
44
45
|
|
|
45
46
|
if status != 0:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
|
|
48
|
+
return -1
|
|
49
|
+
return 0
|
|
49
50
|
|
|
50
|
-
cdef _prepare_selector(self, mycss_entry_t *css_entry,
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
myencoding_t.MyENCODING_UTF_8,
|
|
55
|
-
selector, selector_size,
|
|
56
|
-
&out_status)
|
|
51
|
+
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
|
|
52
|
+
cdef mystatus_t out_status
|
|
53
|
+
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
|
|
54
|
+
selector, selector_size, &out_status)
|
|
57
55
|
|
|
58
56
|
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
|
|
59
|
-
|
|
57
|
+
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
|
|
58
|
+
return -1
|
|
59
|
+
return 0
|
|
60
60
|
|
|
61
61
|
def __dealloc__(self):
|
|
62
62
|
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
|
|
@@ -77,12 +77,11 @@ cdef class Selector:
|
|
|
77
77
|
cdef Node node
|
|
78
78
|
cdef list nodes
|
|
79
79
|
|
|
80
|
-
def __init__(self, Node node, query):
|
|
80
|
+
def __init__(self, Node node, str query):
|
|
81
81
|
"""custom init, because __cinit__ doesn't accept C types"""
|
|
82
82
|
self.node = node
|
|
83
83
|
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
|
|
84
84
|
|
|
85
|
-
|
|
86
85
|
cpdef css(self, str query):
|
|
87
86
|
"""Evaluate CSS selector against current scope."""
|
|
88
87
|
cdef Node current_node
|
|
@@ -106,6 +105,7 @@ cdef class Selector:
|
|
|
106
105
|
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
107
106
|
"""Filter all current matches given text."""
|
|
108
107
|
nodes = []
|
|
108
|
+
cdef Node node
|
|
109
109
|
for node in self.nodes:
|
|
110
110
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
111
111
|
if node_text and text in node_text:
|
|
@@ -116,6 +116,7 @@ cdef class Selector:
|
|
|
116
116
|
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
117
117
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
118
118
|
nodes = []
|
|
119
|
+
cdef Node node
|
|
119
120
|
for node in self.nodes:
|
|
120
121
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
121
122
|
if node_text and text in node_text:
|
|
@@ -142,7 +143,8 @@ cdef class Selector:
|
|
|
142
143
|
|
|
143
144
|
Similar to `string-length` in XPath.
|
|
144
145
|
"""
|
|
145
|
-
nodes = []
|
|
146
|
+
cdef list nodes = []
|
|
147
|
+
cdef Node node
|
|
146
148
|
for node in self.nodes:
|
|
147
149
|
attr = node.attributes.get(attribute)
|
|
148
150
|
if attr and start and start in attr:
|
|
@@ -157,16 +159,15 @@ cdef class Selector:
|
|
|
157
159
|
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
|
|
158
160
|
cdef myhtml_collection_t *collection
|
|
159
161
|
cdef CSSSelector selector = CSSSelector(query)
|
|
160
|
-
|
|
161
|
-
result =
|
|
162
|
+
cdef Node n
|
|
163
|
+
cdef list result = []
|
|
162
164
|
collection = selector.find(node)
|
|
163
165
|
|
|
164
166
|
if collection == NULL:
|
|
165
167
|
return result
|
|
166
168
|
|
|
167
169
|
for i in range(collection.length):
|
|
168
|
-
n = Node()
|
|
169
|
-
n._init(collection.list[i], parser)
|
|
170
|
+
n = Node.new(collection.list[i], parser)
|
|
170
171
|
result.append(n)
|
|
171
172
|
myhtml_collection_destroy(collection)
|
|
172
173
|
return result
|
|
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
|
|
|
176
177
|
cdef myhtml_collection_t *collection
|
|
177
178
|
cdef CSSSelector selector
|
|
178
179
|
cdef int collection_size
|
|
180
|
+
cdef str query
|
|
179
181
|
|
|
180
182
|
for query in selectors:
|
|
181
183
|
selector = CSSSelector(query)
|