selectolax 0.3.29__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +108 -47
- selectolax/lexbor/selection.pxi +34 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +52987 -55311
- selectolax/lexbor.cp312-win32.pyd +0 -0
- selectolax/lexbor.pxd +36 -40
- selectolax/lexbor.pyi +770 -65
- selectolax/lexbor.pyx +54 -17
- selectolax/modest/node.pxi +45 -42
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +50190 -52325
- selectolax/parser.cp312-win32.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +489 -45
- selectolax/parser.pyx +39 -31
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -194
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
__author__ = """Artem Golubin"""
|
|
5
|
-
__email__ =
|
|
6
|
-
__version__ =
|
|
5
|
+
__email__ = "me@rushter.com"
|
|
6
|
+
__version__ = "0.3.34"
|
|
7
7
|
|
|
8
|
-
from . import parser
|
|
9
|
-
from . import lexbor
|
|
10
|
-
from . import modest
|
|
8
|
+
from . import lexbor, modest, parser
|
selectolax/lexbor/attrs.pxi
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
@cython.final
|
|
4
5
|
cdef class LexborAttributes:
|
|
5
6
|
"""A dict-like object that represents attributes."""
|
|
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
|
|
|
23
24
|
yield key.decode(_ENCODING)
|
|
24
25
|
attr = attr.next
|
|
25
26
|
|
|
26
|
-
def __setitem__(self, str key, value):
|
|
27
|
-
value =
|
|
27
|
+
def __setitem__(self, str key, object value):
|
|
28
|
+
value = value
|
|
28
29
|
bytes_key = key.encode(_ENCODING)
|
|
29
|
-
bytes_value = value.encode(_ENCODING)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
bytes_value = value.encode(_ENCODING) if value else b""
|
|
31
|
+
cdef lxb_dom_attr_t *attr
|
|
32
|
+
cdef lxb_dom_document_t *doc
|
|
33
|
+
|
|
34
|
+
if value is None:
|
|
35
|
+
# N.B. This is suboptimal, but there is not API to set empty attributes
|
|
36
|
+
attr = lxb_dom_element_set_attribute(
|
|
37
|
+
<lxb_dom_element_t *> self.node,
|
|
38
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
39
|
+
NULL, 0
|
|
40
|
+
)
|
|
41
|
+
doc = (<lxb_dom_node_t*>attr).owner_document
|
|
42
|
+
lexbor_str_destroy(attr.value, doc.text, 0)
|
|
43
|
+
attr.value = NULL
|
|
44
|
+
|
|
45
|
+
elif isinstance(value, str) or isinstance(value, unicode) :
|
|
46
|
+
lxb_dom_element_set_attribute(
|
|
47
|
+
<lxb_dom_element_t *> self.node,
|
|
48
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
49
|
+
<lxb_char_t *> bytes_value, len(bytes_value),
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
raise TypeError("Expected str or unicode, got %s" % type(value))
|
|
36
53
|
|
|
37
54
|
def __delitem__(self, key):
|
|
38
55
|
try:
|
selectolax/lexbor/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetNone
|
|
2
3
|
|
|
3
4
|
_TAG_TO_NAME = {
|
|
4
5
|
0x0005: "- doctype",
|
|
@@ -6,26 +7,29 @@ _TAG_TO_NAME = {
|
|
|
6
7
|
0x0004: "-comment",
|
|
7
8
|
}
|
|
8
9
|
ctypedef fused str_or_LexborNode:
|
|
9
|
-
|
|
10
|
+
str
|
|
10
11
|
bytes
|
|
11
12
|
LexborNode
|
|
12
13
|
|
|
13
14
|
cdef inline bytes to_bytes(str_or_LexborNode value):
|
|
14
15
|
cdef bytes bytes_val
|
|
15
|
-
if isinstance(value,
|
|
16
|
-
bytes_val = value.encode(
|
|
16
|
+
if isinstance(value, unicode):
|
|
17
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
17
18
|
elif isinstance(value, bytes):
|
|
18
|
-
bytes_val =
|
|
19
|
+
bytes_val = <bytes>value
|
|
19
20
|
return bytes_val
|
|
20
21
|
|
|
22
|
+
|
|
21
23
|
@cython.final
|
|
22
24
|
cdef class LexborNode:
|
|
23
25
|
"""A class that represents HTML node (element)."""
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
@staticmethod
|
|
28
|
+
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
|
|
29
|
+
cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
|
|
30
|
+
lxbnode.node = node
|
|
31
|
+
lxbnode.parser = parser
|
|
32
|
+
return lxbnode
|
|
29
33
|
|
|
30
34
|
@property
|
|
31
35
|
def mem_id(self):
|
|
@@ -41,8 +45,7 @@ cdef class LexborNode:
|
|
|
41
45
|
"""Return the first child node."""
|
|
42
46
|
cdef LexborNode node
|
|
43
47
|
if self.node.first_child:
|
|
44
|
-
node = LexborNode()
|
|
45
|
-
node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
48
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
46
49
|
return node
|
|
47
50
|
return None
|
|
48
51
|
|
|
@@ -50,9 +53,8 @@ cdef class LexborNode:
|
|
|
50
53
|
def parent(self):
|
|
51
54
|
"""Return the parent node."""
|
|
52
55
|
cdef LexborNode node
|
|
53
|
-
if self.node.parent:
|
|
54
|
-
node = LexborNode()
|
|
55
|
-
node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
56
|
+
if self.node.parent != NULL:
|
|
57
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
56
58
|
return node
|
|
57
59
|
return None
|
|
58
60
|
|
|
@@ -60,9 +62,8 @@ cdef class LexborNode:
|
|
|
60
62
|
def next(self):
|
|
61
63
|
"""Return next node."""
|
|
62
64
|
cdef LexborNode node
|
|
63
|
-
if self.node.next:
|
|
64
|
-
node = LexborNode()
|
|
65
|
-
node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
65
|
+
if self.node.next != NULL:
|
|
66
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
66
67
|
return node
|
|
67
68
|
return None
|
|
68
69
|
|
|
@@ -70,9 +71,8 @@ cdef class LexborNode:
|
|
|
70
71
|
def prev(self):
|
|
71
72
|
"""Return previous node."""
|
|
72
73
|
cdef LexborNode node
|
|
73
|
-
if self.node.prev:
|
|
74
|
-
node = LexborNode()
|
|
75
|
-
node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
74
|
+
if self.node.prev != NULL:
|
|
75
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
76
76
|
return node
|
|
77
77
|
return None
|
|
78
78
|
|
|
@@ -80,9 +80,8 @@ cdef class LexborNode:
|
|
|
80
80
|
def last_child(self):
|
|
81
81
|
"""Return last child node."""
|
|
82
82
|
cdef LexborNode node
|
|
83
|
-
if self.node.last_child:
|
|
84
|
-
node = LexborNode()
|
|
85
|
-
node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
83
|
+
if self.node.last_child != NULL:
|
|
84
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
86
85
|
return node
|
|
87
86
|
return None
|
|
88
87
|
|
|
@@ -181,6 +180,12 @@ cdef class LexborNode:
|
|
|
181
180
|
Matches pattern `query` against HTML tree.
|
|
182
181
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
183
182
|
|
|
183
|
+
Special selectors:
|
|
184
|
+
|
|
185
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
186
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
187
|
+
|
|
188
|
+
|
|
184
189
|
Parameters
|
|
185
190
|
----------
|
|
186
191
|
query : str
|
|
@@ -256,7 +261,6 @@ cdef class LexborNode:
|
|
|
256
261
|
text = c_text.decode(_ENCODING)
|
|
257
262
|
return text
|
|
258
263
|
|
|
259
|
-
|
|
260
264
|
def decompose(self, bool recursive=True):
|
|
261
265
|
"""Remove the current node from the tree.
|
|
262
266
|
|
|
@@ -273,6 +277,9 @@ cdef class LexborNode:
|
|
|
273
277
|
>>> tag.decompose()
|
|
274
278
|
|
|
275
279
|
"""
|
|
280
|
+
if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
|
|
281
|
+
raise SelectolaxError("Decomposing the root node is not allowed.")
|
|
282
|
+
|
|
276
283
|
if recursive:
|
|
277
284
|
lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
|
|
278
285
|
else:
|
|
@@ -298,11 +305,11 @@ cdef class LexborNode:
|
|
|
298
305
|
'<html><body><div>Hello world!</div></body></html>'
|
|
299
306
|
|
|
300
307
|
"""
|
|
308
|
+
cdef LexborNode element
|
|
301
309
|
for tag in tags:
|
|
302
310
|
for element in self.css(tag):
|
|
303
311
|
element.decompose(recursive=recursive)
|
|
304
312
|
|
|
305
|
-
|
|
306
313
|
@property
|
|
307
314
|
def attributes(self):
|
|
308
315
|
"""Get all attributes that belong to the current node.
|
|
@@ -410,13 +417,11 @@ cdef class LexborNode:
|
|
|
410
417
|
node = node.next
|
|
411
418
|
continue
|
|
412
419
|
|
|
413
|
-
next_node = LexborNode()
|
|
414
|
-
next_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
420
|
+
next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
415
421
|
yield next_node
|
|
416
422
|
node = node.next
|
|
417
423
|
|
|
418
|
-
|
|
419
|
-
def unwrap(self, delete_empty=False):
|
|
424
|
+
def unwrap(self, bint delete_empty=False):
|
|
420
425
|
"""Replace node with whatever is inside this node.
|
|
421
426
|
|
|
422
427
|
Parameters
|
|
@@ -431,15 +436,15 @@ cdef class LexborNode:
|
|
|
431
436
|
>>> tree.css_first('i').unwrap()
|
|
432
437
|
>>> tree.html
|
|
433
438
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
434
|
-
|
|
439
|
+
|
|
435
440
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
436
441
|
"""
|
|
437
442
|
if self.node.first_child == NULL:
|
|
438
443
|
if delete_empty:
|
|
439
444
|
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
|
|
440
445
|
return
|
|
441
|
-
cdef lxb_dom_node_t* next_node
|
|
442
|
-
cdef lxb_dom_node_t* current_node
|
|
446
|
+
cdef lxb_dom_node_t* next_node
|
|
447
|
+
cdef lxb_dom_node_t* current_node
|
|
443
448
|
|
|
444
449
|
if self.node.first_child.next != NULL:
|
|
445
450
|
current_node = self.node.first_child
|
|
@@ -453,7 +458,7 @@ cdef class LexborNode:
|
|
|
453
458
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
454
459
|
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
|
|
455
460
|
|
|
456
|
-
def unwrap_tags(self, list tags, delete_empty = False):
|
|
461
|
+
def unwrap_tags(self, list tags, bint delete_empty = False):
|
|
457
462
|
"""Unwraps specified tags from the HTML tree.
|
|
458
463
|
|
|
459
464
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -472,14 +477,50 @@ cdef class LexborNode:
|
|
|
472
477
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
473
478
|
>>> tree.body.html
|
|
474
479
|
'<body><div>Hello world!</div></body>'
|
|
475
|
-
|
|
480
|
+
|
|
476
481
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
477
482
|
"""
|
|
478
|
-
|
|
483
|
+
cdef LexborNode element
|
|
479
484
|
for tag in tags:
|
|
480
485
|
for element in self.css(tag):
|
|
481
486
|
element.unwrap(delete_empty)
|
|
482
487
|
|
|
488
|
+
def merge_text_nodes(self):
|
|
489
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
490
|
+
|
|
491
|
+
This is useful for text extraction.
|
|
492
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
493
|
+
|
|
494
|
+
Examples
|
|
495
|
+
--------
|
|
496
|
+
|
|
497
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
498
|
+
>>> node = tree.css_first('div')
|
|
499
|
+
>>> tree.unwrap_tags(["strong"])
|
|
500
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
501
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
502
|
+
>>> node.merge_text_nodes()
|
|
503
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
504
|
+
"John Doe"
|
|
505
|
+
"""
|
|
506
|
+
cdef lxb_dom_node_t *node = self.node.first_child
|
|
507
|
+
cdef lxb_dom_node_t *next_node
|
|
508
|
+
cdef lxb_char_t *left_text
|
|
509
|
+
cdef lxb_char_t *right_text
|
|
510
|
+
cdef size_t left_length, right_length
|
|
511
|
+
|
|
512
|
+
while node != NULL:
|
|
513
|
+
next_node = node.next
|
|
514
|
+
if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
|
|
515
|
+
left_text = lxb_dom_node_text_content(node.prev, &left_length)
|
|
516
|
+
right_text = lxb_dom_node_text_content(node, &right_length)
|
|
517
|
+
if left_text and right_text:
|
|
518
|
+
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
|
|
519
|
+
lxb_dom_node_text_content_set(node, combined, len(combined))
|
|
520
|
+
lxb_dom_node_remove(node.prev)
|
|
521
|
+
if node.first_child:
|
|
522
|
+
LexborNode.new(node, self.parser).merge_text_nodes()
|
|
523
|
+
node = next_node
|
|
483
524
|
|
|
484
525
|
def traverse(self, include_text=False):
|
|
485
526
|
"""Iterate over all child and next nodes starting from the current level.
|
|
@@ -499,8 +540,7 @@ cdef class LexborNode:
|
|
|
499
540
|
|
|
500
541
|
while node != NULL:
|
|
501
542
|
if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
|
|
502
|
-
lxb_node = LexborNode()
|
|
503
|
-
lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
543
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
504
544
|
yield lxb_node
|
|
505
545
|
|
|
506
546
|
if node.first_child != NULL:
|
|
@@ -564,7 +604,6 @@ cdef class LexborNode:
|
|
|
564
604
|
else:
|
|
565
605
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
566
606
|
|
|
567
|
-
|
|
568
607
|
def insert_before(self, str_or_LexborNode value):
|
|
569
608
|
"""
|
|
570
609
|
Insert a node before the current Node.
|
|
@@ -739,7 +778,7 @@ cdef class LexborNode:
|
|
|
739
778
|
>>> selector.child.raw_value
|
|
740
779
|
b'<test>'
|
|
741
780
|
"""
|
|
742
|
-
raise
|
|
781
|
+
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
743
782
|
|
|
744
783
|
def scripts_contain(self, str query):
|
|
745
784
|
"""Returns True if any of the script tags contain specified text.
|
|
@@ -752,6 +791,7 @@ cdef class LexborNode:
|
|
|
752
791
|
The query to check.
|
|
753
792
|
|
|
754
793
|
"""
|
|
794
|
+
cdef LexborNode node
|
|
755
795
|
if self.parser.cached_script_texts is None:
|
|
756
796
|
nodes = self.parser.selector.find('script', self)
|
|
757
797
|
text_nodes = []
|
|
@@ -776,6 +816,7 @@ cdef class LexborNode:
|
|
|
776
816
|
queries : tuple of str
|
|
777
817
|
|
|
778
818
|
"""
|
|
819
|
+
cdef LexborNode node
|
|
779
820
|
if self.parser.cached_script_srcs is None:
|
|
780
821
|
nodes = self.parser.selector.find('script', self)
|
|
781
822
|
src_nodes = []
|
|
@@ -831,31 +872,44 @@ cdef class LexborNode:
|
|
|
831
872
|
"""
|
|
832
873
|
cdef unsigned char * text
|
|
833
874
|
cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
|
|
834
|
-
|
|
835
|
-
container = TextContainer()
|
|
875
|
+
cdef TextContainer container
|
|
836
876
|
if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
|
|
837
877
|
return None
|
|
878
|
+
|
|
838
879
|
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
|
|
839
880
|
if text != NULL:
|
|
881
|
+
container = TextContainer.new_with_defaults()
|
|
840
882
|
py_text = text.decode(_ENCODING)
|
|
841
883
|
container.append(py_text)
|
|
842
884
|
return container.text
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
@cython.internal
|
|
843
888
|
@cython.final
|
|
844
889
|
cdef class TextContainer:
|
|
845
890
|
cdef str _text
|
|
846
|
-
cdef
|
|
847
|
-
cdef
|
|
891
|
+
cdef str separator
|
|
892
|
+
cdef bint strip
|
|
893
|
+
|
|
894
|
+
@staticmethod
|
|
895
|
+
cdef TextContainer new_with_defaults():
|
|
896
|
+
cdef TextContainer cls = TextContainer.__new__(TextContainer)
|
|
897
|
+
cls._text = ''
|
|
898
|
+
cls.separator = ''
|
|
899
|
+
cls.strip = False
|
|
900
|
+
return cls
|
|
848
901
|
|
|
849
902
|
def __init__(self, str separator = '', bool strip = False):
|
|
850
903
|
self._text = ""
|
|
851
904
|
self.separator = separator
|
|
852
905
|
self.strip = strip
|
|
853
906
|
|
|
854
|
-
def append(self, node_text):
|
|
907
|
+
def append(self, str node_text):
|
|
855
908
|
if self.strip:
|
|
856
909
|
self._text += node_text.strip() + self.separator
|
|
857
910
|
else:
|
|
858
911
|
self._text += node_text + self.separator
|
|
912
|
+
|
|
859
913
|
@property
|
|
860
914
|
def text(self):
|
|
861
915
|
if self.separator and self._text and self._text.endswith(self.separator):
|
|
@@ -864,7 +918,7 @@ cdef class TextContainer:
|
|
|
864
918
|
|
|
865
919
|
|
|
866
920
|
cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
867
|
-
cdef unsigned char *text
|
|
921
|
+
cdef unsigned char *text
|
|
868
922
|
cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
|
|
869
923
|
if tag_id != LXB_TAG__TEXT:
|
|
870
924
|
return LEXBOR_ACTION_OK
|
|
@@ -872,8 +926,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
|
872
926
|
text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
|
|
873
927
|
if not text:
|
|
874
928
|
return LEXBOR_ACTION_OK
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
929
|
+
|
|
930
|
+
try:
|
|
931
|
+
py_str = text.decode(_ENCODING)
|
|
932
|
+
|
|
933
|
+
except Exception as e:
|
|
934
|
+
PyErr_SetNone(e)
|
|
935
|
+
return LEXBOR_ACTION_STOP
|
|
936
|
+
|
|
937
|
+
cdef TextContainer cls
|
|
938
|
+
cls = <TextContainer> ctx
|
|
878
939
|
cls.append(py_str)
|
|
879
940
|
return LEXBOR_ACTION_OK
|
selectolax/lexbor/selection.pxi
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
from cpython.list cimport PyList_GET_SIZE
|
|
4
|
+
|
|
2
5
|
|
|
3
6
|
@cython.final
|
|
4
7
|
cdef class LexborCSSSelector:
|
|
@@ -8,21 +11,22 @@ cdef class LexborCSSSelector:
|
|
|
8
11
|
self.results = []
|
|
9
12
|
self.current_node = None
|
|
10
13
|
|
|
11
|
-
cdef _create_css_parser(self):
|
|
14
|
+
cdef int _create_css_parser(self) except -1:
|
|
12
15
|
cdef lxb_status_t status
|
|
13
16
|
|
|
14
|
-
|
|
15
17
|
self.parser = lxb_css_parser_create()
|
|
16
18
|
status = lxb_css_parser_init(self.parser, NULL)
|
|
17
19
|
|
|
18
20
|
if status != LXB_STATUS_OK:
|
|
19
|
-
|
|
21
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
|
|
22
|
+
return -1
|
|
20
23
|
|
|
21
24
|
self.css_selectors = lxb_css_selectors_create()
|
|
22
25
|
status = lxb_css_selectors_init(self.css_selectors)
|
|
23
26
|
|
|
24
27
|
if status != LXB_STATUS_OK:
|
|
25
|
-
|
|
28
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
|
|
29
|
+
return -1
|
|
26
30
|
|
|
27
31
|
lxb_css_parser_selectors_set(self.parser, self.css_selectors)
|
|
28
32
|
|
|
@@ -30,14 +34,18 @@ cdef class LexborCSSSelector:
|
|
|
30
34
|
status = lxb_selectors_init(self.selectors)
|
|
31
35
|
lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
|
|
32
36
|
if status != LXB_STATUS_OK:
|
|
33
|
-
|
|
34
|
-
|
|
37
|
+
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
|
|
38
|
+
return -1
|
|
39
|
+
return 0
|
|
35
40
|
|
|
36
|
-
cpdef find(self, str query, LexborNode node):
|
|
41
|
+
cpdef list find(self, str query, LexborNode node):
|
|
37
42
|
cdef lxb_css_selector_list_t* selectors
|
|
38
43
|
cdef lxb_char_t* c_selector
|
|
39
44
|
cdef lxb_css_selector_list_t * selectors_list
|
|
40
45
|
|
|
46
|
+
if not isinstance(query, str):
|
|
47
|
+
raise TypeError("Query must be a string.")
|
|
48
|
+
|
|
41
49
|
bytes_query = query.encode(_ENCODING)
|
|
42
50
|
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(query))
|
|
43
51
|
|
|
@@ -54,28 +62,32 @@ cdef class LexborCSSSelector:
|
|
|
54
62
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
55
63
|
return results
|
|
56
64
|
|
|
57
|
-
cpdef any_matches(self, str query, LexborNode node):
|
|
65
|
+
cpdef int any_matches(self, str query, LexborNode node) except -1:
|
|
58
66
|
cdef lxb_css_selector_list_t * selectors
|
|
59
67
|
cdef lxb_char_t * c_selector
|
|
60
68
|
cdef lxb_css_selector_list_t * selectors_list
|
|
69
|
+
cdef int result
|
|
70
|
+
|
|
71
|
+
if not isinstance(query, str):
|
|
72
|
+
raise TypeError("Query must be a string.")
|
|
61
73
|
|
|
62
74
|
bytes_query = query.encode(_ENCODING)
|
|
63
75
|
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
|
|
64
76
|
|
|
65
77
|
if selectors_list == NULL:
|
|
66
|
-
|
|
78
|
+
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
67
79
|
|
|
68
80
|
self.results = []
|
|
69
81
|
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
70
82
|
<lxb_selectors_cb_f> css_matcher_callback, <void *> self)
|
|
71
83
|
if status != LXB_STATUS_OK:
|
|
72
|
-
|
|
73
|
-
|
|
84
|
+
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
85
|
+
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
86
|
+
result = PyList_GET_SIZE(self.results) > 0
|
|
74
87
|
self.results = []
|
|
75
88
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
76
89
|
return result
|
|
77
90
|
|
|
78
|
-
|
|
79
91
|
def __dealloc__(self):
|
|
80
92
|
if self.selectors != NULL:
|
|
81
93
|
lxb_selectors_destroy(self.selectors, True)
|
|
@@ -85,7 +97,6 @@ cdef class LexborCSSSelector:
|
|
|
85
97
|
lxb_css_selectors_destroy(self.css_selectors, True)
|
|
86
98
|
|
|
87
99
|
|
|
88
|
-
|
|
89
100
|
cdef class LexborSelector:
|
|
90
101
|
"""An advanced CSS selector that supports additional operations.
|
|
91
102
|
|
|
@@ -100,10 +111,9 @@ cdef class LexborSelector:
|
|
|
100
111
|
self.node = node
|
|
101
112
|
self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]
|
|
102
113
|
|
|
103
|
-
|
|
104
114
|
cpdef css(self, str query):
|
|
105
115
|
"""Evaluate CSS selector against current scope."""
|
|
106
|
-
raise
|
|
116
|
+
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
107
117
|
|
|
108
118
|
@property
|
|
109
119
|
def matches(self) -> list:
|
|
@@ -117,7 +127,7 @@ cdef class LexborSelector:
|
|
|
117
127
|
|
|
118
128
|
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
|
|
119
129
|
"""Filter all current matches given text."""
|
|
120
|
-
nodes = []
|
|
130
|
+
cdef list nodes = []
|
|
121
131
|
for node in self.nodes:
|
|
122
132
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
123
133
|
if node_text and text in node_text:
|
|
@@ -127,7 +137,7 @@ cdef class LexborSelector:
|
|
|
127
137
|
|
|
128
138
|
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
|
|
129
139
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
130
|
-
|
|
140
|
+
cdef LexborNode node
|
|
131
141
|
for node in self.nodes:
|
|
132
142
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
133
143
|
if node_text and text in node_text:
|
|
@@ -139,7 +149,7 @@ cdef class LexborSelector:
|
|
|
139
149
|
|
|
140
150
|
Similar to `string-length` in XPath.
|
|
141
151
|
"""
|
|
142
|
-
nodes = []
|
|
152
|
+
cdef list nodes = []
|
|
143
153
|
for node in self.nodes:
|
|
144
154
|
attr = node.attributes.get(attribute)
|
|
145
155
|
if attr and start and start in attr:
|
|
@@ -154,7 +164,7 @@ cdef class LexborSelector:
|
|
|
154
164
|
|
|
155
165
|
Similar to `string-length` in XPath.
|
|
156
166
|
"""
|
|
157
|
-
|
|
167
|
+
cdef LexborNode node
|
|
158
168
|
for node in self.nodes:
|
|
159
169
|
attr = node.attributes.get(attribute)
|
|
160
170
|
if attr and start and start in attr:
|
|
@@ -169,16 +179,15 @@ cdef class LexborSelector:
|
|
|
169
179
|
|
|
170
180
|
cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
171
181
|
cdef LexborNode lxb_node
|
|
172
|
-
cdef
|
|
173
|
-
cls = <
|
|
174
|
-
lxb_node = LexborNode()
|
|
175
|
-
lxb_node._cinit(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
182
|
+
cdef LexborCSSSelector cls
|
|
183
|
+
cls = <LexborCSSSelector> ctx
|
|
184
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
176
185
|
cls.results.append(lxb_node)
|
|
177
186
|
return LXB_STATUS_OK
|
|
178
187
|
|
|
179
188
|
cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
180
189
|
cdef LexborNode lxb_node
|
|
181
|
-
cdef
|
|
182
|
-
cls = <
|
|
190
|
+
cdef LexborCSSSelector cls
|
|
191
|
+
cls = <LexborCSSSelector> ctx
|
|
183
192
|
cls.results.append(True)
|
|
184
193
|
return LXB_STATUS_STOP
|