selectolax 0.3.33__cp39-cp39-win_amd64.whl → 0.4.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +114 -14
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +24 -2
- selectolax/lexbor.c +6126 -4206
- selectolax/lexbor.cp39-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +8 -0
- selectolax/lexbor.pyi +96 -7
- selectolax/lexbor.pyx +45 -4
- selectolax/modest/node.pxi +4 -1
- selectolax/parser.c +1559 -1461
- selectolax/parser.cp39-win_amd64.pyd +0 -0
- selectolax/parser.pyi +5 -2
- selectolax/parser.pyx +2 -2
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- selectolax-0.3.33.dist-info/METADATA +0 -187
- selectolax-0.3.33.dist-info/RECORD +0 -26
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/WHEEL +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
selectolax/lexbor/node.pxi
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
cimport cython
|
|
2
2
|
from cpython.exc cimport PyErr_SetNone
|
|
3
3
|
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("selectolax")
|
|
7
|
+
|
|
4
8
|
_TAG_TO_NAME = {
|
|
5
|
-
0x0005: "-
|
|
9
|
+
0x0005: "-doctype",
|
|
6
10
|
0x0002: "-text",
|
|
7
11
|
0x0004: "-comment",
|
|
8
12
|
}
|
|
@@ -11,6 +15,10 @@ ctypedef fused str_or_LexborNode:
|
|
|
11
15
|
bytes
|
|
12
16
|
LexborNode
|
|
13
17
|
|
|
18
|
+
ctypedef fused str_or_bytes:
|
|
19
|
+
str
|
|
20
|
+
bytes
|
|
21
|
+
|
|
14
22
|
cdef inline bytes to_bytes(str_or_LexborNode value):
|
|
15
23
|
cdef bytes bytes_val
|
|
16
24
|
if isinstance(value, unicode):
|
|
@@ -37,7 +45,10 @@ cdef class LexborNode:
|
|
|
37
45
|
|
|
38
46
|
@property
|
|
39
47
|
def child(self):
|
|
40
|
-
"""Alias for the `first_child` property.
|
|
48
|
+
"""Alias for the `first_child` property.
|
|
49
|
+
|
|
50
|
+
**Deprecated**. Please use `first_child` instead.
|
|
51
|
+
"""
|
|
41
52
|
return self.first_child
|
|
42
53
|
|
|
43
54
|
@property
|
|
@@ -180,6 +191,12 @@ cdef class LexborNode:
|
|
|
180
191
|
Matches pattern `query` against HTML tree.
|
|
181
192
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
182
193
|
|
|
194
|
+
Special selectors:
|
|
195
|
+
|
|
196
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
197
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
198
|
+
|
|
199
|
+
|
|
183
200
|
Parameters
|
|
184
201
|
----------
|
|
185
202
|
query : str
|
|
@@ -194,13 +211,15 @@ cdef class LexborNode:
|
|
|
194
211
|
def css_first(self, str query, default=None, bool strict=False):
|
|
195
212
|
"""Same as `css` but returns only the first match.
|
|
196
213
|
|
|
214
|
+
When `strict=False` stops at the first match. Works faster.
|
|
215
|
+
|
|
197
216
|
Parameters
|
|
198
217
|
----------
|
|
199
218
|
|
|
200
219
|
query : str
|
|
201
|
-
default :
|
|
220
|
+
default : Any, default None
|
|
202
221
|
Default value to return if there is no match.
|
|
203
|
-
strict: bool, default
|
|
222
|
+
strict: bool, default False
|
|
204
223
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
205
224
|
|
|
206
225
|
|
|
@@ -208,8 +227,10 @@ cdef class LexborNode:
|
|
|
208
227
|
-------
|
|
209
228
|
selector : `LexborNode` object
|
|
210
229
|
"""
|
|
211
|
-
|
|
212
|
-
|
|
230
|
+
if strict:
|
|
231
|
+
results = self.parser.selector.find(query, self)
|
|
232
|
+
else:
|
|
233
|
+
results = self.parser.selector.find_first(query, self)
|
|
213
234
|
n_results = len(results)
|
|
214
235
|
if n_results > 0:
|
|
215
236
|
if strict and n_results > 1:
|
|
@@ -226,7 +247,7 @@ cdef class LexborNode:
|
|
|
226
247
|
|
|
227
248
|
def css_matches(self, str selector):
|
|
228
249
|
"""Returns True if CSS selector matches a node."""
|
|
229
|
-
return self.parser.selector.any_matches(selector, self)
|
|
250
|
+
return bool(self.parser.selector.any_matches(selector, self))
|
|
230
251
|
|
|
231
252
|
def __repr__(self):
|
|
232
253
|
return '<LexborNode %s>' % self.tag
|
|
@@ -240,6 +261,14 @@ cdef class LexborNode:
|
|
|
240
261
|
def tag(self):
|
|
241
262
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
242
263
|
|
|
264
|
+
For for non-tag nodes, returns the following names:
|
|
265
|
+
|
|
266
|
+
* `-text` - text node
|
|
267
|
+
* `-document` - document node
|
|
268
|
+
* `-comment` - comment node
|
|
269
|
+
|
|
270
|
+
This
|
|
271
|
+
|
|
243
272
|
Returns
|
|
244
273
|
-------
|
|
245
274
|
text : str
|
|
@@ -275,9 +304,9 @@ cdef class LexborNode:
|
|
|
275
304
|
raise SelectolaxError("Decomposing the root node is not allowed.")
|
|
276
305
|
|
|
277
306
|
if recursive:
|
|
278
|
-
|
|
307
|
+
node_remove_deep(<lxb_dom_node_t *> self.node)
|
|
279
308
|
else:
|
|
280
|
-
|
|
309
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
281
310
|
|
|
282
311
|
def strip_tags(self, list tags, bool recursive = False):
|
|
283
312
|
"""Remove specified tags from the HTML tree.
|
|
@@ -326,6 +355,9 @@ cdef class LexborNode:
|
|
|
326
355
|
cdef size_t str_len = 0
|
|
327
356
|
attributes = dict()
|
|
328
357
|
|
|
358
|
+
if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
|
|
359
|
+
return attributes
|
|
360
|
+
|
|
329
361
|
while attr != NULL:
|
|
330
362
|
key = lxb_dom_attr_local_name_noi(attr, &str_len)
|
|
331
363
|
value = lxb_dom_attr_value_noi(attr, &str_len)
|
|
@@ -418,6 +450,8 @@ cdef class LexborNode:
|
|
|
418
450
|
def unwrap(self, bint delete_empty=False):
|
|
419
451
|
"""Replace node with whatever is inside this node.
|
|
420
452
|
|
|
453
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
454
|
+
|
|
421
455
|
Parameters
|
|
422
456
|
----------
|
|
423
457
|
delete_empty : bool, default False
|
|
@@ -433,9 +467,14 @@ cdef class LexborNode:
|
|
|
433
467
|
|
|
434
468
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
435
469
|
"""
|
|
470
|
+
|
|
471
|
+
if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
|
|
472
|
+
logger.error("Attempt to unwrap removed node. Does nothing.")
|
|
473
|
+
return
|
|
474
|
+
|
|
436
475
|
if self.node.first_child == NULL:
|
|
437
476
|
if delete_empty:
|
|
438
|
-
|
|
477
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
439
478
|
return
|
|
440
479
|
cdef lxb_dom_node_t* next_node
|
|
441
480
|
cdef lxb_dom_node_t* current_node
|
|
@@ -450,7 +489,7 @@ cdef class LexborNode:
|
|
|
450
489
|
current_node = next_node
|
|
451
490
|
else:
|
|
452
491
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
453
|
-
|
|
492
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
454
493
|
|
|
455
494
|
def unwrap_tags(self, list tags, bint delete_empty = False):
|
|
456
495
|
"""Unwraps specified tags from the HTML tree.
|
|
@@ -512,6 +551,12 @@ cdef class LexborNode:
|
|
|
512
551
|
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
|
|
513
552
|
lxb_dom_node_text_content_set(node, combined, len(combined))
|
|
514
553
|
lxb_dom_node_remove(node.prev)
|
|
554
|
+
|
|
555
|
+
if left_text is not NULL:
|
|
556
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
|
|
557
|
+
if right_text is not NULL:
|
|
558
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
|
|
559
|
+
|
|
515
560
|
if node.first_child:
|
|
516
561
|
LexborNode.new(node, self.parser).merge_text_nodes()
|
|
517
562
|
node = next_node
|
|
@@ -584,7 +629,7 @@ cdef class LexborNode:
|
|
|
584
629
|
if new_node == NULL:
|
|
585
630
|
raise SelectolaxError("Can't create a new node")
|
|
586
631
|
lxb_dom_node_insert_before(self.node, new_node)
|
|
587
|
-
|
|
632
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
588
633
|
elif isinstance(value, LexborNode):
|
|
589
634
|
new_node = lxb_dom_document_import_node(
|
|
590
635
|
&self.parser.document.dom_document,
|
|
@@ -594,7 +639,7 @@ cdef class LexborNode:
|
|
|
594
639
|
if new_node == NULL:
|
|
595
640
|
raise SelectolaxError("Can't create a new node")
|
|
596
641
|
lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
|
|
597
|
-
|
|
642
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
598
643
|
else:
|
|
599
644
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
600
645
|
|
|
@@ -877,6 +922,61 @@ cdef class LexborNode:
|
|
|
877
922
|
container.append(py_text)
|
|
878
923
|
return container.text
|
|
879
924
|
|
|
925
|
+
@property
|
|
926
|
+
def inner_html(self) -> str:
|
|
927
|
+
"""Return HTML representation of the child nodes.
|
|
928
|
+
|
|
929
|
+
Works similar to innerHTML in JavaScript.
|
|
930
|
+
Unlike the `.html` property, does not include the current node.
|
|
931
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
932
|
+
|
|
933
|
+
Returns
|
|
934
|
+
-------
|
|
935
|
+
text : str | None
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
cdef lexbor_str_t *lxb_str
|
|
939
|
+
cdef lxb_status_t status
|
|
940
|
+
|
|
941
|
+
lxb_str = lexbor_str_create()
|
|
942
|
+
status = lxb_html_serialize_deep_str(self.node, lxb_str)
|
|
943
|
+
if status == 0 and lxb_str.data:
|
|
944
|
+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
|
|
945
|
+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
|
|
946
|
+
return html
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
@inner_html.setter
|
|
950
|
+
def inner_html(self, str html):
|
|
951
|
+
"""Set inner HTML to the specified HTML.
|
|
952
|
+
|
|
953
|
+
Replaces existing data inside the node.
|
|
954
|
+
Works similar to innerHTML in JavaScript.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
html : str | None
|
|
959
|
+
|
|
960
|
+
"""
|
|
961
|
+
cdef bytes bytes_val
|
|
962
|
+
bytes_val = <bytes>html.encode("utf-8")
|
|
963
|
+
lxb_html_element_inner_html_set(
|
|
964
|
+
<lxb_html_element_t *>self.node,
|
|
965
|
+
<lxb_char_t *> bytes_val, len(bytes_val)
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def clone(self) -> LexborNode:
|
|
969
|
+
"""Clone the current node.
|
|
970
|
+
|
|
971
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
972
|
+
|
|
973
|
+
It is tied to the current parser instance.
|
|
974
|
+
Gets destroyed when parser instance is destroyed.
|
|
975
|
+
"""
|
|
976
|
+
cdef lxb_dom_node_t* node
|
|
977
|
+
node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
|
|
978
|
+
return LexborNode.new(node, self.parser)
|
|
979
|
+
|
|
880
980
|
|
|
881
981
|
@cython.internal
|
|
882
982
|
@cython.final
|
|
@@ -922,7 +1022,7 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
|
922
1022
|
return LEXBOR_ACTION_OK
|
|
923
1023
|
|
|
924
1024
|
try:
|
|
925
|
-
py_str = text.decode(_ENCODING)
|
|
1025
|
+
py_str = text.decode(_ENCODING, "replace")
|
|
926
1026
|
|
|
927
1027
|
except Exception as e:
|
|
928
1028
|
PyErr_SetNone(e)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
|
|
3
|
+
cdef lxb_dom_node_t *tmp
|
|
4
|
+
cdef lxb_dom_node_t *node = root
|
|
5
|
+
|
|
6
|
+
while node != NULL:
|
|
7
|
+
if node.first_child != NULL:
|
|
8
|
+
node = node.first_child
|
|
9
|
+
else:
|
|
10
|
+
while node != root and node.next == NULL:
|
|
11
|
+
tmp = node.parent
|
|
12
|
+
lxb_dom_node_remove(node)
|
|
13
|
+
node = tmp
|
|
14
|
+
|
|
15
|
+
if node == root:
|
|
16
|
+
lxb_dom_node_remove(node)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
tmp = node.next
|
|
20
|
+
lxb_dom_node_remove(node)
|
|
21
|
+
node = tmp
|
|
22
|
+
|
|
23
|
+
return NULL
|
|
24
|
+
|
|
25
|
+
cdef bint node_is_removed(lxb_dom_node_t* node):
|
|
26
|
+
if node.parent == NULL and node.next == NULL \
|
|
27
|
+
and node.prev == NULL:
|
|
28
|
+
return 1
|
|
29
|
+
return 0
|
selectolax/lexbor/selection.pxi
CHANGED
|
@@ -39,6 +39,12 @@ cdef class LexborCSSSelector:
|
|
|
39
39
|
return 0
|
|
40
40
|
|
|
41
41
|
cpdef list find(self, str query, LexborNode node):
|
|
42
|
+
return self._find(query, node, 0)
|
|
43
|
+
|
|
44
|
+
cpdef list find_first(self, str query, LexborNode node):
|
|
45
|
+
return self._find(query, node, 1)
|
|
46
|
+
|
|
47
|
+
cpdef list _find(self, str query, LexborNode node, bint only_first):
|
|
42
48
|
cdef lxb_css_selector_list_t* selectors
|
|
43
49
|
cdef lxb_char_t* c_selector
|
|
44
50
|
cdef lxb_css_selector_list_t * selectors_list
|
|
@@ -54,8 +60,12 @@ cdef class LexborCSSSelector:
|
|
|
54
60
|
|
|
55
61
|
self.current_node = node
|
|
56
62
|
self.results = []
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
if only_first:
|
|
64
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
65
|
+
<lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
|
|
66
|
+
else:
|
|
67
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
68
|
+
<lxb_selectors_cb_f>css_finder_callback, <void*>self)
|
|
59
69
|
results = list(self.results)
|
|
60
70
|
self.results = []
|
|
61
71
|
self.current_node = None
|
|
@@ -76,6 +86,7 @@ cdef class LexborCSSSelector:
|
|
|
76
86
|
|
|
77
87
|
if selectors_list == NULL:
|
|
78
88
|
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
89
|
+
return -1
|
|
79
90
|
|
|
80
91
|
self.results = []
|
|
81
92
|
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
@@ -83,6 +94,8 @@ cdef class LexborCSSSelector:
|
|
|
83
94
|
if status != LXB_STATUS_OK:
|
|
84
95
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
85
96
|
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
97
|
+
return -1
|
|
98
|
+
|
|
86
99
|
result = PyList_GET_SIZE(self.results) > 0
|
|
87
100
|
self.results = []
|
|
88
101
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
@@ -185,6 +198,15 @@ cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_spe
|
|
|
185
198
|
cls.results.append(lxb_node)
|
|
186
199
|
return LXB_STATUS_OK
|
|
187
200
|
|
|
201
|
+
cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
202
|
+
cdef LexborNode lxb_node
|
|
203
|
+
cdef LexborCSSSelector cls
|
|
204
|
+
cls = <LexborCSSSelector> ctx
|
|
205
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
206
|
+
cls.results.append(lxb_node)
|
|
207
|
+
return LXB_STATUS_STOP
|
|
208
|
+
|
|
209
|
+
|
|
188
210
|
cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
189
211
|
cdef LexborNode lxb_node
|
|
190
212
|
cdef LexborCSSSelector cls
|