selectolax 0.3.34__cp39-cp39-win32.whl → 0.4.0__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +108 -14
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +24 -2
- selectolax/lexbor.c +6111 -4205
- selectolax/lexbor.cp39-win32.pyd +0 -0
- selectolax/lexbor.pxd +8 -0
- selectolax/lexbor.pyi +84 -7
- selectolax/lexbor.pyx +40 -4
- selectolax/modest/node.pxi +4 -1
- selectolax/parser.c +1550 -1466
- selectolax/parser.cp39-win32.pyd +0 -0
- selectolax/parser.pyi +5 -2
- selectolax/parser.pyx +2 -2
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/METADATA +2 -2
- selectolax-0.4.0.dist-info/RECORD +27 -0
- selectolax-0.3.34.dist-info/RECORD +0 -26
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/WHEEL +0 -0
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
selectolax/lexbor/node.pxi
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
cimport cython
|
|
2
2
|
from cpython.exc cimport PyErr_SetNone
|
|
3
3
|
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("selectolax")
|
|
7
|
+
|
|
4
8
|
_TAG_TO_NAME = {
|
|
5
|
-
0x0005: "-
|
|
9
|
+
0x0005: "-doctype",
|
|
6
10
|
0x0002: "-text",
|
|
7
11
|
0x0004: "-comment",
|
|
8
12
|
}
|
|
@@ -11,6 +15,10 @@ ctypedef fused str_or_LexborNode:
|
|
|
11
15
|
bytes
|
|
12
16
|
LexborNode
|
|
13
17
|
|
|
18
|
+
ctypedef fused str_or_bytes:
|
|
19
|
+
str
|
|
20
|
+
bytes
|
|
21
|
+
|
|
14
22
|
cdef inline bytes to_bytes(str_or_LexborNode value):
|
|
15
23
|
cdef bytes bytes_val
|
|
16
24
|
if isinstance(value, unicode):
|
|
@@ -37,7 +45,10 @@ cdef class LexborNode:
|
|
|
37
45
|
|
|
38
46
|
@property
|
|
39
47
|
def child(self):
|
|
40
|
-
"""Alias for the `first_child` property.
|
|
48
|
+
"""Alias for the `first_child` property.
|
|
49
|
+
|
|
50
|
+
**Deprecated**. Please use `first_child` instead.
|
|
51
|
+
"""
|
|
41
52
|
return self.first_child
|
|
42
53
|
|
|
43
54
|
@property
|
|
@@ -200,13 +211,15 @@ cdef class LexborNode:
|
|
|
200
211
|
def css_first(self, str query, default=None, bool strict=False):
|
|
201
212
|
"""Same as `css` but returns only the first match.
|
|
202
213
|
|
|
214
|
+
When `strict=False` stops at the first match. Works faster.
|
|
215
|
+
|
|
203
216
|
Parameters
|
|
204
217
|
----------
|
|
205
218
|
|
|
206
219
|
query : str
|
|
207
|
-
default :
|
|
220
|
+
default : Any, default None
|
|
208
221
|
Default value to return if there is no match.
|
|
209
|
-
strict: bool, default
|
|
222
|
+
strict: bool, default False
|
|
210
223
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
211
224
|
|
|
212
225
|
|
|
@@ -214,8 +227,10 @@ cdef class LexborNode:
|
|
|
214
227
|
-------
|
|
215
228
|
selector : `LexborNode` object
|
|
216
229
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
230
|
+
if strict:
|
|
231
|
+
results = self.parser.selector.find(query, self)
|
|
232
|
+
else:
|
|
233
|
+
results = self.parser.selector.find_first(query, self)
|
|
219
234
|
n_results = len(results)
|
|
220
235
|
if n_results > 0:
|
|
221
236
|
if strict and n_results > 1:
|
|
@@ -232,7 +247,7 @@ cdef class LexborNode:
|
|
|
232
247
|
|
|
233
248
|
def css_matches(self, str selector):
|
|
234
249
|
"""Returns True if CSS selector matches a node."""
|
|
235
|
-
return self.parser.selector.any_matches(selector, self)
|
|
250
|
+
return bool(self.parser.selector.any_matches(selector, self))
|
|
236
251
|
|
|
237
252
|
def __repr__(self):
|
|
238
253
|
return '<LexborNode %s>' % self.tag
|
|
@@ -246,6 +261,14 @@ cdef class LexborNode:
|
|
|
246
261
|
def tag(self):
|
|
247
262
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
248
263
|
|
|
264
|
+
For for non-tag nodes, returns the following names:
|
|
265
|
+
|
|
266
|
+
* `-text` - text node
|
|
267
|
+
* `-document` - document node
|
|
268
|
+
* `-comment` - comment node
|
|
269
|
+
|
|
270
|
+
This
|
|
271
|
+
|
|
249
272
|
Returns
|
|
250
273
|
-------
|
|
251
274
|
text : str
|
|
@@ -281,9 +304,9 @@ cdef class LexborNode:
|
|
|
281
304
|
raise SelectolaxError("Decomposing the root node is not allowed.")
|
|
282
305
|
|
|
283
306
|
if recursive:
|
|
284
|
-
|
|
307
|
+
node_remove_deep(<lxb_dom_node_t *> self.node)
|
|
285
308
|
else:
|
|
286
|
-
|
|
309
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
287
310
|
|
|
288
311
|
def strip_tags(self, list tags, bool recursive = False):
|
|
289
312
|
"""Remove specified tags from the HTML tree.
|
|
@@ -332,6 +355,9 @@ cdef class LexborNode:
|
|
|
332
355
|
cdef size_t str_len = 0
|
|
333
356
|
attributes = dict()
|
|
334
357
|
|
|
358
|
+
if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
|
|
359
|
+
return attributes
|
|
360
|
+
|
|
335
361
|
while attr != NULL:
|
|
336
362
|
key = lxb_dom_attr_local_name_noi(attr, &str_len)
|
|
337
363
|
value = lxb_dom_attr_value_noi(attr, &str_len)
|
|
@@ -424,6 +450,8 @@ cdef class LexborNode:
|
|
|
424
450
|
def unwrap(self, bint delete_empty=False):
|
|
425
451
|
"""Replace node with whatever is inside this node.
|
|
426
452
|
|
|
453
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
454
|
+
|
|
427
455
|
Parameters
|
|
428
456
|
----------
|
|
429
457
|
delete_empty : bool, default False
|
|
@@ -439,9 +467,14 @@ cdef class LexborNode:
|
|
|
439
467
|
|
|
440
468
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
441
469
|
"""
|
|
470
|
+
|
|
471
|
+
if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
|
|
472
|
+
logger.error("Attempt to unwrap removed node. Does nothing.")
|
|
473
|
+
return
|
|
474
|
+
|
|
442
475
|
if self.node.first_child == NULL:
|
|
443
476
|
if delete_empty:
|
|
444
|
-
|
|
477
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
445
478
|
return
|
|
446
479
|
cdef lxb_dom_node_t* next_node
|
|
447
480
|
cdef lxb_dom_node_t* current_node
|
|
@@ -456,7 +489,7 @@ cdef class LexborNode:
|
|
|
456
489
|
current_node = next_node
|
|
457
490
|
else:
|
|
458
491
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
459
|
-
|
|
492
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
460
493
|
|
|
461
494
|
def unwrap_tags(self, list tags, bint delete_empty = False):
|
|
462
495
|
"""Unwraps specified tags from the HTML tree.
|
|
@@ -518,6 +551,12 @@ cdef class LexborNode:
|
|
|
518
551
|
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
|
|
519
552
|
lxb_dom_node_text_content_set(node, combined, len(combined))
|
|
520
553
|
lxb_dom_node_remove(node.prev)
|
|
554
|
+
|
|
555
|
+
if left_text is not NULL:
|
|
556
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
|
|
557
|
+
if right_text is not NULL:
|
|
558
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
|
|
559
|
+
|
|
521
560
|
if node.first_child:
|
|
522
561
|
LexborNode.new(node, self.parser).merge_text_nodes()
|
|
523
562
|
node = next_node
|
|
@@ -590,7 +629,7 @@ cdef class LexborNode:
|
|
|
590
629
|
if new_node == NULL:
|
|
591
630
|
raise SelectolaxError("Can't create a new node")
|
|
592
631
|
lxb_dom_node_insert_before(self.node, new_node)
|
|
593
|
-
|
|
632
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
594
633
|
elif isinstance(value, LexborNode):
|
|
595
634
|
new_node = lxb_dom_document_import_node(
|
|
596
635
|
&self.parser.document.dom_document,
|
|
@@ -600,7 +639,7 @@ cdef class LexborNode:
|
|
|
600
639
|
if new_node == NULL:
|
|
601
640
|
raise SelectolaxError("Can't create a new node")
|
|
602
641
|
lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
|
|
603
|
-
|
|
642
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
604
643
|
else:
|
|
605
644
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
606
645
|
|
|
@@ -883,6 +922,61 @@ cdef class LexborNode:
|
|
|
883
922
|
container.append(py_text)
|
|
884
923
|
return container.text
|
|
885
924
|
|
|
925
|
+
@property
|
|
926
|
+
def inner_html(self) -> str:
|
|
927
|
+
"""Return HTML representation of the child nodes.
|
|
928
|
+
|
|
929
|
+
Works similar to innerHTML in JavaScript.
|
|
930
|
+
Unlike the `.html` property, does not include the current node.
|
|
931
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
932
|
+
|
|
933
|
+
Returns
|
|
934
|
+
-------
|
|
935
|
+
text : str | None
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
cdef lexbor_str_t *lxb_str
|
|
939
|
+
cdef lxb_status_t status
|
|
940
|
+
|
|
941
|
+
lxb_str = lexbor_str_create()
|
|
942
|
+
status = lxb_html_serialize_deep_str(self.node, lxb_str)
|
|
943
|
+
if status == 0 and lxb_str.data:
|
|
944
|
+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
|
|
945
|
+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
|
|
946
|
+
return html
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
@inner_html.setter
|
|
950
|
+
def inner_html(self, str html):
|
|
951
|
+
"""Set inner HTML to the specified HTML.
|
|
952
|
+
|
|
953
|
+
Replaces existing data inside the node.
|
|
954
|
+
Works similar to innerHTML in JavaScript.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
html : str | None
|
|
959
|
+
|
|
960
|
+
"""
|
|
961
|
+
cdef bytes bytes_val
|
|
962
|
+
bytes_val = <bytes>html.encode("utf-8")
|
|
963
|
+
lxb_html_element_inner_html_set(
|
|
964
|
+
<lxb_html_element_t *>self.node,
|
|
965
|
+
<lxb_char_t *> bytes_val, len(bytes_val)
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def clone(self) -> LexborNode:
|
|
969
|
+
"""Clone the current node.
|
|
970
|
+
|
|
971
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
972
|
+
|
|
973
|
+
It is tied to the current parser instance.
|
|
974
|
+
Gets destroyed when parser instance is destroyed.
|
|
975
|
+
"""
|
|
976
|
+
cdef lxb_dom_node_t* node
|
|
977
|
+
node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
|
|
978
|
+
return LexborNode.new(node, self.parser)
|
|
979
|
+
|
|
886
980
|
|
|
887
981
|
@cython.internal
|
|
888
982
|
@cython.final
|
|
@@ -928,7 +1022,7 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
|
928
1022
|
return LEXBOR_ACTION_OK
|
|
929
1023
|
|
|
930
1024
|
try:
|
|
931
|
-
py_str = text.decode(_ENCODING)
|
|
1025
|
+
py_str = text.decode(_ENCODING, "replace")
|
|
932
1026
|
|
|
933
1027
|
except Exception as e:
|
|
934
1028
|
PyErr_SetNone(e)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
|
|
3
|
+
cdef lxb_dom_node_t *tmp
|
|
4
|
+
cdef lxb_dom_node_t *node = root
|
|
5
|
+
|
|
6
|
+
while node != NULL:
|
|
7
|
+
if node.first_child != NULL:
|
|
8
|
+
node = node.first_child
|
|
9
|
+
else:
|
|
10
|
+
while node != root and node.next == NULL:
|
|
11
|
+
tmp = node.parent
|
|
12
|
+
lxb_dom_node_remove(node)
|
|
13
|
+
node = tmp
|
|
14
|
+
|
|
15
|
+
if node == root:
|
|
16
|
+
lxb_dom_node_remove(node)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
tmp = node.next
|
|
20
|
+
lxb_dom_node_remove(node)
|
|
21
|
+
node = tmp
|
|
22
|
+
|
|
23
|
+
return NULL
|
|
24
|
+
|
|
25
|
+
cdef bint node_is_removed(lxb_dom_node_t* node):
|
|
26
|
+
if node.parent == NULL and node.next == NULL \
|
|
27
|
+
and node.prev == NULL:
|
|
28
|
+
return 1
|
|
29
|
+
return 0
|
selectolax/lexbor/selection.pxi
CHANGED
|
@@ -39,6 +39,12 @@ cdef class LexborCSSSelector:
|
|
|
39
39
|
return 0
|
|
40
40
|
|
|
41
41
|
cpdef list find(self, str query, LexborNode node):
|
|
42
|
+
return self._find(query, node, 0)
|
|
43
|
+
|
|
44
|
+
cpdef list find_first(self, str query, LexborNode node):
|
|
45
|
+
return self._find(query, node, 1)
|
|
46
|
+
|
|
47
|
+
cpdef list _find(self, str query, LexborNode node, bint only_first):
|
|
42
48
|
cdef lxb_css_selector_list_t* selectors
|
|
43
49
|
cdef lxb_char_t* c_selector
|
|
44
50
|
cdef lxb_css_selector_list_t * selectors_list
|
|
@@ -54,8 +60,12 @@ cdef class LexborCSSSelector:
|
|
|
54
60
|
|
|
55
61
|
self.current_node = node
|
|
56
62
|
self.results = []
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
if only_first:
|
|
64
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
65
|
+
<lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
|
|
66
|
+
else:
|
|
67
|
+
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
68
|
+
<lxb_selectors_cb_f>css_finder_callback, <void*>self)
|
|
59
69
|
results = list(self.results)
|
|
60
70
|
self.results = []
|
|
61
71
|
self.current_node = None
|
|
@@ -76,6 +86,7 @@ cdef class LexborCSSSelector:
|
|
|
76
86
|
|
|
77
87
|
if selectors_list == NULL:
|
|
78
88
|
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
89
|
+
return -1
|
|
79
90
|
|
|
80
91
|
self.results = []
|
|
81
92
|
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
|
|
@@ -83,6 +94,8 @@ cdef class LexborCSSSelector:
|
|
|
83
94
|
if status != LXB_STATUS_OK:
|
|
84
95
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
85
96
|
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
|
|
97
|
+
return -1
|
|
98
|
+
|
|
86
99
|
result = PyList_GET_SIZE(self.results) > 0
|
|
87
100
|
self.results = []
|
|
88
101
|
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
@@ -185,6 +198,15 @@ cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_spe
|
|
|
185
198
|
cls.results.append(lxb_node)
|
|
186
199
|
return LXB_STATUS_OK
|
|
187
200
|
|
|
201
|
+
cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
202
|
+
cdef LexborNode lxb_node
|
|
203
|
+
cdef LexborCSSSelector cls
|
|
204
|
+
cls = <LexborCSSSelector> ctx
|
|
205
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
|
|
206
|
+
cls.results.append(lxb_node)
|
|
207
|
+
return LXB_STATUS_STOP
|
|
208
|
+
|
|
209
|
+
|
|
188
210
|
cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
|
|
189
211
|
cdef LexborNode lxb_node
|
|
190
212
|
cdef LexborCSSSelector cls
|