selectolax 0.3.16__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.3.29__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/attrs.pxi +2 -1
- selectolax/lexbor/node.pxi +68 -5
- selectolax/lexbor/selection.pxi +14 -11
- selectolax/lexbor/util.pxi +19 -0
- selectolax/lexbor.c +39208 -18768
- selectolax/lexbor.cpython-38-aarch64-linux-gnu.so +0 -0
- selectolax/lexbor.pxd +7 -1
- selectolax/lexbor.pyi +89 -39
- selectolax/lexbor.pyx +10 -5
- selectolax/modest/node.pxi +61 -5
- selectolax/modest/selection.pxi +1 -1
- selectolax/modest/util.pxi +19 -0
- selectolax/parser.c +33796 -14836
- selectolax/parser.cpython-38-aarch64-linux-gnu.so +0 -0
- selectolax/parser.pyi +86 -41
- selectolax/parser.pyx +5 -4
- selectolax/utils.pxi +95 -1
- {selectolax-0.3.16.dist-info → selectolax-0.3.29.dist-info}/LICENSE +1 -1
- {selectolax-0.3.16.dist-info → selectolax-0.3.29.dist-info}/METADATA +17 -4
- selectolax-0.3.29.dist-info/RECORD +26 -0
- {selectolax-0.3.16.dist-info → selectolax-0.3.29.dist-info}/WHEEL +1 -1
- selectolax-0.3.16.dist-info/RECORD +0 -24
- {selectolax-0.3.16.dist-info → selectolax-0.3.29.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
selectolax/lexbor/attrs.pxi
CHANGED
|
@@ -19,8 +19,9 @@ cdef class LexborAttributes:
|
|
|
19
19
|
|
|
20
20
|
while attr != NULL:
|
|
21
21
|
key = lxb_dom_attr_local_name_noi(attr, &str_len)
|
|
22
|
+
if key is not NULL:
|
|
23
|
+
yield key.decode(_ENCODING)
|
|
22
24
|
attr = attr.next
|
|
23
|
-
yield key.decode(_ENCODING)
|
|
24
25
|
|
|
25
26
|
def __setitem__(self, str key, value):
|
|
26
27
|
value = str(value)
|
selectolax/lexbor/node.pxi
CHANGED
|
@@ -95,7 +95,7 @@ cdef class LexborNode:
|
|
|
95
95
|
text : str
|
|
96
96
|
"""
|
|
97
97
|
cdef lexbor_str_t *lxb_str
|
|
98
|
-
cdef lxb_status_t
|
|
98
|
+
cdef lxb_status_t status
|
|
99
99
|
|
|
100
100
|
lxb_str = lexbor_str_create()
|
|
101
101
|
status = lxb_html_serialize_tree_str(self.node, lxb_str)
|
|
@@ -416,9 +416,14 @@ cdef class LexborNode:
|
|
|
416
416
|
node = node.next
|
|
417
417
|
|
|
418
418
|
|
|
419
|
-
def unwrap(self):
|
|
419
|
+
def unwrap(self, delete_empty=False):
|
|
420
420
|
"""Replace node with whatever is inside this node.
|
|
421
421
|
|
|
422
|
+
Parameters
|
|
423
|
+
----------
|
|
424
|
+
delete_empty : bool, default False
|
|
425
|
+
If True, removes empty tags.
|
|
426
|
+
|
|
422
427
|
Examples
|
|
423
428
|
--------
|
|
424
429
|
|
|
@@ -426,9 +431,12 @@ cdef class LexborNode:
|
|
|
426
431
|
>>> tree.css_first('i').unwrap()
|
|
427
432
|
>>> tree.html
|
|
428
433
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
429
|
-
|
|
434
|
+
|
|
435
|
+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
430
436
|
"""
|
|
431
437
|
if self.node.first_child == NULL:
|
|
438
|
+
if delete_empty:
|
|
439
|
+
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
|
|
432
440
|
return
|
|
433
441
|
cdef lxb_dom_node_t* next_node;
|
|
434
442
|
cdef lxb_dom_node_t* current_node;
|
|
@@ -445,7 +453,7 @@ cdef class LexborNode:
|
|
|
445
453
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
446
454
|
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
|
|
447
455
|
|
|
448
|
-
def unwrap_tags(self, list tags):
|
|
456
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
449
457
|
"""Unwraps specified tags from the HTML tree.
|
|
450
458
|
|
|
451
459
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -454,6 +462,8 @@ cdef class LexborNode:
|
|
|
454
462
|
----------
|
|
455
463
|
tags : list
|
|
456
464
|
List of tags to remove.
|
|
465
|
+
delete_empty : bool, default False
|
|
466
|
+
If True, removes empty tags.
|
|
457
467
|
|
|
458
468
|
Examples
|
|
459
469
|
--------
|
|
@@ -462,11 +472,13 @@ cdef class LexborNode:
|
|
|
462
472
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
463
473
|
>>> tree.body.html
|
|
464
474
|
'<body><div>Hello world!</div></body>'
|
|
475
|
+
|
|
476
|
+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
465
477
|
"""
|
|
466
478
|
|
|
467
479
|
for tag in tags:
|
|
468
480
|
for element in self.css(tag):
|
|
469
|
-
element.unwrap()
|
|
481
|
+
element.unwrap(delete_empty)
|
|
470
482
|
|
|
471
483
|
|
|
472
484
|
def traverse(self, include_text=False):
|
|
@@ -655,6 +667,57 @@ cdef class LexborNode:
|
|
|
655
667
|
else:
|
|
656
668
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
657
669
|
|
|
670
|
+
def insert_child(self, str_or_LexborNode value):
|
|
671
|
+
"""
|
|
672
|
+
Insert a node inside (at the end of) the current Node.
|
|
673
|
+
|
|
674
|
+
Parameters
|
|
675
|
+
----------
|
|
676
|
+
value : str, bytes or Node
|
|
677
|
+
The text or Node instance to insert inside the Node.
|
|
678
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
679
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
680
|
+
Does not clone the ``Node`` object.
|
|
681
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
682
|
+
|
|
683
|
+
Examples
|
|
684
|
+
--------
|
|
685
|
+
|
|
686
|
+
>>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
|
|
687
|
+
>>> div = tree.css_first('div')
|
|
688
|
+
>>> div.insert_child('Laptop')
|
|
689
|
+
>>> tree.body.child.html
|
|
690
|
+
'<div>Get <img src="">Laptop</div>'
|
|
691
|
+
|
|
692
|
+
>>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
|
|
693
|
+
>>> html_parser2 = LexborHTMLParser('<div>Test</div>')
|
|
694
|
+
>>> span_node = html_parser.css_first('span')
|
|
695
|
+
>>> span_node.insert_child(html_parser2.body.child)
|
|
696
|
+
<div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
|
|
697
|
+
"""
|
|
698
|
+
cdef lxb_dom_node_t * new_node
|
|
699
|
+
|
|
700
|
+
if isinstance(value, (str, bytes, unicode)):
|
|
701
|
+
bytes_val = to_bytes(value)
|
|
702
|
+
new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
|
|
703
|
+
&self.parser.document.dom_document,
|
|
704
|
+
<lxb_char_t *> bytes_val, len(bytes_val)
|
|
705
|
+
)
|
|
706
|
+
if new_node == NULL:
|
|
707
|
+
raise SelectolaxError("Can't create a new node")
|
|
708
|
+
lxb_dom_node_insert_child(self.node, new_node)
|
|
709
|
+
elif isinstance(value, LexborNode):
|
|
710
|
+
new_node = lxb_dom_document_import_node(
|
|
711
|
+
&self.parser.document.dom_document,
|
|
712
|
+
<lxb_dom_node_t *> value.node,
|
|
713
|
+
<bint> True
|
|
714
|
+
)
|
|
715
|
+
if new_node == NULL:
|
|
716
|
+
raise SelectolaxError("Can't create a new node")
|
|
717
|
+
lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
|
|
718
|
+
else:
|
|
719
|
+
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
720
|
+
|
|
658
721
|
@property
|
|
659
722
|
def raw_value(self):
|
|
660
723
|
"""Return the raw (unparsed, original) value of a node.
|
selectolax/lexbor/selection.pxi
CHANGED
|
@@ -28,7 +28,7 @@ cdef class LexborCSSSelector:
|
|
|
28
28
|
|
|
29
29
|
self.selectors = lxb_selectors_create()
|
|
30
30
|
status = lxb_selectors_init(self.selectors)
|
|
31
|
-
|
|
31
|
+
lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
|
|
32
32
|
if status != LXB_STATUS_OK:
|
|
33
33
|
raise SelectolaxError("Can't initialize CSS selector.")
|
|
34
34
|
|
|
@@ -72,14 +72,17 @@ cdef class LexborCSSSelector:
|
|
|
72
72
|
raise SelectolaxError("Can't parse CSS selector.")
|
|
73
73
|
result = bool(self.results)
|
|
74
74
|
self.results = []
|
|
75
|
+
lxb_css_selector_list_destroy_memory(selectors_list)
|
|
75
76
|
return result
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
def __dealloc__(self):
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
if self.selectors != NULL:
|
|
81
|
+
lxb_selectors_destroy(self.selectors, True)
|
|
82
|
+
if self.parser != NULL:
|
|
83
|
+
lxb_css_parser_destroy(self.parser, True)
|
|
84
|
+
if self.css_selectors != NULL:
|
|
85
|
+
lxb_css_selectors_destroy(self.css_selectors, True)
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
|
|
@@ -103,16 +106,16 @@ cdef class LexborSelector:
|
|
|
103
106
|
raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
104
107
|
|
|
105
108
|
@property
|
|
106
|
-
def matches(self):
|
|
109
|
+
def matches(self) -> list:
|
|
107
110
|
"""Returns all possible matches"""
|
|
108
111
|
return self.nodes
|
|
109
112
|
|
|
110
113
|
@property
|
|
111
|
-
def any_matches(self):
|
|
114
|
+
def any_matches(self) -> bool:
|
|
112
115
|
"""Returns True if there are any matches"""
|
|
113
116
|
return bool(self.nodes)
|
|
114
117
|
|
|
115
|
-
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
118
|
+
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
|
|
116
119
|
"""Filter all current matches given text."""
|
|
117
120
|
nodes = []
|
|
118
121
|
for node in self.nodes:
|
|
@@ -122,7 +125,7 @@ cdef class LexborSelector:
|
|
|
122
125
|
self.nodes = nodes
|
|
123
126
|
return self
|
|
124
127
|
|
|
125
|
-
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
128
|
+
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
|
|
126
129
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
127
130
|
nodes = []
|
|
128
131
|
for node in self.nodes:
|
|
@@ -131,7 +134,7 @@ cdef class LexborSelector:
|
|
|
131
134
|
return True
|
|
132
135
|
return False
|
|
133
136
|
|
|
134
|
-
def attribute_longer_than(self, str attribute, int length, str start = None):
|
|
137
|
+
def attribute_longer_than(self, str attribute, int length, str start = None) -> LexborSelector:
|
|
135
138
|
"""Filter all current matches by attribute length.
|
|
136
139
|
|
|
137
140
|
Similar to `string-length` in XPath.
|
|
@@ -146,7 +149,7 @@ cdef class LexborSelector:
|
|
|
146
149
|
self.nodes = nodes
|
|
147
150
|
return self
|
|
148
151
|
|
|
149
|
-
def any_attribute_longer_than(self, str attribute, int length, str start = None):
|
|
152
|
+
def any_attribute_longer_than(self, str attribute, int length, str start = None) -> bool:
|
|
150
153
|
"""Returns True any href attribute longer than a specified length.
|
|
151
154
|
|
|
152
155
|
Similar to `string-length` in XPath.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
include "../utils.pxi"
|
|
2
|
+
|
|
3
|
+
def create_tag(tag: str):
|
|
4
|
+
"""
|
|
5
|
+
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
|
|
6
|
+
e.g. `"<div></div>"`.
|
|
7
|
+
"""
|
|
8
|
+
return do_create_tag(tag, LexborHTMLParser)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parse_fragment(html: str):
|
|
12
|
+
"""
|
|
13
|
+
Given HTML, parse it into a list of Nodes, such that the nodes
|
|
14
|
+
correspond to the given HTML.
|
|
15
|
+
|
|
16
|
+
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
|
|
17
|
+
if they are missing. This function does not add these tags.
|
|
18
|
+
"""
|
|
19
|
+
return do_parse_fragment(html, LexborHTMLParser)
|