selectolax 0.3.33__cp313-cp313-win_amd64.whl → 0.4.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +114 -14
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +24 -2
- selectolax/lexbor.c +6126 -4206
- selectolax/lexbor.cp313-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +8 -0
- selectolax/lexbor.pyi +96 -7
- selectolax/lexbor.pyx +45 -4
- selectolax/modest/node.pxi +4 -1
- selectolax/parser.c +1559 -1461
- selectolax/parser.cp313-win_amd64.pyd +0 -0
- selectolax/parser.pyi +5 -2
- selectolax/parser.pyx +2 -2
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- selectolax-0.3.33.dist-info/METADATA +0 -187
- selectolax-0.3.33.dist-info/RECORD +0 -26
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/WHEEL +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pxd
CHANGED
|
@@ -215,6 +215,8 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
215
215
|
|
|
216
216
|
size_t ref_count
|
|
217
217
|
|
|
218
|
+
ctypedef struct lxb_html_element_t
|
|
219
|
+
|
|
218
220
|
# Functions
|
|
219
221
|
lxb_html_document_t * lxb_html_document_create()
|
|
220
222
|
lxb_status_t lxb_html_document_parse(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
|
|
@@ -223,6 +225,9 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
223
225
|
lxb_dom_element_t * lxb_dom_document_element(lxb_dom_document_t *document)
|
|
224
226
|
|
|
225
227
|
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
228
|
+
lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
229
|
+
lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
|
|
230
|
+
const lxb_char_t *html, size_t size)
|
|
226
231
|
|
|
227
232
|
cdef class LexborNode:
|
|
228
233
|
cdef:
|
|
@@ -241,6 +246,8 @@ cdef class LexborCSSSelector:
|
|
|
241
246
|
cdef public LexborNode current_node
|
|
242
247
|
cdef int _create_css_parser(self) except -1
|
|
243
248
|
cpdef list find(self, str query, LexborNode node)
|
|
249
|
+
cpdef list find_first(self, str query, LexborNode node)
|
|
250
|
+
cpdef list _find(self, str query, LexborNode node, bint only_first)
|
|
244
251
|
cpdef int any_matches(self, str query, LexborNode node) except -1
|
|
245
252
|
|
|
246
253
|
cdef class LexborHTMLParser:
|
|
@@ -318,6 +325,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
318
325
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
319
326
|
lxb_dom_text_t * lxb_dom_document_create_text_node(lxb_dom_document_t *document, const lxb_char_t *data, size_t len)
|
|
320
327
|
void lxb_dom_node_simple_walk(lxb_dom_node_t *root, lxb_dom_node_simple_walker_f walker_cb, void *ctx)
|
|
328
|
+
lxb_dom_node_t* lxb_dom_node_clone(lxb_dom_node_t *node, bint deep)
|
|
321
329
|
|
|
322
330
|
|
|
323
331
|
cdef extern from "lexbor/dom/interfaces/element.h" nogil:
|
selectolax/lexbor.pyi
CHANGED
|
@@ -71,6 +71,34 @@ class LexborSelector:
|
|
|
71
71
|
"""
|
|
72
72
|
...
|
|
73
73
|
|
|
74
|
+
@property
|
|
75
|
+
def inner_html(self) -> str | None:
|
|
76
|
+
"""Return HTML representation of the child nodes.
|
|
77
|
+
|
|
78
|
+
Works similar to innerHTML in JavaScript.
|
|
79
|
+
Unlike the `.html` property, does not include the current node.
|
|
80
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
text : str or None
|
|
85
|
+
"""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
@inner_html.setter
|
|
89
|
+
def inner_html(self, html: str):
|
|
90
|
+
"""Set inner HTML to the specified HTML.
|
|
91
|
+
|
|
92
|
+
Replaces existing data inside the node.
|
|
93
|
+
Works similar to innerHTML in JavaScript.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
html : str
|
|
98
|
+
|
|
99
|
+
"""
|
|
100
|
+
...
|
|
101
|
+
|
|
74
102
|
class LexborCSSSelector:
|
|
75
103
|
def __init__(self): ...
|
|
76
104
|
def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
|
|
@@ -84,7 +112,10 @@ class LexborNode:
|
|
|
84
112
|
def mem_id(self) -> int: ...
|
|
85
113
|
@property
|
|
86
114
|
def child(self) -> LexborNode | None:
|
|
87
|
-
"""Alias for the first_child property.
|
|
115
|
+
"""Alias for the `first_child` property.
|
|
116
|
+
|
|
117
|
+
**Deprecated**. Please use `first_child` instead.
|
|
118
|
+
"""
|
|
88
119
|
...
|
|
89
120
|
@property
|
|
90
121
|
def first_child(self) -> LexborNode | None:
|
|
@@ -145,6 +176,12 @@ class LexborNode:
|
|
|
145
176
|
Matches pattern `query` against HTML tree.
|
|
146
177
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
147
178
|
|
|
179
|
+
Special selectors:
|
|
180
|
+
|
|
181
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
182
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
183
|
+
|
|
184
|
+
|
|
148
185
|
Parameters
|
|
149
186
|
----------
|
|
150
187
|
query : str
|
|
@@ -167,7 +204,7 @@ class LexborNode:
|
|
|
167
204
|
query : str
|
|
168
205
|
default : bool, default None
|
|
169
206
|
Default value to return if there is no match.
|
|
170
|
-
strict: bool, default
|
|
207
|
+
strict: bool, default False
|
|
171
208
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
172
209
|
|
|
173
210
|
|
|
@@ -188,7 +225,7 @@ class LexborNode:
|
|
|
188
225
|
query : str
|
|
189
226
|
default : bool, default None
|
|
190
227
|
Default value to return if there is no match.
|
|
191
|
-
strict: bool, default
|
|
228
|
+
strict: bool, default False
|
|
192
229
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
193
230
|
|
|
194
231
|
|
|
@@ -209,7 +246,7 @@ class LexborNode:
|
|
|
209
246
|
query : str
|
|
210
247
|
default : bool, default None
|
|
211
248
|
Default value to return if there is no match.
|
|
212
|
-
strict: bool, default
|
|
249
|
+
strict: bool, default False
|
|
213
250
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
214
251
|
|
|
215
252
|
|
|
@@ -230,6 +267,12 @@ class LexborNode:
|
|
|
230
267
|
def tag(self) -> str | None:
|
|
231
268
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
232
269
|
|
|
270
|
+
For for non-tag nodes, returns the following names:
|
|
271
|
+
|
|
272
|
+
* `-text` - text node
|
|
273
|
+
* `-document` - document node
|
|
274
|
+
* `-comment` - comment node
|
|
275
|
+
|
|
233
276
|
Returns
|
|
234
277
|
-------
|
|
235
278
|
text : str
|
|
@@ -345,6 +388,8 @@ class LexborNode:
|
|
|
345
388
|
def unwrap(self, delete_empty: bool = False) -> None:
|
|
346
389
|
"""Replace node with whatever is inside this node.
|
|
347
390
|
|
|
391
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
392
|
+
|
|
348
393
|
Parameters
|
|
349
394
|
----------
|
|
350
395
|
delete_empty : bool, default False
|
|
@@ -602,6 +647,44 @@ class LexborNode:
|
|
|
602
647
|
"""
|
|
603
648
|
...
|
|
604
649
|
|
|
650
|
+
@property
|
|
651
|
+
def inner_html(self) -> str | None:
|
|
652
|
+
"""Return HTML representation of the child nodes.
|
|
653
|
+
|
|
654
|
+
Works similar to innerHTML in JavaScript.
|
|
655
|
+
Unlike the `.html` property, does not include the current node.
|
|
656
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
text : str or None
|
|
661
|
+
"""
|
|
662
|
+
...
|
|
663
|
+
|
|
664
|
+
@inner_html.setter
|
|
665
|
+
def inner_html(self, html: str):
|
|
666
|
+
"""Set inner HTML to the specified HTML.
|
|
667
|
+
|
|
668
|
+
Replaces existing data inside the node.
|
|
669
|
+
Works similar to innerHTML in JavaScript.
|
|
670
|
+
|
|
671
|
+
Parameters
|
|
672
|
+
----------
|
|
673
|
+
html : str
|
|
674
|
+
|
|
675
|
+
"""
|
|
676
|
+
...
|
|
677
|
+
|
|
678
|
+
def clone(self) -> LexborNode:
|
|
679
|
+
"""Clone the current node.
|
|
680
|
+
|
|
681
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
682
|
+
|
|
683
|
+
It is tied to the current parser instance.
|
|
684
|
+
Gets destroyed when parser instance is destroyed.
|
|
685
|
+
"""
|
|
686
|
+
...
|
|
687
|
+
|
|
605
688
|
class LexborHTMLParser:
|
|
606
689
|
"""The lexbor HTML parser.
|
|
607
690
|
|
|
@@ -665,6 +748,12 @@ class LexborHTMLParser:
|
|
|
665
748
|
Matches pattern `query` against HTML tree.
|
|
666
749
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
667
750
|
|
|
751
|
+
Special selectors:
|
|
752
|
+
|
|
753
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
754
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
755
|
+
|
|
756
|
+
|
|
668
757
|
Parameters
|
|
669
758
|
----------
|
|
670
759
|
query : str
|
|
@@ -687,7 +776,7 @@ class LexborHTMLParser:
|
|
|
687
776
|
query : str
|
|
688
777
|
default : bool, default None
|
|
689
778
|
Default value to return if there is no match.
|
|
690
|
-
strict: bool, default
|
|
779
|
+
strict: bool, default False
|
|
691
780
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
692
781
|
|
|
693
782
|
|
|
@@ -708,7 +797,7 @@ class LexborHTMLParser:
|
|
|
708
797
|
query : str
|
|
709
798
|
default : bool, default None
|
|
710
799
|
Default value to return if there is no match.
|
|
711
|
-
strict: bool, default
|
|
800
|
+
strict: bool, default False
|
|
712
801
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
713
802
|
|
|
714
803
|
|
|
@@ -729,7 +818,7 @@ class LexborHTMLParser:
|
|
|
729
818
|
query : str
|
|
730
819
|
default : bool, default None
|
|
731
820
|
Default value to return if there is no match.
|
|
732
|
-
strict: bool, default
|
|
821
|
+
strict: bool, default False
|
|
733
822
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
734
823
|
|
|
735
824
|
|
selectolax/lexbor.pyx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from cpython.bool cimport bool
|
|
2
2
|
from cpython.exc cimport PyErr_SetObject
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
_ENCODING = 'UTF-8'
|
|
5
6
|
|
|
6
7
|
include "base.pxi"
|
|
@@ -9,6 +10,7 @@ include "lexbor/attrs.pxi"
|
|
|
9
10
|
include "lexbor/node.pxi"
|
|
10
11
|
include "lexbor/selection.pxi"
|
|
11
12
|
include "lexbor/util.pxi"
|
|
13
|
+
include "lexbor/node_remove.pxi"
|
|
12
14
|
|
|
13
15
|
# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
|
|
14
16
|
|
|
@@ -169,6 +171,11 @@ cdef class LexborHTMLParser:
|
|
|
169
171
|
Matches pattern `query` against HTML tree.
|
|
170
172
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
171
173
|
|
|
174
|
+
Special selectors:
|
|
175
|
+
|
|
176
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
177
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
178
|
+
|
|
172
179
|
Parameters
|
|
173
180
|
----------
|
|
174
181
|
query : str
|
|
@@ -187,9 +194,9 @@ cdef class LexborHTMLParser:
|
|
|
187
194
|
----------
|
|
188
195
|
|
|
189
196
|
query : str
|
|
190
|
-
default :
|
|
197
|
+
default : Any, default None
|
|
191
198
|
Default value to return if there is no match.
|
|
192
|
-
strict: bool, default
|
|
199
|
+
strict: bool, default False
|
|
193
200
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
194
201
|
|
|
195
202
|
|
|
@@ -206,7 +213,7 @@ cdef class LexborHTMLParser:
|
|
|
206
213
|
----------
|
|
207
214
|
tags : list of str
|
|
208
215
|
List of tags to remove.
|
|
209
|
-
recursive : bool, default
|
|
216
|
+
recursive : bool, default False
|
|
210
217
|
Whenever to delete all its child nodes
|
|
211
218
|
|
|
212
219
|
Examples
|
|
@@ -329,7 +336,13 @@ cdef class LexborHTMLParser:
|
|
|
329
336
|
return obj
|
|
330
337
|
|
|
331
338
|
def clone(self):
|
|
332
|
-
"""Clone the current
|
|
339
|
+
"""Clone the current node.
|
|
340
|
+
|
|
341
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
342
|
+
|
|
343
|
+
It is tied to the current parser instance.
|
|
344
|
+
Gets destroyed when parser instance is destroyed.
|
|
345
|
+
"""
|
|
333
346
|
cdef lxb_html_document_t* cloned_document
|
|
334
347
|
cdef lxb_dom_node_t* cloned_node
|
|
335
348
|
cdef LexborHTMLParser cls
|
|
@@ -381,3 +394,31 @@ cdef class LexborHTMLParser:
|
|
|
381
394
|
# faster to check if the document is empty which should determine if we have a root
|
|
382
395
|
if self.document != NULL:
|
|
383
396
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def inner_html(self) -> str:
|
|
400
|
+
"""Return HTML representation of the child nodes.
|
|
401
|
+
|
|
402
|
+
Works similar to innerHTML in JavaScript.
|
|
403
|
+
Unlike the `.html` property, does not include the current node.
|
|
404
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
text : str | None
|
|
409
|
+
"""
|
|
410
|
+
return self.root.inner_html
|
|
411
|
+
|
|
412
|
+
@inner_html.setter
|
|
413
|
+
def inner_html(self, str html):
|
|
414
|
+
"""Set inner HTML to the specified HTML.
|
|
415
|
+
|
|
416
|
+
Replaces existing data inside the node.
|
|
417
|
+
Works similar to innerHTML in JavaScript.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
html : str
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
self.root.inner_html = html
|
selectolax/modest/node.pxi
CHANGED
|
@@ -397,7 +397,10 @@ cdef class Node:
|
|
|
397
397
|
|
|
398
398
|
@property
|
|
399
399
|
def child(self):
|
|
400
|
-
"""
|
|
400
|
+
"""Alias for the `first_child` property.
|
|
401
|
+
|
|
402
|
+
**Deprecated**. Please use `first_child` instead.
|
|
403
|
+
"""
|
|
401
404
|
cdef Node node
|
|
402
405
|
if self.node.child:
|
|
403
406
|
node = Node.new(self.node.child, self.parser)
|