selectolax 0.3.34__cp314-cp314t-win_arm64.whl → 0.4.0__cp314-cp314t-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +108 -14
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +24 -2
- selectolax/lexbor.c +6101 -4196
- selectolax/lexbor.cp314t-win_arm64.pyd +0 -0
- selectolax/lexbor.pxd +8 -0
- selectolax/lexbor.pyi +84 -7
- selectolax/lexbor.pyx +40 -4
- selectolax/modest/node.pxi +4 -1
- selectolax/parser.c +1540 -1457
- selectolax/parser.cp314t-win_arm64.pyd +0 -0
- selectolax/parser.pyi +5 -2
- selectolax/parser.pyx +2 -2
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/METADATA +2 -2
- selectolax-0.4.0.dist-info/RECORD +27 -0
- selectolax-0.3.34.dist-info/RECORD +0 -26
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/WHEEL +0 -0
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.34.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pxd
CHANGED
|
@@ -215,6 +215,8 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
215
215
|
|
|
216
216
|
size_t ref_count
|
|
217
217
|
|
|
218
|
+
ctypedef struct lxb_html_element_t
|
|
219
|
+
|
|
218
220
|
# Functions
|
|
219
221
|
lxb_html_document_t * lxb_html_document_create()
|
|
220
222
|
lxb_status_t lxb_html_document_parse(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
|
|
@@ -223,6 +225,9 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
223
225
|
lxb_dom_element_t * lxb_dom_document_element(lxb_dom_document_t *document)
|
|
224
226
|
|
|
225
227
|
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
228
|
+
lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
229
|
+
lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
|
|
230
|
+
const lxb_char_t *html, size_t size)
|
|
226
231
|
|
|
227
232
|
cdef class LexborNode:
|
|
228
233
|
cdef:
|
|
@@ -241,6 +246,8 @@ cdef class LexborCSSSelector:
|
|
|
241
246
|
cdef public LexborNode current_node
|
|
242
247
|
cdef int _create_css_parser(self) except -1
|
|
243
248
|
cpdef list find(self, str query, LexborNode node)
|
|
249
|
+
cpdef list find_first(self, str query, LexborNode node)
|
|
250
|
+
cpdef list _find(self, str query, LexborNode node, bint only_first)
|
|
244
251
|
cpdef int any_matches(self, str query, LexborNode node) except -1
|
|
245
252
|
|
|
246
253
|
cdef class LexborHTMLParser:
|
|
@@ -318,6 +325,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
318
325
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
319
326
|
lxb_dom_text_t * lxb_dom_document_create_text_node(lxb_dom_document_t *document, const lxb_char_t *data, size_t len)
|
|
320
327
|
void lxb_dom_node_simple_walk(lxb_dom_node_t *root, lxb_dom_node_simple_walker_f walker_cb, void *ctx)
|
|
328
|
+
lxb_dom_node_t* lxb_dom_node_clone(lxb_dom_node_t *node, bint deep)
|
|
321
329
|
|
|
322
330
|
|
|
323
331
|
cdef extern from "lexbor/dom/interfaces/element.h" nogil:
|
selectolax/lexbor.pyi
CHANGED
|
@@ -71,6 +71,34 @@ class LexborSelector:
|
|
|
71
71
|
"""
|
|
72
72
|
...
|
|
73
73
|
|
|
74
|
+
@property
|
|
75
|
+
def inner_html(self) -> str | None:
|
|
76
|
+
"""Return HTML representation of the child nodes.
|
|
77
|
+
|
|
78
|
+
Works similar to innerHTML in JavaScript.
|
|
79
|
+
Unlike the `.html` property, does not include the current node.
|
|
80
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
text : str or None
|
|
85
|
+
"""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
@inner_html.setter
|
|
89
|
+
def inner_html(self, html: str):
|
|
90
|
+
"""Set inner HTML to the specified HTML.
|
|
91
|
+
|
|
92
|
+
Replaces existing data inside the node.
|
|
93
|
+
Works similar to innerHTML in JavaScript.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
html : str
|
|
98
|
+
|
|
99
|
+
"""
|
|
100
|
+
...
|
|
101
|
+
|
|
74
102
|
class LexborCSSSelector:
|
|
75
103
|
def __init__(self): ...
|
|
76
104
|
def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
|
|
@@ -84,7 +112,10 @@ class LexborNode:
|
|
|
84
112
|
def mem_id(self) -> int: ...
|
|
85
113
|
@property
|
|
86
114
|
def child(self) -> LexborNode | None:
|
|
87
|
-
"""Alias for the first_child property.
|
|
115
|
+
"""Alias for the `first_child` property.
|
|
116
|
+
|
|
117
|
+
**Deprecated**. Please use `first_child` instead.
|
|
118
|
+
"""
|
|
88
119
|
...
|
|
89
120
|
@property
|
|
90
121
|
def first_child(self) -> LexborNode | None:
|
|
@@ -173,7 +204,7 @@ class LexborNode:
|
|
|
173
204
|
query : str
|
|
174
205
|
default : bool, default None
|
|
175
206
|
Default value to return if there is no match.
|
|
176
|
-
strict: bool, default
|
|
207
|
+
strict: bool, default False
|
|
177
208
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
178
209
|
|
|
179
210
|
|
|
@@ -194,7 +225,7 @@ class LexborNode:
|
|
|
194
225
|
query : str
|
|
195
226
|
default : bool, default None
|
|
196
227
|
Default value to return if there is no match.
|
|
197
|
-
strict: bool, default
|
|
228
|
+
strict: bool, default False
|
|
198
229
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
199
230
|
|
|
200
231
|
|
|
@@ -215,7 +246,7 @@ class LexborNode:
|
|
|
215
246
|
query : str
|
|
216
247
|
default : bool, default None
|
|
217
248
|
Default value to return if there is no match.
|
|
218
|
-
strict: bool, default
|
|
249
|
+
strict: bool, default False
|
|
219
250
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
220
251
|
|
|
221
252
|
|
|
@@ -236,6 +267,12 @@ class LexborNode:
|
|
|
236
267
|
def tag(self) -> str | None:
|
|
237
268
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
238
269
|
|
|
270
|
+
For for non-tag nodes, returns the following names:
|
|
271
|
+
|
|
272
|
+
* `-text` - text node
|
|
273
|
+
* `-document` - document node
|
|
274
|
+
* `-comment` - comment node
|
|
275
|
+
|
|
239
276
|
Returns
|
|
240
277
|
-------
|
|
241
278
|
text : str
|
|
@@ -351,6 +388,8 @@ class LexborNode:
|
|
|
351
388
|
def unwrap(self, delete_empty: bool = False) -> None:
|
|
352
389
|
"""Replace node with whatever is inside this node.
|
|
353
390
|
|
|
391
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
392
|
+
|
|
354
393
|
Parameters
|
|
355
394
|
----------
|
|
356
395
|
delete_empty : bool, default False
|
|
@@ -608,6 +647,44 @@ class LexborNode:
|
|
|
608
647
|
"""
|
|
609
648
|
...
|
|
610
649
|
|
|
650
|
+
@property
|
|
651
|
+
def inner_html(self) -> str | None:
|
|
652
|
+
"""Return HTML representation of the child nodes.
|
|
653
|
+
|
|
654
|
+
Works similar to innerHTML in JavaScript.
|
|
655
|
+
Unlike the `.html` property, does not include the current node.
|
|
656
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
text : str or None
|
|
661
|
+
"""
|
|
662
|
+
...
|
|
663
|
+
|
|
664
|
+
@inner_html.setter
|
|
665
|
+
def inner_html(self, html: str):
|
|
666
|
+
"""Set inner HTML to the specified HTML.
|
|
667
|
+
|
|
668
|
+
Replaces existing data inside the node.
|
|
669
|
+
Works similar to innerHTML in JavaScript.
|
|
670
|
+
|
|
671
|
+
Parameters
|
|
672
|
+
----------
|
|
673
|
+
html : str
|
|
674
|
+
|
|
675
|
+
"""
|
|
676
|
+
...
|
|
677
|
+
|
|
678
|
+
def clone(self) -> LexborNode:
|
|
679
|
+
"""Clone the current node.
|
|
680
|
+
|
|
681
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
682
|
+
|
|
683
|
+
It is tied to the current parser instance.
|
|
684
|
+
Gets destroyed when parser instance is destroyed.
|
|
685
|
+
"""
|
|
686
|
+
...
|
|
687
|
+
|
|
611
688
|
class LexborHTMLParser:
|
|
612
689
|
"""The lexbor HTML parser.
|
|
613
690
|
|
|
@@ -699,7 +776,7 @@ class LexborHTMLParser:
|
|
|
699
776
|
query : str
|
|
700
777
|
default : bool, default None
|
|
701
778
|
Default value to return if there is no match.
|
|
702
|
-
strict: bool, default
|
|
779
|
+
strict: bool, default False
|
|
703
780
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
704
781
|
|
|
705
782
|
|
|
@@ -720,7 +797,7 @@ class LexborHTMLParser:
|
|
|
720
797
|
query : str
|
|
721
798
|
default : bool, default None
|
|
722
799
|
Default value to return if there is no match.
|
|
723
|
-
strict: bool, default
|
|
800
|
+
strict: bool, default False
|
|
724
801
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
725
802
|
|
|
726
803
|
|
|
@@ -741,7 +818,7 @@ class LexborHTMLParser:
|
|
|
741
818
|
query : str
|
|
742
819
|
default : bool, default None
|
|
743
820
|
Default value to return if there is no match.
|
|
744
|
-
strict: bool, default
|
|
821
|
+
strict: bool, default False
|
|
745
822
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
746
823
|
|
|
747
824
|
|
selectolax/lexbor.pyx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from cpython.bool cimport bool
|
|
2
2
|
from cpython.exc cimport PyErr_SetObject
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
_ENCODING = 'UTF-8'
|
|
5
6
|
|
|
6
7
|
include "base.pxi"
|
|
@@ -9,6 +10,7 @@ include "lexbor/attrs.pxi"
|
|
|
9
10
|
include "lexbor/node.pxi"
|
|
10
11
|
include "lexbor/selection.pxi"
|
|
11
12
|
include "lexbor/util.pxi"
|
|
13
|
+
include "lexbor/node_remove.pxi"
|
|
12
14
|
|
|
13
15
|
# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
|
|
14
16
|
|
|
@@ -192,9 +194,9 @@ cdef class LexborHTMLParser:
|
|
|
192
194
|
----------
|
|
193
195
|
|
|
194
196
|
query : str
|
|
195
|
-
default :
|
|
197
|
+
default : Any, default None
|
|
196
198
|
Default value to return if there is no match.
|
|
197
|
-
strict: bool, default
|
|
199
|
+
strict: bool, default False
|
|
198
200
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
199
201
|
|
|
200
202
|
|
|
@@ -211,7 +213,7 @@ cdef class LexborHTMLParser:
|
|
|
211
213
|
----------
|
|
212
214
|
tags : list of str
|
|
213
215
|
List of tags to remove.
|
|
214
|
-
recursive : bool, default
|
|
216
|
+
recursive : bool, default False
|
|
215
217
|
Whenever to delete all its child nodes
|
|
216
218
|
|
|
217
219
|
Examples
|
|
@@ -334,7 +336,13 @@ cdef class LexborHTMLParser:
|
|
|
334
336
|
return obj
|
|
335
337
|
|
|
336
338
|
def clone(self):
|
|
337
|
-
"""Clone the current
|
|
339
|
+
"""Clone the current node.
|
|
340
|
+
|
|
341
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
342
|
+
|
|
343
|
+
It is tied to the current parser instance.
|
|
344
|
+
Gets destroyed when parser instance is destroyed.
|
|
345
|
+
"""
|
|
338
346
|
cdef lxb_html_document_t* cloned_document
|
|
339
347
|
cdef lxb_dom_node_t* cloned_node
|
|
340
348
|
cdef LexborHTMLParser cls
|
|
@@ -386,3 +394,31 @@ cdef class LexborHTMLParser:
|
|
|
386
394
|
# faster to check if the document is empty which should determine if we have a root
|
|
387
395
|
if self.document != NULL:
|
|
388
396
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def inner_html(self) -> str:
|
|
400
|
+
"""Return HTML representation of the child nodes.
|
|
401
|
+
|
|
402
|
+
Works similar to innerHTML in JavaScript.
|
|
403
|
+
Unlike the `.html` property, does not include the current node.
|
|
404
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
text : str | None
|
|
409
|
+
"""
|
|
410
|
+
return self.root.inner_html
|
|
411
|
+
|
|
412
|
+
@inner_html.setter
|
|
413
|
+
def inner_html(self, str html):
|
|
414
|
+
"""Set inner HTML to the specified HTML.
|
|
415
|
+
|
|
416
|
+
Replaces existing data inside the node.
|
|
417
|
+
Works similar to innerHTML in JavaScript.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
html : str
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
self.root.inner_html = html
|
selectolax/modest/node.pxi
CHANGED
|
@@ -397,7 +397,10 @@ cdef class Node:
|
|
|
397
397
|
|
|
398
398
|
@property
|
|
399
399
|
def child(self):
|
|
400
|
-
"""
|
|
400
|
+
"""Alias for the `first_child` property.
|
|
401
|
+
|
|
402
|
+
**Deprecated**. Please use `first_child` instead.
|
|
403
|
+
"""
|
|
401
404
|
cdef Node node
|
|
402
405
|
if self.node.child:
|
|
403
406
|
node = Node.new(self.node.child, self.parser)
|