selectolax 0.3.32__cp310-cp310-win_amd64.whl → 0.3.34__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +0 -1
- selectolax/lexbor/node.pxi +99 -41
- selectolax/lexbor/selection.pxi +27 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +6412 -6702
- selectolax/lexbor.cp310-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +32 -35
- selectolax/lexbor.pyi +51 -1
- selectolax/lexbor.pyx +48 -17
- selectolax/modest/node.pxi +37 -36
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +4524 -5291
- selectolax/parser.cp310-win_amd64.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +2 -2
- selectolax/parser.pyx +28 -31
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- selectolax-0.3.32.dist-info/METADATA +0 -187
- selectolax-0.3.32.dist-info/RECORD +0 -26
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/WHEEL +0 -0
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pxd
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from libc.stdint cimport
|
|
1
|
+
from libc.stdint cimport uint8_t, uint32_t, uintptr_t
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
cdef extern from "lexbor/core/core.h" nogil:
|
|
@@ -31,7 +31,6 @@ cdef extern from "lexbor/core/core.h" nogil:
|
|
|
31
31
|
lexbor_str_t* lexbor_str_create()
|
|
32
32
|
lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
|
|
33
33
|
|
|
34
|
-
|
|
35
34
|
cdef extern from "lexbor/html/html.h" nogil:
|
|
36
35
|
ctypedef unsigned int lxb_html_document_opt_t
|
|
37
36
|
|
|
@@ -54,14 +53,12 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
54
53
|
void *events
|
|
55
54
|
|
|
56
55
|
ctypedef struct lexbor_str_t:
|
|
57
|
-
lxb_char_t *data
|
|
58
|
-
size_t length
|
|
59
|
-
|
|
56
|
+
lxb_char_t *data
|
|
57
|
+
size_t length
|
|
60
58
|
|
|
61
59
|
ctypedef struct lxb_dom_node_t:
|
|
62
60
|
lxb_dom_event_target_t event_target
|
|
63
61
|
|
|
64
|
-
|
|
65
62
|
uintptr_t local_name
|
|
66
63
|
uintptr_t prefix
|
|
67
64
|
uintptr_t ns
|
|
@@ -77,7 +74,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
77
74
|
|
|
78
75
|
lxb_dom_node_type_t type
|
|
79
76
|
|
|
80
|
-
|
|
81
77
|
ctypedef struct lxb_dom_document_t:
|
|
82
78
|
lxb_dom_node_t node
|
|
83
79
|
|
|
@@ -104,7 +100,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
104
100
|
|
|
105
101
|
bint scripting
|
|
106
102
|
|
|
107
|
-
|
|
108
103
|
ctypedef struct lxb_html_document_t:
|
|
109
104
|
lxb_dom_document_t dom_document
|
|
110
105
|
|
|
@@ -128,7 +123,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
128
123
|
LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
|
|
129
124
|
LXB_HTML_PARSER_STATE_ERROR = 0x04
|
|
130
125
|
|
|
131
|
-
|
|
132
126
|
ctypedef enum lxb_dom_node_type_t:
|
|
133
127
|
LXB_DOM_NODE_TYPE_ELEMENT = 0x01
|
|
134
128
|
LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
|
|
@@ -175,10 +169,9 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
175
169
|
size_t length
|
|
176
170
|
size_t struct_size
|
|
177
171
|
|
|
178
|
-
|
|
179
172
|
ctypedef struct lxb_html_tree_pending_table_t
|
|
180
|
-
ctypedef bint lxb_html_tree_insertion_mode_f
|
|
181
|
-
ctypedef lxb_status_t lxb_html_tree_append_attr_f
|
|
173
|
+
ctypedef bint lxb_html_tree_insertion_mode_f
|
|
174
|
+
ctypedef lxb_status_t lxb_html_tree_append_attr_f
|
|
182
175
|
|
|
183
176
|
ctypedef struct lxb_html_tree_t:
|
|
184
177
|
|
|
@@ -189,13 +182,13 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
189
182
|
|
|
190
183
|
lxb_html_form_element_t *form
|
|
191
184
|
|
|
192
|
-
lexbor_array_t *open_elements
|
|
193
|
-
lexbor_array_t *active_formatting
|
|
194
|
-
lexbor_array_obj_t *template_insertion_modes
|
|
185
|
+
lexbor_array_t *open_elements
|
|
186
|
+
lexbor_array_t *active_formatting
|
|
187
|
+
lexbor_array_obj_t *template_insertion_modes
|
|
195
188
|
|
|
196
|
-
lxb_html_tree_pending_table_t *pending_table
|
|
189
|
+
lxb_html_tree_pending_table_t *pending_table
|
|
197
190
|
|
|
198
|
-
lexbor_array_obj_t *parse_errors
|
|
191
|
+
lexbor_array_obj_t *parse_errors
|
|
199
192
|
|
|
200
193
|
bint foster_parenting
|
|
201
194
|
bint frameset_ok
|
|
@@ -232,9 +225,13 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
232
225
|
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
233
226
|
|
|
234
227
|
cdef class LexborNode:
|
|
235
|
-
cdef
|
|
236
|
-
|
|
237
|
-
|
|
228
|
+
cdef:
|
|
229
|
+
lxb_dom_node_t *node
|
|
230
|
+
public LexborHTMLParser parser
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
|
|
234
|
+
|
|
238
235
|
|
|
239
236
|
cdef class LexborCSSSelector:
|
|
240
237
|
cdef lxb_css_parser_t* parser
|
|
@@ -242,15 +239,15 @@ cdef class LexborCSSSelector:
|
|
|
242
239
|
cdef lxb_css_selectors_t * css_selectors
|
|
243
240
|
cdef public list results
|
|
244
241
|
cdef public LexborNode current_node
|
|
245
|
-
cdef _create_css_parser(self)
|
|
246
|
-
cpdef find(self, str query, LexborNode node)
|
|
247
|
-
cpdef any_matches(self, str query, LexborNode node)
|
|
242
|
+
cdef int _create_css_parser(self) except -1
|
|
243
|
+
cpdef list find(self, str query, LexborNode node)
|
|
244
|
+
cpdef int any_matches(self, str query, LexborNode node) except -1
|
|
248
245
|
|
|
249
246
|
cdef class LexborHTMLParser:
|
|
250
247
|
cdef lxb_html_document_t *document
|
|
251
248
|
cdef public bytes raw_html
|
|
252
249
|
cdef LexborCSSSelector _selector
|
|
253
|
-
cdef _parse_html(self, char* html, size_t html_len)
|
|
250
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1
|
|
254
251
|
cdef object cached_script_texts
|
|
255
252
|
cdef object cached_script_srcs
|
|
256
253
|
|
|
@@ -267,8 +264,8 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
267
264
|
ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)
|
|
268
265
|
|
|
269
266
|
ctypedef struct lxb_dom_character_data_t:
|
|
270
|
-
lxb_dom_node_t node
|
|
271
|
-
lexbor_str_t data
|
|
267
|
+
lxb_dom_node_t node
|
|
268
|
+
lexbor_str_t data
|
|
272
269
|
|
|
273
270
|
ctypedef struct lxb_dom_text_t:
|
|
274
271
|
lxb_dom_character_data_t char_data
|
|
@@ -289,19 +286,20 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
289
286
|
lxb_dom_element_t *owner
|
|
290
287
|
|
|
291
288
|
lxb_dom_attr_t *next
|
|
292
|
-
lxb_dom_attr_t *prev
|
|
293
|
-
|
|
289
|
+
lxb_dom_attr_t *prev
|
|
294
290
|
|
|
295
291
|
lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
|
|
296
292
|
lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
|
|
293
|
+
lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
|
|
294
|
+
void lxb_dom_node_remove(lxb_dom_node_t *node)
|
|
297
295
|
void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
|
|
298
|
-
lxb_dom_node_t *
|
|
296
|
+
lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
|
|
299
297
|
lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
|
|
300
298
|
lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
|
|
301
299
|
lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
|
|
302
300
|
lxb_dom_attr_t * lxb_dom_element_first_attribute_noi(lxb_dom_element_t *element)
|
|
303
301
|
|
|
304
|
-
const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
302
|
+
const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
305
303
|
const lxb_char_t * lxb_dom_attr_value_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
306
304
|
|
|
307
305
|
lxb_dom_attr_t * lxb_dom_element_set_attribute(lxb_dom_element_t *element,
|
|
@@ -314,7 +312,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
314
312
|
lxb_tag_id_t lxb_dom_node_tag_id_noi(lxb_dom_node_t *node)
|
|
315
313
|
lxb_dom_node_t * lxb_dom_document_import_node(lxb_dom_document_t *doc, lxb_dom_node_t *node, bint deep)
|
|
316
314
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
317
|
-
lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
|
|
315
|
+
lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
|
|
318
316
|
void lxb_dom_node_insert_child(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
319
317
|
void lxb_dom_node_insert_before(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
320
318
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
@@ -347,7 +345,7 @@ cdef extern from "lexbor/css/css.h" nogil:
|
|
|
347
345
|
lxb_css_parser_t * lxb_css_parser_create()
|
|
348
346
|
lxb_status_t lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz)
|
|
349
347
|
lxb_css_parser_t * lxb_css_parser_destroy(lxb_css_parser_t *parser, bint self_destroy)
|
|
350
|
-
lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
|
|
348
|
+
lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
|
|
351
349
|
void lxb_css_selector_list_destroy_memory(lxb_css_selector_list_t *list)
|
|
352
350
|
|
|
353
351
|
|
|
@@ -558,8 +556,7 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
|
|
|
558
556
|
ctypedef struct lxb_selectors_t
|
|
559
557
|
ctypedef struct lxb_css_selector_list_t
|
|
560
558
|
ctypedef struct lxb_css_selector_specificity_t
|
|
561
|
-
ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
|
|
562
|
-
void *ctx)
|
|
559
|
+
ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx)
|
|
563
560
|
ctypedef enum lxb_selectors_opt_t:
|
|
564
561
|
LXB_SELECTORS_OPT_DEFAULT = 0x00
|
|
565
562
|
LXB_SELECTORS_OPT_MATCH_ROOT = 1 << 1
|
|
@@ -576,4 +573,4 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
|
|
|
576
573
|
lxb_status_t lxb_selectors_init(lxb_selectors_t *selectors)
|
|
577
574
|
lxb_selectors_t * lxb_selectors_destroy(lxb_selectors_t *selectors, bint self_destroy)
|
|
578
575
|
lxb_status_t lxb_selectors_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
|
|
579
|
-
|
|
576
|
+
lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
|
selectolax/lexbor.pyi
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Iterator, Literal,
|
|
1
|
+
from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
@@ -145,6 +145,12 @@ class LexborNode:
|
|
|
145
145
|
Matches pattern `query` against HTML tree.
|
|
146
146
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
147
147
|
|
|
148
|
+
Special selectors:
|
|
149
|
+
|
|
150
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
151
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
152
|
+
|
|
153
|
+
|
|
148
154
|
Parameters
|
|
149
155
|
----------
|
|
150
156
|
query : str
|
|
@@ -384,6 +390,25 @@ class LexborNode:
|
|
|
384
390
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
385
391
|
"""
|
|
386
392
|
...
|
|
393
|
+
def merge_text_nodes(self) -> None:
|
|
394
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
395
|
+
|
|
396
|
+
This is useful for text extraction.
|
|
397
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
398
|
+
|
|
399
|
+
Examples
|
|
400
|
+
--------
|
|
401
|
+
|
|
402
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
403
|
+
>>> node = tree.css_first('div')
|
|
404
|
+
>>> tree.unwrap_tags(["strong"])
|
|
405
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
406
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
407
|
+
>>> node.merge_text_nodes()
|
|
408
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
409
|
+
"John Doe"
|
|
410
|
+
"""
|
|
411
|
+
...
|
|
387
412
|
def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
|
|
388
413
|
"""Iterate over all child and next nodes starting from the current level.
|
|
389
414
|
|
|
@@ -646,6 +671,12 @@ class LexborHTMLParser:
|
|
|
646
671
|
Matches pattern `query` against HTML tree.
|
|
647
672
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
648
673
|
|
|
674
|
+
Special selectors:
|
|
675
|
+
|
|
676
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
677
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
678
|
+
|
|
679
|
+
|
|
649
680
|
Parameters
|
|
650
681
|
----------
|
|
651
682
|
query : str
|
|
@@ -779,6 +810,25 @@ class LexborHTMLParser:
|
|
|
779
810
|
"""
|
|
780
811
|
...
|
|
781
812
|
def css_matches(self, selector: str) -> bool: ...
|
|
813
|
+
def merge_text_nodes(self) -> None:
|
|
814
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
815
|
+
|
|
816
|
+
This is useful for text extraction.
|
|
817
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
818
|
+
|
|
819
|
+
Examples
|
|
820
|
+
--------
|
|
821
|
+
|
|
822
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
823
|
+
>>> node = tree.css_first('div')
|
|
824
|
+
>>> tree.unwrap_tags(["strong"])
|
|
825
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
826
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
827
|
+
>>> node.merge_text_nodes()
|
|
828
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
829
|
+
"John Doe"
|
|
830
|
+
"""
|
|
831
|
+
...
|
|
782
832
|
def clone(self) -> LexborHTMLParser:
|
|
783
833
|
"""Clone the current tree."""
|
|
784
834
|
...
|
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
2
3
|
|
|
3
4
|
_ENCODING = 'UTF-8'
|
|
4
5
|
|
|
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
|
|
|
24
25
|
html : str (unicode) or bytes
|
|
25
26
|
"""
|
|
26
27
|
def __init__(self, html):
|
|
27
|
-
|
|
28
28
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
29
|
+
cdef object bytes_html
|
|
31
30
|
bytes_html, html_len = preprocess_input(html)
|
|
32
31
|
self._parse_html(bytes_html, html_len)
|
|
33
32
|
self.raw_html = bytes_html
|
|
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
|
|
|
39
38
|
self._selector = LexborCSSSelector()
|
|
40
39
|
return self._selector
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
41
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
42
|
cdef lxb_status_t status
|
|
45
43
|
|
|
46
44
|
with nogil:
|
|
47
45
|
self.document = lxb_html_document_create()
|
|
48
46
|
|
|
49
47
|
if self.document == NULL:
|
|
50
|
-
|
|
48
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
49
|
+
return -1
|
|
51
50
|
|
|
52
51
|
with nogil:
|
|
53
52
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
53
|
+
|
|
54
54
|
if status != 0x0000:
|
|
55
|
-
|
|
55
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
56
|
+
return -1
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
if self.document == NULL:
|
|
59
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
60
|
+
return -1
|
|
61
|
+
return 0
|
|
58
62
|
|
|
59
63
|
def __dealloc__(self):
|
|
60
64
|
if self.document != NULL:
|
|
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
|
|
|
68
72
|
"""Returns root node."""
|
|
69
73
|
if self.document == NULL:
|
|
70
74
|
return None
|
|
71
|
-
return LexborNode
|
|
75
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
76
|
|
|
73
77
|
@property
|
|
74
78
|
def body(self):
|
|
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
|
|
|
77
81
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
82
|
if body == NULL:
|
|
79
83
|
return None
|
|
80
|
-
return LexborNode
|
|
84
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
85
|
|
|
82
86
|
@property
|
|
83
87
|
def head(self):
|
|
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
|
|
|
86
90
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
91
|
if head == NULL:
|
|
88
92
|
return None
|
|
89
|
-
return LexborNode
|
|
93
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
94
|
|
|
91
95
|
def tags(self, str name):
|
|
92
96
|
"""Returns a list of tags that match specified name.
|
|
@@ -122,7 +126,7 @@ cdef class LexborHTMLParser:
|
|
|
122
126
|
raise SelectolaxError("Can't locate elements.")
|
|
123
127
|
|
|
124
128
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
125
|
-
node = LexborNode
|
|
129
|
+
node = LexborNode.new(
|
|
126
130
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
127
131
|
self
|
|
128
132
|
)
|
|
@@ -156,7 +160,7 @@ cdef class LexborHTMLParser:
|
|
|
156
160
|
"""Return HTML representation of the page."""
|
|
157
161
|
if self.document == NULL:
|
|
158
162
|
return None
|
|
159
|
-
node = LexborNode
|
|
163
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
160
164
|
return node.html
|
|
161
165
|
|
|
162
166
|
def css(self, str query):
|
|
@@ -165,6 +169,11 @@ cdef class LexborHTMLParser:
|
|
|
165
169
|
Matches pattern `query` against HTML tree.
|
|
166
170
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
167
171
|
|
|
172
|
+
Special selectors:
|
|
173
|
+
|
|
174
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
175
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
176
|
+
|
|
168
177
|
Parameters
|
|
169
178
|
----------
|
|
170
179
|
query : str
|
|
@@ -238,7 +247,7 @@ cdef class LexborHTMLParser:
|
|
|
238
247
|
|
|
239
248
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
240
249
|
if recursive:
|
|
241
|
-
lxb_dom_node_destroy_deep(
|
|
250
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
242
251
|
else:
|
|
243
252
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
244
253
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -279,7 +288,6 @@ cdef class LexborHTMLParser:
|
|
|
279
288
|
"""
|
|
280
289
|
return self.root.scripts_contain(query)
|
|
281
290
|
|
|
282
|
-
|
|
283
291
|
def script_srcs_contain(self, tuple queries):
|
|
284
292
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
285
293
|
|
|
@@ -295,6 +303,26 @@ cdef class LexborHTMLParser:
|
|
|
295
303
|
def css_matches(self, str selector):
|
|
296
304
|
return self.root.css_matches(selector)
|
|
297
305
|
|
|
306
|
+
def merge_text_nodes(self):
|
|
307
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
308
|
+
|
|
309
|
+
This is useful for text extraction.
|
|
310
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
311
|
+
|
|
312
|
+
Examples
|
|
313
|
+
--------
|
|
314
|
+
|
|
315
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
316
|
+
>>> node = tree.css_first('div')
|
|
317
|
+
>>> tree.unwrap_tags(["strong"])
|
|
318
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
319
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
320
|
+
>>> node.merge_text_nodes()
|
|
321
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
322
|
+
"John Doe"
|
|
323
|
+
"""
|
|
324
|
+
return self.root.merge_text_nodes()
|
|
325
|
+
|
|
298
326
|
@staticmethod
|
|
299
327
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
300
328
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -309,6 +337,7 @@ cdef class LexborHTMLParser:
|
|
|
309
337
|
"""Clone the current tree."""
|
|
310
338
|
cdef lxb_html_document_t* cloned_document
|
|
311
339
|
cdef lxb_dom_node_t* cloned_node
|
|
340
|
+
cdef LexborHTMLParser cls
|
|
312
341
|
|
|
313
342
|
with nogil:
|
|
314
343
|
cloned_document = lxb_html_document_create()
|
|
@@ -333,6 +362,7 @@ cdef class LexborHTMLParser:
|
|
|
333
362
|
|
|
334
363
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
335
364
|
return cls
|
|
365
|
+
|
|
336
366
|
def unwrap_tags(self, list tags, delete_empty = False):
|
|
337
367
|
"""Unwraps specified tags from the HTML tree.
|
|
338
368
|
|
|
@@ -353,5 +383,6 @@ cdef class LexborHTMLParser:
|
|
|
353
383
|
>>> tree.body.html
|
|
354
384
|
'<body><div>Hello world!</div></body>'
|
|
355
385
|
"""
|
|
356
|
-
if
|
|
386
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
387
|
+
if self.document != NULL:
|
|
357
388
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,6 +9,7 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
@@ -23,9 +25,10 @@ cdef class Stack:
|
|
|
23
25
|
cdef bint is_empty(self):
|
|
24
26
|
return self.top <= 0
|
|
25
27
|
|
|
26
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
27
29
|
if self.top >= self.capacity:
|
|
28
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
29
32
|
self._stack[self.top] = res
|
|
30
33
|
self.top += 1
|
|
31
34
|
|
|
@@ -33,10 +36,13 @@ cdef class Stack:
|
|
|
33
36
|
self.top = self.top - 1
|
|
34
37
|
return self._stack[self.top]
|
|
35
38
|
|
|
36
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
37
40
|
self.capacity *= 2
|
|
38
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
39
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
40
46
|
|
|
41
47
|
cdef class _Attributes:
|
|
42
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -130,25 +136,24 @@ cdef class _Attributes:
|
|
|
130
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
131
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
132
138
|
|
|
133
|
-
|
|
134
|
-
|
|
135
139
|
ctypedef fused str_or_Node:
|
|
136
140
|
str
|
|
137
141
|
bytes
|
|
138
142
|
Node
|
|
139
143
|
|
|
140
|
-
|
|
141
144
|
cdef class Node:
|
|
142
145
|
"""A class that represents HTML node (element)."""
|
|
143
146
|
cdef myhtml_tree_node_t *node
|
|
144
147
|
cdef public HTMLParser parser
|
|
145
148
|
|
|
146
|
-
|
|
147
|
-
cdef
|
|
148
|
-
# custom
|
|
149
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
150
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
151
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
152
157
|
|
|
153
158
|
@property
|
|
154
159
|
def attributes(self):
|
|
@@ -288,7 +293,7 @@ cdef class Node:
|
|
|
288
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
289
294
|
text = ""
|
|
290
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
291
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
292
297
|
|
|
293
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
294
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -341,12 +346,10 @@ cdef class Node:
|
|
|
341
346
|
node = node.next
|
|
342
347
|
continue
|
|
343
348
|
|
|
344
|
-
next_node = Node()
|
|
345
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
346
350
|
yield next_node
|
|
347
351
|
node = node.next
|
|
348
352
|
|
|
349
|
-
|
|
350
353
|
def traverse(self, include_text=False):
|
|
351
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
352
355
|
|
|
@@ -360,16 +363,15 @@ cdef class Node:
|
|
|
360
363
|
node
|
|
361
364
|
"""
|
|
362
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
363
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
364
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
365
368
|
|
|
366
369
|
stack.push(self.node)
|
|
367
370
|
|
|
368
371
|
while not stack.is_empty():
|
|
369
372
|
current_node = stack.pop()
|
|
370
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
371
|
-
next_node = Node()
|
|
372
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
373
375
|
yield next_node
|
|
374
376
|
|
|
375
377
|
if current_node.next is not NULL:
|
|
@@ -398,8 +400,7 @@ cdef class Node:
|
|
|
398
400
|
"""Return the child node."""
|
|
399
401
|
cdef Node node
|
|
400
402
|
if self.node.child:
|
|
401
|
-
node = Node()
|
|
402
|
-
node._init(self.node.child, self.parser)
|
|
403
|
+
node = Node.new(self.node.child, self.parser)
|
|
403
404
|
return node
|
|
404
405
|
return None
|
|
405
406
|
|
|
@@ -408,8 +409,7 @@ cdef class Node:
|
|
|
408
409
|
"""Return the parent node."""
|
|
409
410
|
cdef Node node
|
|
410
411
|
if self.node.parent:
|
|
411
|
-
node = Node()
|
|
412
|
-
node._init(self.node.parent, self.parser)
|
|
412
|
+
node = Node.new(self.node.parent, self.parser)
|
|
413
413
|
return node
|
|
414
414
|
return None
|
|
415
415
|
|
|
@@ -418,8 +418,7 @@ cdef class Node:
|
|
|
418
418
|
"""Return next node."""
|
|
419
419
|
cdef Node node
|
|
420
420
|
if self.node.next:
|
|
421
|
-
node = Node()
|
|
422
|
-
node._init(self.node.next, self.parser)
|
|
421
|
+
node = Node.new(self.node.next, self.parser)
|
|
423
422
|
return node
|
|
424
423
|
return None
|
|
425
424
|
|
|
@@ -428,8 +427,7 @@ cdef class Node:
|
|
|
428
427
|
"""Return previous node."""
|
|
429
428
|
cdef Node node
|
|
430
429
|
if self.node.prev:
|
|
431
|
-
node = Node()
|
|
432
|
-
node._init(self.node.prev, self.parser)
|
|
430
|
+
node = Node.new(self.node.prev, self.parser)
|
|
433
431
|
return node
|
|
434
432
|
return None
|
|
435
433
|
|
|
@@ -438,8 +436,7 @@ cdef class Node:
|
|
|
438
436
|
"""Return last child node."""
|
|
439
437
|
cdef Node node
|
|
440
438
|
if self.node.last_child:
|
|
441
|
-
node = Node()
|
|
442
|
-
node._init(self.node.last_child, self.parser)
|
|
439
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
443
440
|
return node
|
|
444
441
|
return None
|
|
445
442
|
|
|
@@ -539,8 +536,8 @@ cdef class Node:
|
|
|
539
536
|
if delete_empty:
|
|
540
537
|
myhtml_node_delete(self.node)
|
|
541
538
|
return
|
|
542
|
-
cdef myhtml_tree_node_t* next_node
|
|
543
|
-
cdef myhtml_tree_node_t* current_node
|
|
539
|
+
cdef myhtml_tree_node_t* next_node
|
|
540
|
+
cdef myhtml_tree_node_t* current_node
|
|
544
541
|
|
|
545
542
|
if self.node.child.next != NULL:
|
|
546
543
|
current_node = self.node.child
|
|
@@ -574,6 +571,8 @@ cdef class Node:
|
|
|
574
571
|
'<html><body><div>Hello world!</div></body></html>'
|
|
575
572
|
|
|
576
573
|
"""
|
|
574
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
575
|
+
cdef Node element
|
|
577
576
|
for tag in tags:
|
|
578
577
|
for element in self.css(tag):
|
|
579
578
|
element.decompose(recursive=recursive)
|
|
@@ -600,7 +599,7 @@ cdef class Node:
|
|
|
600
599
|
|
|
601
600
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
602
601
|
"""
|
|
603
|
-
|
|
602
|
+
cdef Node element
|
|
604
603
|
for tag in tags:
|
|
605
604
|
for element in self.css(tag):
|
|
606
605
|
element.unwrap(delete_empty)
|
|
@@ -788,7 +787,7 @@ cdef class Node:
|
|
|
788
787
|
|
|
789
788
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
790
789
|
"""
|
|
791
|
-
|
|
790
|
+
cdef Node element
|
|
792
791
|
for tag in tags:
|
|
793
792
|
for element in self.css(tag):
|
|
794
793
|
element.unwrap(delete_empty)
|
|
@@ -847,6 +846,7 @@ cdef class Node:
|
|
|
847
846
|
The query to check.
|
|
848
847
|
|
|
849
848
|
"""
|
|
849
|
+
cdef Node node
|
|
850
850
|
if self.parser.cached_script_texts is None:
|
|
851
851
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
852
852
|
text_nodes = []
|
|
@@ -895,6 +895,7 @@ cdef class Node:
|
|
|
895
895
|
if not isinstance(other, Node):
|
|
896
896
|
return False
|
|
897
897
|
return self.html == other.html
|
|
898
|
+
|
|
898
899
|
@property
|
|
899
900
|
def text_content(self):
|
|
900
901
|
"""Returns the text of the node if it is a text node.
|
|
@@ -948,8 +949,8 @@ cdef class Node:
|
|
|
948
949
|
while not stack.is_empty():
|
|
949
950
|
current_node = stack.pop()
|
|
950
951
|
|
|
951
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
952
|
-
|
|
952
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
953
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
953
954
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
954
955
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
955
956
|
if left_text and right_text:
|