selectolax 0.3.31__cp313-cp313-win_amd64.whl → 0.3.33__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +0 -1
- selectolax/lexbor/node.pxi +93 -41
- selectolax/lexbor/selection.pxi +27 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +6367 -6672
- selectolax/lexbor.cp313-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +32 -35
- selectolax/lexbor.pyi +111 -5
- selectolax/lexbor.pyx +43 -17
- selectolax/modest/node.pxi +37 -36
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +4484 -5266
- selectolax/parser.cp313-win_amd64.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +2 -2
- selectolax/parser.pyx +28 -31
- selectolax/utils.pxi +13 -3
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/METADATA +3 -3
- selectolax-0.3.33.dist-info/RECORD +26 -0
- selectolax-0.3.31.dist-info/RECORD +0 -26
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/WHEEL +0 -0
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pxd
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from libc.stdint cimport
|
|
1
|
+
from libc.stdint cimport uint8_t, uint32_t, uintptr_t
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
cdef extern from "lexbor/core/core.h" nogil:
|
|
@@ -31,7 +31,6 @@ cdef extern from "lexbor/core/core.h" nogil:
|
|
|
31
31
|
lexbor_str_t* lexbor_str_create()
|
|
32
32
|
lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
|
|
33
33
|
|
|
34
|
-
|
|
35
34
|
cdef extern from "lexbor/html/html.h" nogil:
|
|
36
35
|
ctypedef unsigned int lxb_html_document_opt_t
|
|
37
36
|
|
|
@@ -54,14 +53,12 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
54
53
|
void *events
|
|
55
54
|
|
|
56
55
|
ctypedef struct lexbor_str_t:
|
|
57
|
-
lxb_char_t *data
|
|
58
|
-
size_t length
|
|
59
|
-
|
|
56
|
+
lxb_char_t *data
|
|
57
|
+
size_t length
|
|
60
58
|
|
|
61
59
|
ctypedef struct lxb_dom_node_t:
|
|
62
60
|
lxb_dom_event_target_t event_target
|
|
63
61
|
|
|
64
|
-
|
|
65
62
|
uintptr_t local_name
|
|
66
63
|
uintptr_t prefix
|
|
67
64
|
uintptr_t ns
|
|
@@ -77,7 +74,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
77
74
|
|
|
78
75
|
lxb_dom_node_type_t type
|
|
79
76
|
|
|
80
|
-
|
|
81
77
|
ctypedef struct lxb_dom_document_t:
|
|
82
78
|
lxb_dom_node_t node
|
|
83
79
|
|
|
@@ -104,7 +100,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
104
100
|
|
|
105
101
|
bint scripting
|
|
106
102
|
|
|
107
|
-
|
|
108
103
|
ctypedef struct lxb_html_document_t:
|
|
109
104
|
lxb_dom_document_t dom_document
|
|
110
105
|
|
|
@@ -128,7 +123,6 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
128
123
|
LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
|
|
129
124
|
LXB_HTML_PARSER_STATE_ERROR = 0x04
|
|
130
125
|
|
|
131
|
-
|
|
132
126
|
ctypedef enum lxb_dom_node_type_t:
|
|
133
127
|
LXB_DOM_NODE_TYPE_ELEMENT = 0x01
|
|
134
128
|
LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
|
|
@@ -175,10 +169,9 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
175
169
|
size_t length
|
|
176
170
|
size_t struct_size
|
|
177
171
|
|
|
178
|
-
|
|
179
172
|
ctypedef struct lxb_html_tree_pending_table_t
|
|
180
|
-
ctypedef bint lxb_html_tree_insertion_mode_f
|
|
181
|
-
ctypedef lxb_status_t lxb_html_tree_append_attr_f
|
|
173
|
+
ctypedef bint lxb_html_tree_insertion_mode_f
|
|
174
|
+
ctypedef lxb_status_t lxb_html_tree_append_attr_f
|
|
182
175
|
|
|
183
176
|
ctypedef struct lxb_html_tree_t:
|
|
184
177
|
|
|
@@ -189,13 +182,13 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
189
182
|
|
|
190
183
|
lxb_html_form_element_t *form
|
|
191
184
|
|
|
192
|
-
lexbor_array_t *open_elements
|
|
193
|
-
lexbor_array_t *active_formatting
|
|
194
|
-
lexbor_array_obj_t *template_insertion_modes
|
|
185
|
+
lexbor_array_t *open_elements
|
|
186
|
+
lexbor_array_t *active_formatting
|
|
187
|
+
lexbor_array_obj_t *template_insertion_modes
|
|
195
188
|
|
|
196
|
-
lxb_html_tree_pending_table_t *pending_table
|
|
189
|
+
lxb_html_tree_pending_table_t *pending_table
|
|
197
190
|
|
|
198
|
-
lexbor_array_obj_t *parse_errors
|
|
191
|
+
lexbor_array_obj_t *parse_errors
|
|
199
192
|
|
|
200
193
|
bint foster_parenting
|
|
201
194
|
bint frameset_ok
|
|
@@ -232,9 +225,13 @@ cdef extern from "lexbor/html/html.h" nogil:
|
|
|
232
225
|
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
|
|
233
226
|
|
|
234
227
|
cdef class LexborNode:
|
|
235
|
-
cdef
|
|
236
|
-
|
|
237
|
-
|
|
228
|
+
cdef:
|
|
229
|
+
lxb_dom_node_t *node
|
|
230
|
+
public LexborHTMLParser parser
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
|
|
234
|
+
|
|
238
235
|
|
|
239
236
|
cdef class LexborCSSSelector:
|
|
240
237
|
cdef lxb_css_parser_t* parser
|
|
@@ -242,15 +239,15 @@ cdef class LexborCSSSelector:
|
|
|
242
239
|
cdef lxb_css_selectors_t * css_selectors
|
|
243
240
|
cdef public list results
|
|
244
241
|
cdef public LexborNode current_node
|
|
245
|
-
cdef _create_css_parser(self)
|
|
246
|
-
cpdef find(self, str query, LexborNode node)
|
|
247
|
-
cpdef any_matches(self, str query, LexborNode node)
|
|
242
|
+
cdef int _create_css_parser(self) except -1
|
|
243
|
+
cpdef list find(self, str query, LexborNode node)
|
|
244
|
+
cpdef int any_matches(self, str query, LexborNode node) except -1
|
|
248
245
|
|
|
249
246
|
cdef class LexborHTMLParser:
|
|
250
247
|
cdef lxb_html_document_t *document
|
|
251
248
|
cdef public bytes raw_html
|
|
252
249
|
cdef LexborCSSSelector _selector
|
|
253
|
-
cdef _parse_html(self, char* html, size_t html_len)
|
|
250
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1
|
|
254
251
|
cdef object cached_script_texts
|
|
255
252
|
cdef object cached_script_srcs
|
|
256
253
|
|
|
@@ -267,8 +264,8 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
267
264
|
ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)
|
|
268
265
|
|
|
269
266
|
ctypedef struct lxb_dom_character_data_t:
|
|
270
|
-
lxb_dom_node_t node
|
|
271
|
-
lexbor_str_t data
|
|
267
|
+
lxb_dom_node_t node
|
|
268
|
+
lexbor_str_t data
|
|
272
269
|
|
|
273
270
|
ctypedef struct lxb_dom_text_t:
|
|
274
271
|
lxb_dom_character_data_t char_data
|
|
@@ -289,19 +286,20 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
289
286
|
lxb_dom_element_t *owner
|
|
290
287
|
|
|
291
288
|
lxb_dom_attr_t *next
|
|
292
|
-
lxb_dom_attr_t *prev
|
|
293
|
-
|
|
289
|
+
lxb_dom_attr_t *prev
|
|
294
290
|
|
|
295
291
|
lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
|
|
296
292
|
lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
|
|
293
|
+
lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
|
|
294
|
+
void lxb_dom_node_remove(lxb_dom_node_t *node)
|
|
297
295
|
void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
|
|
298
|
-
lxb_dom_node_t *
|
|
296
|
+
lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
|
|
299
297
|
lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
|
|
300
298
|
lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
|
|
301
299
|
lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
|
|
302
300
|
lxb_dom_attr_t * lxb_dom_element_first_attribute_noi(lxb_dom_element_t *element)
|
|
303
301
|
|
|
304
|
-
const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
302
|
+
const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
305
303
|
const lxb_char_t * lxb_dom_attr_value_noi(lxb_dom_attr_t *attr, size_t *len)
|
|
306
304
|
|
|
307
305
|
lxb_dom_attr_t * lxb_dom_element_set_attribute(lxb_dom_element_t *element,
|
|
@@ -314,7 +312,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
|
|
|
314
312
|
lxb_tag_id_t lxb_dom_node_tag_id_noi(lxb_dom_node_t *node)
|
|
315
313
|
lxb_dom_node_t * lxb_dom_document_import_node(lxb_dom_document_t *doc, lxb_dom_node_t *node, bint deep)
|
|
316
314
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
317
|
-
lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
|
|
315
|
+
lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
|
|
318
316
|
void lxb_dom_node_insert_child(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
319
317
|
void lxb_dom_node_insert_before(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
320
318
|
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
|
|
@@ -347,7 +345,7 @@ cdef extern from "lexbor/css/css.h" nogil:
|
|
|
347
345
|
lxb_css_parser_t * lxb_css_parser_create()
|
|
348
346
|
lxb_status_t lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz)
|
|
349
347
|
lxb_css_parser_t * lxb_css_parser_destroy(lxb_css_parser_t *parser, bint self_destroy)
|
|
350
|
-
lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
|
|
348
|
+
lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
|
|
351
349
|
void lxb_css_selector_list_destroy_memory(lxb_css_selector_list_t *list)
|
|
352
350
|
|
|
353
351
|
|
|
@@ -558,8 +556,7 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
|
|
|
558
556
|
ctypedef struct lxb_selectors_t
|
|
559
557
|
ctypedef struct lxb_css_selector_list_t
|
|
560
558
|
ctypedef struct lxb_css_selector_specificity_t
|
|
561
|
-
ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
|
|
562
|
-
void *ctx)
|
|
559
|
+
ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx)
|
|
563
560
|
ctypedef enum lxb_selectors_opt_t:
|
|
564
561
|
LXB_SELECTORS_OPT_DEFAULT = 0x00
|
|
565
562
|
LXB_SELECTORS_OPT_MATCH_ROOT = 1 << 1
|
|
@@ -576,4 +573,4 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
|
|
|
576
573
|
lxb_status_t lxb_selectors_init(lxb_selectors_t *selectors)
|
|
577
574
|
lxb_selectors_t * lxb_selectors_destroy(lxb_selectors_t *selectors, bint self_destroy)
|
|
578
575
|
lxb_status_t lxb_selectors_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
|
|
579
|
-
|
|
576
|
+
lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
|
selectolax/lexbor.pyi
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Iterator, Literal,
|
|
1
|
+
from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
@@ -158,11 +158,45 @@ class LexborNode:
|
|
|
158
158
|
@overload
|
|
159
159
|
def css_first(
|
|
160
160
|
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
161
|
-
) -> LexborNode:
|
|
161
|
+
) -> LexborNode:
|
|
162
|
+
"""Same as `css` but returns only the first match.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
|
|
167
|
+
query : str
|
|
168
|
+
default : bool, default None
|
|
169
|
+
Default value to return if there is no match.
|
|
170
|
+
strict: bool, default True
|
|
171
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
selector : `LexborNode` object
|
|
177
|
+
"""
|
|
178
|
+
...
|
|
162
179
|
@overload
|
|
163
180
|
def css_first(
|
|
164
181
|
self, query: str, default: DefaultT, strict: bool = False
|
|
165
|
-
) -> LexborNode | DefaultT:
|
|
182
|
+
) -> LexborNode | DefaultT:
|
|
183
|
+
"""Same as `css` but returns only the first match.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
|
|
188
|
+
query : str
|
|
189
|
+
default : bool, default None
|
|
190
|
+
Default value to return if there is no match.
|
|
191
|
+
strict: bool, default True
|
|
192
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
selector : `LexborNode` object
|
|
198
|
+
"""
|
|
199
|
+
...
|
|
166
200
|
@overload
|
|
167
201
|
def css_first(
|
|
168
202
|
self, query: str, default: None = ..., strict: bool = False
|
|
@@ -350,6 +384,25 @@ class LexborNode:
|
|
|
350
384
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
351
385
|
"""
|
|
352
386
|
...
|
|
387
|
+
def merge_text_nodes(self) -> None:
|
|
388
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
389
|
+
|
|
390
|
+
This is useful for text extraction.
|
|
391
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
392
|
+
|
|
393
|
+
Examples
|
|
394
|
+
--------
|
|
395
|
+
|
|
396
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
397
|
+
>>> node = tree.css_first('div')
|
|
398
|
+
>>> tree.unwrap_tags(["strong"])
|
|
399
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
400
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
401
|
+
>>> node.merge_text_nodes()
|
|
402
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
403
|
+
"John Doe"
|
|
404
|
+
"""
|
|
405
|
+
...
|
|
353
406
|
def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
|
|
354
407
|
"""Iterate over all child and next nodes starting from the current level.
|
|
355
408
|
|
|
@@ -625,11 +678,45 @@ class LexborHTMLParser:
|
|
|
625
678
|
@overload
|
|
626
679
|
def css_first(
|
|
627
680
|
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
628
|
-
) -> LexborNode:
|
|
681
|
+
) -> LexborNode:
|
|
682
|
+
"""Same as `css` but returns only the first match.
|
|
683
|
+
|
|
684
|
+
Parameters
|
|
685
|
+
----------
|
|
686
|
+
|
|
687
|
+
query : str
|
|
688
|
+
default : bool, default None
|
|
689
|
+
Default value to return if there is no match.
|
|
690
|
+
strict: bool, default True
|
|
691
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
Returns
|
|
695
|
+
-------
|
|
696
|
+
selector : `LexborNode` object
|
|
697
|
+
"""
|
|
698
|
+
...
|
|
629
699
|
@overload
|
|
630
700
|
def css_first(
|
|
631
701
|
self, query: str, default: DefaultT, strict: bool = False
|
|
632
|
-
) -> LexborNode | DefaultT:
|
|
702
|
+
) -> LexborNode | DefaultT:
|
|
703
|
+
"""Same as `css` but returns only the first match.
|
|
704
|
+
|
|
705
|
+
Parameters
|
|
706
|
+
----------
|
|
707
|
+
|
|
708
|
+
query : str
|
|
709
|
+
default : bool, default None
|
|
710
|
+
Default value to return if there is no match.
|
|
711
|
+
strict: bool, default True
|
|
712
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
Returns
|
|
716
|
+
-------
|
|
717
|
+
selector : `LexborNode` object
|
|
718
|
+
"""
|
|
719
|
+
...
|
|
633
720
|
@overload
|
|
634
721
|
def css_first(
|
|
635
722
|
self, query: str, default: None = ..., strict: bool = False
|
|
@@ -711,6 +798,25 @@ class LexborHTMLParser:
|
|
|
711
798
|
"""
|
|
712
799
|
...
|
|
713
800
|
def css_matches(self, selector: str) -> bool: ...
|
|
801
|
+
def merge_text_nodes(self) -> None:
|
|
802
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
803
|
+
|
|
804
|
+
This is useful for text extraction.
|
|
805
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
806
|
+
|
|
807
|
+
Examples
|
|
808
|
+
--------
|
|
809
|
+
|
|
810
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
811
|
+
>>> node = tree.css_first('div')
|
|
812
|
+
>>> tree.unwrap_tags(["strong"])
|
|
813
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
814
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
815
|
+
>>> node.merge_text_nodes()
|
|
816
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
817
|
+
"John Doe"
|
|
818
|
+
"""
|
|
819
|
+
...
|
|
714
820
|
def clone(self) -> LexborHTMLParser:
|
|
715
821
|
"""Clone the current tree."""
|
|
716
822
|
...
|
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
2
3
|
|
|
3
4
|
_ENCODING = 'UTF-8'
|
|
4
5
|
|
|
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
|
|
|
24
25
|
html : str (unicode) or bytes
|
|
25
26
|
"""
|
|
26
27
|
def __init__(self, html):
|
|
27
|
-
|
|
28
28
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
29
|
+
cdef object bytes_html
|
|
31
30
|
bytes_html, html_len = preprocess_input(html)
|
|
32
31
|
self._parse_html(bytes_html, html_len)
|
|
33
32
|
self.raw_html = bytes_html
|
|
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
|
|
|
39
38
|
self._selector = LexborCSSSelector()
|
|
40
39
|
return self._selector
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
41
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
42
|
cdef lxb_status_t status
|
|
45
43
|
|
|
46
44
|
with nogil:
|
|
47
45
|
self.document = lxb_html_document_create()
|
|
48
46
|
|
|
49
47
|
if self.document == NULL:
|
|
50
|
-
|
|
48
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
49
|
+
return -1
|
|
51
50
|
|
|
52
51
|
with nogil:
|
|
53
52
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
53
|
+
|
|
54
54
|
if status != 0x0000:
|
|
55
|
-
|
|
55
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
56
|
+
return -1
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
if self.document == NULL:
|
|
59
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
60
|
+
return -1
|
|
61
|
+
return 0
|
|
58
62
|
|
|
59
63
|
def __dealloc__(self):
|
|
60
64
|
if self.document != NULL:
|
|
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
|
|
|
68
72
|
"""Returns root node."""
|
|
69
73
|
if self.document == NULL:
|
|
70
74
|
return None
|
|
71
|
-
return LexborNode
|
|
75
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
76
|
|
|
73
77
|
@property
|
|
74
78
|
def body(self):
|
|
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
|
|
|
77
81
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
82
|
if body == NULL:
|
|
79
83
|
return None
|
|
80
|
-
return LexborNode
|
|
84
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
85
|
|
|
82
86
|
@property
|
|
83
87
|
def head(self):
|
|
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
|
|
|
86
90
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
91
|
if head == NULL:
|
|
88
92
|
return None
|
|
89
|
-
return LexborNode
|
|
93
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
94
|
|
|
91
95
|
def tags(self, str name):
|
|
92
96
|
"""Returns a list of tags that match specified name.
|
|
@@ -122,7 +126,7 @@ cdef class LexborHTMLParser:
|
|
|
122
126
|
raise SelectolaxError("Can't locate elements.")
|
|
123
127
|
|
|
124
128
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
125
|
-
node = LexborNode
|
|
129
|
+
node = LexborNode.new(
|
|
126
130
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
127
131
|
self
|
|
128
132
|
)
|
|
@@ -156,7 +160,7 @@ cdef class LexborHTMLParser:
|
|
|
156
160
|
"""Return HTML representation of the page."""
|
|
157
161
|
if self.document == NULL:
|
|
158
162
|
return None
|
|
159
|
-
node = LexborNode
|
|
163
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
160
164
|
return node.html
|
|
161
165
|
|
|
162
166
|
def css(self, str query):
|
|
@@ -238,7 +242,7 @@ cdef class LexborHTMLParser:
|
|
|
238
242
|
|
|
239
243
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
240
244
|
if recursive:
|
|
241
|
-
lxb_dom_node_destroy_deep(
|
|
245
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
242
246
|
else:
|
|
243
247
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
244
248
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -279,7 +283,6 @@ cdef class LexborHTMLParser:
|
|
|
279
283
|
"""
|
|
280
284
|
return self.root.scripts_contain(query)
|
|
281
285
|
|
|
282
|
-
|
|
283
286
|
def script_srcs_contain(self, tuple queries):
|
|
284
287
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
285
288
|
|
|
@@ -295,6 +298,26 @@ cdef class LexborHTMLParser:
|
|
|
295
298
|
def css_matches(self, str selector):
|
|
296
299
|
return self.root.css_matches(selector)
|
|
297
300
|
|
|
301
|
+
def merge_text_nodes(self):
|
|
302
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
303
|
+
|
|
304
|
+
This is useful for text extraction.
|
|
305
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
306
|
+
|
|
307
|
+
Examples
|
|
308
|
+
--------
|
|
309
|
+
|
|
310
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
311
|
+
>>> node = tree.css_first('div')
|
|
312
|
+
>>> tree.unwrap_tags(["strong"])
|
|
313
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
314
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
315
|
+
>>> node.merge_text_nodes()
|
|
316
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
317
|
+
"John Doe"
|
|
318
|
+
"""
|
|
319
|
+
return self.root.merge_text_nodes()
|
|
320
|
+
|
|
298
321
|
@staticmethod
|
|
299
322
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
300
323
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -309,6 +332,7 @@ cdef class LexborHTMLParser:
|
|
|
309
332
|
"""Clone the current tree."""
|
|
310
333
|
cdef lxb_html_document_t* cloned_document
|
|
311
334
|
cdef lxb_dom_node_t* cloned_node
|
|
335
|
+
cdef LexborHTMLParser cls
|
|
312
336
|
|
|
313
337
|
with nogil:
|
|
314
338
|
cloned_document = lxb_html_document_create()
|
|
@@ -333,6 +357,7 @@ cdef class LexborHTMLParser:
|
|
|
333
357
|
|
|
334
358
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
335
359
|
return cls
|
|
360
|
+
|
|
336
361
|
def unwrap_tags(self, list tags, delete_empty = False):
|
|
337
362
|
"""Unwraps specified tags from the HTML tree.
|
|
338
363
|
|
|
@@ -353,5 +378,6 @@ cdef class LexborHTMLParser:
|
|
|
353
378
|
>>> tree.body.html
|
|
354
379
|
'<body><div>Hello world!</div></body>'
|
|
355
380
|
"""
|
|
356
|
-
if
|
|
381
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
382
|
+
if self.document != NULL:
|
|
357
383
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|