selectolax 0.3.28__cp311-cp311-musllinux_1_2_aarch64.whl → 0.4.0__cp311-cp311-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +225 -58
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +21988 -22274
- selectolax/lexbor.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +98 -23
- selectolax/modest/node.pxi +68 -46
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18150 -20047
- selectolax/parser.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -53
- selectolax/parser.pyx +45 -35
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/METADATA +0 -183
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
_ENCODING = 'UTF-8'
|
|
4
6
|
|
|
@@ -8,6 +10,7 @@ include "lexbor/attrs.pxi"
|
|
|
8
10
|
include "lexbor/node.pxi"
|
|
9
11
|
include "lexbor/selection.pxi"
|
|
10
12
|
include "lexbor/util.pxi"
|
|
13
|
+
include "lexbor/node_remove.pxi"
|
|
11
14
|
|
|
12
15
|
# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
|
|
13
16
|
|
|
@@ -24,10 +27,8 @@ cdef class LexborHTMLParser:
|
|
|
24
27
|
html : str (unicode) or bytes
|
|
25
28
|
"""
|
|
26
29
|
def __init__(self, html):
|
|
27
|
-
|
|
28
30
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
31
|
+
cdef object bytes_html
|
|
31
32
|
bytes_html, html_len = preprocess_input(html)
|
|
32
33
|
self._parse_html(bytes_html, html_len)
|
|
33
34
|
self.raw_html = bytes_html
|
|
@@ -39,22 +40,27 @@ cdef class LexborHTMLParser:
|
|
|
39
40
|
self._selector = LexborCSSSelector()
|
|
40
41
|
return self._selector
|
|
41
42
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
43
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
44
|
cdef lxb_status_t status
|
|
45
45
|
|
|
46
46
|
with nogil:
|
|
47
47
|
self.document = lxb_html_document_create()
|
|
48
48
|
|
|
49
49
|
if self.document == NULL:
|
|
50
|
-
|
|
50
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
51
|
+
return -1
|
|
51
52
|
|
|
52
53
|
with nogil:
|
|
53
54
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
55
|
+
|
|
54
56
|
if status != 0x0000:
|
|
55
|
-
|
|
57
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
58
|
+
return -1
|
|
56
59
|
|
|
57
|
-
|
|
60
|
+
if self.document == NULL:
|
|
61
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
62
|
+
return -1
|
|
63
|
+
return 0
|
|
58
64
|
|
|
59
65
|
def __dealloc__(self):
|
|
60
66
|
if self.document != NULL:
|
|
@@ -68,7 +74,7 @@ cdef class LexborHTMLParser:
|
|
|
68
74
|
"""Returns root node."""
|
|
69
75
|
if self.document == NULL:
|
|
70
76
|
return None
|
|
71
|
-
return LexborNode
|
|
77
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
78
|
|
|
73
79
|
@property
|
|
74
80
|
def body(self):
|
|
@@ -77,7 +83,7 @@ cdef class LexborHTMLParser:
|
|
|
77
83
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
84
|
if body == NULL:
|
|
79
85
|
return None
|
|
80
|
-
return LexborNode
|
|
86
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
87
|
|
|
82
88
|
@property
|
|
83
89
|
def head(self):
|
|
@@ -86,7 +92,7 @@ cdef class LexborHTMLParser:
|
|
|
86
92
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
93
|
if head == NULL:
|
|
88
94
|
return None
|
|
89
|
-
return LexborNode
|
|
95
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
96
|
|
|
91
97
|
def tags(self, str name):
|
|
92
98
|
"""Returns a list of tags that match specified name.
|
|
@@ -96,6 +102,12 @@ cdef class LexborHTMLParser:
|
|
|
96
102
|
name : str (e.g. div)
|
|
97
103
|
|
|
98
104
|
"""
|
|
105
|
+
|
|
106
|
+
if not name:
|
|
107
|
+
raise ValueError("Tag name cannot be empty")
|
|
108
|
+
if len(name) > 100:
|
|
109
|
+
raise ValueError("Tag name is too long")
|
|
110
|
+
|
|
99
111
|
cdef lxb_dom_collection_t* collection = NULL
|
|
100
112
|
cdef lxb_status_t status
|
|
101
113
|
pybyte_name = name.encode('UTF-8')
|
|
@@ -116,7 +128,7 @@ cdef class LexborHTMLParser:
|
|
|
116
128
|
raise SelectolaxError("Can't locate elements.")
|
|
117
129
|
|
|
118
130
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
119
|
-
node = LexborNode
|
|
131
|
+
node = LexborNode.new(
|
|
120
132
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
121
133
|
self
|
|
122
134
|
)
|
|
@@ -150,7 +162,7 @@ cdef class LexborHTMLParser:
|
|
|
150
162
|
"""Return HTML representation of the page."""
|
|
151
163
|
if self.document == NULL:
|
|
152
164
|
return None
|
|
153
|
-
node = LexborNode
|
|
165
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
154
166
|
return node.html
|
|
155
167
|
|
|
156
168
|
def css(self, str query):
|
|
@@ -159,6 +171,11 @@ cdef class LexborHTMLParser:
|
|
|
159
171
|
Matches pattern `query` against HTML tree.
|
|
160
172
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
161
173
|
|
|
174
|
+
Special selectors:
|
|
175
|
+
|
|
176
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
177
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
178
|
+
|
|
162
179
|
Parameters
|
|
163
180
|
----------
|
|
164
181
|
query : str
|
|
@@ -177,9 +194,9 @@ cdef class LexborHTMLParser:
|
|
|
177
194
|
----------
|
|
178
195
|
|
|
179
196
|
query : str
|
|
180
|
-
default :
|
|
197
|
+
default : Any, default None
|
|
181
198
|
Default value to return if there is no match.
|
|
182
|
-
strict: bool, default
|
|
199
|
+
strict: bool, default False
|
|
183
200
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
184
201
|
|
|
185
202
|
|
|
@@ -196,7 +213,7 @@ cdef class LexborHTMLParser:
|
|
|
196
213
|
----------
|
|
197
214
|
tags : list of str
|
|
198
215
|
List of tags to remove.
|
|
199
|
-
recursive : bool, default
|
|
216
|
+
recursive : bool, default False
|
|
200
217
|
Whenever to delete all its child nodes
|
|
201
218
|
|
|
202
219
|
Examples
|
|
@@ -232,7 +249,7 @@ cdef class LexborHTMLParser:
|
|
|
232
249
|
|
|
233
250
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
234
251
|
if recursive:
|
|
235
|
-
lxb_dom_node_destroy_deep(
|
|
252
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
236
253
|
else:
|
|
237
254
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
238
255
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -273,7 +290,6 @@ cdef class LexborHTMLParser:
|
|
|
273
290
|
"""
|
|
274
291
|
return self.root.scripts_contain(query)
|
|
275
292
|
|
|
276
|
-
|
|
277
293
|
def script_srcs_contain(self, tuple queries):
|
|
278
294
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
279
295
|
|
|
@@ -289,6 +305,26 @@ cdef class LexborHTMLParser:
|
|
|
289
305
|
def css_matches(self, str selector):
|
|
290
306
|
return self.root.css_matches(selector)
|
|
291
307
|
|
|
308
|
+
def merge_text_nodes(self):
|
|
309
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
310
|
+
|
|
311
|
+
This is useful for text extraction.
|
|
312
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
313
|
+
|
|
314
|
+
Examples
|
|
315
|
+
--------
|
|
316
|
+
|
|
317
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
318
|
+
>>> node = tree.css_first('div')
|
|
319
|
+
>>> tree.unwrap_tags(["strong"])
|
|
320
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
321
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
322
|
+
>>> node.merge_text_nodes()
|
|
323
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
324
|
+
"John Doe"
|
|
325
|
+
"""
|
|
326
|
+
return self.root.merge_text_nodes()
|
|
327
|
+
|
|
292
328
|
@staticmethod
|
|
293
329
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
294
330
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -300,9 +336,16 @@ cdef class LexborHTMLParser:
|
|
|
300
336
|
return obj
|
|
301
337
|
|
|
302
338
|
def clone(self):
|
|
303
|
-
"""Clone the current
|
|
339
|
+
"""Clone the current node.
|
|
340
|
+
|
|
341
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
342
|
+
|
|
343
|
+
It is tied to the current parser instance.
|
|
344
|
+
Gets destroyed when parser instance is destroyed.
|
|
345
|
+
"""
|
|
304
346
|
cdef lxb_html_document_t* cloned_document
|
|
305
347
|
cdef lxb_dom_node_t* cloned_node
|
|
348
|
+
cdef LexborHTMLParser cls
|
|
306
349
|
|
|
307
350
|
with nogil:
|
|
308
351
|
cloned_document = lxb_html_document_create()
|
|
@@ -327,7 +370,8 @@ cdef class LexborHTMLParser:
|
|
|
327
370
|
|
|
328
371
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
329
372
|
return cls
|
|
330
|
-
|
|
373
|
+
|
|
374
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
331
375
|
"""Unwraps specified tags from the HTML tree.
|
|
332
376
|
|
|
333
377
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -336,6 +380,8 @@ cdef class LexborHTMLParser:
|
|
|
336
380
|
----------
|
|
337
381
|
tags : list
|
|
338
382
|
List of tags to remove.
|
|
383
|
+
delete_empty : bool
|
|
384
|
+
Whenever to delete empty tags.
|
|
339
385
|
|
|
340
386
|
Examples
|
|
341
387
|
--------
|
|
@@ -345,5 +391,34 @@ cdef class LexborHTMLParser:
|
|
|
345
391
|
>>> tree.body.html
|
|
346
392
|
'<body><div>Hello world!</div></body>'
|
|
347
393
|
"""
|
|
348
|
-
if
|
|
349
|
-
|
|
394
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
395
|
+
if self.document != NULL:
|
|
396
|
+
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def inner_html(self) -> str:
|
|
400
|
+
"""Return HTML representation of the child nodes.
|
|
401
|
+
|
|
402
|
+
Works similar to innerHTML in JavaScript.
|
|
403
|
+
Unlike the `.html` property, does not include the current node.
|
|
404
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
text : str | None
|
|
409
|
+
"""
|
|
410
|
+
return self.root.inner_html
|
|
411
|
+
|
|
412
|
+
@inner_html.setter
|
|
413
|
+
def inner_html(self, str html):
|
|
414
|
+
"""Set inner HTML to the specified HTML.
|
|
415
|
+
|
|
416
|
+
Replaces existing data inside the node.
|
|
417
|
+
Works similar to innerHTML in JavaScript.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
html : str
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
self.root.inner_html = html
|
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
14
16
|
self.capacity = capacity
|
|
15
17
|
self.top = 0
|
|
16
18
|
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
|
|
19
|
+
if self._stack == NULL:
|
|
20
|
+
raise MemoryError("Failed to allocate memory for stack")
|
|
17
21
|
|
|
18
22
|
def __dealloc__(self):
|
|
19
23
|
free(self._stack)
|
|
@@ -21,9 +25,10 @@ cdef class Stack:
|
|
|
21
25
|
cdef bint is_empty(self):
|
|
22
26
|
return self.top <= 0
|
|
23
27
|
|
|
24
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
25
29
|
if self.top >= self.capacity:
|
|
26
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
27
32
|
self._stack[self.top] = res
|
|
28
33
|
self.top += 1
|
|
29
34
|
|
|
@@ -31,10 +36,13 @@ cdef class Stack:
|
|
|
31
36
|
self.top = self.top - 1
|
|
32
37
|
return self._stack[self.top]
|
|
33
38
|
|
|
34
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
35
40
|
self.capacity *= 2
|
|
36
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
37
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
38
46
|
|
|
39
47
|
cdef class _Attributes:
|
|
40
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -128,25 +136,24 @@ cdef class _Attributes:
|
|
|
128
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
129
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
130
138
|
|
|
131
|
-
|
|
132
|
-
|
|
133
139
|
ctypedef fused str_or_Node:
|
|
134
|
-
|
|
140
|
+
str
|
|
135
141
|
bytes
|
|
136
142
|
Node
|
|
137
143
|
|
|
138
|
-
|
|
139
144
|
cdef class Node:
|
|
140
145
|
"""A class that represents HTML node (element)."""
|
|
141
146
|
cdef myhtml_tree_node_t *node
|
|
142
147
|
cdef public HTMLParser parser
|
|
143
148
|
|
|
144
|
-
|
|
145
|
-
cdef
|
|
146
|
-
# custom
|
|
147
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
148
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
149
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
150
157
|
|
|
151
158
|
@property
|
|
152
159
|
def attributes(self):
|
|
@@ -286,7 +293,7 @@ cdef class Node:
|
|
|
286
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
287
294
|
text = ""
|
|
288
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
289
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
290
297
|
|
|
291
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
292
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -339,12 +346,10 @@ cdef class Node:
|
|
|
339
346
|
node = node.next
|
|
340
347
|
continue
|
|
341
348
|
|
|
342
|
-
next_node = Node()
|
|
343
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
344
350
|
yield next_node
|
|
345
351
|
node = node.next
|
|
346
352
|
|
|
347
|
-
|
|
348
353
|
def traverse(self, include_text=False):
|
|
349
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
350
355
|
|
|
@@ -358,16 +363,15 @@ cdef class Node:
|
|
|
358
363
|
node
|
|
359
364
|
"""
|
|
360
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
361
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
362
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
363
368
|
|
|
364
369
|
stack.push(self.node)
|
|
365
370
|
|
|
366
371
|
while not stack.is_empty():
|
|
367
372
|
current_node = stack.pop()
|
|
368
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
369
|
-
next_node = Node()
|
|
370
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
371
375
|
yield next_node
|
|
372
376
|
|
|
373
377
|
if current_node.next is not NULL:
|
|
@@ -393,11 +397,13 @@ cdef class Node:
|
|
|
393
397
|
|
|
394
398
|
@property
|
|
395
399
|
def child(self):
|
|
396
|
-
"""
|
|
400
|
+
"""Alias for the `first_child` property.
|
|
401
|
+
|
|
402
|
+
**Deprecated**. Please use `first_child` instead.
|
|
403
|
+
"""
|
|
397
404
|
cdef Node node
|
|
398
405
|
if self.node.child:
|
|
399
|
-
node = Node()
|
|
400
|
-
node._init(self.node.child, self.parser)
|
|
406
|
+
node = Node.new(self.node.child, self.parser)
|
|
401
407
|
return node
|
|
402
408
|
return None
|
|
403
409
|
|
|
@@ -406,8 +412,7 @@ cdef class Node:
|
|
|
406
412
|
"""Return the parent node."""
|
|
407
413
|
cdef Node node
|
|
408
414
|
if self.node.parent:
|
|
409
|
-
node = Node()
|
|
410
|
-
node._init(self.node.parent, self.parser)
|
|
415
|
+
node = Node.new(self.node.parent, self.parser)
|
|
411
416
|
return node
|
|
412
417
|
return None
|
|
413
418
|
|
|
@@ -416,8 +421,7 @@ cdef class Node:
|
|
|
416
421
|
"""Return next node."""
|
|
417
422
|
cdef Node node
|
|
418
423
|
if self.node.next:
|
|
419
|
-
node = Node()
|
|
420
|
-
node._init(self.node.next, self.parser)
|
|
424
|
+
node = Node.new(self.node.next, self.parser)
|
|
421
425
|
return node
|
|
422
426
|
return None
|
|
423
427
|
|
|
@@ -426,8 +430,7 @@ cdef class Node:
|
|
|
426
430
|
"""Return previous node."""
|
|
427
431
|
cdef Node node
|
|
428
432
|
if self.node.prev:
|
|
429
|
-
node = Node()
|
|
430
|
-
node._init(self.node.prev, self.parser)
|
|
433
|
+
node = Node.new(self.node.prev, self.parser)
|
|
431
434
|
return node
|
|
432
435
|
return None
|
|
433
436
|
|
|
@@ -436,8 +439,7 @@ cdef class Node:
|
|
|
436
439
|
"""Return last child node."""
|
|
437
440
|
cdef Node node
|
|
438
441
|
if self.node.last_child:
|
|
439
|
-
node = Node()
|
|
440
|
-
node._init(self.node.last_child, self.parser)
|
|
442
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
441
443
|
return node
|
|
442
444
|
return None
|
|
443
445
|
|
|
@@ -515,9 +517,14 @@ cdef class Node:
|
|
|
515
517
|
"""An alias for the decompose method."""
|
|
516
518
|
self.decompose(recursive)
|
|
517
519
|
|
|
518
|
-
def unwrap(self):
|
|
520
|
+
def unwrap(self, delete_empty = False):
|
|
519
521
|
"""Replace node with whatever is inside this node.
|
|
520
522
|
|
|
523
|
+
Parameters
|
|
524
|
+
----------
|
|
525
|
+
delete_empty : bool, default False
|
|
526
|
+
Whenever to delete empty tags.
|
|
527
|
+
|
|
521
528
|
Examples
|
|
522
529
|
--------
|
|
523
530
|
|
|
@@ -526,11 +533,14 @@ cdef class Node:
|
|
|
526
533
|
>>> tree.html
|
|
527
534
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
528
535
|
|
|
536
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
529
537
|
"""
|
|
530
538
|
if self.node.child == NULL:
|
|
539
|
+
if delete_empty:
|
|
540
|
+
myhtml_node_delete(self.node)
|
|
531
541
|
return
|
|
532
|
-
cdef myhtml_tree_node_t* next_node
|
|
533
|
-
cdef myhtml_tree_node_t* current_node
|
|
542
|
+
cdef myhtml_tree_node_t* next_node
|
|
543
|
+
cdef myhtml_tree_node_t* current_node
|
|
534
544
|
|
|
535
545
|
if self.node.child.next != NULL:
|
|
536
546
|
current_node = self.node.child
|
|
@@ -564,11 +574,13 @@ cdef class Node:
|
|
|
564
574
|
'<html><body><div>Hello world!</div></body></html>'
|
|
565
575
|
|
|
566
576
|
"""
|
|
577
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
578
|
+
cdef Node element
|
|
567
579
|
for tag in tags:
|
|
568
580
|
for element in self.css(tag):
|
|
569
581
|
element.decompose(recursive=recursive)
|
|
570
582
|
|
|
571
|
-
def unwrap_tags(self, list tags):
|
|
583
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
572
584
|
"""Unwraps specified tags from the HTML tree.
|
|
573
585
|
|
|
574
586
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -577,6 +589,8 @@ cdef class Node:
|
|
|
577
589
|
----------
|
|
578
590
|
tags : list
|
|
579
591
|
List of tags to remove.
|
|
592
|
+
delete_empty : bool, default False
|
|
593
|
+
Whenever to delete empty tags.
|
|
580
594
|
|
|
581
595
|
Examples
|
|
582
596
|
--------
|
|
@@ -585,11 +599,13 @@ cdef class Node:
|
|
|
585
599
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
586
600
|
>>> tree.body.html
|
|
587
601
|
'<body><div>Hello world!</div></body>'
|
|
588
|
-
"""
|
|
589
602
|
|
|
603
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
604
|
+
"""
|
|
605
|
+
cdef Node element
|
|
590
606
|
for tag in tags:
|
|
591
607
|
for element in self.css(tag):
|
|
592
|
-
element.unwrap()
|
|
608
|
+
element.unwrap(delete_empty)
|
|
593
609
|
|
|
594
610
|
def replace_with(self, str_or_Node value):
|
|
595
611
|
"""Replace current Node with specified value.
|
|
@@ -752,7 +768,7 @@ cdef class Node:
|
|
|
752
768
|
else:
|
|
753
769
|
raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
|
|
754
770
|
|
|
755
|
-
def unwrap_tags(self, list tags):
|
|
771
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
756
772
|
"""Unwraps specified tags from the HTML tree.
|
|
757
773
|
|
|
758
774
|
Works the same as th ``unwrap`` method, but applied to a list of tags.
|
|
@@ -761,6 +777,8 @@ cdef class Node:
|
|
|
761
777
|
----------
|
|
762
778
|
tags : list
|
|
763
779
|
List of tags to remove.
|
|
780
|
+
delete_empty : bool, default False
|
|
781
|
+
Whenever to delete empty tags.
|
|
764
782
|
|
|
765
783
|
Examples
|
|
766
784
|
--------
|
|
@@ -769,11 +787,13 @@ cdef class Node:
|
|
|
769
787
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
770
788
|
>>> tree.body.html
|
|
771
789
|
'<body><div>Hello world!</div></body>'
|
|
772
|
-
"""
|
|
773
790
|
|
|
791
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
792
|
+
"""
|
|
793
|
+
cdef Node element
|
|
774
794
|
for tag in tags:
|
|
775
795
|
for element in self.css(tag):
|
|
776
|
-
element.unwrap()
|
|
796
|
+
element.unwrap(delete_empty)
|
|
777
797
|
|
|
778
798
|
@property
|
|
779
799
|
def raw_value(self):
|
|
@@ -829,6 +849,7 @@ cdef class Node:
|
|
|
829
849
|
The query to check.
|
|
830
850
|
|
|
831
851
|
"""
|
|
852
|
+
cdef Node node
|
|
832
853
|
if self.parser.cached_script_texts is None:
|
|
833
854
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
834
855
|
text_nodes = []
|
|
@@ -877,6 +898,7 @@ cdef class Node:
|
|
|
877
898
|
if not isinstance(other, Node):
|
|
878
899
|
return False
|
|
879
900
|
return self.html == other.html
|
|
901
|
+
|
|
880
902
|
@property
|
|
881
903
|
def text_content(self):
|
|
882
904
|
"""Returns the text of the node if it is a text node.
|
|
@@ -930,8 +952,8 @@ cdef class Node:
|
|
|
930
952
|
while not stack.is_empty():
|
|
931
953
|
current_node = stack.pop()
|
|
932
954
|
|
|
933
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
934
|
-
|
|
955
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
956
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
935
957
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
936
958
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
937
959
|
if left_text and right_text:
|
|
@@ -962,8 +984,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
|
|
|
962
984
|
|
|
963
985
|
cdef inline bytes to_bytes(str_or_Node value):
|
|
964
986
|
cdef bytes bytes_val
|
|
965
|
-
if isinstance(value,
|
|
966
|
-
bytes_val = value.encode(
|
|
987
|
+
if isinstance(value, unicode):
|
|
988
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
967
989
|
elif isinstance(value, bytes):
|
|
968
|
-
bytes_val =
|
|
990
|
+
bytes_val = <bytes>value
|
|
969
991
|
return bytes_val
|