selectolax 0.3.28__cp313-cp313-musllinux_1_2_x86_64.whl → 0.4.0__cp313-cp313-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +225 -58
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +22000 -22286
- selectolax/lexbor.cpython-313-x86_64-linux-musl.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +98 -23
- selectolax/modest/node.pxi +68 -46
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18150 -20047
- selectolax/parser.cpython-313-x86_64-linux-musl.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -53
- selectolax/parser.pyx +45 -35
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/METADATA +0 -193
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
__author__ = """Artem Golubin"""
|
|
5
|
-
__email__ =
|
|
6
|
-
__version__ =
|
|
5
|
+
__email__ = "me@rushter.com"
|
|
6
|
+
__version__ = "0.4.0"
|
|
7
7
|
|
|
8
|
-
from . import parser
|
|
9
|
-
from . import lexbor
|
|
10
|
-
from . import modest
|
|
8
|
+
from . import lexbor, modest, parser
|
selectolax/lexbor/attrs.pxi
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
@cython.final
|
|
4
5
|
cdef class LexborAttributes:
|
|
5
6
|
"""A dict-like object that represents attributes."""
|
|
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
|
|
|
23
24
|
yield key.decode(_ENCODING)
|
|
24
25
|
attr = attr.next
|
|
25
26
|
|
|
26
|
-
def __setitem__(self, str key, value):
|
|
27
|
-
value =
|
|
27
|
+
def __setitem__(self, str key, object value):
|
|
28
|
+
value = value
|
|
28
29
|
bytes_key = key.encode(_ENCODING)
|
|
29
|
-
bytes_value = value.encode(_ENCODING)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
bytes_value = value.encode(_ENCODING) if value else b""
|
|
31
|
+
cdef lxb_dom_attr_t *attr
|
|
32
|
+
cdef lxb_dom_document_t *doc
|
|
33
|
+
|
|
34
|
+
if value is None:
|
|
35
|
+
# N.B. This is suboptimal, but there is not API to set empty attributes
|
|
36
|
+
attr = lxb_dom_element_set_attribute(
|
|
37
|
+
<lxb_dom_element_t *> self.node,
|
|
38
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
39
|
+
NULL, 0
|
|
40
|
+
)
|
|
41
|
+
doc = (<lxb_dom_node_t*>attr).owner_document
|
|
42
|
+
lexbor_str_destroy(attr.value, doc.text, 0)
|
|
43
|
+
attr.value = NULL
|
|
44
|
+
|
|
45
|
+
elif isinstance(value, str) or isinstance(value, unicode) :
|
|
46
|
+
lxb_dom_element_set_attribute(
|
|
47
|
+
<lxb_dom_element_t *> self.node,
|
|
48
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
49
|
+
<lxb_char_t *> bytes_value, len(bytes_value),
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
raise TypeError("Expected str or unicode, got %s" % type(value))
|
|
36
53
|
|
|
37
54
|
def __delitem__(self, key):
|
|
38
55
|
try:
|
selectolax/lexbor/node.pxi
CHANGED
|
@@ -1,31 +1,43 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetNone
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("selectolax")
|
|
2
7
|
|
|
3
8
|
_TAG_TO_NAME = {
|
|
4
|
-
0x0005: "-
|
|
9
|
+
0x0005: "-doctype",
|
|
5
10
|
0x0002: "-text",
|
|
6
11
|
0x0004: "-comment",
|
|
7
12
|
}
|
|
8
13
|
ctypedef fused str_or_LexborNode:
|
|
9
|
-
|
|
14
|
+
str
|
|
10
15
|
bytes
|
|
11
16
|
LexborNode
|
|
12
17
|
|
|
18
|
+
ctypedef fused str_or_bytes:
|
|
19
|
+
str
|
|
20
|
+
bytes
|
|
21
|
+
|
|
13
22
|
cdef inline bytes to_bytes(str_or_LexborNode value):
|
|
14
23
|
cdef bytes bytes_val
|
|
15
|
-
if isinstance(value,
|
|
16
|
-
bytes_val = value.encode(
|
|
24
|
+
if isinstance(value, unicode):
|
|
25
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
17
26
|
elif isinstance(value, bytes):
|
|
18
|
-
bytes_val =
|
|
27
|
+
bytes_val = <bytes>value
|
|
19
28
|
return bytes_val
|
|
20
29
|
|
|
30
|
+
|
|
21
31
|
@cython.final
|
|
22
32
|
cdef class LexborNode:
|
|
23
33
|
"""A class that represents HTML node (element)."""
|
|
24
34
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
35
|
+
@staticmethod
|
|
36
|
+
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
|
|
37
|
+
cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
|
|
38
|
+
lxbnode.node = node
|
|
39
|
+
lxbnode.parser = parser
|
|
40
|
+
return lxbnode
|
|
29
41
|
|
|
30
42
|
@property
|
|
31
43
|
def mem_id(self):
|
|
@@ -33,7 +45,10 @@ cdef class LexborNode:
|
|
|
33
45
|
|
|
34
46
|
@property
|
|
35
47
|
def child(self):
|
|
36
|
-
"""Alias for the `first_child` property.
|
|
48
|
+
"""Alias for the `first_child` property.
|
|
49
|
+
|
|
50
|
+
**Deprecated**. Please use `first_child` instead.
|
|
51
|
+
"""
|
|
37
52
|
return self.first_child
|
|
38
53
|
|
|
39
54
|
@property
|
|
@@ -41,8 +56,7 @@ cdef class LexborNode:
|
|
|
41
56
|
"""Return the first child node."""
|
|
42
57
|
cdef LexborNode node
|
|
43
58
|
if self.node.first_child:
|
|
44
|
-
node = LexborNode()
|
|
45
|
-
node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
59
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
46
60
|
return node
|
|
47
61
|
return None
|
|
48
62
|
|
|
@@ -50,9 +64,8 @@ cdef class LexborNode:
|
|
|
50
64
|
def parent(self):
|
|
51
65
|
"""Return the parent node."""
|
|
52
66
|
cdef LexborNode node
|
|
53
|
-
if self.node.parent:
|
|
54
|
-
node = LexborNode()
|
|
55
|
-
node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
67
|
+
if self.node.parent != NULL:
|
|
68
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
56
69
|
return node
|
|
57
70
|
return None
|
|
58
71
|
|
|
@@ -60,9 +73,8 @@ cdef class LexborNode:
|
|
|
60
73
|
def next(self):
|
|
61
74
|
"""Return next node."""
|
|
62
75
|
cdef LexborNode node
|
|
63
|
-
if self.node.next:
|
|
64
|
-
node = LexborNode()
|
|
65
|
-
node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
76
|
+
if self.node.next != NULL:
|
|
77
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
66
78
|
return node
|
|
67
79
|
return None
|
|
68
80
|
|
|
@@ -70,9 +82,8 @@ cdef class LexborNode:
|
|
|
70
82
|
def prev(self):
|
|
71
83
|
"""Return previous node."""
|
|
72
84
|
cdef LexborNode node
|
|
73
|
-
if self.node.prev:
|
|
74
|
-
node = LexborNode()
|
|
75
|
-
node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
85
|
+
if self.node.prev != NULL:
|
|
86
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
76
87
|
return node
|
|
77
88
|
return None
|
|
78
89
|
|
|
@@ -80,9 +91,8 @@ cdef class LexborNode:
|
|
|
80
91
|
def last_child(self):
|
|
81
92
|
"""Return last child node."""
|
|
82
93
|
cdef LexborNode node
|
|
83
|
-
if self.node.last_child:
|
|
84
|
-
node = LexborNode()
|
|
85
|
-
node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
94
|
+
if self.node.last_child != NULL:
|
|
95
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
86
96
|
return node
|
|
87
97
|
return None
|
|
88
98
|
|
|
@@ -181,6 +191,12 @@ cdef class LexborNode:
|
|
|
181
191
|
Matches pattern `query` against HTML tree.
|
|
182
192
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
183
193
|
|
|
194
|
+
Special selectors:
|
|
195
|
+
|
|
196
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
197
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
198
|
+
|
|
199
|
+
|
|
184
200
|
Parameters
|
|
185
201
|
----------
|
|
186
202
|
query : str
|
|
@@ -195,13 +211,15 @@ cdef class LexborNode:
|
|
|
195
211
|
def css_first(self, str query, default=None, bool strict=False):
|
|
196
212
|
"""Same as `css` but returns only the first match.
|
|
197
213
|
|
|
214
|
+
When `strict=False` stops at the first match. Works faster.
|
|
215
|
+
|
|
198
216
|
Parameters
|
|
199
217
|
----------
|
|
200
218
|
|
|
201
219
|
query : str
|
|
202
|
-
default :
|
|
220
|
+
default : Any, default None
|
|
203
221
|
Default value to return if there is no match.
|
|
204
|
-
strict: bool, default
|
|
222
|
+
strict: bool, default False
|
|
205
223
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
206
224
|
|
|
207
225
|
|
|
@@ -209,8 +227,10 @@ cdef class LexborNode:
|
|
|
209
227
|
-------
|
|
210
228
|
selector : `LexborNode` object
|
|
211
229
|
"""
|
|
212
|
-
|
|
213
|
-
|
|
230
|
+
if strict:
|
|
231
|
+
results = self.parser.selector.find(query, self)
|
|
232
|
+
else:
|
|
233
|
+
results = self.parser.selector.find_first(query, self)
|
|
214
234
|
n_results = len(results)
|
|
215
235
|
if n_results > 0:
|
|
216
236
|
if strict and n_results > 1:
|
|
@@ -227,7 +247,7 @@ cdef class LexborNode:
|
|
|
227
247
|
|
|
228
248
|
def css_matches(self, str selector):
|
|
229
249
|
"""Returns True if CSS selector matches a node."""
|
|
230
|
-
return self.parser.selector.any_matches(selector, self)
|
|
250
|
+
return bool(self.parser.selector.any_matches(selector, self))
|
|
231
251
|
|
|
232
252
|
def __repr__(self):
|
|
233
253
|
return '<LexborNode %s>' % self.tag
|
|
@@ -241,6 +261,14 @@ cdef class LexborNode:
|
|
|
241
261
|
def tag(self):
|
|
242
262
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
243
263
|
|
|
264
|
+
For for non-tag nodes, returns the following names:
|
|
265
|
+
|
|
266
|
+
* `-text` - text node
|
|
267
|
+
* `-document` - document node
|
|
268
|
+
* `-comment` - comment node
|
|
269
|
+
|
|
270
|
+
This
|
|
271
|
+
|
|
244
272
|
Returns
|
|
245
273
|
-------
|
|
246
274
|
text : str
|
|
@@ -256,7 +284,6 @@ cdef class LexborNode:
|
|
|
256
284
|
text = c_text.decode(_ENCODING)
|
|
257
285
|
return text
|
|
258
286
|
|
|
259
|
-
|
|
260
287
|
def decompose(self, bool recursive=True):
|
|
261
288
|
"""Remove the current node from the tree.
|
|
262
289
|
|
|
@@ -273,10 +300,13 @@ cdef class LexborNode:
|
|
|
273
300
|
>>> tag.decompose()
|
|
274
301
|
|
|
275
302
|
"""
|
|
303
|
+
if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
|
|
304
|
+
raise SelectolaxError("Decomposing the root node is not allowed.")
|
|
305
|
+
|
|
276
306
|
if recursive:
|
|
277
|
-
|
|
307
|
+
node_remove_deep(<lxb_dom_node_t *> self.node)
|
|
278
308
|
else:
|
|
279
|
-
|
|
309
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
280
310
|
|
|
281
311
|
def strip_tags(self, list tags, bool recursive = False):
|
|
282
312
|
"""Remove specified tags from the HTML tree.
|
|
@@ -298,11 +328,11 @@ cdef class LexborNode:
|
|
|
298
328
|
'<html><body><div>Hello world!</div></body></html>'
|
|
299
329
|
|
|
300
330
|
"""
|
|
331
|
+
cdef LexborNode element
|
|
301
332
|
for tag in tags:
|
|
302
333
|
for element in self.css(tag):
|
|
303
334
|
element.decompose(recursive=recursive)
|
|
304
335
|
|
|
305
|
-
|
|
306
336
|
@property
|
|
307
337
|
def attributes(self):
|
|
308
338
|
"""Get all attributes that belong to the current node.
|
|
@@ -325,6 +355,9 @@ cdef class LexborNode:
|
|
|
325
355
|
cdef size_t str_len = 0
|
|
326
356
|
attributes = dict()
|
|
327
357
|
|
|
358
|
+
if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
|
|
359
|
+
return attributes
|
|
360
|
+
|
|
328
361
|
while attr != NULL:
|
|
329
362
|
key = lxb_dom_attr_local_name_noi(attr, &str_len)
|
|
330
363
|
value = lxb_dom_attr_value_noi(attr, &str_len)
|
|
@@ -410,15 +443,20 @@ cdef class LexborNode:
|
|
|
410
443
|
node = node.next
|
|
411
444
|
continue
|
|
412
445
|
|
|
413
|
-
next_node = LexborNode()
|
|
414
|
-
next_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
446
|
+
next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
415
447
|
yield next_node
|
|
416
448
|
node = node.next
|
|
417
449
|
|
|
418
|
-
|
|
419
|
-
def unwrap(self):
|
|
450
|
+
def unwrap(self, bint delete_empty=False):
|
|
420
451
|
"""Replace node with whatever is inside this node.
|
|
421
452
|
|
|
453
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
454
|
+
|
|
455
|
+
Parameters
|
|
456
|
+
----------
|
|
457
|
+
delete_empty : bool, default False
|
|
458
|
+
If True, removes empty tags.
|
|
459
|
+
|
|
422
460
|
Examples
|
|
423
461
|
--------
|
|
424
462
|
|
|
@@ -427,11 +465,19 @@ cdef class LexborNode:
|
|
|
427
465
|
>>> tree.html
|
|
428
466
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
429
467
|
|
|
468
|
+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
430
469
|
"""
|
|
470
|
+
|
|
471
|
+
if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
|
|
472
|
+
logger.error("Attempt to unwrap removed node. Does nothing.")
|
|
473
|
+
return
|
|
474
|
+
|
|
431
475
|
if self.node.first_child == NULL:
|
|
476
|
+
if delete_empty:
|
|
477
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
432
478
|
return
|
|
433
|
-
cdef lxb_dom_node_t* next_node
|
|
434
|
-
cdef lxb_dom_node_t* current_node
|
|
479
|
+
cdef lxb_dom_node_t* next_node
|
|
480
|
+
cdef lxb_dom_node_t* current_node
|
|
435
481
|
|
|
436
482
|
if self.node.first_child.next != NULL:
|
|
437
483
|
current_node = self.node.first_child
|
|
@@ -443,9 +489,9 @@ cdef class LexborNode:
|
|
|
443
489
|
current_node = next_node
|
|
444
490
|
else:
|
|
445
491
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
446
|
-
|
|
492
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
447
493
|
|
|
448
|
-
def unwrap_tags(self, list tags):
|
|
494
|
+
def unwrap_tags(self, list tags, bint delete_empty = False):
|
|
449
495
|
"""Unwraps specified tags from the HTML tree.
|
|
450
496
|
|
|
451
497
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -454,6 +500,8 @@ cdef class LexborNode:
|
|
|
454
500
|
----------
|
|
455
501
|
tags : list
|
|
456
502
|
List of tags to remove.
|
|
503
|
+
delete_empty : bool, default False
|
|
504
|
+
If True, removes empty tags.
|
|
457
505
|
|
|
458
506
|
Examples
|
|
459
507
|
--------
|
|
@@ -462,12 +510,56 @@ cdef class LexborNode:
|
|
|
462
510
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
463
511
|
>>> tree.body.html
|
|
464
512
|
'<body><div>Hello world!</div></body>'
|
|
465
|
-
"""
|
|
466
513
|
|
|
514
|
+
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
515
|
+
"""
|
|
516
|
+
cdef LexborNode element
|
|
467
517
|
for tag in tags:
|
|
468
518
|
for element in self.css(tag):
|
|
469
|
-
element.unwrap()
|
|
519
|
+
element.unwrap(delete_empty)
|
|
520
|
+
|
|
521
|
+
def merge_text_nodes(self):
|
|
522
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
470
523
|
|
|
524
|
+
This is useful for text extraction.
|
|
525
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
526
|
+
|
|
527
|
+
Examples
|
|
528
|
+
--------
|
|
529
|
+
|
|
530
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
531
|
+
>>> node = tree.css_first('div')
|
|
532
|
+
>>> tree.unwrap_tags(["strong"])
|
|
533
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
534
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
535
|
+
>>> node.merge_text_nodes()
|
|
536
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
537
|
+
"John Doe"
|
|
538
|
+
"""
|
|
539
|
+
cdef lxb_dom_node_t *node = self.node.first_child
|
|
540
|
+
cdef lxb_dom_node_t *next_node
|
|
541
|
+
cdef lxb_char_t *left_text
|
|
542
|
+
cdef lxb_char_t *right_text
|
|
543
|
+
cdef size_t left_length, right_length
|
|
544
|
+
|
|
545
|
+
while node != NULL:
|
|
546
|
+
next_node = node.next
|
|
547
|
+
if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
|
|
548
|
+
left_text = lxb_dom_node_text_content(node.prev, &left_length)
|
|
549
|
+
right_text = lxb_dom_node_text_content(node, &right_length)
|
|
550
|
+
if left_text and right_text:
|
|
551
|
+
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
|
|
552
|
+
lxb_dom_node_text_content_set(node, combined, len(combined))
|
|
553
|
+
lxb_dom_node_remove(node.prev)
|
|
554
|
+
|
|
555
|
+
if left_text is not NULL:
|
|
556
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
|
|
557
|
+
if right_text is not NULL:
|
|
558
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
|
|
559
|
+
|
|
560
|
+
if node.first_child:
|
|
561
|
+
LexborNode.new(node, self.parser).merge_text_nodes()
|
|
562
|
+
node = next_node
|
|
471
563
|
|
|
472
564
|
def traverse(self, include_text=False):
|
|
473
565
|
"""Iterate over all child and next nodes starting from the current level.
|
|
@@ -487,8 +579,7 @@ cdef class LexborNode:
|
|
|
487
579
|
|
|
488
580
|
while node != NULL:
|
|
489
581
|
if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
|
|
490
|
-
lxb_node = LexborNode()
|
|
491
|
-
lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
582
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
492
583
|
yield lxb_node
|
|
493
584
|
|
|
494
585
|
if node.first_child != NULL:
|
|
@@ -538,7 +629,7 @@ cdef class LexborNode:
|
|
|
538
629
|
if new_node == NULL:
|
|
539
630
|
raise SelectolaxError("Can't create a new node")
|
|
540
631
|
lxb_dom_node_insert_before(self.node, new_node)
|
|
541
|
-
|
|
632
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
542
633
|
elif isinstance(value, LexborNode):
|
|
543
634
|
new_node = lxb_dom_document_import_node(
|
|
544
635
|
&self.parser.document.dom_document,
|
|
@@ -548,11 +639,10 @@ cdef class LexborNode:
|
|
|
548
639
|
if new_node == NULL:
|
|
549
640
|
raise SelectolaxError("Can't create a new node")
|
|
550
641
|
lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
|
|
551
|
-
|
|
642
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
552
643
|
else:
|
|
553
644
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
554
645
|
|
|
555
|
-
|
|
556
646
|
def insert_before(self, str_or_LexborNode value):
|
|
557
647
|
"""
|
|
558
648
|
Insert a node before the current Node.
|
|
@@ -727,7 +817,7 @@ cdef class LexborNode:
|
|
|
727
817
|
>>> selector.child.raw_value
|
|
728
818
|
b'<test>'
|
|
729
819
|
"""
|
|
730
|
-
raise
|
|
820
|
+
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
731
821
|
|
|
732
822
|
def scripts_contain(self, str query):
|
|
733
823
|
"""Returns True if any of the script tags contain specified text.
|
|
@@ -740,6 +830,7 @@ cdef class LexborNode:
|
|
|
740
830
|
The query to check.
|
|
741
831
|
|
|
742
832
|
"""
|
|
833
|
+
cdef LexborNode node
|
|
743
834
|
if self.parser.cached_script_texts is None:
|
|
744
835
|
nodes = self.parser.selector.find('script', self)
|
|
745
836
|
text_nodes = []
|
|
@@ -764,6 +855,7 @@ cdef class LexborNode:
|
|
|
764
855
|
queries : tuple of str
|
|
765
856
|
|
|
766
857
|
"""
|
|
858
|
+
cdef LexborNode node
|
|
767
859
|
if self.parser.cached_script_srcs is None:
|
|
768
860
|
nodes = self.parser.selector.find('script', self)
|
|
769
861
|
src_nodes = []
|
|
@@ -819,31 +911,99 @@ cdef class LexborNode:
|
|
|
819
911
|
"""
|
|
820
912
|
cdef unsigned char * text
|
|
821
913
|
cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
|
|
822
|
-
|
|
823
|
-
container = TextContainer()
|
|
914
|
+
cdef TextContainer container
|
|
824
915
|
if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
|
|
825
916
|
return None
|
|
917
|
+
|
|
826
918
|
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
|
|
827
919
|
if text != NULL:
|
|
920
|
+
container = TextContainer.new_with_defaults()
|
|
828
921
|
py_text = text.decode(_ENCODING)
|
|
829
922
|
container.append(py_text)
|
|
830
923
|
return container.text
|
|
924
|
+
|
|
925
|
+
@property
|
|
926
|
+
def inner_html(self) -> str:
|
|
927
|
+
"""Return HTML representation of the child nodes.
|
|
928
|
+
|
|
929
|
+
Works similar to innerHTML in JavaScript.
|
|
930
|
+
Unlike the `.html` property, does not include the current node.
|
|
931
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
932
|
+
|
|
933
|
+
Returns
|
|
934
|
+
-------
|
|
935
|
+
text : str | None
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
cdef lexbor_str_t *lxb_str
|
|
939
|
+
cdef lxb_status_t status
|
|
940
|
+
|
|
941
|
+
lxb_str = lexbor_str_create()
|
|
942
|
+
status = lxb_html_serialize_deep_str(self.node, lxb_str)
|
|
943
|
+
if status == 0 and lxb_str.data:
|
|
944
|
+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
|
|
945
|
+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
|
|
946
|
+
return html
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
@inner_html.setter
|
|
950
|
+
def inner_html(self, str html):
|
|
951
|
+
"""Set inner HTML to the specified HTML.
|
|
952
|
+
|
|
953
|
+
Replaces existing data inside the node.
|
|
954
|
+
Works similar to innerHTML in JavaScript.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
html : str | None
|
|
959
|
+
|
|
960
|
+
"""
|
|
961
|
+
cdef bytes bytes_val
|
|
962
|
+
bytes_val = <bytes>html.encode("utf-8")
|
|
963
|
+
lxb_html_element_inner_html_set(
|
|
964
|
+
<lxb_html_element_t *>self.node,
|
|
965
|
+
<lxb_char_t *> bytes_val, len(bytes_val)
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def clone(self) -> LexborNode:
|
|
969
|
+
"""Clone the current node.
|
|
970
|
+
|
|
971
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
972
|
+
|
|
973
|
+
It is tied to the current parser instance.
|
|
974
|
+
Gets destroyed when parser instance is destroyed.
|
|
975
|
+
"""
|
|
976
|
+
cdef lxb_dom_node_t* node
|
|
977
|
+
node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
|
|
978
|
+
return LexborNode.new(node, self.parser)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
@cython.internal
|
|
831
982
|
@cython.final
|
|
832
983
|
cdef class TextContainer:
|
|
833
984
|
cdef str _text
|
|
834
|
-
cdef
|
|
835
|
-
cdef
|
|
985
|
+
cdef str separator
|
|
986
|
+
cdef bint strip
|
|
987
|
+
|
|
988
|
+
@staticmethod
|
|
989
|
+
cdef TextContainer new_with_defaults():
|
|
990
|
+
cdef TextContainer cls = TextContainer.__new__(TextContainer)
|
|
991
|
+
cls._text = ''
|
|
992
|
+
cls.separator = ''
|
|
993
|
+
cls.strip = False
|
|
994
|
+
return cls
|
|
836
995
|
|
|
837
996
|
def __init__(self, str separator = '', bool strip = False):
|
|
838
997
|
self._text = ""
|
|
839
998
|
self.separator = separator
|
|
840
999
|
self.strip = strip
|
|
841
1000
|
|
|
842
|
-
def append(self, node_text):
|
|
1001
|
+
def append(self, str node_text):
|
|
843
1002
|
if self.strip:
|
|
844
1003
|
self._text += node_text.strip() + self.separator
|
|
845
1004
|
else:
|
|
846
1005
|
self._text += node_text + self.separator
|
|
1006
|
+
|
|
847
1007
|
@property
|
|
848
1008
|
def text(self):
|
|
849
1009
|
if self.separator and self._text and self._text.endswith(self.separator):
|
|
@@ -852,7 +1012,7 @@ cdef class TextContainer:
|
|
|
852
1012
|
|
|
853
1013
|
|
|
854
1014
|
cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
855
|
-
cdef unsigned char *text
|
|
1015
|
+
cdef unsigned char *text
|
|
856
1016
|
cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
|
|
857
1017
|
if tag_id != LXB_TAG__TEXT:
|
|
858
1018
|
return LEXBOR_ACTION_OK
|
|
@@ -860,8 +1020,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
|
860
1020
|
text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
|
|
861
1021
|
if not text:
|
|
862
1022
|
return LEXBOR_ACTION_OK
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
py_str = text.decode(_ENCODING, "replace")
|
|
1026
|
+
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
PyErr_SetNone(e)
|
|
1029
|
+
return LEXBOR_ACTION_STOP
|
|
1030
|
+
|
|
1031
|
+
cdef TextContainer cls
|
|
1032
|
+
cls = <TextContainer> ctx
|
|
866
1033
|
cls.append(py_str)
|
|
867
1034
|
return LEXBOR_ACTION_OK
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
|
|
3
|
+
cdef lxb_dom_node_t *tmp
|
|
4
|
+
cdef lxb_dom_node_t *node = root
|
|
5
|
+
|
|
6
|
+
while node != NULL:
|
|
7
|
+
if node.first_child != NULL:
|
|
8
|
+
node = node.first_child
|
|
9
|
+
else:
|
|
10
|
+
while node != root and node.next == NULL:
|
|
11
|
+
tmp = node.parent
|
|
12
|
+
lxb_dom_node_remove(node)
|
|
13
|
+
node = tmp
|
|
14
|
+
|
|
15
|
+
if node == root:
|
|
16
|
+
lxb_dom_node_remove(node)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
tmp = node.next
|
|
20
|
+
lxb_dom_node_remove(node)
|
|
21
|
+
node = tmp
|
|
22
|
+
|
|
23
|
+
return NULL
|
|
24
|
+
|
|
25
|
+
cdef bint node_is_removed(lxb_dom_node_t* node):
|
|
26
|
+
if node.parent == NULL and node.next == NULL \
|
|
27
|
+
and node.prev == NULL:
|
|
28
|
+
return 1
|
|
29
|
+
return 0
|