selectolax 0.3.29__cp311-cp311-musllinux_1_2_aarch64.whl → 0.4.0__cp311-cp311-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +215 -60
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +24654 -25072
- selectolax/lexbor.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +94 -21
- selectolax/modest/node.pxi +49 -43
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18015 -20066
- selectolax/parser.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -46
- selectolax/parser.pyx +41 -33
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -183
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/__init__.py
CHANGED
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
__author__ = """Artem Golubin"""
|
|
5
|
-
__email__ =
|
|
6
|
-
__version__ =
|
|
5
|
+
__email__ = "me@rushter.com"
|
|
6
|
+
__version__ = "0.4.0"
|
|
7
7
|
|
|
8
|
-
from . import parser
|
|
9
|
-
from . import lexbor
|
|
10
|
-
from . import modest
|
|
8
|
+
from . import lexbor, modest, parser
|
selectolax/lexbor/attrs.pxi
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
@cython.final
|
|
4
5
|
cdef class LexborAttributes:
|
|
5
6
|
"""A dict-like object that represents attributes."""
|
|
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
|
|
|
23
24
|
yield key.decode(_ENCODING)
|
|
24
25
|
attr = attr.next
|
|
25
26
|
|
|
26
|
-
def __setitem__(self, str key, value):
|
|
27
|
-
value =
|
|
27
|
+
def __setitem__(self, str key, object value):
|
|
28
|
+
value = value
|
|
28
29
|
bytes_key = key.encode(_ENCODING)
|
|
29
|
-
bytes_value = value.encode(_ENCODING)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
bytes_value = value.encode(_ENCODING) if value else b""
|
|
31
|
+
cdef lxb_dom_attr_t *attr
|
|
32
|
+
cdef lxb_dom_document_t *doc
|
|
33
|
+
|
|
34
|
+
if value is None:
|
|
35
|
+
# N.B. This is suboptimal, but there is not API to set empty attributes
|
|
36
|
+
attr = lxb_dom_element_set_attribute(
|
|
37
|
+
<lxb_dom_element_t *> self.node,
|
|
38
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
39
|
+
NULL, 0
|
|
40
|
+
)
|
|
41
|
+
doc = (<lxb_dom_node_t*>attr).owner_document
|
|
42
|
+
lexbor_str_destroy(attr.value, doc.text, 0)
|
|
43
|
+
attr.value = NULL
|
|
44
|
+
|
|
45
|
+
elif isinstance(value, str) or isinstance(value, unicode) :
|
|
46
|
+
lxb_dom_element_set_attribute(
|
|
47
|
+
<lxb_dom_element_t *> self.node,
|
|
48
|
+
<lxb_char_t *> bytes_key, len(bytes_key),
|
|
49
|
+
<lxb_char_t *> bytes_value, len(bytes_value),
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
raise TypeError("Expected str or unicode, got %s" % type(value))
|
|
36
53
|
|
|
37
54
|
def __delitem__(self, key):
|
|
38
55
|
try:
|
selectolax/lexbor/node.pxi
CHANGED
|
@@ -1,31 +1,43 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetNone
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("selectolax")
|
|
2
7
|
|
|
3
8
|
_TAG_TO_NAME = {
|
|
4
|
-
0x0005: "-
|
|
9
|
+
0x0005: "-doctype",
|
|
5
10
|
0x0002: "-text",
|
|
6
11
|
0x0004: "-comment",
|
|
7
12
|
}
|
|
8
13
|
ctypedef fused str_or_LexborNode:
|
|
9
|
-
|
|
14
|
+
str
|
|
10
15
|
bytes
|
|
11
16
|
LexborNode
|
|
12
17
|
|
|
18
|
+
ctypedef fused str_or_bytes:
|
|
19
|
+
str
|
|
20
|
+
bytes
|
|
21
|
+
|
|
13
22
|
cdef inline bytes to_bytes(str_or_LexborNode value):
|
|
14
23
|
cdef bytes bytes_val
|
|
15
|
-
if isinstance(value,
|
|
16
|
-
bytes_val = value.encode(
|
|
24
|
+
if isinstance(value, unicode):
|
|
25
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
17
26
|
elif isinstance(value, bytes):
|
|
18
|
-
bytes_val =
|
|
27
|
+
bytes_val = <bytes>value
|
|
19
28
|
return bytes_val
|
|
20
29
|
|
|
30
|
+
|
|
21
31
|
@cython.final
|
|
22
32
|
cdef class LexborNode:
|
|
23
33
|
"""A class that represents HTML node (element)."""
|
|
24
34
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
35
|
+
@staticmethod
|
|
36
|
+
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
|
|
37
|
+
cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
|
|
38
|
+
lxbnode.node = node
|
|
39
|
+
lxbnode.parser = parser
|
|
40
|
+
return lxbnode
|
|
29
41
|
|
|
30
42
|
@property
|
|
31
43
|
def mem_id(self):
|
|
@@ -33,7 +45,10 @@ cdef class LexborNode:
|
|
|
33
45
|
|
|
34
46
|
@property
|
|
35
47
|
def child(self):
|
|
36
|
-
"""Alias for the `first_child` property.
|
|
48
|
+
"""Alias for the `first_child` property.
|
|
49
|
+
|
|
50
|
+
**Deprecated**. Please use `first_child` instead.
|
|
51
|
+
"""
|
|
37
52
|
return self.first_child
|
|
38
53
|
|
|
39
54
|
@property
|
|
@@ -41,8 +56,7 @@ cdef class LexborNode:
|
|
|
41
56
|
"""Return the first child node."""
|
|
42
57
|
cdef LexborNode node
|
|
43
58
|
if self.node.first_child:
|
|
44
|
-
node = LexborNode()
|
|
45
|
-
node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
59
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
|
|
46
60
|
return node
|
|
47
61
|
return None
|
|
48
62
|
|
|
@@ -50,9 +64,8 @@ cdef class LexborNode:
|
|
|
50
64
|
def parent(self):
|
|
51
65
|
"""Return the parent node."""
|
|
52
66
|
cdef LexborNode node
|
|
53
|
-
if self.node.parent:
|
|
54
|
-
node = LexborNode()
|
|
55
|
-
node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
67
|
+
if self.node.parent != NULL:
|
|
68
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
|
|
56
69
|
return node
|
|
57
70
|
return None
|
|
58
71
|
|
|
@@ -60,9 +73,8 @@ cdef class LexborNode:
|
|
|
60
73
|
def next(self):
|
|
61
74
|
"""Return next node."""
|
|
62
75
|
cdef LexborNode node
|
|
63
|
-
if self.node.next:
|
|
64
|
-
node = LexborNode()
|
|
65
|
-
node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
76
|
+
if self.node.next != NULL:
|
|
77
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
|
|
66
78
|
return node
|
|
67
79
|
return None
|
|
68
80
|
|
|
@@ -70,9 +82,8 @@ cdef class LexborNode:
|
|
|
70
82
|
def prev(self):
|
|
71
83
|
"""Return previous node."""
|
|
72
84
|
cdef LexborNode node
|
|
73
|
-
if self.node.prev:
|
|
74
|
-
node = LexborNode()
|
|
75
|
-
node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
85
|
+
if self.node.prev != NULL:
|
|
86
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
|
|
76
87
|
return node
|
|
77
88
|
return None
|
|
78
89
|
|
|
@@ -80,9 +91,8 @@ cdef class LexborNode:
|
|
|
80
91
|
def last_child(self):
|
|
81
92
|
"""Return last child node."""
|
|
82
93
|
cdef LexborNode node
|
|
83
|
-
if self.node.last_child:
|
|
84
|
-
node = LexborNode()
|
|
85
|
-
node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
94
|
+
if self.node.last_child != NULL:
|
|
95
|
+
node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
|
|
86
96
|
return node
|
|
87
97
|
return None
|
|
88
98
|
|
|
@@ -181,6 +191,12 @@ cdef class LexborNode:
|
|
|
181
191
|
Matches pattern `query` against HTML tree.
|
|
182
192
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
183
193
|
|
|
194
|
+
Special selectors:
|
|
195
|
+
|
|
196
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
197
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
198
|
+
|
|
199
|
+
|
|
184
200
|
Parameters
|
|
185
201
|
----------
|
|
186
202
|
query : str
|
|
@@ -195,13 +211,15 @@ cdef class LexborNode:
|
|
|
195
211
|
def css_first(self, str query, default=None, bool strict=False):
|
|
196
212
|
"""Same as `css` but returns only the first match.
|
|
197
213
|
|
|
214
|
+
When `strict=False` stops at the first match. Works faster.
|
|
215
|
+
|
|
198
216
|
Parameters
|
|
199
217
|
----------
|
|
200
218
|
|
|
201
219
|
query : str
|
|
202
|
-
default :
|
|
220
|
+
default : Any, default None
|
|
203
221
|
Default value to return if there is no match.
|
|
204
|
-
strict: bool, default
|
|
222
|
+
strict: bool, default False
|
|
205
223
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
206
224
|
|
|
207
225
|
|
|
@@ -209,8 +227,10 @@ cdef class LexborNode:
|
|
|
209
227
|
-------
|
|
210
228
|
selector : `LexborNode` object
|
|
211
229
|
"""
|
|
212
|
-
|
|
213
|
-
|
|
230
|
+
if strict:
|
|
231
|
+
results = self.parser.selector.find(query, self)
|
|
232
|
+
else:
|
|
233
|
+
results = self.parser.selector.find_first(query, self)
|
|
214
234
|
n_results = len(results)
|
|
215
235
|
if n_results > 0:
|
|
216
236
|
if strict and n_results > 1:
|
|
@@ -227,7 +247,7 @@ cdef class LexborNode:
|
|
|
227
247
|
|
|
228
248
|
def css_matches(self, str selector):
|
|
229
249
|
"""Returns True if CSS selector matches a node."""
|
|
230
|
-
return self.parser.selector.any_matches(selector, self)
|
|
250
|
+
return bool(self.parser.selector.any_matches(selector, self))
|
|
231
251
|
|
|
232
252
|
def __repr__(self):
|
|
233
253
|
return '<LexborNode %s>' % self.tag
|
|
@@ -241,6 +261,14 @@ cdef class LexborNode:
|
|
|
241
261
|
def tag(self):
|
|
242
262
|
"""Return the name of the current tag (e.g. div, p, img).
|
|
243
263
|
|
|
264
|
+
For for non-tag nodes, returns the following names:
|
|
265
|
+
|
|
266
|
+
* `-text` - text node
|
|
267
|
+
* `-document` - document node
|
|
268
|
+
* `-comment` - comment node
|
|
269
|
+
|
|
270
|
+
This
|
|
271
|
+
|
|
244
272
|
Returns
|
|
245
273
|
-------
|
|
246
274
|
text : str
|
|
@@ -256,7 +284,6 @@ cdef class LexborNode:
|
|
|
256
284
|
text = c_text.decode(_ENCODING)
|
|
257
285
|
return text
|
|
258
286
|
|
|
259
|
-
|
|
260
287
|
def decompose(self, bool recursive=True):
|
|
261
288
|
"""Remove the current node from the tree.
|
|
262
289
|
|
|
@@ -273,10 +300,13 @@ cdef class LexborNode:
|
|
|
273
300
|
>>> tag.decompose()
|
|
274
301
|
|
|
275
302
|
"""
|
|
303
|
+
if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
|
|
304
|
+
raise SelectolaxError("Decomposing the root node is not allowed.")
|
|
305
|
+
|
|
276
306
|
if recursive:
|
|
277
|
-
|
|
307
|
+
node_remove_deep(<lxb_dom_node_t *> self.node)
|
|
278
308
|
else:
|
|
279
|
-
|
|
309
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
280
310
|
|
|
281
311
|
def strip_tags(self, list tags, bool recursive = False):
|
|
282
312
|
"""Remove specified tags from the HTML tree.
|
|
@@ -298,11 +328,11 @@ cdef class LexborNode:
|
|
|
298
328
|
'<html><body><div>Hello world!</div></body></html>'
|
|
299
329
|
|
|
300
330
|
"""
|
|
331
|
+
cdef LexborNode element
|
|
301
332
|
for tag in tags:
|
|
302
333
|
for element in self.css(tag):
|
|
303
334
|
element.decompose(recursive=recursive)
|
|
304
335
|
|
|
305
|
-
|
|
306
336
|
@property
|
|
307
337
|
def attributes(self):
|
|
308
338
|
"""Get all attributes that belong to the current node.
|
|
@@ -325,6 +355,9 @@ cdef class LexborNode:
|
|
|
325
355
|
cdef size_t str_len = 0
|
|
326
356
|
attributes = dict()
|
|
327
357
|
|
|
358
|
+
if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
|
|
359
|
+
return attributes
|
|
360
|
+
|
|
328
361
|
while attr != NULL:
|
|
329
362
|
key = lxb_dom_attr_local_name_noi(attr, &str_len)
|
|
330
363
|
value = lxb_dom_attr_value_noi(attr, &str_len)
|
|
@@ -410,15 +443,15 @@ cdef class LexborNode:
|
|
|
410
443
|
node = node.next
|
|
411
444
|
continue
|
|
412
445
|
|
|
413
|
-
next_node = LexborNode()
|
|
414
|
-
next_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
446
|
+
next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
415
447
|
yield next_node
|
|
416
448
|
node = node.next
|
|
417
449
|
|
|
418
|
-
|
|
419
|
-
def unwrap(self, delete_empty=False):
|
|
450
|
+
def unwrap(self, bint delete_empty=False):
|
|
420
451
|
"""Replace node with whatever is inside this node.
|
|
421
452
|
|
|
453
|
+
Does nothing if you perform unwrapping second time on the same node.
|
|
454
|
+
|
|
422
455
|
Parameters
|
|
423
456
|
----------
|
|
424
457
|
delete_empty : bool, default False
|
|
@@ -431,15 +464,20 @@ cdef class LexborNode:
|
|
|
431
464
|
>>> tree.css_first('i').unwrap()
|
|
432
465
|
>>> tree.html
|
|
433
466
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
434
|
-
|
|
467
|
+
|
|
435
468
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
436
469
|
"""
|
|
470
|
+
|
|
471
|
+
if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
|
|
472
|
+
logger.error("Attempt to unwrap removed node. Does nothing.")
|
|
473
|
+
return
|
|
474
|
+
|
|
437
475
|
if self.node.first_child == NULL:
|
|
438
476
|
if delete_empty:
|
|
439
|
-
|
|
477
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
440
478
|
return
|
|
441
|
-
cdef lxb_dom_node_t* next_node
|
|
442
|
-
cdef lxb_dom_node_t* current_node
|
|
479
|
+
cdef lxb_dom_node_t* next_node
|
|
480
|
+
cdef lxb_dom_node_t* current_node
|
|
443
481
|
|
|
444
482
|
if self.node.first_child.next != NULL:
|
|
445
483
|
current_node = self.node.first_child
|
|
@@ -451,9 +489,9 @@ cdef class LexborNode:
|
|
|
451
489
|
current_node = next_node
|
|
452
490
|
else:
|
|
453
491
|
lxb_dom_node_insert_before(self.node, self.node.first_child)
|
|
454
|
-
|
|
492
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
455
493
|
|
|
456
|
-
def unwrap_tags(self, list tags, delete_empty = False):
|
|
494
|
+
def unwrap_tags(self, list tags, bint delete_empty = False):
|
|
457
495
|
"""Unwraps specified tags from the HTML tree.
|
|
458
496
|
|
|
459
497
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -472,14 +510,56 @@ cdef class LexborNode:
|
|
|
472
510
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
473
511
|
>>> tree.body.html
|
|
474
512
|
'<body><div>Hello world!</div></body>'
|
|
475
|
-
|
|
513
|
+
|
|
476
514
|
Note: by default, empty tags are ignored, use "delete_empty" to change this.
|
|
477
515
|
"""
|
|
478
|
-
|
|
516
|
+
cdef LexborNode element
|
|
479
517
|
for tag in tags:
|
|
480
518
|
for element in self.css(tag):
|
|
481
519
|
element.unwrap(delete_empty)
|
|
482
520
|
|
|
521
|
+
def merge_text_nodes(self):
|
|
522
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
523
|
+
|
|
524
|
+
This is useful for text extraction.
|
|
525
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
526
|
+
|
|
527
|
+
Examples
|
|
528
|
+
--------
|
|
529
|
+
|
|
530
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
531
|
+
>>> node = tree.css_first('div')
|
|
532
|
+
>>> tree.unwrap_tags(["strong"])
|
|
533
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
534
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
535
|
+
>>> node.merge_text_nodes()
|
|
536
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
537
|
+
"John Doe"
|
|
538
|
+
"""
|
|
539
|
+
cdef lxb_dom_node_t *node = self.node.first_child
|
|
540
|
+
cdef lxb_dom_node_t *next_node
|
|
541
|
+
cdef lxb_char_t *left_text
|
|
542
|
+
cdef lxb_char_t *right_text
|
|
543
|
+
cdef size_t left_length, right_length
|
|
544
|
+
|
|
545
|
+
while node != NULL:
|
|
546
|
+
next_node = node.next
|
|
547
|
+
if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
|
|
548
|
+
left_text = lxb_dom_node_text_content(node.prev, &left_length)
|
|
549
|
+
right_text = lxb_dom_node_text_content(node, &right_length)
|
|
550
|
+
if left_text and right_text:
|
|
551
|
+
combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
|
|
552
|
+
lxb_dom_node_text_content_set(node, combined, len(combined))
|
|
553
|
+
lxb_dom_node_remove(node.prev)
|
|
554
|
+
|
|
555
|
+
if left_text is not NULL:
|
|
556
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
|
|
557
|
+
if right_text is not NULL:
|
|
558
|
+
lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
|
|
559
|
+
|
|
560
|
+
if node.first_child:
|
|
561
|
+
LexborNode.new(node, self.parser).merge_text_nodes()
|
|
562
|
+
node = next_node
|
|
483
563
|
|
|
484
564
|
def traverse(self, include_text=False):
|
|
485
565
|
"""Iterate over all child and next nodes starting from the current level.
|
|
@@ -499,8 +579,7 @@ cdef class LexborNode:
|
|
|
499
579
|
|
|
500
580
|
while node != NULL:
|
|
501
581
|
if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
|
|
502
|
-
lxb_node = LexborNode()
|
|
503
|
-
lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
|
|
582
|
+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
|
|
504
583
|
yield lxb_node
|
|
505
584
|
|
|
506
585
|
if node.first_child != NULL:
|
|
@@ -550,7 +629,7 @@ cdef class LexborNode:
|
|
|
550
629
|
if new_node == NULL:
|
|
551
630
|
raise SelectolaxError("Can't create a new node")
|
|
552
631
|
lxb_dom_node_insert_before(self.node, new_node)
|
|
553
|
-
|
|
632
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
554
633
|
elif isinstance(value, LexborNode):
|
|
555
634
|
new_node = lxb_dom_document_import_node(
|
|
556
635
|
&self.parser.document.dom_document,
|
|
@@ -560,11 +639,10 @@ cdef class LexborNode:
|
|
|
560
639
|
if new_node == NULL:
|
|
561
640
|
raise SelectolaxError("Can't create a new node")
|
|
562
641
|
lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
|
|
563
|
-
|
|
642
|
+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
|
|
564
643
|
else:
|
|
565
644
|
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
|
|
566
645
|
|
|
567
|
-
|
|
568
646
|
def insert_before(self, str_or_LexborNode value):
|
|
569
647
|
"""
|
|
570
648
|
Insert a node before the current Node.
|
|
@@ -739,7 +817,7 @@ cdef class LexborNode:
|
|
|
739
817
|
>>> selector.child.raw_value
|
|
740
818
|
b'<test>'
|
|
741
819
|
"""
|
|
742
|
-
raise
|
|
820
|
+
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
|
|
743
821
|
|
|
744
822
|
def scripts_contain(self, str query):
|
|
745
823
|
"""Returns True if any of the script tags contain specified text.
|
|
@@ -752,6 +830,7 @@ cdef class LexborNode:
|
|
|
752
830
|
The query to check.
|
|
753
831
|
|
|
754
832
|
"""
|
|
833
|
+
cdef LexborNode node
|
|
755
834
|
if self.parser.cached_script_texts is None:
|
|
756
835
|
nodes = self.parser.selector.find('script', self)
|
|
757
836
|
text_nodes = []
|
|
@@ -776,6 +855,7 @@ cdef class LexborNode:
|
|
|
776
855
|
queries : tuple of str
|
|
777
856
|
|
|
778
857
|
"""
|
|
858
|
+
cdef LexborNode node
|
|
779
859
|
if self.parser.cached_script_srcs is None:
|
|
780
860
|
nodes = self.parser.selector.find('script', self)
|
|
781
861
|
src_nodes = []
|
|
@@ -831,31 +911,99 @@ cdef class LexborNode:
|
|
|
831
911
|
"""
|
|
832
912
|
cdef unsigned char * text
|
|
833
913
|
cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
|
|
834
|
-
|
|
835
|
-
container = TextContainer()
|
|
914
|
+
cdef TextContainer container
|
|
836
915
|
if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
|
|
837
916
|
return None
|
|
917
|
+
|
|
838
918
|
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
|
|
839
919
|
if text != NULL:
|
|
920
|
+
container = TextContainer.new_with_defaults()
|
|
840
921
|
py_text = text.decode(_ENCODING)
|
|
841
922
|
container.append(py_text)
|
|
842
923
|
return container.text
|
|
924
|
+
|
|
925
|
+
@property
|
|
926
|
+
def inner_html(self) -> str:
|
|
927
|
+
"""Return HTML representation of the child nodes.
|
|
928
|
+
|
|
929
|
+
Works similar to innerHTML in JavaScript.
|
|
930
|
+
Unlike the `.html` property, does not include the current node.
|
|
931
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
932
|
+
|
|
933
|
+
Returns
|
|
934
|
+
-------
|
|
935
|
+
text : str | None
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
cdef lexbor_str_t *lxb_str
|
|
939
|
+
cdef lxb_status_t status
|
|
940
|
+
|
|
941
|
+
lxb_str = lexbor_str_create()
|
|
942
|
+
status = lxb_html_serialize_deep_str(self.node, lxb_str)
|
|
943
|
+
if status == 0 and lxb_str.data:
|
|
944
|
+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
|
|
945
|
+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
|
|
946
|
+
return html
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
@inner_html.setter
|
|
950
|
+
def inner_html(self, str html):
|
|
951
|
+
"""Set inner HTML to the specified HTML.
|
|
952
|
+
|
|
953
|
+
Replaces existing data inside the node.
|
|
954
|
+
Works similar to innerHTML in JavaScript.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
html : str | None
|
|
959
|
+
|
|
960
|
+
"""
|
|
961
|
+
cdef bytes bytes_val
|
|
962
|
+
bytes_val = <bytes>html.encode("utf-8")
|
|
963
|
+
lxb_html_element_inner_html_set(
|
|
964
|
+
<lxb_html_element_t *>self.node,
|
|
965
|
+
<lxb_char_t *> bytes_val, len(bytes_val)
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def clone(self) -> LexborNode:
|
|
969
|
+
"""Clone the current node.
|
|
970
|
+
|
|
971
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
972
|
+
|
|
973
|
+
It is tied to the current parser instance.
|
|
974
|
+
Gets destroyed when parser instance is destroyed.
|
|
975
|
+
"""
|
|
976
|
+
cdef lxb_dom_node_t* node
|
|
977
|
+
node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
|
|
978
|
+
return LexborNode.new(node, self.parser)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
@cython.internal
|
|
843
982
|
@cython.final
|
|
844
983
|
cdef class TextContainer:
|
|
845
984
|
cdef str _text
|
|
846
|
-
cdef
|
|
847
|
-
cdef
|
|
985
|
+
cdef str separator
|
|
986
|
+
cdef bint strip
|
|
987
|
+
|
|
988
|
+
@staticmethod
|
|
989
|
+
cdef TextContainer new_with_defaults():
|
|
990
|
+
cdef TextContainer cls = TextContainer.__new__(TextContainer)
|
|
991
|
+
cls._text = ''
|
|
992
|
+
cls.separator = ''
|
|
993
|
+
cls.strip = False
|
|
994
|
+
return cls
|
|
848
995
|
|
|
849
996
|
def __init__(self, str separator = '', bool strip = False):
|
|
850
997
|
self._text = ""
|
|
851
998
|
self.separator = separator
|
|
852
999
|
self.strip = strip
|
|
853
1000
|
|
|
854
|
-
def append(self, node_text):
|
|
1001
|
+
def append(self, str node_text):
|
|
855
1002
|
if self.strip:
|
|
856
1003
|
self._text += node_text.strip() + self.separator
|
|
857
1004
|
else:
|
|
858
1005
|
self._text += node_text + self.separator
|
|
1006
|
+
|
|
859
1007
|
@property
|
|
860
1008
|
def text(self):
|
|
861
1009
|
if self.separator and self._text and self._text.endswith(self.separator):
|
|
@@ -864,7 +1012,7 @@ cdef class TextContainer:
|
|
|
864
1012
|
|
|
865
1013
|
|
|
866
1014
|
cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
867
|
-
cdef unsigned char *text
|
|
1015
|
+
cdef unsigned char *text
|
|
868
1016
|
cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
|
|
869
1017
|
if tag_id != LXB_TAG__TEXT:
|
|
870
1018
|
return LEXBOR_ACTION_OK
|
|
@@ -872,8 +1020,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
|
|
|
872
1020
|
text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
|
|
873
1021
|
if not text:
|
|
874
1022
|
return LEXBOR_ACTION_OK
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
py_str = text.decode(_ENCODING, "replace")
|
|
1026
|
+
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
PyErr_SetNone(e)
|
|
1029
|
+
return LEXBOR_ACTION_STOP
|
|
1030
|
+
|
|
1031
|
+
cdef TextContainer cls
|
|
1032
|
+
cls = <TextContainer> ctx
|
|
878
1033
|
cls.append(py_str)
|
|
879
1034
|
return LEXBOR_ACTION_OK
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
|
|
3
|
+
cdef lxb_dom_node_t *tmp
|
|
4
|
+
cdef lxb_dom_node_t *node = root
|
|
5
|
+
|
|
6
|
+
while node != NULL:
|
|
7
|
+
if node.first_child != NULL:
|
|
8
|
+
node = node.first_child
|
|
9
|
+
else:
|
|
10
|
+
while node != root and node.next == NULL:
|
|
11
|
+
tmp = node.parent
|
|
12
|
+
lxb_dom_node_remove(node)
|
|
13
|
+
node = tmp
|
|
14
|
+
|
|
15
|
+
if node == root:
|
|
16
|
+
lxb_dom_node_remove(node)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
tmp = node.next
|
|
20
|
+
lxb_dom_node_remove(node)
|
|
21
|
+
node = tmp
|
|
22
|
+
|
|
23
|
+
return NULL
|
|
24
|
+
|
|
25
|
+
cdef bint node_is_removed(lxb_dom_node_t* node):
|
|
26
|
+
if node.parent == NULL and node.next == NULL \
|
|
27
|
+
and node.prev == NULL:
|
|
28
|
+
return 1
|
|
29
|
+
return 0
|