selectolax 0.3.28__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +119 -46
- selectolax/lexbor/selection.pxi +34 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +52987 -55179
- selectolax/lexbor.cp312-win32.pyd +0 -0
- selectolax/lexbor.pxd +36 -40
- selectolax/lexbor.pyi +770 -65
- selectolax/lexbor.pyx +58 -19
- selectolax/modest/node.pxi +64 -45
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +50190 -52171
- selectolax/parser.cp312-win32.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +489 -52
- selectolax/parser.pyx +43 -33
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/METADATA +0 -193
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
2
3
|
|
|
3
4
|
_ENCODING = 'UTF-8'
|
|
4
5
|
|
|
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
|
|
|
24
25
|
html : str (unicode) or bytes
|
|
25
26
|
"""
|
|
26
27
|
def __init__(self, html):
|
|
27
|
-
|
|
28
28
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
29
|
+
cdef object bytes_html
|
|
31
30
|
bytes_html, html_len = preprocess_input(html)
|
|
32
31
|
self._parse_html(bytes_html, html_len)
|
|
33
32
|
self.raw_html = bytes_html
|
|
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
|
|
|
39
38
|
self._selector = LexborCSSSelector()
|
|
40
39
|
return self._selector
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
41
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
42
|
cdef lxb_status_t status
|
|
45
43
|
|
|
46
44
|
with nogil:
|
|
47
45
|
self.document = lxb_html_document_create()
|
|
48
46
|
|
|
49
47
|
if self.document == NULL:
|
|
50
|
-
|
|
48
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
49
|
+
return -1
|
|
51
50
|
|
|
52
51
|
with nogil:
|
|
53
52
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
53
|
+
|
|
54
54
|
if status != 0x0000:
|
|
55
|
-
|
|
55
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
56
|
+
return -1
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
if self.document == NULL:
|
|
59
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
60
|
+
return -1
|
|
61
|
+
return 0
|
|
58
62
|
|
|
59
63
|
def __dealloc__(self):
|
|
60
64
|
if self.document != NULL:
|
|
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
|
|
|
68
72
|
"""Returns root node."""
|
|
69
73
|
if self.document == NULL:
|
|
70
74
|
return None
|
|
71
|
-
return LexborNode
|
|
75
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
76
|
|
|
73
77
|
@property
|
|
74
78
|
def body(self):
|
|
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
|
|
|
77
81
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
82
|
if body == NULL:
|
|
79
83
|
return None
|
|
80
|
-
return LexborNode
|
|
84
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
85
|
|
|
82
86
|
@property
|
|
83
87
|
def head(self):
|
|
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
|
|
|
86
90
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
91
|
if head == NULL:
|
|
88
92
|
return None
|
|
89
|
-
return LexborNode
|
|
93
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
94
|
|
|
91
95
|
def tags(self, str name):
|
|
92
96
|
"""Returns a list of tags that match specified name.
|
|
@@ -96,6 +100,12 @@ cdef class LexborHTMLParser:
|
|
|
96
100
|
name : str (e.g. div)
|
|
97
101
|
|
|
98
102
|
"""
|
|
103
|
+
|
|
104
|
+
if not name:
|
|
105
|
+
raise ValueError("Tag name cannot be empty")
|
|
106
|
+
if len(name) > 100:
|
|
107
|
+
raise ValueError("Tag name is too long")
|
|
108
|
+
|
|
99
109
|
cdef lxb_dom_collection_t* collection = NULL
|
|
100
110
|
cdef lxb_status_t status
|
|
101
111
|
pybyte_name = name.encode('UTF-8')
|
|
@@ -116,7 +126,7 @@ cdef class LexborHTMLParser:
|
|
|
116
126
|
raise SelectolaxError("Can't locate elements.")
|
|
117
127
|
|
|
118
128
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
119
|
-
node = LexborNode
|
|
129
|
+
node = LexborNode.new(
|
|
120
130
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
121
131
|
self
|
|
122
132
|
)
|
|
@@ -150,7 +160,7 @@ cdef class LexborHTMLParser:
|
|
|
150
160
|
"""Return HTML representation of the page."""
|
|
151
161
|
if self.document == NULL:
|
|
152
162
|
return None
|
|
153
|
-
node = LexborNode
|
|
163
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
154
164
|
return node.html
|
|
155
165
|
|
|
156
166
|
def css(self, str query):
|
|
@@ -159,6 +169,11 @@ cdef class LexborHTMLParser:
|
|
|
159
169
|
Matches pattern `query` against HTML tree.
|
|
160
170
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
161
171
|
|
|
172
|
+
Special selectors:
|
|
173
|
+
|
|
174
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
175
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
176
|
+
|
|
162
177
|
Parameters
|
|
163
178
|
----------
|
|
164
179
|
query : str
|
|
@@ -232,7 +247,7 @@ cdef class LexborHTMLParser:
|
|
|
232
247
|
|
|
233
248
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
234
249
|
if recursive:
|
|
235
|
-
lxb_dom_node_destroy_deep(
|
|
250
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
236
251
|
else:
|
|
237
252
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
238
253
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -273,7 +288,6 @@ cdef class LexborHTMLParser:
|
|
|
273
288
|
"""
|
|
274
289
|
return self.root.scripts_contain(query)
|
|
275
290
|
|
|
276
|
-
|
|
277
291
|
def script_srcs_contain(self, tuple queries):
|
|
278
292
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
279
293
|
|
|
@@ -289,6 +303,26 @@ cdef class LexborHTMLParser:
|
|
|
289
303
|
def css_matches(self, str selector):
|
|
290
304
|
return self.root.css_matches(selector)
|
|
291
305
|
|
|
306
|
+
def merge_text_nodes(self):
|
|
307
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
308
|
+
|
|
309
|
+
This is useful for text extraction.
|
|
310
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
311
|
+
|
|
312
|
+
Examples
|
|
313
|
+
--------
|
|
314
|
+
|
|
315
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
316
|
+
>>> node = tree.css_first('div')
|
|
317
|
+
>>> tree.unwrap_tags(["strong"])
|
|
318
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
319
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
320
|
+
>>> node.merge_text_nodes()
|
|
321
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
322
|
+
"John Doe"
|
|
323
|
+
"""
|
|
324
|
+
return self.root.merge_text_nodes()
|
|
325
|
+
|
|
292
326
|
@staticmethod
|
|
293
327
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
294
328
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -303,6 +337,7 @@ cdef class LexborHTMLParser:
|
|
|
303
337
|
"""Clone the current tree."""
|
|
304
338
|
cdef lxb_html_document_t* cloned_document
|
|
305
339
|
cdef lxb_dom_node_t* cloned_node
|
|
340
|
+
cdef LexborHTMLParser cls
|
|
306
341
|
|
|
307
342
|
with nogil:
|
|
308
343
|
cloned_document = lxb_html_document_create()
|
|
@@ -327,7 +362,8 @@ cdef class LexborHTMLParser:
|
|
|
327
362
|
|
|
328
363
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
329
364
|
return cls
|
|
330
|
-
|
|
365
|
+
|
|
366
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
331
367
|
"""Unwraps specified tags from the HTML tree.
|
|
332
368
|
|
|
333
369
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -336,6 +372,8 @@ cdef class LexborHTMLParser:
|
|
|
336
372
|
----------
|
|
337
373
|
tags : list
|
|
338
374
|
List of tags to remove.
|
|
375
|
+
delete_empty : bool
|
|
376
|
+
Whenever to delete empty tags.
|
|
339
377
|
|
|
340
378
|
Examples
|
|
341
379
|
--------
|
|
@@ -345,5 +383,6 @@ cdef class LexborHTMLParser:
|
|
|
345
383
|
>>> tree.body.html
|
|
346
384
|
'<body><div>Hello world!</div></body>'
|
|
347
385
|
"""
|
|
348
|
-
if
|
|
349
|
-
|
|
386
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
387
|
+
if self.document != NULL:
|
|
388
|
+
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
14
16
|
self.capacity = capacity
|
|
15
17
|
self.top = 0
|
|
16
18
|
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
|
|
19
|
+
if self._stack == NULL:
|
|
20
|
+
raise MemoryError("Failed to allocate memory for stack")
|
|
17
21
|
|
|
18
22
|
def __dealloc__(self):
|
|
19
23
|
free(self._stack)
|
|
@@ -21,9 +25,10 @@ cdef class Stack:
|
|
|
21
25
|
cdef bint is_empty(self):
|
|
22
26
|
return self.top <= 0
|
|
23
27
|
|
|
24
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
25
29
|
if self.top >= self.capacity:
|
|
26
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
27
32
|
self._stack[self.top] = res
|
|
28
33
|
self.top += 1
|
|
29
34
|
|
|
@@ -31,10 +36,13 @@ cdef class Stack:
|
|
|
31
36
|
self.top = self.top - 1
|
|
32
37
|
return self._stack[self.top]
|
|
33
38
|
|
|
34
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
35
40
|
self.capacity *= 2
|
|
36
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
37
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
38
46
|
|
|
39
47
|
cdef class _Attributes:
|
|
40
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -128,25 +136,24 @@ cdef class _Attributes:
|
|
|
128
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
129
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
130
138
|
|
|
131
|
-
|
|
132
|
-
|
|
133
139
|
ctypedef fused str_or_Node:
|
|
134
|
-
|
|
140
|
+
str
|
|
135
141
|
bytes
|
|
136
142
|
Node
|
|
137
143
|
|
|
138
|
-
|
|
139
144
|
cdef class Node:
|
|
140
145
|
"""A class that represents HTML node (element)."""
|
|
141
146
|
cdef myhtml_tree_node_t *node
|
|
142
147
|
cdef public HTMLParser parser
|
|
143
148
|
|
|
144
|
-
|
|
145
|
-
cdef
|
|
146
|
-
# custom
|
|
147
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
148
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
149
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
150
157
|
|
|
151
158
|
@property
|
|
152
159
|
def attributes(self):
|
|
@@ -286,7 +293,7 @@ cdef class Node:
|
|
|
286
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
287
294
|
text = ""
|
|
288
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
289
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
290
297
|
|
|
291
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
292
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -339,12 +346,10 @@ cdef class Node:
|
|
|
339
346
|
node = node.next
|
|
340
347
|
continue
|
|
341
348
|
|
|
342
|
-
next_node = Node()
|
|
343
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
344
350
|
yield next_node
|
|
345
351
|
node = node.next
|
|
346
352
|
|
|
347
|
-
|
|
348
353
|
def traverse(self, include_text=False):
|
|
349
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
350
355
|
|
|
@@ -358,16 +363,15 @@ cdef class Node:
|
|
|
358
363
|
node
|
|
359
364
|
"""
|
|
360
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
361
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
362
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
363
368
|
|
|
364
369
|
stack.push(self.node)
|
|
365
370
|
|
|
366
371
|
while not stack.is_empty():
|
|
367
372
|
current_node = stack.pop()
|
|
368
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
369
|
-
next_node = Node()
|
|
370
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
371
375
|
yield next_node
|
|
372
376
|
|
|
373
377
|
if current_node.next is not NULL:
|
|
@@ -396,8 +400,7 @@ cdef class Node:
|
|
|
396
400
|
"""Return the child node."""
|
|
397
401
|
cdef Node node
|
|
398
402
|
if self.node.child:
|
|
399
|
-
node = Node()
|
|
400
|
-
node._init(self.node.child, self.parser)
|
|
403
|
+
node = Node.new(self.node.child, self.parser)
|
|
401
404
|
return node
|
|
402
405
|
return None
|
|
403
406
|
|
|
@@ -406,8 +409,7 @@ cdef class Node:
|
|
|
406
409
|
"""Return the parent node."""
|
|
407
410
|
cdef Node node
|
|
408
411
|
if self.node.parent:
|
|
409
|
-
node = Node()
|
|
410
|
-
node._init(self.node.parent, self.parser)
|
|
412
|
+
node = Node.new(self.node.parent, self.parser)
|
|
411
413
|
return node
|
|
412
414
|
return None
|
|
413
415
|
|
|
@@ -416,8 +418,7 @@ cdef class Node:
|
|
|
416
418
|
"""Return next node."""
|
|
417
419
|
cdef Node node
|
|
418
420
|
if self.node.next:
|
|
419
|
-
node = Node()
|
|
420
|
-
node._init(self.node.next, self.parser)
|
|
421
|
+
node = Node.new(self.node.next, self.parser)
|
|
421
422
|
return node
|
|
422
423
|
return None
|
|
423
424
|
|
|
@@ -426,8 +427,7 @@ cdef class Node:
|
|
|
426
427
|
"""Return previous node."""
|
|
427
428
|
cdef Node node
|
|
428
429
|
if self.node.prev:
|
|
429
|
-
node = Node()
|
|
430
|
-
node._init(self.node.prev, self.parser)
|
|
430
|
+
node = Node.new(self.node.prev, self.parser)
|
|
431
431
|
return node
|
|
432
432
|
return None
|
|
433
433
|
|
|
@@ -436,8 +436,7 @@ cdef class Node:
|
|
|
436
436
|
"""Return last child node."""
|
|
437
437
|
cdef Node node
|
|
438
438
|
if self.node.last_child:
|
|
439
|
-
node = Node()
|
|
440
|
-
node._init(self.node.last_child, self.parser)
|
|
439
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
441
440
|
return node
|
|
442
441
|
return None
|
|
443
442
|
|
|
@@ -515,9 +514,14 @@ cdef class Node:
|
|
|
515
514
|
"""An alias for the decompose method."""
|
|
516
515
|
self.decompose(recursive)
|
|
517
516
|
|
|
518
|
-
def unwrap(self):
|
|
517
|
+
def unwrap(self, delete_empty = False):
|
|
519
518
|
"""Replace node with whatever is inside this node.
|
|
520
519
|
|
|
520
|
+
Parameters
|
|
521
|
+
----------
|
|
522
|
+
delete_empty : bool, default False
|
|
523
|
+
Whenever to delete empty tags.
|
|
524
|
+
|
|
521
525
|
Examples
|
|
522
526
|
--------
|
|
523
527
|
|
|
@@ -526,11 +530,14 @@ cdef class Node:
|
|
|
526
530
|
>>> tree.html
|
|
527
531
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
528
532
|
|
|
533
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
529
534
|
"""
|
|
530
535
|
if self.node.child == NULL:
|
|
536
|
+
if delete_empty:
|
|
537
|
+
myhtml_node_delete(self.node)
|
|
531
538
|
return
|
|
532
|
-
cdef myhtml_tree_node_t* next_node
|
|
533
|
-
cdef myhtml_tree_node_t* current_node
|
|
539
|
+
cdef myhtml_tree_node_t* next_node
|
|
540
|
+
cdef myhtml_tree_node_t* current_node
|
|
534
541
|
|
|
535
542
|
if self.node.child.next != NULL:
|
|
536
543
|
current_node = self.node.child
|
|
@@ -564,11 +571,13 @@ cdef class Node:
|
|
|
564
571
|
'<html><body><div>Hello world!</div></body></html>'
|
|
565
572
|
|
|
566
573
|
"""
|
|
574
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
575
|
+
cdef Node element
|
|
567
576
|
for tag in tags:
|
|
568
577
|
for element in self.css(tag):
|
|
569
578
|
element.decompose(recursive=recursive)
|
|
570
579
|
|
|
571
|
-
def unwrap_tags(self, list tags):
|
|
580
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
572
581
|
"""Unwraps specified tags from the HTML tree.
|
|
573
582
|
|
|
574
583
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -577,6 +586,8 @@ cdef class Node:
|
|
|
577
586
|
----------
|
|
578
587
|
tags : list
|
|
579
588
|
List of tags to remove.
|
|
589
|
+
delete_empty : bool, default False
|
|
590
|
+
Whenever to delete empty tags.
|
|
580
591
|
|
|
581
592
|
Examples
|
|
582
593
|
--------
|
|
@@ -585,11 +596,13 @@ cdef class Node:
|
|
|
585
596
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
586
597
|
>>> tree.body.html
|
|
587
598
|
'<body><div>Hello world!</div></body>'
|
|
588
|
-
"""
|
|
589
599
|
|
|
600
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
601
|
+
"""
|
|
602
|
+
cdef Node element
|
|
590
603
|
for tag in tags:
|
|
591
604
|
for element in self.css(tag):
|
|
592
|
-
element.unwrap()
|
|
605
|
+
element.unwrap(delete_empty)
|
|
593
606
|
|
|
594
607
|
def replace_with(self, str_or_Node value):
|
|
595
608
|
"""Replace current Node with specified value.
|
|
@@ -752,7 +765,7 @@ cdef class Node:
|
|
|
752
765
|
else:
|
|
753
766
|
raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
|
|
754
767
|
|
|
755
|
-
def unwrap_tags(self, list tags):
|
|
768
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
756
769
|
"""Unwraps specified tags from the HTML tree.
|
|
757
770
|
|
|
758
771
|
Works the same as th ``unwrap`` method, but applied to a list of tags.
|
|
@@ -761,6 +774,8 @@ cdef class Node:
|
|
|
761
774
|
----------
|
|
762
775
|
tags : list
|
|
763
776
|
List of tags to remove.
|
|
777
|
+
delete_empty : bool, default False
|
|
778
|
+
Whenever to delete empty tags.
|
|
764
779
|
|
|
765
780
|
Examples
|
|
766
781
|
--------
|
|
@@ -769,11 +784,13 @@ cdef class Node:
|
|
|
769
784
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
770
785
|
>>> tree.body.html
|
|
771
786
|
'<body><div>Hello world!</div></body>'
|
|
772
|
-
"""
|
|
773
787
|
|
|
788
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
789
|
+
"""
|
|
790
|
+
cdef Node element
|
|
774
791
|
for tag in tags:
|
|
775
792
|
for element in self.css(tag):
|
|
776
|
-
element.unwrap()
|
|
793
|
+
element.unwrap(delete_empty)
|
|
777
794
|
|
|
778
795
|
@property
|
|
779
796
|
def raw_value(self):
|
|
@@ -829,6 +846,7 @@ cdef class Node:
|
|
|
829
846
|
The query to check.
|
|
830
847
|
|
|
831
848
|
"""
|
|
849
|
+
cdef Node node
|
|
832
850
|
if self.parser.cached_script_texts is None:
|
|
833
851
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
834
852
|
text_nodes = []
|
|
@@ -877,6 +895,7 @@ cdef class Node:
|
|
|
877
895
|
if not isinstance(other, Node):
|
|
878
896
|
return False
|
|
879
897
|
return self.html == other.html
|
|
898
|
+
|
|
880
899
|
@property
|
|
881
900
|
def text_content(self):
|
|
882
901
|
"""Returns the text of the node if it is a text node.
|
|
@@ -930,8 +949,8 @@ cdef class Node:
|
|
|
930
949
|
while not stack.is_empty():
|
|
931
950
|
current_node = stack.pop()
|
|
932
951
|
|
|
933
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
934
|
-
|
|
952
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
953
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
935
954
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
936
955
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
937
956
|
if left_text and right_text:
|
|
@@ -962,8 +981,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
|
|
|
962
981
|
|
|
963
982
|
cdef inline bytes to_bytes(str_or_Node value):
|
|
964
983
|
cdef bytes bytes_val
|
|
965
|
-
if isinstance(value,
|
|
966
|
-
bytes_val = value.encode(
|
|
984
|
+
if isinstance(value, unicode):
|
|
985
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
967
986
|
elif isinstance(value, bytes):
|
|
968
|
-
bytes_val =
|
|
987
|
+
bytes_val = <bytes>value
|
|
969
988
|
return bytes_val
|
selectolax/modest/selection.pxi
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
@cython.final
|
|
4
6
|
cdef class CSSSelector:
|
|
@@ -28,35 +30,33 @@ cdef class CSSSelector:
|
|
|
28
30
|
|
|
29
31
|
return collection
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
cdef _create_css_parser(self):
|
|
33
|
+
cdef int _create_css_parser(self) except -1:
|
|
33
34
|
cdef mystatus_t status
|
|
34
35
|
|
|
35
36
|
cdef mycss_t *mycss = mycss_create()
|
|
36
37
|
status = mycss_init(mycss)
|
|
37
38
|
|
|
38
39
|
if status != 0:
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
|
|
41
|
+
return -1
|
|
41
42
|
|
|
42
43
|
self.css_entry = mycss_entry_create()
|
|
43
44
|
status = mycss_entry_init(mycss, self.css_entry)
|
|
44
45
|
|
|
45
46
|
if status != 0:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
|
|
48
|
+
return -1
|
|
49
|
+
return 0
|
|
49
50
|
|
|
50
|
-
cdef _prepare_selector(self, mycss_entry_t *css_entry,
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
myencoding_t.MyENCODING_UTF_8,
|
|
55
|
-
selector, selector_size,
|
|
56
|
-
&out_status)
|
|
51
|
+
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
|
|
52
|
+
cdef mystatus_t out_status
|
|
53
|
+
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
|
|
54
|
+
selector, selector_size, &out_status)
|
|
57
55
|
|
|
58
56
|
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
|
|
59
|
-
|
|
57
|
+
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
|
|
58
|
+
return -1
|
|
59
|
+
return 0
|
|
60
60
|
|
|
61
61
|
def __dealloc__(self):
|
|
62
62
|
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
|
|
@@ -77,12 +77,11 @@ cdef class Selector:
|
|
|
77
77
|
cdef Node node
|
|
78
78
|
cdef list nodes
|
|
79
79
|
|
|
80
|
-
def __init__(self, Node node, query):
|
|
80
|
+
def __init__(self, Node node, str query):
|
|
81
81
|
"""custom init, because __cinit__ doesn't accept C types"""
|
|
82
82
|
self.node = node
|
|
83
83
|
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
|
|
84
84
|
|
|
85
|
-
|
|
86
85
|
cpdef css(self, str query):
|
|
87
86
|
"""Evaluate CSS selector against current scope."""
|
|
88
87
|
cdef Node current_node
|
|
@@ -106,6 +105,7 @@ cdef class Selector:
|
|
|
106
105
|
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
107
106
|
"""Filter all current matches given text."""
|
|
108
107
|
nodes = []
|
|
108
|
+
cdef Node node
|
|
109
109
|
for node in self.nodes:
|
|
110
110
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
111
111
|
if node_text and text in node_text:
|
|
@@ -116,6 +116,7 @@ cdef class Selector:
|
|
|
116
116
|
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
117
117
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
118
118
|
nodes = []
|
|
119
|
+
cdef Node node
|
|
119
120
|
for node in self.nodes:
|
|
120
121
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
121
122
|
if node_text and text in node_text:
|
|
@@ -142,7 +143,8 @@ cdef class Selector:
|
|
|
142
143
|
|
|
143
144
|
Similar to `string-length` in XPath.
|
|
144
145
|
"""
|
|
145
|
-
nodes = []
|
|
146
|
+
cdef list nodes = []
|
|
147
|
+
cdef Node node
|
|
146
148
|
for node in self.nodes:
|
|
147
149
|
attr = node.attributes.get(attribute)
|
|
148
150
|
if attr and start and start in attr:
|
|
@@ -157,16 +159,15 @@ cdef class Selector:
|
|
|
157
159
|
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
|
|
158
160
|
cdef myhtml_collection_t *collection
|
|
159
161
|
cdef CSSSelector selector = CSSSelector(query)
|
|
160
|
-
|
|
161
|
-
result =
|
|
162
|
+
cdef Node n
|
|
163
|
+
cdef list result = []
|
|
162
164
|
collection = selector.find(node)
|
|
163
165
|
|
|
164
166
|
if collection == NULL:
|
|
165
167
|
return result
|
|
166
168
|
|
|
167
169
|
for i in range(collection.length):
|
|
168
|
-
n = Node()
|
|
169
|
-
n._init(collection.list[i], parser)
|
|
170
|
+
n = Node.new(collection.list[i], parser)
|
|
170
171
|
result.append(n)
|
|
171
172
|
myhtml_collection_destroy(collection)
|
|
172
173
|
return result
|
|
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
|
|
|
176
177
|
cdef myhtml_collection_t *collection
|
|
177
178
|
cdef CSSSelector selector
|
|
178
179
|
cdef int collection_size
|
|
180
|
+
cdef str query
|
|
179
181
|
|
|
180
182
|
for query in selectors:
|
|
181
183
|
selector = CSSSelector(query)
|