selectolax 0.3.29__cp310-cp310-musllinux_1_2_aarch64.whl → 0.4.0__cp310-cp310-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +215 -60
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +24654 -25072
- selectolax/lexbor.cpython-310-aarch64-linux-gnu.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +94 -21
- selectolax/modest/node.pxi +49 -43
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18015 -20066
- selectolax/parser.cpython-310-aarch64-linux-gnu.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -46
- selectolax/parser.pyx +41 -33
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -183
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
_ENCODING = 'UTF-8'
|
|
4
6
|
|
|
@@ -8,6 +10,7 @@ include "lexbor/attrs.pxi"
|
|
|
8
10
|
include "lexbor/node.pxi"
|
|
9
11
|
include "lexbor/selection.pxi"
|
|
10
12
|
include "lexbor/util.pxi"
|
|
13
|
+
include "lexbor/node_remove.pxi"
|
|
11
14
|
|
|
12
15
|
# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
|
|
13
16
|
|
|
@@ -24,10 +27,8 @@ cdef class LexborHTMLParser:
|
|
|
24
27
|
html : str (unicode) or bytes
|
|
25
28
|
"""
|
|
26
29
|
def __init__(self, html):
|
|
27
|
-
|
|
28
30
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
31
|
+
cdef object bytes_html
|
|
31
32
|
bytes_html, html_len = preprocess_input(html)
|
|
32
33
|
self._parse_html(bytes_html, html_len)
|
|
33
34
|
self.raw_html = bytes_html
|
|
@@ -39,22 +40,27 @@ cdef class LexborHTMLParser:
|
|
|
39
40
|
self._selector = LexborCSSSelector()
|
|
40
41
|
return self._selector
|
|
41
42
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
43
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
44
|
cdef lxb_status_t status
|
|
45
45
|
|
|
46
46
|
with nogil:
|
|
47
47
|
self.document = lxb_html_document_create()
|
|
48
48
|
|
|
49
49
|
if self.document == NULL:
|
|
50
|
-
|
|
50
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
51
|
+
return -1
|
|
51
52
|
|
|
52
53
|
with nogil:
|
|
53
54
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
55
|
+
|
|
54
56
|
if status != 0x0000:
|
|
55
|
-
|
|
57
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
58
|
+
return -1
|
|
56
59
|
|
|
57
|
-
|
|
60
|
+
if self.document == NULL:
|
|
61
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
62
|
+
return -1
|
|
63
|
+
return 0
|
|
58
64
|
|
|
59
65
|
def __dealloc__(self):
|
|
60
66
|
if self.document != NULL:
|
|
@@ -68,7 +74,7 @@ cdef class LexborHTMLParser:
|
|
|
68
74
|
"""Returns root node."""
|
|
69
75
|
if self.document == NULL:
|
|
70
76
|
return None
|
|
71
|
-
return LexborNode
|
|
77
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
78
|
|
|
73
79
|
@property
|
|
74
80
|
def body(self):
|
|
@@ -77,7 +83,7 @@ cdef class LexborHTMLParser:
|
|
|
77
83
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
84
|
if body == NULL:
|
|
79
85
|
return None
|
|
80
|
-
return LexborNode
|
|
86
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
87
|
|
|
82
88
|
@property
|
|
83
89
|
def head(self):
|
|
@@ -86,7 +92,7 @@ cdef class LexborHTMLParser:
|
|
|
86
92
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
93
|
if head == NULL:
|
|
88
94
|
return None
|
|
89
|
-
return LexborNode
|
|
95
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
96
|
|
|
91
97
|
def tags(self, str name):
|
|
92
98
|
"""Returns a list of tags that match specified name.
|
|
@@ -96,6 +102,12 @@ cdef class LexborHTMLParser:
|
|
|
96
102
|
name : str (e.g. div)
|
|
97
103
|
|
|
98
104
|
"""
|
|
105
|
+
|
|
106
|
+
if not name:
|
|
107
|
+
raise ValueError("Tag name cannot be empty")
|
|
108
|
+
if len(name) > 100:
|
|
109
|
+
raise ValueError("Tag name is too long")
|
|
110
|
+
|
|
99
111
|
cdef lxb_dom_collection_t* collection = NULL
|
|
100
112
|
cdef lxb_status_t status
|
|
101
113
|
pybyte_name = name.encode('UTF-8')
|
|
@@ -116,7 +128,7 @@ cdef class LexborHTMLParser:
|
|
|
116
128
|
raise SelectolaxError("Can't locate elements.")
|
|
117
129
|
|
|
118
130
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
119
|
-
node = LexborNode
|
|
131
|
+
node = LexborNode.new(
|
|
120
132
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
121
133
|
self
|
|
122
134
|
)
|
|
@@ -150,7 +162,7 @@ cdef class LexborHTMLParser:
|
|
|
150
162
|
"""Return HTML representation of the page."""
|
|
151
163
|
if self.document == NULL:
|
|
152
164
|
return None
|
|
153
|
-
node = LexborNode
|
|
165
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
154
166
|
return node.html
|
|
155
167
|
|
|
156
168
|
def css(self, str query):
|
|
@@ -159,6 +171,11 @@ cdef class LexborHTMLParser:
|
|
|
159
171
|
Matches pattern `query` against HTML tree.
|
|
160
172
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
161
173
|
|
|
174
|
+
Special selectors:
|
|
175
|
+
|
|
176
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
177
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
178
|
+
|
|
162
179
|
Parameters
|
|
163
180
|
----------
|
|
164
181
|
query : str
|
|
@@ -177,9 +194,9 @@ cdef class LexborHTMLParser:
|
|
|
177
194
|
----------
|
|
178
195
|
|
|
179
196
|
query : str
|
|
180
|
-
default :
|
|
197
|
+
default : Any, default None
|
|
181
198
|
Default value to return if there is no match.
|
|
182
|
-
strict: bool, default
|
|
199
|
+
strict: bool, default False
|
|
183
200
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
184
201
|
|
|
185
202
|
|
|
@@ -196,7 +213,7 @@ cdef class LexborHTMLParser:
|
|
|
196
213
|
----------
|
|
197
214
|
tags : list of str
|
|
198
215
|
List of tags to remove.
|
|
199
|
-
recursive : bool, default
|
|
216
|
+
recursive : bool, default False
|
|
200
217
|
Whenever to delete all its child nodes
|
|
201
218
|
|
|
202
219
|
Examples
|
|
@@ -232,7 +249,7 @@ cdef class LexborHTMLParser:
|
|
|
232
249
|
|
|
233
250
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
234
251
|
if recursive:
|
|
235
|
-
lxb_dom_node_destroy_deep(
|
|
252
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
236
253
|
else:
|
|
237
254
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
238
255
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -273,7 +290,6 @@ cdef class LexborHTMLParser:
|
|
|
273
290
|
"""
|
|
274
291
|
return self.root.scripts_contain(query)
|
|
275
292
|
|
|
276
|
-
|
|
277
293
|
def script_srcs_contain(self, tuple queries):
|
|
278
294
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
279
295
|
|
|
@@ -289,6 +305,26 @@ cdef class LexborHTMLParser:
|
|
|
289
305
|
def css_matches(self, str selector):
|
|
290
306
|
return self.root.css_matches(selector)
|
|
291
307
|
|
|
308
|
+
def merge_text_nodes(self):
|
|
309
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
310
|
+
|
|
311
|
+
This is useful for text extraction.
|
|
312
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
313
|
+
|
|
314
|
+
Examples
|
|
315
|
+
--------
|
|
316
|
+
|
|
317
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
318
|
+
>>> node = tree.css_first('div')
|
|
319
|
+
>>> tree.unwrap_tags(["strong"])
|
|
320
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
321
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
322
|
+
>>> node.merge_text_nodes()
|
|
323
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
324
|
+
"John Doe"
|
|
325
|
+
"""
|
|
326
|
+
return self.root.merge_text_nodes()
|
|
327
|
+
|
|
292
328
|
@staticmethod
|
|
293
329
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
294
330
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -300,9 +336,16 @@ cdef class LexborHTMLParser:
|
|
|
300
336
|
return obj
|
|
301
337
|
|
|
302
338
|
def clone(self):
|
|
303
|
-
"""Clone the current
|
|
339
|
+
"""Clone the current node.
|
|
340
|
+
|
|
341
|
+
You can use to do temporary modifications without affecting the original HTML tree.
|
|
342
|
+
|
|
343
|
+
It is tied to the current parser instance.
|
|
344
|
+
Gets destroyed when parser instance is destroyed.
|
|
345
|
+
"""
|
|
304
346
|
cdef lxb_html_document_t* cloned_document
|
|
305
347
|
cdef lxb_dom_node_t* cloned_node
|
|
348
|
+
cdef LexborHTMLParser cls
|
|
306
349
|
|
|
307
350
|
with nogil:
|
|
308
351
|
cloned_document = lxb_html_document_create()
|
|
@@ -327,6 +370,7 @@ cdef class LexborHTMLParser:
|
|
|
327
370
|
|
|
328
371
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
329
372
|
return cls
|
|
373
|
+
|
|
330
374
|
def unwrap_tags(self, list tags, delete_empty = False):
|
|
331
375
|
"""Unwraps specified tags from the HTML tree.
|
|
332
376
|
|
|
@@ -347,5 +391,34 @@ cdef class LexborHTMLParser:
|
|
|
347
391
|
>>> tree.body.html
|
|
348
392
|
'<body><div>Hello world!</div></body>'
|
|
349
393
|
"""
|
|
350
|
-
if
|
|
394
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
395
|
+
if self.document != NULL:
|
|
351
396
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def inner_html(self) -> str:
|
|
400
|
+
"""Return HTML representation of the child nodes.
|
|
401
|
+
|
|
402
|
+
Works similar to innerHTML in JavaScript.
|
|
403
|
+
Unlike the `.html` property, does not include the current node.
|
|
404
|
+
Can be used to set HTML as well. See the setter docstring.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
text : str | None
|
|
409
|
+
"""
|
|
410
|
+
return self.root.inner_html
|
|
411
|
+
|
|
412
|
+
@inner_html.setter
|
|
413
|
+
def inner_html(self, str html):
|
|
414
|
+
"""Set inner HTML to the specified HTML.
|
|
415
|
+
|
|
416
|
+
Replaces existing data inside the node.
|
|
417
|
+
Works similar to innerHTML in JavaScript.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
html : str
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
self.root.inner_html = html
|
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
14
16
|
self.capacity = capacity
|
|
15
17
|
self.top = 0
|
|
16
18
|
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
|
|
19
|
+
if self._stack == NULL:
|
|
20
|
+
raise MemoryError("Failed to allocate memory for stack")
|
|
17
21
|
|
|
18
22
|
def __dealloc__(self):
|
|
19
23
|
free(self._stack)
|
|
@@ -21,9 +25,10 @@ cdef class Stack:
|
|
|
21
25
|
cdef bint is_empty(self):
|
|
22
26
|
return self.top <= 0
|
|
23
27
|
|
|
24
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
25
29
|
if self.top >= self.capacity:
|
|
26
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
27
32
|
self._stack[self.top] = res
|
|
28
33
|
self.top += 1
|
|
29
34
|
|
|
@@ -31,10 +36,13 @@ cdef class Stack:
|
|
|
31
36
|
self.top = self.top - 1
|
|
32
37
|
return self._stack[self.top]
|
|
33
38
|
|
|
34
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
35
40
|
self.capacity *= 2
|
|
36
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
37
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
38
46
|
|
|
39
47
|
cdef class _Attributes:
|
|
40
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -128,25 +136,24 @@ cdef class _Attributes:
|
|
|
128
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
129
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
130
138
|
|
|
131
|
-
|
|
132
|
-
|
|
133
139
|
ctypedef fused str_or_Node:
|
|
134
|
-
|
|
140
|
+
str
|
|
135
141
|
bytes
|
|
136
142
|
Node
|
|
137
143
|
|
|
138
|
-
|
|
139
144
|
cdef class Node:
|
|
140
145
|
"""A class that represents HTML node (element)."""
|
|
141
146
|
cdef myhtml_tree_node_t *node
|
|
142
147
|
cdef public HTMLParser parser
|
|
143
148
|
|
|
144
|
-
|
|
145
|
-
cdef
|
|
146
|
-
# custom
|
|
147
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
148
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
149
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
150
157
|
|
|
151
158
|
@property
|
|
152
159
|
def attributes(self):
|
|
@@ -286,7 +293,7 @@ cdef class Node:
|
|
|
286
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
287
294
|
text = ""
|
|
288
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
289
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
290
297
|
|
|
291
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
292
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -339,12 +346,10 @@ cdef class Node:
|
|
|
339
346
|
node = node.next
|
|
340
347
|
continue
|
|
341
348
|
|
|
342
|
-
next_node = Node()
|
|
343
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
344
350
|
yield next_node
|
|
345
351
|
node = node.next
|
|
346
352
|
|
|
347
|
-
|
|
348
353
|
def traverse(self, include_text=False):
|
|
349
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
350
355
|
|
|
@@ -358,16 +363,15 @@ cdef class Node:
|
|
|
358
363
|
node
|
|
359
364
|
"""
|
|
360
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
361
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
362
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
363
368
|
|
|
364
369
|
stack.push(self.node)
|
|
365
370
|
|
|
366
371
|
while not stack.is_empty():
|
|
367
372
|
current_node = stack.pop()
|
|
368
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
369
|
-
next_node = Node()
|
|
370
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
371
375
|
yield next_node
|
|
372
376
|
|
|
373
377
|
if current_node.next is not NULL:
|
|
@@ -393,11 +397,13 @@ cdef class Node:
|
|
|
393
397
|
|
|
394
398
|
@property
|
|
395
399
|
def child(self):
|
|
396
|
-
"""
|
|
400
|
+
"""Alias for the `first_child` property.
|
|
401
|
+
|
|
402
|
+
**Deprecated**. Please use `first_child` instead.
|
|
403
|
+
"""
|
|
397
404
|
cdef Node node
|
|
398
405
|
if self.node.child:
|
|
399
|
-
node = Node()
|
|
400
|
-
node._init(self.node.child, self.parser)
|
|
406
|
+
node = Node.new(self.node.child, self.parser)
|
|
401
407
|
return node
|
|
402
408
|
return None
|
|
403
409
|
|
|
@@ -406,8 +412,7 @@ cdef class Node:
|
|
|
406
412
|
"""Return the parent node."""
|
|
407
413
|
cdef Node node
|
|
408
414
|
if self.node.parent:
|
|
409
|
-
node = Node()
|
|
410
|
-
node._init(self.node.parent, self.parser)
|
|
415
|
+
node = Node.new(self.node.parent, self.parser)
|
|
411
416
|
return node
|
|
412
417
|
return None
|
|
413
418
|
|
|
@@ -416,8 +421,7 @@ cdef class Node:
|
|
|
416
421
|
"""Return next node."""
|
|
417
422
|
cdef Node node
|
|
418
423
|
if self.node.next:
|
|
419
|
-
node = Node()
|
|
420
|
-
node._init(self.node.next, self.parser)
|
|
424
|
+
node = Node.new(self.node.next, self.parser)
|
|
421
425
|
return node
|
|
422
426
|
return None
|
|
423
427
|
|
|
@@ -426,8 +430,7 @@ cdef class Node:
|
|
|
426
430
|
"""Return previous node."""
|
|
427
431
|
cdef Node node
|
|
428
432
|
if self.node.prev:
|
|
429
|
-
node = Node()
|
|
430
|
-
node._init(self.node.prev, self.parser)
|
|
433
|
+
node = Node.new(self.node.prev, self.parser)
|
|
431
434
|
return node
|
|
432
435
|
return None
|
|
433
436
|
|
|
@@ -436,8 +439,7 @@ cdef class Node:
|
|
|
436
439
|
"""Return last child node."""
|
|
437
440
|
cdef Node node
|
|
438
441
|
if self.node.last_child:
|
|
439
|
-
node = Node()
|
|
440
|
-
node._init(self.node.last_child, self.parser)
|
|
442
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
441
443
|
return node
|
|
442
444
|
return None
|
|
443
445
|
|
|
@@ -537,8 +539,8 @@ cdef class Node:
|
|
|
537
539
|
if delete_empty:
|
|
538
540
|
myhtml_node_delete(self.node)
|
|
539
541
|
return
|
|
540
|
-
cdef myhtml_tree_node_t* next_node
|
|
541
|
-
cdef myhtml_tree_node_t* current_node
|
|
542
|
+
cdef myhtml_tree_node_t* next_node
|
|
543
|
+
cdef myhtml_tree_node_t* current_node
|
|
542
544
|
|
|
543
545
|
if self.node.child.next != NULL:
|
|
544
546
|
current_node = self.node.child
|
|
@@ -572,6 +574,8 @@ cdef class Node:
|
|
|
572
574
|
'<html><body><div>Hello world!</div></body></html>'
|
|
573
575
|
|
|
574
576
|
"""
|
|
577
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
578
|
+
cdef Node element
|
|
575
579
|
for tag in tags:
|
|
576
580
|
for element in self.css(tag):
|
|
577
581
|
element.decompose(recursive=recursive)
|
|
@@ -595,10 +599,10 @@ cdef class Node:
|
|
|
595
599
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
596
600
|
>>> tree.body.html
|
|
597
601
|
'<body><div>Hello world!</div></body>'
|
|
598
|
-
|
|
602
|
+
|
|
599
603
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
600
604
|
"""
|
|
601
|
-
|
|
605
|
+
cdef Node element
|
|
602
606
|
for tag in tags:
|
|
603
607
|
for element in self.css(tag):
|
|
604
608
|
element.unwrap(delete_empty)
|
|
@@ -783,10 +787,10 @@ cdef class Node:
|
|
|
783
787
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
784
788
|
>>> tree.body.html
|
|
785
789
|
'<body><div>Hello world!</div></body>'
|
|
786
|
-
|
|
790
|
+
|
|
787
791
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
788
792
|
"""
|
|
789
|
-
|
|
793
|
+
cdef Node element
|
|
790
794
|
for tag in tags:
|
|
791
795
|
for element in self.css(tag):
|
|
792
796
|
element.unwrap(delete_empty)
|
|
@@ -845,6 +849,7 @@ cdef class Node:
|
|
|
845
849
|
The query to check.
|
|
846
850
|
|
|
847
851
|
"""
|
|
852
|
+
cdef Node node
|
|
848
853
|
if self.parser.cached_script_texts is None:
|
|
849
854
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
850
855
|
text_nodes = []
|
|
@@ -893,6 +898,7 @@ cdef class Node:
|
|
|
893
898
|
if not isinstance(other, Node):
|
|
894
899
|
return False
|
|
895
900
|
return self.html == other.html
|
|
901
|
+
|
|
896
902
|
@property
|
|
897
903
|
def text_content(self):
|
|
898
904
|
"""Returns the text of the node if it is a text node.
|
|
@@ -946,8 +952,8 @@ cdef class Node:
|
|
|
946
952
|
while not stack.is_empty():
|
|
947
953
|
current_node = stack.pop()
|
|
948
954
|
|
|
949
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
950
|
-
|
|
955
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
956
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
951
957
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
952
958
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
953
959
|
if left_text and right_text:
|
|
@@ -978,8 +984,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
|
|
|
978
984
|
|
|
979
985
|
cdef inline bytes to_bytes(str_or_Node value):
|
|
980
986
|
cdef bytes bytes_val
|
|
981
|
-
if isinstance(value,
|
|
982
|
-
bytes_val = value.encode(
|
|
987
|
+
if isinstance(value, unicode):
|
|
988
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
983
989
|
elif isinstance(value, bytes):
|
|
984
|
-
bytes_val =
|
|
990
|
+
bytes_val = <bytes>value
|
|
985
991
|
return bytes_val
|
selectolax/modest/selection.pxi
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
@cython.final
|
|
4
6
|
cdef class CSSSelector:
|
|
@@ -28,35 +30,33 @@ cdef class CSSSelector:
|
|
|
28
30
|
|
|
29
31
|
return collection
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
cdef _create_css_parser(self):
|
|
33
|
+
cdef int _create_css_parser(self) except -1:
|
|
33
34
|
cdef mystatus_t status
|
|
34
35
|
|
|
35
36
|
cdef mycss_t *mycss = mycss_create()
|
|
36
37
|
status = mycss_init(mycss)
|
|
37
38
|
|
|
38
39
|
if status != 0:
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
|
|
41
|
+
return -1
|
|
41
42
|
|
|
42
43
|
self.css_entry = mycss_entry_create()
|
|
43
44
|
status = mycss_entry_init(mycss, self.css_entry)
|
|
44
45
|
|
|
45
46
|
if status != 0:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
|
|
48
|
+
return -1
|
|
49
|
+
return 0
|
|
49
50
|
|
|
50
|
-
cdef _prepare_selector(self, mycss_entry_t *css_entry,
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
myencoding_t.MyENCODING_UTF_8,
|
|
55
|
-
selector, selector_size,
|
|
56
|
-
&out_status)
|
|
51
|
+
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
|
|
52
|
+
cdef mystatus_t out_status
|
|
53
|
+
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
|
|
54
|
+
selector, selector_size, &out_status)
|
|
57
55
|
|
|
58
56
|
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
|
|
59
|
-
|
|
57
|
+
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
|
|
58
|
+
return -1
|
|
59
|
+
return 0
|
|
60
60
|
|
|
61
61
|
def __dealloc__(self):
|
|
62
62
|
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
|
|
@@ -77,12 +77,11 @@ cdef class Selector:
|
|
|
77
77
|
cdef Node node
|
|
78
78
|
cdef list nodes
|
|
79
79
|
|
|
80
|
-
def __init__(self, Node node, query):
|
|
80
|
+
def __init__(self, Node node, str query):
|
|
81
81
|
"""custom init, because __cinit__ doesn't accept C types"""
|
|
82
82
|
self.node = node
|
|
83
83
|
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
|
|
84
84
|
|
|
85
|
-
|
|
86
85
|
cpdef css(self, str query):
|
|
87
86
|
"""Evaluate CSS selector against current scope."""
|
|
88
87
|
cdef Node current_node
|
|
@@ -106,6 +105,7 @@ cdef class Selector:
|
|
|
106
105
|
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
107
106
|
"""Filter all current matches given text."""
|
|
108
107
|
nodes = []
|
|
108
|
+
cdef Node node
|
|
109
109
|
for node in self.nodes:
|
|
110
110
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
111
111
|
if node_text and text in node_text:
|
|
@@ -116,6 +116,7 @@ cdef class Selector:
|
|
|
116
116
|
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
117
117
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
118
118
|
nodes = []
|
|
119
|
+
cdef Node node
|
|
119
120
|
for node in self.nodes:
|
|
120
121
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
121
122
|
if node_text and text in node_text:
|
|
@@ -142,7 +143,8 @@ cdef class Selector:
|
|
|
142
143
|
|
|
143
144
|
Similar to `string-length` in XPath.
|
|
144
145
|
"""
|
|
145
|
-
nodes = []
|
|
146
|
+
cdef list nodes = []
|
|
147
|
+
cdef Node node
|
|
146
148
|
for node in self.nodes:
|
|
147
149
|
attr = node.attributes.get(attribute)
|
|
148
150
|
if attr and start and start in attr:
|
|
@@ -157,16 +159,15 @@ cdef class Selector:
|
|
|
157
159
|
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
|
|
158
160
|
cdef myhtml_collection_t *collection
|
|
159
161
|
cdef CSSSelector selector = CSSSelector(query)
|
|
160
|
-
|
|
161
|
-
result =
|
|
162
|
+
cdef Node n
|
|
163
|
+
cdef list result = []
|
|
162
164
|
collection = selector.find(node)
|
|
163
165
|
|
|
164
166
|
if collection == NULL:
|
|
165
167
|
return result
|
|
166
168
|
|
|
167
169
|
for i in range(collection.length):
|
|
168
|
-
n = Node()
|
|
169
|
-
n._init(collection.list[i], parser)
|
|
170
|
+
n = Node.new(collection.list[i], parser)
|
|
170
171
|
result.append(n)
|
|
171
172
|
myhtml_collection_destroy(collection)
|
|
172
173
|
return result
|
|
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
|
|
|
176
177
|
cdef myhtml_collection_t *collection
|
|
177
178
|
cdef CSSSelector selector
|
|
178
179
|
cdef int collection_size
|
|
180
|
+
cdef str query
|
|
179
181
|
|
|
180
182
|
for query in selectors:
|
|
181
183
|
selector = CSSSelector(query)
|