selectolax 0.3.29__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +108 -47
- selectolax/lexbor/selection.pxi +34 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +52987 -55311
- selectolax/lexbor.cp312-win32.pyd +0 -0
- selectolax/lexbor.pxd +36 -40
- selectolax/lexbor.pyi +770 -65
- selectolax/lexbor.pyx +54 -17
- selectolax/modest/node.pxi +45 -42
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +50190 -52325
- selectolax/parser.cp312-win32.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +489 -45
- selectolax/parser.pyx +39 -31
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -194
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
selectolax/lexbor.pyx
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from cpython cimport bool
|
|
1
|
+
from cpython.bool cimport bool
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
2
3
|
|
|
3
4
|
_ENCODING = 'UTF-8'
|
|
4
5
|
|
|
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
|
|
|
24
25
|
html : str (unicode) or bytes
|
|
25
26
|
"""
|
|
26
27
|
def __init__(self, html):
|
|
27
|
-
|
|
28
28
|
cdef size_t html_len
|
|
29
|
-
cdef
|
|
30
|
-
|
|
29
|
+
cdef object bytes_html
|
|
31
30
|
bytes_html, html_len = preprocess_input(html)
|
|
32
31
|
self._parse_html(bytes_html, html_len)
|
|
33
32
|
self.raw_html = bytes_html
|
|
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
|
|
|
39
38
|
self._selector = LexborCSSSelector()
|
|
40
39
|
return self._selector
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
cdef _parse_html(self, char *html, size_t html_len):
|
|
41
|
+
cdef int _parse_html(self, char *html, size_t html_len) except -1:
|
|
44
42
|
cdef lxb_status_t status
|
|
45
43
|
|
|
46
44
|
with nogil:
|
|
47
45
|
self.document = lxb_html_document_create()
|
|
48
46
|
|
|
49
47
|
if self.document == NULL:
|
|
50
|
-
|
|
48
|
+
PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
|
|
49
|
+
return -1
|
|
51
50
|
|
|
52
51
|
with nogil:
|
|
53
52
|
status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
|
|
53
|
+
|
|
54
54
|
if status != 0x0000:
|
|
55
|
-
|
|
55
|
+
PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
|
|
56
|
+
return -1
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
if self.document == NULL:
|
|
59
|
+
PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
|
|
60
|
+
return -1
|
|
61
|
+
return 0
|
|
58
62
|
|
|
59
63
|
def __dealloc__(self):
|
|
60
64
|
if self.document != NULL:
|
|
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
|
|
|
68
72
|
"""Returns root node."""
|
|
69
73
|
if self.document == NULL:
|
|
70
74
|
return None
|
|
71
|
-
return LexborNode
|
|
75
|
+
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
|
|
72
76
|
|
|
73
77
|
@property
|
|
74
78
|
def body(self):
|
|
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
|
|
|
77
81
|
body = lxb_html_document_body_element_noi(self.document)
|
|
78
82
|
if body == NULL:
|
|
79
83
|
return None
|
|
80
|
-
return LexborNode
|
|
84
|
+
return LexborNode.new(<lxb_dom_node_t *> body, self)
|
|
81
85
|
|
|
82
86
|
@property
|
|
83
87
|
def head(self):
|
|
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
|
|
|
86
90
|
head = lxb_html_document_head_element_noi(self.document)
|
|
87
91
|
if head == NULL:
|
|
88
92
|
return None
|
|
89
|
-
return LexborNode
|
|
93
|
+
return LexborNode.new(<lxb_dom_node_t *> head, self)
|
|
90
94
|
|
|
91
95
|
def tags(self, str name):
|
|
92
96
|
"""Returns a list of tags that match specified name.
|
|
@@ -96,6 +100,12 @@ cdef class LexborHTMLParser:
|
|
|
96
100
|
name : str (e.g. div)
|
|
97
101
|
|
|
98
102
|
"""
|
|
103
|
+
|
|
104
|
+
if not name:
|
|
105
|
+
raise ValueError("Tag name cannot be empty")
|
|
106
|
+
if len(name) > 100:
|
|
107
|
+
raise ValueError("Tag name is too long")
|
|
108
|
+
|
|
99
109
|
cdef lxb_dom_collection_t* collection = NULL
|
|
100
110
|
cdef lxb_status_t status
|
|
101
111
|
pybyte_name = name.encode('UTF-8')
|
|
@@ -116,7 +126,7 @@ cdef class LexborHTMLParser:
|
|
|
116
126
|
raise SelectolaxError("Can't locate elements.")
|
|
117
127
|
|
|
118
128
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
119
|
-
node = LexborNode
|
|
129
|
+
node = LexborNode.new(
|
|
120
130
|
<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
|
|
121
131
|
self
|
|
122
132
|
)
|
|
@@ -150,7 +160,7 @@ cdef class LexborHTMLParser:
|
|
|
150
160
|
"""Return HTML representation of the page."""
|
|
151
161
|
if self.document == NULL:
|
|
152
162
|
return None
|
|
153
|
-
node = LexborNode
|
|
163
|
+
node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
|
|
154
164
|
return node.html
|
|
155
165
|
|
|
156
166
|
def css(self, str query):
|
|
@@ -159,6 +169,11 @@ cdef class LexborHTMLParser:
|
|
|
159
169
|
Matches pattern `query` against HTML tree.
|
|
160
170
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
161
171
|
|
|
172
|
+
Special selectors:
|
|
173
|
+
|
|
174
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
175
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
176
|
+
|
|
162
177
|
Parameters
|
|
163
178
|
----------
|
|
164
179
|
query : str
|
|
@@ -232,7 +247,7 @@ cdef class LexborHTMLParser:
|
|
|
232
247
|
|
|
233
248
|
for i in range(lxb_dom_collection_length_noi(collection)):
|
|
234
249
|
if recursive:
|
|
235
|
-
lxb_dom_node_destroy_deep(
|
|
250
|
+
lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
|
|
236
251
|
else:
|
|
237
252
|
lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
|
|
238
253
|
lxb_dom_collection_destroy(collection, <bint> True)
|
|
@@ -273,7 +288,6 @@ cdef class LexborHTMLParser:
|
|
|
273
288
|
"""
|
|
274
289
|
return self.root.scripts_contain(query)
|
|
275
290
|
|
|
276
|
-
|
|
277
291
|
def script_srcs_contain(self, tuple queries):
|
|
278
292
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
279
293
|
|
|
@@ -289,6 +303,26 @@ cdef class LexborHTMLParser:
|
|
|
289
303
|
def css_matches(self, str selector):
|
|
290
304
|
return self.root.css_matches(selector)
|
|
291
305
|
|
|
306
|
+
def merge_text_nodes(self):
|
|
307
|
+
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
308
|
+
|
|
309
|
+
This is useful for text extraction.
|
|
310
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
311
|
+
|
|
312
|
+
Examples
|
|
313
|
+
--------
|
|
314
|
+
|
|
315
|
+
>>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
316
|
+
>>> node = tree.css_first('div')
|
|
317
|
+
>>> tree.unwrap_tags(["strong"])
|
|
318
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
319
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
320
|
+
>>> node.merge_text_nodes()
|
|
321
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
322
|
+
"John Doe"
|
|
323
|
+
"""
|
|
324
|
+
return self.root.merge_text_nodes()
|
|
325
|
+
|
|
292
326
|
@staticmethod
|
|
293
327
|
cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
|
|
294
328
|
obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
|
|
@@ -303,6 +337,7 @@ cdef class LexborHTMLParser:
|
|
|
303
337
|
"""Clone the current tree."""
|
|
304
338
|
cdef lxb_html_document_t* cloned_document
|
|
305
339
|
cdef lxb_dom_node_t* cloned_node
|
|
340
|
+
cdef LexborHTMLParser cls
|
|
306
341
|
|
|
307
342
|
with nogil:
|
|
308
343
|
cloned_document = lxb_html_document_create()
|
|
@@ -327,6 +362,7 @@ cdef class LexborHTMLParser:
|
|
|
327
362
|
|
|
328
363
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
329
364
|
return cls
|
|
365
|
+
|
|
330
366
|
def unwrap_tags(self, list tags, delete_empty = False):
|
|
331
367
|
"""Unwraps specified tags from the HTML tree.
|
|
332
368
|
|
|
@@ -347,5 +383,6 @@ cdef class LexborHTMLParser:
|
|
|
347
383
|
>>> tree.body.html
|
|
348
384
|
'<body><div>Hello world!</div></body>'
|
|
349
385
|
"""
|
|
350
|
-
if
|
|
386
|
+
# faster to check if the document is empty which should determine if we have a root
|
|
387
|
+
if self.document != NULL:
|
|
351
388
|
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
selectolax/modest/node.pxi
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_NoMemory
|
|
2
3
|
|
|
3
4
|
from libc.stdlib cimport free
|
|
4
5
|
from libc.stdlib cimport malloc
|
|
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
|
|
|
8
9
|
DEF _STACK_SIZE = 100
|
|
9
10
|
DEF _ENCODING = 'UTF-8'
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
@cython.final
|
|
12
14
|
cdef class Stack:
|
|
13
15
|
def __cinit__(self, size_t capacity=25):
|
|
14
16
|
self.capacity = capacity
|
|
15
17
|
self.top = 0
|
|
16
18
|
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
|
|
19
|
+
if self._stack == NULL:
|
|
20
|
+
raise MemoryError("Failed to allocate memory for stack")
|
|
17
21
|
|
|
18
22
|
def __dealloc__(self):
|
|
19
23
|
free(self._stack)
|
|
@@ -21,9 +25,10 @@ cdef class Stack:
|
|
|
21
25
|
cdef bint is_empty(self):
|
|
22
26
|
return self.top <= 0
|
|
23
27
|
|
|
24
|
-
cdef push(self, myhtml_tree_node_t* res):
|
|
28
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1:
|
|
25
29
|
if self.top >= self.capacity:
|
|
26
|
-
self.resize()
|
|
30
|
+
if self.resize() < 0:
|
|
31
|
+
return -1
|
|
27
32
|
self._stack[self.top] = res
|
|
28
33
|
self.top += 1
|
|
29
34
|
|
|
@@ -31,10 +36,13 @@ cdef class Stack:
|
|
|
31
36
|
self.top = self.top - 1
|
|
32
37
|
return self._stack[self.top]
|
|
33
38
|
|
|
34
|
-
cdef resize(self):
|
|
39
|
+
cdef int resize(self) except -1:
|
|
35
40
|
self.capacity *= 2
|
|
36
41
|
self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
|
|
37
|
-
|
|
42
|
+
if self._stack == NULL:
|
|
43
|
+
PyErr_NoMemory()
|
|
44
|
+
return -1
|
|
45
|
+
return 0
|
|
38
46
|
|
|
39
47
|
cdef class _Attributes:
|
|
40
48
|
"""A dict-like object that represents attributes."""
|
|
@@ -128,25 +136,24 @@ cdef class _Attributes:
|
|
|
128
136
|
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
|
|
129
137
|
return "<%s attributes, %s items>" % (tag_name, len(self))
|
|
130
138
|
|
|
131
|
-
|
|
132
|
-
|
|
133
139
|
ctypedef fused str_or_Node:
|
|
134
|
-
|
|
140
|
+
str
|
|
135
141
|
bytes
|
|
136
142
|
Node
|
|
137
143
|
|
|
138
|
-
|
|
139
144
|
cdef class Node:
|
|
140
145
|
"""A class that represents HTML node (element)."""
|
|
141
146
|
cdef myhtml_tree_node_t *node
|
|
142
147
|
cdef public HTMLParser parser
|
|
143
148
|
|
|
144
|
-
|
|
145
|
-
cdef
|
|
146
|
-
# custom
|
|
147
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
|
|
151
|
+
# custom __init__ for C, because __cinit__ doesn't accept C types
|
|
152
|
+
cdef Node cls = Node.__new__(Node)
|
|
153
|
+
cls.node = node
|
|
148
154
|
# Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
|
|
149
|
-
|
|
155
|
+
cls.parser = parser
|
|
156
|
+
return cls
|
|
150
157
|
|
|
151
158
|
@property
|
|
152
159
|
def attributes(self):
|
|
@@ -286,7 +293,7 @@ cdef class Node:
|
|
|
286
293
|
cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
|
|
287
294
|
text = ""
|
|
288
295
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
289
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
296
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
290
297
|
|
|
291
298
|
if node.tag_id == MyHTML_TAG__TEXT:
|
|
292
299
|
c_text = myhtml_node_text(node, NULL)
|
|
@@ -339,12 +346,10 @@ cdef class Node:
|
|
|
339
346
|
node = node.next
|
|
340
347
|
continue
|
|
341
348
|
|
|
342
|
-
next_node = Node()
|
|
343
|
-
next_node._init(node, self.parser)
|
|
349
|
+
next_node = Node.new(node, self.parser)
|
|
344
350
|
yield next_node
|
|
345
351
|
node = node.next
|
|
346
352
|
|
|
347
|
-
|
|
348
353
|
def traverse(self, include_text=False):
|
|
349
354
|
"""Iterate over all child and next nodes starting from the current level.
|
|
350
355
|
|
|
@@ -358,16 +363,15 @@ cdef class Node:
|
|
|
358
363
|
node
|
|
359
364
|
"""
|
|
360
365
|
cdef Stack stack = Stack(_STACK_SIZE)
|
|
361
|
-
cdef myhtml_tree_node_t* current_node = NULL
|
|
362
|
-
cdef Node next_node
|
|
366
|
+
cdef myhtml_tree_node_t* current_node = NULL
|
|
367
|
+
cdef Node next_node
|
|
363
368
|
|
|
364
369
|
stack.push(self.node)
|
|
365
370
|
|
|
366
371
|
while not stack.is_empty():
|
|
367
372
|
current_node = stack.pop()
|
|
368
373
|
if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
|
|
369
|
-
next_node = Node()
|
|
370
|
-
next_node._init(current_node, self.parser)
|
|
374
|
+
next_node = Node.new(current_node, self.parser)
|
|
371
375
|
yield next_node
|
|
372
376
|
|
|
373
377
|
if current_node.next is not NULL:
|
|
@@ -396,8 +400,7 @@ cdef class Node:
|
|
|
396
400
|
"""Return the child node."""
|
|
397
401
|
cdef Node node
|
|
398
402
|
if self.node.child:
|
|
399
|
-
node = Node()
|
|
400
|
-
node._init(self.node.child, self.parser)
|
|
403
|
+
node = Node.new(self.node.child, self.parser)
|
|
401
404
|
return node
|
|
402
405
|
return None
|
|
403
406
|
|
|
@@ -406,8 +409,7 @@ cdef class Node:
|
|
|
406
409
|
"""Return the parent node."""
|
|
407
410
|
cdef Node node
|
|
408
411
|
if self.node.parent:
|
|
409
|
-
node = Node()
|
|
410
|
-
node._init(self.node.parent, self.parser)
|
|
412
|
+
node = Node.new(self.node.parent, self.parser)
|
|
411
413
|
return node
|
|
412
414
|
return None
|
|
413
415
|
|
|
@@ -416,8 +418,7 @@ cdef class Node:
|
|
|
416
418
|
"""Return next node."""
|
|
417
419
|
cdef Node node
|
|
418
420
|
if self.node.next:
|
|
419
|
-
node = Node()
|
|
420
|
-
node._init(self.node.next, self.parser)
|
|
421
|
+
node = Node.new(self.node.next, self.parser)
|
|
421
422
|
return node
|
|
422
423
|
return None
|
|
423
424
|
|
|
@@ -426,8 +427,7 @@ cdef class Node:
|
|
|
426
427
|
"""Return previous node."""
|
|
427
428
|
cdef Node node
|
|
428
429
|
if self.node.prev:
|
|
429
|
-
node = Node()
|
|
430
|
-
node._init(self.node.prev, self.parser)
|
|
430
|
+
node = Node.new(self.node.prev, self.parser)
|
|
431
431
|
return node
|
|
432
432
|
return None
|
|
433
433
|
|
|
@@ -436,8 +436,7 @@ cdef class Node:
|
|
|
436
436
|
"""Return last child node."""
|
|
437
437
|
cdef Node node
|
|
438
438
|
if self.node.last_child:
|
|
439
|
-
node = Node()
|
|
440
|
-
node._init(self.node.last_child, self.parser)
|
|
439
|
+
node = Node.new(self.node.last_child, self.parser)
|
|
441
440
|
return node
|
|
442
441
|
return None
|
|
443
442
|
|
|
@@ -537,8 +536,8 @@ cdef class Node:
|
|
|
537
536
|
if delete_empty:
|
|
538
537
|
myhtml_node_delete(self.node)
|
|
539
538
|
return
|
|
540
|
-
cdef myhtml_tree_node_t* next_node
|
|
541
|
-
cdef myhtml_tree_node_t* current_node
|
|
539
|
+
cdef myhtml_tree_node_t* next_node
|
|
540
|
+
cdef myhtml_tree_node_t* current_node
|
|
542
541
|
|
|
543
542
|
if self.node.child.next != NULL:
|
|
544
543
|
current_node = self.node.child
|
|
@@ -572,6 +571,8 @@ cdef class Node:
|
|
|
572
571
|
'<html><body><div>Hello world!</div></body></html>'
|
|
573
572
|
|
|
574
573
|
"""
|
|
574
|
+
# ensure cython can recast element to a Node so that decompose will be called sooner.
|
|
575
|
+
cdef Node element
|
|
575
576
|
for tag in tags:
|
|
576
577
|
for element in self.css(tag):
|
|
577
578
|
element.decompose(recursive=recursive)
|
|
@@ -595,10 +596,10 @@ cdef class Node:
|
|
|
595
596
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
596
597
|
>>> tree.body.html
|
|
597
598
|
'<body><div>Hello world!</div></body>'
|
|
598
|
-
|
|
599
|
+
|
|
599
600
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
600
601
|
"""
|
|
601
|
-
|
|
602
|
+
cdef Node element
|
|
602
603
|
for tag in tags:
|
|
603
604
|
for element in self.css(tag):
|
|
604
605
|
element.unwrap(delete_empty)
|
|
@@ -783,10 +784,10 @@ cdef class Node:
|
|
|
783
784
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
784
785
|
>>> tree.body.html
|
|
785
786
|
'<body><div>Hello world!</div></body>'
|
|
786
|
-
|
|
787
|
+
|
|
787
788
|
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
788
789
|
"""
|
|
789
|
-
|
|
790
|
+
cdef Node element
|
|
790
791
|
for tag in tags:
|
|
791
792
|
for element in self.css(tag):
|
|
792
793
|
element.unwrap(delete_empty)
|
|
@@ -845,6 +846,7 @@ cdef class Node:
|
|
|
845
846
|
The query to check.
|
|
846
847
|
|
|
847
848
|
"""
|
|
849
|
+
cdef Node node
|
|
848
850
|
if self.parser.cached_script_texts is None:
|
|
849
851
|
nodes = find_nodes(self.parser, self.node, 'script')
|
|
850
852
|
text_nodes = []
|
|
@@ -893,6 +895,7 @@ cdef class Node:
|
|
|
893
895
|
if not isinstance(other, Node):
|
|
894
896
|
return False
|
|
895
897
|
return self.html == other.html
|
|
898
|
+
|
|
896
899
|
@property
|
|
897
900
|
def text_content(self):
|
|
898
901
|
"""Returns the text of the node if it is a text node.
|
|
@@ -946,8 +949,8 @@ cdef class Node:
|
|
|
946
949
|
while not stack.is_empty():
|
|
947
950
|
current_node = stack.pop()
|
|
948
951
|
|
|
949
|
-
if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
950
|
-
|
|
952
|
+
if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
|
|
953
|
+
current_node.prev.tag_id == MyHTML_TAG__TEXT):
|
|
951
954
|
left_text = myhtml_node_text(current_node.prev, &left_length)
|
|
952
955
|
right_text = myhtml_node_text(current_node, &right_length)
|
|
953
956
|
if left_text and right_text:
|
|
@@ -978,8 +981,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
|
|
|
978
981
|
|
|
979
982
|
cdef inline bytes to_bytes(str_or_Node value):
|
|
980
983
|
cdef bytes bytes_val
|
|
981
|
-
if isinstance(value,
|
|
982
|
-
bytes_val = value.encode(
|
|
984
|
+
if isinstance(value, unicode):
|
|
985
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
983
986
|
elif isinstance(value, bytes):
|
|
984
|
-
bytes_val =
|
|
987
|
+
bytes_val = <bytes>value
|
|
985
988
|
return bytes_val
|
selectolax/modest/selection.pxi
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
cimport cython
|
|
2
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
|
+
|
|
2
4
|
|
|
3
5
|
@cython.final
|
|
4
6
|
cdef class CSSSelector:
|
|
@@ -28,35 +30,33 @@ cdef class CSSSelector:
|
|
|
28
30
|
|
|
29
31
|
return collection
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
cdef _create_css_parser(self):
|
|
33
|
+
cdef int _create_css_parser(self) except -1:
|
|
33
34
|
cdef mystatus_t status
|
|
34
35
|
|
|
35
36
|
cdef mycss_t *mycss = mycss_create()
|
|
36
37
|
status = mycss_init(mycss)
|
|
37
38
|
|
|
38
39
|
if status != 0:
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
|
|
41
|
+
return -1
|
|
41
42
|
|
|
42
43
|
self.css_entry = mycss_entry_create()
|
|
43
44
|
status = mycss_entry_init(mycss, self.css_entry)
|
|
44
45
|
|
|
45
46
|
if status != 0:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
|
|
48
|
+
return -1
|
|
49
|
+
return 0
|
|
49
50
|
|
|
50
|
-
cdef _prepare_selector(self, mycss_entry_t *css_entry,
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
myencoding_t.MyENCODING_UTF_8,
|
|
55
|
-
selector, selector_size,
|
|
56
|
-
&out_status)
|
|
51
|
+
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
|
|
52
|
+
cdef mystatus_t out_status
|
|
53
|
+
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
|
|
54
|
+
selector, selector_size, &out_status)
|
|
57
55
|
|
|
58
56
|
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
|
|
59
|
-
|
|
57
|
+
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
|
|
58
|
+
return -1
|
|
59
|
+
return 0
|
|
60
60
|
|
|
61
61
|
def __dealloc__(self):
|
|
62
62
|
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
|
|
@@ -77,12 +77,11 @@ cdef class Selector:
|
|
|
77
77
|
cdef Node node
|
|
78
78
|
cdef list nodes
|
|
79
79
|
|
|
80
|
-
def __init__(self, Node node, query):
|
|
80
|
+
def __init__(self, Node node, str query):
|
|
81
81
|
"""custom init, because __cinit__ doesn't accept C types"""
|
|
82
82
|
self.node = node
|
|
83
83
|
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
|
|
84
84
|
|
|
85
|
-
|
|
86
85
|
cpdef css(self, str query):
|
|
87
86
|
"""Evaluate CSS selector against current scope."""
|
|
88
87
|
cdef Node current_node
|
|
@@ -106,6 +105,7 @@ cdef class Selector:
|
|
|
106
105
|
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
107
106
|
"""Filter all current matches given text."""
|
|
108
107
|
nodes = []
|
|
108
|
+
cdef Node node
|
|
109
109
|
for node in self.nodes:
|
|
110
110
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
111
111
|
if node_text and text in node_text:
|
|
@@ -116,6 +116,7 @@ cdef class Selector:
|
|
|
116
116
|
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
|
|
117
117
|
"""Returns True if any node in the current search scope contains specified text"""
|
|
118
118
|
nodes = []
|
|
119
|
+
cdef Node node
|
|
119
120
|
for node in self.nodes:
|
|
120
121
|
node_text = node.text(deep=deep, separator=separator, strip=strip)
|
|
121
122
|
if node_text and text in node_text:
|
|
@@ -142,7 +143,8 @@ cdef class Selector:
|
|
|
142
143
|
|
|
143
144
|
Similar to `string-length` in XPath.
|
|
144
145
|
"""
|
|
145
|
-
nodes = []
|
|
146
|
+
cdef list nodes = []
|
|
147
|
+
cdef Node node
|
|
146
148
|
for node in self.nodes:
|
|
147
149
|
attr = node.attributes.get(attribute)
|
|
148
150
|
if attr and start and start in attr:
|
|
@@ -157,16 +159,15 @@ cdef class Selector:
|
|
|
157
159
|
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
|
|
158
160
|
cdef myhtml_collection_t *collection
|
|
159
161
|
cdef CSSSelector selector = CSSSelector(query)
|
|
160
|
-
|
|
161
|
-
result =
|
|
162
|
+
cdef Node n
|
|
163
|
+
cdef list result = []
|
|
162
164
|
collection = selector.find(node)
|
|
163
165
|
|
|
164
166
|
if collection == NULL:
|
|
165
167
|
return result
|
|
166
168
|
|
|
167
169
|
for i in range(collection.length):
|
|
168
|
-
n = Node()
|
|
169
|
-
n._init(collection.list[i], parser)
|
|
170
|
+
n = Node.new(collection.list[i], parser)
|
|
170
171
|
result.append(n)
|
|
171
172
|
myhtml_collection_destroy(collection)
|
|
172
173
|
return result
|
|
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
|
|
|
176
177
|
cdef myhtml_collection_t *collection
|
|
177
178
|
cdef CSSSelector selector
|
|
178
179
|
cdef int collection_size
|
|
180
|
+
cdef str query
|
|
179
181
|
|
|
180
182
|
for query in selectors:
|
|
181
183
|
selector = CSSSelector(query)
|