selectolax 0.3.29__cp310-cp310-musllinux_1_2_aarch64.whl → 0.4.0__cp310-cp310-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyx CHANGED
@@ -1,4 +1,6 @@
1
- from cpython cimport bool
1
+ from cpython.bool cimport bool
2
+ from cpython.exc cimport PyErr_SetObject
3
+
2
4
 
3
5
  _ENCODING = 'UTF-8'
4
6
 
@@ -8,6 +10,7 @@ include "lexbor/attrs.pxi"
8
10
  include "lexbor/node.pxi"
9
11
  include "lexbor/selection.pxi"
10
12
  include "lexbor/util.pxi"
13
+ include "lexbor/node_remove.pxi"
11
14
 
12
15
  # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
13
16
 
@@ -24,10 +27,8 @@ cdef class LexborHTMLParser:
24
27
  html : str (unicode) or bytes
25
28
  """
26
29
  def __init__(self, html):
27
-
28
30
  cdef size_t html_len
29
- cdef char* html_chars
30
-
31
+ cdef object bytes_html
31
32
  bytes_html, html_len = preprocess_input(html)
32
33
  self._parse_html(bytes_html, html_len)
33
34
  self.raw_html = bytes_html
@@ -39,22 +40,27 @@ cdef class LexborHTMLParser:
39
40
  self._selector = LexborCSSSelector()
40
41
  return self._selector
41
42
 
42
-
43
- cdef _parse_html(self, char *html, size_t html_len):
43
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
44
44
  cdef lxb_status_t status
45
45
 
46
46
  with nogil:
47
47
  self.document = lxb_html_document_create()
48
48
 
49
49
  if self.document == NULL:
50
- raise SelectolaxError("Failed to initialize object for HTML Document.")
50
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
51
+ return -1
51
52
 
52
53
  with nogil:
53
54
  status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
55
+
54
56
  if status != 0x0000:
55
- raise SelectolaxError("Can't parse HTML.")
57
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
58
+ return -1
56
59
 
57
- assert self.document != NULL
60
+ if self.document == NULL:
61
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
62
+ return -1
63
+ return 0
58
64
 
59
65
  def __dealloc__(self):
60
66
  if self.document != NULL:
@@ -68,7 +74,7 @@ cdef class LexborHTMLParser:
68
74
  """Returns root node."""
69
75
  if self.document == NULL:
70
76
  return None
71
- return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
77
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
78
 
73
79
  @property
74
80
  def body(self):
@@ -77,7 +83,7 @@ cdef class LexborHTMLParser:
77
83
  body = lxb_html_document_body_element_noi(self.document)
78
84
  if body == NULL:
79
85
  return None
80
- return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
86
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
81
87
 
82
88
  @property
83
89
  def head(self):
@@ -86,7 +92,7 @@ cdef class LexborHTMLParser:
86
92
  head = lxb_html_document_head_element_noi(self.document)
87
93
  if head == NULL:
88
94
  return None
89
- return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
95
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
90
96
 
91
97
  def tags(self, str name):
92
98
  """Returns a list of tags that match specified name.
@@ -96,6 +102,12 @@ cdef class LexborHTMLParser:
96
102
  name : str (e.g. div)
97
103
 
98
104
  """
105
+
106
+ if not name:
107
+ raise ValueError("Tag name cannot be empty")
108
+ if len(name) > 100:
109
+ raise ValueError("Tag name is too long")
110
+
99
111
  cdef lxb_dom_collection_t* collection = NULL
100
112
  cdef lxb_status_t status
101
113
  pybyte_name = name.encode('UTF-8')
@@ -116,7 +128,7 @@ cdef class LexborHTMLParser:
116
128
  raise SelectolaxError("Can't locate elements.")
117
129
 
118
130
  for i in range(lxb_dom_collection_length_noi(collection)):
119
- node = LexborNode()._cinit(
131
+ node = LexborNode.new(
120
132
  <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
121
133
  self
122
134
  )
@@ -150,7 +162,7 @@ cdef class LexborHTMLParser:
150
162
  """Return HTML representation of the page."""
151
163
  if self.document == NULL:
152
164
  return None
153
- node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
165
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
154
166
  return node.html
155
167
 
156
168
  def css(self, str query):
@@ -159,6 +171,11 @@ cdef class LexborHTMLParser:
159
171
  Matches pattern `query` against HTML tree.
160
172
  `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
161
173
 
174
+ Special selectors:
175
+
176
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
177
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
178
+
162
179
  Parameters
163
180
  ----------
164
181
  query : str
@@ -177,9 +194,9 @@ cdef class LexborHTMLParser:
177
194
  ----------
178
195
 
179
196
  query : str
180
- default : bool, default None
197
+ default : Any, default None
181
198
  Default value to return if there is no match.
182
- strict: bool, default True
199
+ strict: bool, default False
183
200
  Set to True if you want to check if there is strictly only one match in the document.
184
201
 
185
202
 
@@ -196,7 +213,7 @@ cdef class LexborHTMLParser:
196
213
  ----------
197
214
  tags : list of str
198
215
  List of tags to remove.
199
- recursive : bool, default True
216
+ recursive : bool, default False
200
217
  Whenever to delete all its child nodes
201
218
 
202
219
  Examples
@@ -232,7 +249,7 @@ cdef class LexborHTMLParser:
232
249
 
233
250
  for i in range(lxb_dom_collection_length_noi(collection)):
234
251
  if recursive:
235
- lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
252
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
236
253
  else:
237
254
  lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
238
255
  lxb_dom_collection_destroy(collection, <bint> True)
@@ -273,7 +290,6 @@ cdef class LexborHTMLParser:
273
290
  """
274
291
  return self.root.scripts_contain(query)
275
292
 
276
-
277
293
  def script_srcs_contain(self, tuple queries):
278
294
  """Returns True if any of the script SRCs attributes contain on of the specified text.
279
295
 
@@ -289,6 +305,26 @@ cdef class LexborHTMLParser:
289
305
  def css_matches(self, str selector):
290
306
  return self.root.css_matches(selector)
291
307
 
308
+ def merge_text_nodes(self):
309
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
310
+
311
+ This is useful for text extraction.
312
+ Use it when you need to strip HTML tags and merge "dangling" text.
313
+
314
+ Examples
315
+ --------
316
+
317
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
318
+ >>> node = tree.css_first('div')
319
+ >>> tree.unwrap_tags(["strong"])
320
+ >>> tree.text(deep=True, separator=" ", strip=True)
321
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
322
+ >>> node.merge_text_nodes()
323
+ >>> tree.text(deep=True, separator=" ", strip=True)
324
+ "John Doe"
325
+ """
326
+ return self.root.merge_text_nodes()
327
+
292
328
  @staticmethod
293
329
  cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
294
330
  obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -300,9 +336,16 @@ cdef class LexborHTMLParser:
300
336
  return obj
301
337
 
302
338
  def clone(self):
303
- """Clone the current tree."""
339
+ """Clone the current node.
340
+
341
+ You can use to do temporary modifications without affecting the original HTML tree.
342
+
343
+ It is tied to the current parser instance.
344
+ Gets destroyed when parser instance is destroyed.
345
+ """
304
346
  cdef lxb_html_document_t* cloned_document
305
347
  cdef lxb_dom_node_t* cloned_node
348
+ cdef LexborHTMLParser cls
306
349
 
307
350
  with nogil:
308
351
  cloned_document = lxb_html_document_create()
@@ -327,6 +370,7 @@ cdef class LexborHTMLParser:
327
370
 
328
371
  cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
329
372
  return cls
373
+
330
374
  def unwrap_tags(self, list tags, delete_empty = False):
331
375
  """Unwraps specified tags from the HTML tree.
332
376
 
@@ -347,5 +391,34 @@ cdef class LexborHTMLParser:
347
391
  >>> tree.body.html
348
392
  '<body><div>Hello world!</div></body>'
349
393
  """
350
- if self.root is not None:
394
+ # faster to check if the document is empty which should determine if we have a root
395
+ if self.document != NULL:
351
396
  self.root.unwrap_tags(tags, delete_empty=delete_empty)
397
+
398
+ @property
399
+ def inner_html(self) -> str:
400
+ """Return HTML representation of the child nodes.
401
+
402
+ Works similar to innerHTML in JavaScript.
403
+ Unlike the `.html` property, does not include the current node.
404
+ Can be used to set HTML as well. See the setter docstring.
405
+
406
+ Returns
407
+ -------
408
+ text : str | None
409
+ """
410
+ return self.root.inner_html
411
+
412
+ @inner_html.setter
413
+ def inner_html(self, str html):
414
+ """Set inner HTML to the specified HTML.
415
+
416
+ Replaces existing data inside the node.
417
+ Works similar to innerHTML in JavaScript.
418
+
419
+ Parameters
420
+ ----------
421
+ html : str
422
+
423
+ """
424
+ self.root.inner_html = html
@@ -1,4 +1,5 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_NoMemory
2
3
 
3
4
  from libc.stdlib cimport free
4
5
  from libc.stdlib cimport malloc
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
8
9
  DEF _STACK_SIZE = 100
9
10
  DEF _ENCODING = 'UTF-8'
10
11
 
12
+
11
13
  @cython.final
12
14
  cdef class Stack:
13
15
  def __cinit__(self, size_t capacity=25):
14
16
  self.capacity = capacity
15
17
  self.top = 0
16
18
  self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
19
+ if self._stack == NULL:
20
+ raise MemoryError("Failed to allocate memory for stack")
17
21
 
18
22
  def __dealloc__(self):
19
23
  free(self._stack)
@@ -21,9 +25,10 @@ cdef class Stack:
21
25
  cdef bint is_empty(self):
22
26
  return self.top <= 0
23
27
 
24
- cdef push(self, myhtml_tree_node_t* res):
28
+ cdef int push(self, myhtml_tree_node_t* res) except -1:
25
29
  if self.top >= self.capacity:
26
- self.resize()
30
+ if self.resize() < 0:
31
+ return -1
27
32
  self._stack[self.top] = res
28
33
  self.top += 1
29
34
 
@@ -31,10 +36,13 @@ cdef class Stack:
31
36
  self.top = self.top - 1
32
37
  return self._stack[self.top]
33
38
 
34
- cdef resize(self):
39
+ cdef int resize(self) except -1:
35
40
  self.capacity *= 2
36
41
  self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
37
-
42
+ if self._stack == NULL:
43
+ PyErr_NoMemory()
44
+ return -1
45
+ return 0
38
46
 
39
47
  cdef class _Attributes:
40
48
  """A dict-like object that represents attributes."""
@@ -128,25 +136,24 @@ cdef class _Attributes:
128
136
  tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
129
137
  return "<%s attributes, %s items>" % (tag_name, len(self))
130
138
 
131
-
132
-
133
139
  ctypedef fused str_or_Node:
134
- basestring
140
+ str
135
141
  bytes
136
142
  Node
137
143
 
138
-
139
144
  cdef class Node:
140
145
  """A class that represents HTML node (element)."""
141
146
  cdef myhtml_tree_node_t *node
142
147
  cdef public HTMLParser parser
143
148
 
144
-
145
- cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
146
- # custom init, because __cinit__ doesn't accept C types
147
- self.node = node
149
+ @staticmethod
150
+ cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
151
+ # custom __init__ for C, because __cinit__ doesn't accept C types
152
+ cdef Node cls = Node.__new__(Node)
153
+ cls.node = node
148
154
  # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
149
- self.parser = parser
155
+ cls.parser = parser
156
+ return cls
150
157
 
151
158
  @property
152
159
  def attributes(self):
@@ -286,7 +293,7 @@ cdef class Node:
286
293
  cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
287
294
  text = ""
288
295
  cdef Stack stack = Stack(_STACK_SIZE)
289
- cdef myhtml_tree_node_t* current_node = NULL;
296
+ cdef myhtml_tree_node_t* current_node = NULL
290
297
 
291
298
  if node.tag_id == MyHTML_TAG__TEXT:
292
299
  c_text = myhtml_node_text(node, NULL)
@@ -339,12 +346,10 @@ cdef class Node:
339
346
  node = node.next
340
347
  continue
341
348
 
342
- next_node = Node()
343
- next_node._init(node, self.parser)
349
+ next_node = Node.new(node, self.parser)
344
350
  yield next_node
345
351
  node = node.next
346
352
 
347
-
348
353
  def traverse(self, include_text=False):
349
354
  """Iterate over all child and next nodes starting from the current level.
350
355
 
@@ -358,16 +363,15 @@ cdef class Node:
358
363
  node
359
364
  """
360
365
  cdef Stack stack = Stack(_STACK_SIZE)
361
- cdef myhtml_tree_node_t* current_node = NULL;
362
- cdef Node next_node;
366
+ cdef myhtml_tree_node_t* current_node = NULL
367
+ cdef Node next_node
363
368
 
364
369
  stack.push(self.node)
365
370
 
366
371
  while not stack.is_empty():
367
372
  current_node = stack.pop()
368
373
  if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
369
- next_node = Node()
370
- next_node._init(current_node, self.parser)
374
+ next_node = Node.new(current_node, self.parser)
371
375
  yield next_node
372
376
 
373
377
  if current_node.next is not NULL:
@@ -393,11 +397,13 @@ cdef class Node:
393
397
 
394
398
  @property
395
399
  def child(self):
396
- """Return the child node."""
400
+ """Alias for the `first_child` property.
401
+
402
+ **Deprecated**. Please use `first_child` instead.
403
+ """
397
404
  cdef Node node
398
405
  if self.node.child:
399
- node = Node()
400
- node._init(self.node.child, self.parser)
406
+ node = Node.new(self.node.child, self.parser)
401
407
  return node
402
408
  return None
403
409
 
@@ -406,8 +412,7 @@ cdef class Node:
406
412
  """Return the parent node."""
407
413
  cdef Node node
408
414
  if self.node.parent:
409
- node = Node()
410
- node._init(self.node.parent, self.parser)
415
+ node = Node.new(self.node.parent, self.parser)
411
416
  return node
412
417
  return None
413
418
 
@@ -416,8 +421,7 @@ cdef class Node:
416
421
  """Return next node."""
417
422
  cdef Node node
418
423
  if self.node.next:
419
- node = Node()
420
- node._init(self.node.next, self.parser)
424
+ node = Node.new(self.node.next, self.parser)
421
425
  return node
422
426
  return None
423
427
 
@@ -426,8 +430,7 @@ cdef class Node:
426
430
  """Return previous node."""
427
431
  cdef Node node
428
432
  if self.node.prev:
429
- node = Node()
430
- node._init(self.node.prev, self.parser)
433
+ node = Node.new(self.node.prev, self.parser)
431
434
  return node
432
435
  return None
433
436
 
@@ -436,8 +439,7 @@ cdef class Node:
436
439
  """Return last child node."""
437
440
  cdef Node node
438
441
  if self.node.last_child:
439
- node = Node()
440
- node._init(self.node.last_child, self.parser)
442
+ node = Node.new(self.node.last_child, self.parser)
441
443
  return node
442
444
  return None
443
445
 
@@ -537,8 +539,8 @@ cdef class Node:
537
539
  if delete_empty:
538
540
  myhtml_node_delete(self.node)
539
541
  return
540
- cdef myhtml_tree_node_t* next_node;
541
- cdef myhtml_tree_node_t* current_node;
542
+ cdef myhtml_tree_node_t* next_node
543
+ cdef myhtml_tree_node_t* current_node
542
544
 
543
545
  if self.node.child.next != NULL:
544
546
  current_node = self.node.child
@@ -572,6 +574,8 @@ cdef class Node:
572
574
  '<html><body><div>Hello world!</div></body></html>'
573
575
 
574
576
  """
577
+ # ensure cython can recast element to a Node so that decompose will be called sooner.
578
+ cdef Node element
575
579
  for tag in tags:
576
580
  for element in self.css(tag):
577
581
  element.decompose(recursive=recursive)
@@ -595,10 +599,10 @@ cdef class Node:
595
599
  >>> tree.body.unwrap_tags(['i','a'])
596
600
  >>> tree.body.html
597
601
  '<body><div>Hello world!</div></body>'
598
-
602
+
599
603
  Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
600
604
  """
601
-
605
+ cdef Node element
602
606
  for tag in tags:
603
607
  for element in self.css(tag):
604
608
  element.unwrap(delete_empty)
@@ -783,10 +787,10 @@ cdef class Node:
783
787
  >>> tree.body.unwrap_tags(['i','a'])
784
788
  >>> tree.body.html
785
789
  '<body><div>Hello world!</div></body>'
786
-
790
+
787
791
  Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
788
792
  """
789
-
793
+ cdef Node element
790
794
  for tag in tags:
791
795
  for element in self.css(tag):
792
796
  element.unwrap(delete_empty)
@@ -845,6 +849,7 @@ cdef class Node:
845
849
  The query to check.
846
850
 
847
851
  """
852
+ cdef Node node
848
853
  if self.parser.cached_script_texts is None:
849
854
  nodes = find_nodes(self.parser, self.node, 'script')
850
855
  text_nodes = []
@@ -893,6 +898,7 @@ cdef class Node:
893
898
  if not isinstance(other, Node):
894
899
  return False
895
900
  return self.html == other.html
901
+
896
902
  @property
897
903
  def text_content(self):
898
904
  """Returns the text of the node if it is a text node.
@@ -946,8 +952,8 @@ cdef class Node:
946
952
  while not stack.is_empty():
947
953
  current_node = stack.pop()
948
954
 
949
- if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
950
- current_node.prev.tag_id == MyHTML_TAG__TEXT:
955
+ if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
956
+ current_node.prev.tag_id == MyHTML_TAG__TEXT):
951
957
  left_text = myhtml_node_text(current_node.prev, &left_length)
952
958
  right_text = myhtml_node_text(current_node, &right_length)
953
959
  if left_text and right_text:
@@ -978,8 +984,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
978
984
 
979
985
  cdef inline bytes to_bytes(str_or_Node value):
980
986
  cdef bytes bytes_val
981
- if isinstance(value, (str, unicode)):
982
- bytes_val = value.encode(_ENCODING)
987
+ if isinstance(value, unicode):
988
+ bytes_val = <bytes>value.encode("utf-8")
983
989
  elif isinstance(value, bytes):
984
- bytes_val = <char*> value
990
+ bytes_val = <bytes>value
985
991
  return bytes_val
@@ -1,4 +1,6 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_SetObject
3
+
2
4
 
3
5
  @cython.final
4
6
  cdef class CSSSelector:
@@ -28,35 +30,33 @@ cdef class CSSSelector:
28
30
 
29
31
  return collection
30
32
 
31
-
32
- cdef _create_css_parser(self):
33
+ cdef int _create_css_parser(self) except -1:
33
34
  cdef mystatus_t status
34
35
 
35
36
  cdef mycss_t *mycss = mycss_create()
36
37
  status = mycss_init(mycss)
37
38
 
38
39
  if status != 0:
39
- raise RuntimeError("Can't init MyCSS object.")
40
- # return
40
+ PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
41
+ return -1
41
42
 
42
43
  self.css_entry = mycss_entry_create()
43
44
  status = mycss_entry_init(mycss, self.css_entry)
44
45
 
45
46
  if status != 0:
46
- raise RuntimeError("Can't init MyCSS Entry object.")
47
-
48
-
47
+ PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
48
+ return -1
49
+ return 0
49
50
 
50
- cdef _prepare_selector(self, mycss_entry_t *css_entry,
51
- const char *selector, size_t selector_size):
52
- cdef mystatus_t out_status;
53
- self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry),
54
- myencoding_t.MyENCODING_UTF_8,
55
- selector, selector_size,
56
- &out_status)
51
+ cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
52
+ cdef mystatus_t out_status
53
+ self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
54
+ selector, selector_size, &out_status)
57
55
 
58
56
  if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
59
- raise ValueError("Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
57
+ PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
58
+ return -1
59
+ return 0
60
60
 
61
61
  def __dealloc__(self):
62
62
  mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
@@ -77,12 +77,11 @@ cdef class Selector:
77
77
  cdef Node node
78
78
  cdef list nodes
79
79
 
80
- def __init__(self, Node node, query):
80
+ def __init__(self, Node node, str query):
81
81
  """custom init, because __cinit__ doesn't accept C types"""
82
82
  self.node = node
83
83
  self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
84
84
 
85
-
86
85
  cpdef css(self, str query):
87
86
  """Evaluate CSS selector against current scope."""
88
87
  cdef Node current_node
@@ -106,6 +105,7 @@ cdef class Selector:
106
105
  def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
107
106
  """Filter all current matches given text."""
108
107
  nodes = []
108
+ cdef Node node
109
109
  for node in self.nodes:
110
110
  node_text = node.text(deep=deep, separator=separator, strip=strip)
111
111
  if node_text and text in node_text:
@@ -116,6 +116,7 @@ cdef class Selector:
116
116
  def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
117
117
  """Returns True if any node in the current search scope contains specified text"""
118
118
  nodes = []
119
+ cdef Node node
119
120
  for node in self.nodes:
120
121
  node_text = node.text(deep=deep, separator=separator, strip=strip)
121
122
  if node_text and text in node_text:
@@ -142,7 +143,8 @@ cdef class Selector:
142
143
 
143
144
  Similar to `string-length` in XPath.
144
145
  """
145
- nodes = []
146
+ cdef list nodes = []
147
+ cdef Node node
146
148
  for node in self.nodes:
147
149
  attr = node.attributes.get(attribute)
148
150
  if attr and start and start in attr:
@@ -157,16 +159,15 @@ cdef class Selector:
157
159
  cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
158
160
  cdef myhtml_collection_t *collection
159
161
  cdef CSSSelector selector = CSSSelector(query)
160
-
161
- result = list()
162
+ cdef Node n
163
+ cdef list result = []
162
164
  collection = selector.find(node)
163
165
 
164
166
  if collection == NULL:
165
167
  return result
166
168
 
167
169
  for i in range(collection.length):
168
- n = Node()
169
- n._init(collection.list[i], parser)
170
+ n = Node.new(collection.list[i], parser)
170
171
  result.append(n)
171
172
  myhtml_collection_destroy(collection)
172
173
  return result
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
176
177
  cdef myhtml_collection_t *collection
177
178
  cdef CSSSelector selector
178
179
  cdef int collection_size
180
+ cdef str query
179
181
 
180
182
  for query in selectors:
181
183
  selector = CSSSelector(query)
@@ -1,5 +1,6 @@
1
1
  include "../utils.pxi"
2
2
 
3
+
3
4
  def create_tag(tag: str):
4
5
  """
5
6
  Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,