selectolax 0.3.28__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyx CHANGED
@@ -1,4 +1,5 @@
1
- from cpython cimport bool
1
+ from cpython.bool cimport bool
2
+ from cpython.exc cimport PyErr_SetObject
2
3
 
3
4
  _ENCODING = 'UTF-8'
4
5
 
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
24
25
  html : str (unicode) or bytes
25
26
  """
26
27
  def __init__(self, html):
27
-
28
28
  cdef size_t html_len
29
- cdef char* html_chars
30
-
29
+ cdef object bytes_html
31
30
  bytes_html, html_len = preprocess_input(html)
32
31
  self._parse_html(bytes_html, html_len)
33
32
  self.raw_html = bytes_html
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
39
38
  self._selector = LexborCSSSelector()
40
39
  return self._selector
41
40
 
42
-
43
- cdef _parse_html(self, char *html, size_t html_len):
41
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
44
42
  cdef lxb_status_t status
45
43
 
46
44
  with nogil:
47
45
  self.document = lxb_html_document_create()
48
46
 
49
47
  if self.document == NULL:
50
- raise SelectolaxError("Failed to initialize object for HTML Document.")
48
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
49
+ return -1
51
50
 
52
51
  with nogil:
53
52
  status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
53
+
54
54
  if status != 0x0000:
55
- raise SelectolaxError("Can't parse HTML.")
55
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
56
+ return -1
56
57
 
57
- assert self.document != NULL
58
+ if self.document == NULL:
59
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
60
+ return -1
61
+ return 0
58
62
 
59
63
  def __dealloc__(self):
60
64
  if self.document != NULL:
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
68
72
  """Returns root node."""
69
73
  if self.document == NULL:
70
74
  return None
71
- return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
75
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
76
 
73
77
  @property
74
78
  def body(self):
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
77
81
  body = lxb_html_document_body_element_noi(self.document)
78
82
  if body == NULL:
79
83
  return None
80
- return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
84
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
81
85
 
82
86
  @property
83
87
  def head(self):
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
86
90
  head = lxb_html_document_head_element_noi(self.document)
87
91
  if head == NULL:
88
92
  return None
89
- return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
93
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
90
94
 
91
95
  def tags(self, str name):
92
96
  """Returns a list of tags that match specified name.
@@ -96,6 +100,12 @@ cdef class LexborHTMLParser:
96
100
  name : str (e.g. div)
97
101
 
98
102
  """
103
+
104
+ if not name:
105
+ raise ValueError("Tag name cannot be empty")
106
+ if len(name) > 100:
107
+ raise ValueError("Tag name is too long")
108
+
99
109
  cdef lxb_dom_collection_t* collection = NULL
100
110
  cdef lxb_status_t status
101
111
  pybyte_name = name.encode('UTF-8')
@@ -116,7 +126,7 @@ cdef class LexborHTMLParser:
116
126
  raise SelectolaxError("Can't locate elements.")
117
127
 
118
128
  for i in range(lxb_dom_collection_length_noi(collection)):
119
- node = LexborNode()._cinit(
129
+ node = LexborNode.new(
120
130
  <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
121
131
  self
122
132
  )
@@ -150,7 +160,7 @@ cdef class LexborHTMLParser:
150
160
  """Return HTML representation of the page."""
151
161
  if self.document == NULL:
152
162
  return None
153
- node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
163
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
154
164
  return node.html
155
165
 
156
166
  def css(self, str query):
@@ -159,6 +169,11 @@ cdef class LexborHTMLParser:
159
169
  Matches pattern `query` against HTML tree.
160
170
  `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
161
171
 
172
+ Special selectors:
173
+
174
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
175
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
176
+
162
177
  Parameters
163
178
  ----------
164
179
  query : str
@@ -232,7 +247,7 @@ cdef class LexborHTMLParser:
232
247
 
233
248
  for i in range(lxb_dom_collection_length_noi(collection)):
234
249
  if recursive:
235
- lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
250
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
236
251
  else:
237
252
  lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
238
253
  lxb_dom_collection_destroy(collection, <bint> True)
@@ -273,7 +288,6 @@ cdef class LexborHTMLParser:
273
288
  """
274
289
  return self.root.scripts_contain(query)
275
290
 
276
-
277
291
  def script_srcs_contain(self, tuple queries):
278
292
  """Returns True if any of the script SRCs attributes contain on of the specified text.
279
293
 
@@ -289,6 +303,26 @@ cdef class LexborHTMLParser:
289
303
  def css_matches(self, str selector):
290
304
  return self.root.css_matches(selector)
291
305
 
306
+ def merge_text_nodes(self):
307
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
308
+
309
+ This is useful for text extraction.
310
+ Use it when you need to strip HTML tags and merge "dangling" text.
311
+
312
+ Examples
313
+ --------
314
+
315
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
316
+ >>> node = tree.css_first('div')
317
+ >>> tree.unwrap_tags(["strong"])
318
+ >>> tree.text(deep=True, separator=" ", strip=True)
319
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
320
+ >>> node.merge_text_nodes()
321
+ >>> tree.text(deep=True, separator=" ", strip=True)
322
+ "John Doe"
323
+ """
324
+ return self.root.merge_text_nodes()
325
+
292
326
  @staticmethod
293
327
  cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
294
328
  obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -303,6 +337,7 @@ cdef class LexborHTMLParser:
303
337
  """Clone the current tree."""
304
338
  cdef lxb_html_document_t* cloned_document
305
339
  cdef lxb_dom_node_t* cloned_node
340
+ cdef LexborHTMLParser cls
306
341
 
307
342
  with nogil:
308
343
  cloned_document = lxb_html_document_create()
@@ -327,7 +362,8 @@ cdef class LexborHTMLParser:
327
362
 
328
363
  cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
329
364
  return cls
330
- def unwrap_tags(self, list tags):
365
+
366
+ def unwrap_tags(self, list tags, delete_empty = False):
331
367
  """Unwraps specified tags from the HTML tree.
332
368
 
333
369
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -336,6 +372,8 @@ cdef class LexborHTMLParser:
336
372
  ----------
337
373
  tags : list
338
374
  List of tags to remove.
375
+ delete_empty : bool
376
+ Whenever to delete empty tags.
339
377
 
340
378
  Examples
341
379
  --------
@@ -345,5 +383,6 @@ cdef class LexborHTMLParser:
345
383
  >>> tree.body.html
346
384
  '<body><div>Hello world!</div></body>'
347
385
  """
348
- if self.root is not None:
349
- self.root.unwrap_tags(tags)
386
+ # faster to check if the document is empty which should determine if we have a root
387
+ if self.document != NULL:
388
+ self.root.unwrap_tags(tags, delete_empty=delete_empty)
@@ -1,4 +1,5 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_NoMemory
2
3
 
3
4
  from libc.stdlib cimport free
4
5
  from libc.stdlib cimport malloc
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
8
9
  DEF _STACK_SIZE = 100
9
10
  DEF _ENCODING = 'UTF-8'
10
11
 
12
+
11
13
  @cython.final
12
14
  cdef class Stack:
13
15
  def __cinit__(self, size_t capacity=25):
14
16
  self.capacity = capacity
15
17
  self.top = 0
16
18
  self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
19
+ if self._stack == NULL:
20
+ raise MemoryError("Failed to allocate memory for stack")
17
21
 
18
22
  def __dealloc__(self):
19
23
  free(self._stack)
@@ -21,9 +25,10 @@ cdef class Stack:
21
25
  cdef bint is_empty(self):
22
26
  return self.top <= 0
23
27
 
24
- cdef push(self, myhtml_tree_node_t* res):
28
+ cdef int push(self, myhtml_tree_node_t* res) except -1:
25
29
  if self.top >= self.capacity:
26
- self.resize()
30
+ if self.resize() < 0:
31
+ return -1
27
32
  self._stack[self.top] = res
28
33
  self.top += 1
29
34
 
@@ -31,10 +36,13 @@ cdef class Stack:
31
36
  self.top = self.top - 1
32
37
  return self._stack[self.top]
33
38
 
34
- cdef resize(self):
39
+ cdef int resize(self) except -1:
35
40
  self.capacity *= 2
36
41
  self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
37
-
42
+ if self._stack == NULL:
43
+ PyErr_NoMemory()
44
+ return -1
45
+ return 0
38
46
 
39
47
  cdef class _Attributes:
40
48
  """A dict-like object that represents attributes."""
@@ -128,25 +136,24 @@ cdef class _Attributes:
128
136
  tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
129
137
  return "<%s attributes, %s items>" % (tag_name, len(self))
130
138
 
131
-
132
-
133
139
  ctypedef fused str_or_Node:
134
- basestring
140
+ str
135
141
  bytes
136
142
  Node
137
143
 
138
-
139
144
  cdef class Node:
140
145
  """A class that represents HTML node (element)."""
141
146
  cdef myhtml_tree_node_t *node
142
147
  cdef public HTMLParser parser
143
148
 
144
-
145
- cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
146
- # custom init, because __cinit__ doesn't accept C types
147
- self.node = node
149
+ @staticmethod
150
+ cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
151
+ # custom __init__ for C, because __cinit__ doesn't accept C types
152
+ cdef Node cls = Node.__new__(Node)
153
+ cls.node = node
148
154
  # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
149
- self.parser = parser
155
+ cls.parser = parser
156
+ return cls
150
157
 
151
158
  @property
152
159
  def attributes(self):
@@ -286,7 +293,7 @@ cdef class Node:
286
293
  cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
287
294
  text = ""
288
295
  cdef Stack stack = Stack(_STACK_SIZE)
289
- cdef myhtml_tree_node_t* current_node = NULL;
296
+ cdef myhtml_tree_node_t* current_node = NULL
290
297
 
291
298
  if node.tag_id == MyHTML_TAG__TEXT:
292
299
  c_text = myhtml_node_text(node, NULL)
@@ -339,12 +346,10 @@ cdef class Node:
339
346
  node = node.next
340
347
  continue
341
348
 
342
- next_node = Node()
343
- next_node._init(node, self.parser)
349
+ next_node = Node.new(node, self.parser)
344
350
  yield next_node
345
351
  node = node.next
346
352
 
347
-
348
353
  def traverse(self, include_text=False):
349
354
  """Iterate over all child and next nodes starting from the current level.
350
355
 
@@ -358,16 +363,15 @@ cdef class Node:
358
363
  node
359
364
  """
360
365
  cdef Stack stack = Stack(_STACK_SIZE)
361
- cdef myhtml_tree_node_t* current_node = NULL;
362
- cdef Node next_node;
366
+ cdef myhtml_tree_node_t* current_node = NULL
367
+ cdef Node next_node
363
368
 
364
369
  stack.push(self.node)
365
370
 
366
371
  while not stack.is_empty():
367
372
  current_node = stack.pop()
368
373
  if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
369
- next_node = Node()
370
- next_node._init(current_node, self.parser)
374
+ next_node = Node.new(current_node, self.parser)
371
375
  yield next_node
372
376
 
373
377
  if current_node.next is not NULL:
@@ -396,8 +400,7 @@ cdef class Node:
396
400
  """Return the child node."""
397
401
  cdef Node node
398
402
  if self.node.child:
399
- node = Node()
400
- node._init(self.node.child, self.parser)
403
+ node = Node.new(self.node.child, self.parser)
401
404
  return node
402
405
  return None
403
406
 
@@ -406,8 +409,7 @@ cdef class Node:
406
409
  """Return the parent node."""
407
410
  cdef Node node
408
411
  if self.node.parent:
409
- node = Node()
410
- node._init(self.node.parent, self.parser)
412
+ node = Node.new(self.node.parent, self.parser)
411
413
  return node
412
414
  return None
413
415
 
@@ -416,8 +418,7 @@ cdef class Node:
416
418
  """Return next node."""
417
419
  cdef Node node
418
420
  if self.node.next:
419
- node = Node()
420
- node._init(self.node.next, self.parser)
421
+ node = Node.new(self.node.next, self.parser)
421
422
  return node
422
423
  return None
423
424
 
@@ -426,8 +427,7 @@ cdef class Node:
426
427
  """Return previous node."""
427
428
  cdef Node node
428
429
  if self.node.prev:
429
- node = Node()
430
- node._init(self.node.prev, self.parser)
430
+ node = Node.new(self.node.prev, self.parser)
431
431
  return node
432
432
  return None
433
433
 
@@ -436,8 +436,7 @@ cdef class Node:
436
436
  """Return last child node."""
437
437
  cdef Node node
438
438
  if self.node.last_child:
439
- node = Node()
440
- node._init(self.node.last_child, self.parser)
439
+ node = Node.new(self.node.last_child, self.parser)
441
440
  return node
442
441
  return None
443
442
 
@@ -515,9 +514,14 @@ cdef class Node:
515
514
  """An alias for the decompose method."""
516
515
  self.decompose(recursive)
517
516
 
518
- def unwrap(self):
517
+ def unwrap(self, delete_empty = False):
519
518
  """Replace node with whatever is inside this node.
520
519
 
520
+ Parameters
521
+ ----------
522
+ delete_empty : bool, default False
523
+ Whenever to delete empty tags.
524
+
521
525
  Examples
522
526
  --------
523
527
 
@@ -526,11 +530,14 @@ cdef class Node:
526
530
  >>> tree.html
527
531
  '<html><head></head><body><div>Hello world!</div></body></html>'
528
532
 
533
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
529
534
  """
530
535
  if self.node.child == NULL:
536
+ if delete_empty:
537
+ myhtml_node_delete(self.node)
531
538
  return
532
- cdef myhtml_tree_node_t* next_node;
533
- cdef myhtml_tree_node_t* current_node;
539
+ cdef myhtml_tree_node_t* next_node
540
+ cdef myhtml_tree_node_t* current_node
534
541
 
535
542
  if self.node.child.next != NULL:
536
543
  current_node = self.node.child
@@ -564,11 +571,13 @@ cdef class Node:
564
571
  '<html><body><div>Hello world!</div></body></html>'
565
572
 
566
573
  """
574
+ # ensure cython can recast element to a Node so that decompose will be called sooner.
575
+ cdef Node element
567
576
  for tag in tags:
568
577
  for element in self.css(tag):
569
578
  element.decompose(recursive=recursive)
570
579
 
571
- def unwrap_tags(self, list tags):
580
+ def unwrap_tags(self, list tags, delete_empty = False):
572
581
  """Unwraps specified tags from the HTML tree.
573
582
 
574
583
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -577,6 +586,8 @@ cdef class Node:
577
586
  ----------
578
587
  tags : list
579
588
  List of tags to remove.
589
+ delete_empty : bool, default False
590
+ Whenever to delete empty tags.
580
591
 
581
592
  Examples
582
593
  --------
@@ -585,11 +596,13 @@ cdef class Node:
585
596
  >>> tree.body.unwrap_tags(['i','a'])
586
597
  >>> tree.body.html
587
598
  '<body><div>Hello world!</div></body>'
588
- """
589
599
 
600
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
601
+ """
602
+ cdef Node element
590
603
  for tag in tags:
591
604
  for element in self.css(tag):
592
- element.unwrap()
605
+ element.unwrap(delete_empty)
593
606
 
594
607
  def replace_with(self, str_or_Node value):
595
608
  """Replace current Node with specified value.
@@ -752,7 +765,7 @@ cdef class Node:
752
765
  else:
753
766
  raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
754
767
 
755
- def unwrap_tags(self, list tags):
768
+ def unwrap_tags(self, list tags, delete_empty = False):
756
769
  """Unwraps specified tags from the HTML tree.
757
770
 
758
771
  Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -761,6 +774,8 @@ cdef class Node:
761
774
  ----------
762
775
  tags : list
763
776
  List of tags to remove.
777
+ delete_empty : bool, default False
778
+ Whenever to delete empty tags.
764
779
 
765
780
  Examples
766
781
  --------
@@ -769,11 +784,13 @@ cdef class Node:
769
784
  >>> tree.body.unwrap_tags(['i','a'])
770
785
  >>> tree.body.html
771
786
  '<body><div>Hello world!</div></body>'
772
- """
773
787
 
788
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
789
+ """
790
+ cdef Node element
774
791
  for tag in tags:
775
792
  for element in self.css(tag):
776
- element.unwrap()
793
+ element.unwrap(delete_empty)
777
794
 
778
795
  @property
779
796
  def raw_value(self):
@@ -829,6 +846,7 @@ cdef class Node:
829
846
  The query to check.
830
847
 
831
848
  """
849
+ cdef Node node
832
850
  if self.parser.cached_script_texts is None:
833
851
  nodes = find_nodes(self.parser, self.node, 'script')
834
852
  text_nodes = []
@@ -877,6 +895,7 @@ cdef class Node:
877
895
  if not isinstance(other, Node):
878
896
  return False
879
897
  return self.html == other.html
898
+
880
899
  @property
881
900
  def text_content(self):
882
901
  """Returns the text of the node if it is a text node.
@@ -930,8 +949,8 @@ cdef class Node:
930
949
  while not stack.is_empty():
931
950
  current_node = stack.pop()
932
951
 
933
- if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
934
- current_node.prev.tag_id == MyHTML_TAG__TEXT:
952
+ if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
953
+ current_node.prev.tag_id == MyHTML_TAG__TEXT):
935
954
  left_text = myhtml_node_text(current_node.prev, &left_length)
936
955
  right_text = myhtml_node_text(current_node, &right_length)
937
956
  if left_text and right_text:
@@ -962,8 +981,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
962
981
 
963
982
  cdef inline bytes to_bytes(str_or_Node value):
964
983
  cdef bytes bytes_val
965
- if isinstance(value, (str, unicode)):
966
- bytes_val = value.encode(_ENCODING)
984
+ if isinstance(value, unicode):
985
+ bytes_val = <bytes>value.encode("utf-8")
967
986
  elif isinstance(value, bytes):
968
- bytes_val = <char*> value
987
+ bytes_val = <bytes>value
969
988
  return bytes_val
@@ -1,4 +1,6 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_SetObject
3
+
2
4
 
3
5
  @cython.final
4
6
  cdef class CSSSelector:
@@ -28,35 +30,33 @@ cdef class CSSSelector:
28
30
 
29
31
  return collection
30
32
 
31
-
32
- cdef _create_css_parser(self):
33
+ cdef int _create_css_parser(self) except -1:
33
34
  cdef mystatus_t status
34
35
 
35
36
  cdef mycss_t *mycss = mycss_create()
36
37
  status = mycss_init(mycss)
37
38
 
38
39
  if status != 0:
39
- raise RuntimeError("Can't init MyCSS object.")
40
- # return
40
+ PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
41
+ return -1
41
42
 
42
43
  self.css_entry = mycss_entry_create()
43
44
  status = mycss_entry_init(mycss, self.css_entry)
44
45
 
45
46
  if status != 0:
46
- raise RuntimeError("Can't init MyCSS Entry object.")
47
-
48
-
47
+ PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
48
+ return -1
49
+ return 0
49
50
 
50
- cdef _prepare_selector(self, mycss_entry_t *css_entry,
51
- const char *selector, size_t selector_size):
52
- cdef mystatus_t out_status;
53
- self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry),
54
- myencoding_t.MyENCODING_UTF_8,
55
- selector, selector_size,
56
- &out_status)
51
+ cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
52
+ cdef mystatus_t out_status
53
+ self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
54
+ selector, selector_size, &out_status)
57
55
 
58
56
  if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
59
- raise ValueError("Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
57
+ PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
58
+ return -1
59
+ return 0
60
60
 
61
61
  def __dealloc__(self):
62
62
  mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
@@ -77,12 +77,11 @@ cdef class Selector:
77
77
  cdef Node node
78
78
  cdef list nodes
79
79
 
80
- def __init__(self, Node node, query):
80
+ def __init__(self, Node node, str query):
81
81
  """custom init, because __cinit__ doesn't accept C types"""
82
82
  self.node = node
83
83
  self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
84
84
 
85
-
86
85
  cpdef css(self, str query):
87
86
  """Evaluate CSS selector against current scope."""
88
87
  cdef Node current_node
@@ -106,6 +105,7 @@ cdef class Selector:
106
105
  def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
107
106
  """Filter all current matches given text."""
108
107
  nodes = []
108
+ cdef Node node
109
109
  for node in self.nodes:
110
110
  node_text = node.text(deep=deep, separator=separator, strip=strip)
111
111
  if node_text and text in node_text:
@@ -116,6 +116,7 @@ cdef class Selector:
116
116
  def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
117
117
  """Returns True if any node in the current search scope contains specified text"""
118
118
  nodes = []
119
+ cdef Node node
119
120
  for node in self.nodes:
120
121
  node_text = node.text(deep=deep, separator=separator, strip=strip)
121
122
  if node_text and text in node_text:
@@ -142,7 +143,8 @@ cdef class Selector:
142
143
 
143
144
  Similar to `string-length` in XPath.
144
145
  """
145
- nodes = []
146
+ cdef list nodes = []
147
+ cdef Node node
146
148
  for node in self.nodes:
147
149
  attr = node.attributes.get(attribute)
148
150
  if attr and start and start in attr:
@@ -157,16 +159,15 @@ cdef class Selector:
157
159
  cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
158
160
  cdef myhtml_collection_t *collection
159
161
  cdef CSSSelector selector = CSSSelector(query)
160
-
161
- result = list()
162
+ cdef Node n
163
+ cdef list result = []
162
164
  collection = selector.find(node)
163
165
 
164
166
  if collection == NULL:
165
167
  return result
166
168
 
167
169
  for i in range(collection.length):
168
- n = Node()
169
- n._init(collection.list[i], parser)
170
+ n = Node.new(collection.list[i], parser)
170
171
  result.append(n)
171
172
  myhtml_collection_destroy(collection)
172
173
  return result
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
176
177
  cdef myhtml_collection_t *collection
177
178
  cdef CSSSelector selector
178
179
  cdef int collection_size
180
+ cdef str query
179
181
 
180
182
  for query in selectors:
181
183
  selector = CSSSelector(query)
@@ -1,5 +1,6 @@
1
1
  include "../utils.pxi"
2
2
 
3
+
3
4
  def create_tag(tag: str):
4
5
  """
5
6
  Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,