selectolax 0.3.32__cp312-cp312-macosx_11_0_arm64.whl → 0.3.33__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

Binary file
selectolax/lexbor.pxd CHANGED
@@ -1,4 +1,4 @@
1
- from libc.stdint cimport uint32_t, uint8_t, uintptr_t
1
+ from libc.stdint cimport uint8_t, uint32_t, uintptr_t
2
2
 
3
3
 
4
4
  cdef extern from "lexbor/core/core.h" nogil:
@@ -31,7 +31,6 @@ cdef extern from "lexbor/core/core.h" nogil:
31
31
  lexbor_str_t* lexbor_str_create()
32
32
  lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
33
33
 
34
-
35
34
  cdef extern from "lexbor/html/html.h" nogil:
36
35
  ctypedef unsigned int lxb_html_document_opt_t
37
36
 
@@ -54,14 +53,12 @@ cdef extern from "lexbor/html/html.h" nogil:
54
53
  void *events
55
54
 
56
55
  ctypedef struct lexbor_str_t:
57
- lxb_char_t *data;
58
- size_t length;
59
-
56
+ lxb_char_t *data
57
+ size_t length
60
58
 
61
59
  ctypedef struct lxb_dom_node_t:
62
60
  lxb_dom_event_target_t event_target
63
61
 
64
-
65
62
  uintptr_t local_name
66
63
  uintptr_t prefix
67
64
  uintptr_t ns
@@ -77,7 +74,6 @@ cdef extern from "lexbor/html/html.h" nogil:
77
74
 
78
75
  lxb_dom_node_type_t type
79
76
 
80
-
81
77
  ctypedef struct lxb_dom_document_t:
82
78
  lxb_dom_node_t node
83
79
 
@@ -104,7 +100,6 @@ cdef extern from "lexbor/html/html.h" nogil:
104
100
 
105
101
  bint scripting
106
102
 
107
-
108
103
  ctypedef struct lxb_html_document_t:
109
104
  lxb_dom_document_t dom_document
110
105
 
@@ -128,7 +123,6 @@ cdef extern from "lexbor/html/html.h" nogil:
128
123
  LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
129
124
  LXB_HTML_PARSER_STATE_ERROR = 0x04
130
125
 
131
-
132
126
  ctypedef enum lxb_dom_node_type_t:
133
127
  LXB_DOM_NODE_TYPE_ELEMENT = 0x01
134
128
  LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
@@ -175,10 +169,9 @@ cdef extern from "lexbor/html/html.h" nogil:
175
169
  size_t length
176
170
  size_t struct_size
177
171
 
178
-
179
172
  ctypedef struct lxb_html_tree_pending_table_t
180
- ctypedef bint lxb_html_tree_insertion_mode_f;
181
- ctypedef lxb_status_t lxb_html_tree_append_attr_f;
173
+ ctypedef bint lxb_html_tree_insertion_mode_f
174
+ ctypedef lxb_status_t lxb_html_tree_append_attr_f
182
175
 
183
176
  ctypedef struct lxb_html_tree_t:
184
177
 
@@ -189,13 +182,13 @@ cdef extern from "lexbor/html/html.h" nogil:
189
182
 
190
183
  lxb_html_form_element_t *form
191
184
 
192
- lexbor_array_t *open_elements;
193
- lexbor_array_t *active_formatting;
194
- lexbor_array_obj_t *template_insertion_modes;
185
+ lexbor_array_t *open_elements
186
+ lexbor_array_t *active_formatting
187
+ lexbor_array_obj_t *template_insertion_modes
195
188
 
196
- lxb_html_tree_pending_table_t *pending_table;
189
+ lxb_html_tree_pending_table_t *pending_table
197
190
 
198
- lexbor_array_obj_t *parse_errors;
191
+ lexbor_array_obj_t *parse_errors
199
192
 
200
193
  bint foster_parenting
201
194
  bint frameset_ok
@@ -232,9 +225,13 @@ cdef extern from "lexbor/html/html.h" nogil:
232
225
  lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
233
226
 
234
227
  cdef class LexborNode:
235
- cdef lxb_dom_node_t *node
236
- cdef public LexborHTMLParser parser
237
- cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser)
228
+ cdef:
229
+ lxb_dom_node_t *node
230
+ public LexborHTMLParser parser
231
+
232
+ @staticmethod
233
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
234
+
238
235
 
239
236
  cdef class LexborCSSSelector:
240
237
  cdef lxb_css_parser_t* parser
@@ -242,15 +239,15 @@ cdef class LexborCSSSelector:
242
239
  cdef lxb_css_selectors_t * css_selectors
243
240
  cdef public list results
244
241
  cdef public LexborNode current_node
245
- cdef _create_css_parser(self)
246
- cpdef find(self, str query, LexborNode node)
247
- cpdef any_matches(self, str query, LexborNode node)
242
+ cdef int _create_css_parser(self) except -1
243
+ cpdef list find(self, str query, LexborNode node)
244
+ cpdef int any_matches(self, str query, LexborNode node) except -1
248
245
 
249
246
  cdef class LexborHTMLParser:
250
247
  cdef lxb_html_document_t *document
251
248
  cdef public bytes raw_html
252
249
  cdef LexborCSSSelector _selector
253
- cdef _parse_html(self, char* html, size_t html_len)
250
+ cdef int _parse_html(self, char* html, size_t html_len) except -1
254
251
  cdef object cached_script_texts
255
252
  cdef object cached_script_srcs
256
253
 
@@ -267,8 +264,8 @@ cdef extern from "lexbor/dom/dom.h" nogil:
267
264
  ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)
268
265
 
269
266
  ctypedef struct lxb_dom_character_data_t:
270
- lxb_dom_node_t node;
271
- lexbor_str_t data;
267
+ lxb_dom_node_t node
268
+ lexbor_str_t data
272
269
 
273
270
  ctypedef struct lxb_dom_text_t:
274
271
  lxb_dom_character_data_t char_data
@@ -289,19 +286,20 @@ cdef extern from "lexbor/dom/dom.h" nogil:
289
286
  lxb_dom_element_t *owner
290
287
 
291
288
  lxb_dom_attr_t *next
292
- lxb_dom_attr_t *prev;
293
-
289
+ lxb_dom_attr_t *prev
294
290
 
295
291
  lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
296
292
  lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
293
+ lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
294
+ void lxb_dom_node_remove(lxb_dom_node_t *node)
297
295
  void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
298
- lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
296
+ lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
299
297
  lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
300
298
  lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
301
299
  lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
302
300
  lxb_dom_attr_t * lxb_dom_element_first_attribute_noi(lxb_dom_element_t *element)
303
301
 
304
- const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len);
302
+ const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
305
303
  const lxb_char_t * lxb_dom_attr_value_noi(lxb_dom_attr_t *attr, size_t *len)
306
304
 
307
305
  lxb_dom_attr_t * lxb_dom_element_set_attribute(lxb_dom_element_t *element,
@@ -314,7 +312,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
314
312
  lxb_tag_id_t lxb_dom_node_tag_id_noi(lxb_dom_node_t *node)
315
313
  lxb_dom_node_t * lxb_dom_document_import_node(lxb_dom_document_t *doc, lxb_dom_node_t *node, bint deep)
316
314
  void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
317
- lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node);
315
+ lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
318
316
  void lxb_dom_node_insert_child(lxb_dom_node_t *to, lxb_dom_node_t *node)
319
317
  void lxb_dom_node_insert_before(lxb_dom_node_t *to, lxb_dom_node_t *node)
320
318
  void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
@@ -347,7 +345,7 @@ cdef extern from "lexbor/css/css.h" nogil:
347
345
  lxb_css_parser_t * lxb_css_parser_create()
348
346
  lxb_status_t lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz)
349
347
  lxb_css_parser_t * lxb_css_parser_destroy(lxb_css_parser_t *parser, bint self_destroy)
350
- lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy);
348
+ lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
351
349
  void lxb_css_selector_list_destroy_memory(lxb_css_selector_list_t *list)
352
350
 
353
351
 
@@ -558,8 +556,7 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
558
556
  ctypedef struct lxb_selectors_t
559
557
  ctypedef struct lxb_css_selector_list_t
560
558
  ctypedef struct lxb_css_selector_specificity_t
561
- ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
562
- void *ctx)
559
+ ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx)
563
560
  ctypedef enum lxb_selectors_opt_t:
564
561
  LXB_SELECTORS_OPT_DEFAULT = 0x00
565
562
  LXB_SELECTORS_OPT_MATCH_ROOT = 1 << 1
@@ -576,4 +573,4 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
576
573
  lxb_status_t lxb_selectors_init(lxb_selectors_t *selectors)
577
574
  lxb_selectors_t * lxb_selectors_destroy(lxb_selectors_t *selectors, bint self_destroy)
578
575
  lxb_status_t lxb_selectors_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
579
- lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
576
+ lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
selectolax/lexbor.pyi CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Any, Iterator, Literal, TypeVar, NoReturn, overload, Optional
1
+ from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
2
2
 
3
3
  DefaultT = TypeVar("DefaultT")
4
4
 
@@ -384,6 +384,25 @@ class LexborNode:
384
384
  Note: by default, empty tags are ignored, use "delete_empty" to change this.
385
385
  """
386
386
  ...
387
+ def merge_text_nodes(self) -> None:
388
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
389
+
390
+ This is useful for text extraction.
391
+ Use it when you need to strip HTML tags and merge "dangling" text.
392
+
393
+ Examples
394
+ --------
395
+
396
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
397
+ >>> node = tree.css_first('div')
398
+ >>> tree.unwrap_tags(["strong"])
399
+ >>> tree.text(deep=True, separator=" ", strip=True)
400
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
401
+ >>> node.merge_text_nodes()
402
+ >>> tree.text(deep=True, separator=" ", strip=True)
403
+ "John Doe"
404
+ """
405
+ ...
387
406
  def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
388
407
  """Iterate over all child and next nodes starting from the current level.
389
408
 
@@ -779,6 +798,25 @@ class LexborHTMLParser:
779
798
  """
780
799
  ...
781
800
  def css_matches(self, selector: str) -> bool: ...
801
+ def merge_text_nodes(self) -> None:
802
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
803
+
804
+ This is useful for text extraction.
805
+ Use it when you need to strip HTML tags and merge "dangling" text.
806
+
807
+ Examples
808
+ --------
809
+
810
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
811
+ >>> node = tree.css_first('div')
812
+ >>> tree.unwrap_tags(["strong"])
813
+ >>> tree.text(deep=True, separator=" ", strip=True)
814
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
815
+ >>> node.merge_text_nodes()
816
+ >>> tree.text(deep=True, separator=" ", strip=True)
817
+ "John Doe"
818
+ """
819
+ ...
782
820
  def clone(self) -> LexborHTMLParser:
783
821
  """Clone the current tree."""
784
822
  ...
selectolax/lexbor.pyx CHANGED
@@ -1,4 +1,5 @@
1
- from cpython cimport bool
1
+ from cpython.bool cimport bool
2
+ from cpython.exc cimport PyErr_SetObject
2
3
 
3
4
  _ENCODING = 'UTF-8'
4
5
 
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
24
25
  html : str (unicode) or bytes
25
26
  """
26
27
  def __init__(self, html):
27
-
28
28
  cdef size_t html_len
29
- cdef char* html_chars
30
-
29
+ cdef object bytes_html
31
30
  bytes_html, html_len = preprocess_input(html)
32
31
  self._parse_html(bytes_html, html_len)
33
32
  self.raw_html = bytes_html
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
39
38
  self._selector = LexborCSSSelector()
40
39
  return self._selector
41
40
 
42
-
43
- cdef _parse_html(self, char *html, size_t html_len):
41
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
44
42
  cdef lxb_status_t status
45
43
 
46
44
  with nogil:
47
45
  self.document = lxb_html_document_create()
48
46
 
49
47
  if self.document == NULL:
50
- raise SelectolaxError("Failed to initialize object for HTML Document.")
48
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
49
+ return -1
51
50
 
52
51
  with nogil:
53
52
  status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
53
+
54
54
  if status != 0x0000:
55
- raise SelectolaxError("Can't parse HTML.")
55
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
56
+ return -1
56
57
 
57
- assert self.document != NULL
58
+ if self.document == NULL:
59
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
60
+ return -1
61
+ return 0
58
62
 
59
63
  def __dealloc__(self):
60
64
  if self.document != NULL:
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
68
72
  """Returns root node."""
69
73
  if self.document == NULL:
70
74
  return None
71
- return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
75
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
76
 
73
77
  @property
74
78
  def body(self):
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
77
81
  body = lxb_html_document_body_element_noi(self.document)
78
82
  if body == NULL:
79
83
  return None
80
- return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
84
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
81
85
 
82
86
  @property
83
87
  def head(self):
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
86
90
  head = lxb_html_document_head_element_noi(self.document)
87
91
  if head == NULL:
88
92
  return None
89
- return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
93
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
90
94
 
91
95
  def tags(self, str name):
92
96
  """Returns a list of tags that match specified name.
@@ -122,7 +126,7 @@ cdef class LexborHTMLParser:
122
126
  raise SelectolaxError("Can't locate elements.")
123
127
 
124
128
  for i in range(lxb_dom_collection_length_noi(collection)):
125
- node = LexborNode()._cinit(
129
+ node = LexborNode.new(
126
130
  <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
127
131
  self
128
132
  )
@@ -156,7 +160,7 @@ cdef class LexborHTMLParser:
156
160
  """Return HTML representation of the page."""
157
161
  if self.document == NULL:
158
162
  return None
159
- node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
163
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
160
164
  return node.html
161
165
 
162
166
  def css(self, str query):
@@ -238,7 +242,7 @@ cdef class LexborHTMLParser:
238
242
 
239
243
  for i in range(lxb_dom_collection_length_noi(collection)):
240
244
  if recursive:
241
- lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
245
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
242
246
  else:
243
247
  lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
244
248
  lxb_dom_collection_destroy(collection, <bint> True)
@@ -279,7 +283,6 @@ cdef class LexborHTMLParser:
279
283
  """
280
284
  return self.root.scripts_contain(query)
281
285
 
282
-
283
286
  def script_srcs_contain(self, tuple queries):
284
287
  """Returns True if any of the script SRCs attributes contain on of the specified text.
285
288
 
@@ -295,6 +298,26 @@ cdef class LexborHTMLParser:
295
298
  def css_matches(self, str selector):
296
299
  return self.root.css_matches(selector)
297
300
 
301
+ def merge_text_nodes(self):
302
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
303
+
304
+ This is useful for text extraction.
305
+ Use it when you need to strip HTML tags and merge "dangling" text.
306
+
307
+ Examples
308
+ --------
309
+
310
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
311
+ >>> node = tree.css_first('div')
312
+ >>> tree.unwrap_tags(["strong"])
313
+ >>> tree.text(deep=True, separator=" ", strip=True)
314
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
315
+ >>> node.merge_text_nodes()
316
+ >>> tree.text(deep=True, separator=" ", strip=True)
317
+ "John Doe"
318
+ """
319
+ return self.root.merge_text_nodes()
320
+
298
321
  @staticmethod
299
322
  cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
300
323
  obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -309,6 +332,7 @@ cdef class LexborHTMLParser:
309
332
  """Clone the current tree."""
310
333
  cdef lxb_html_document_t* cloned_document
311
334
  cdef lxb_dom_node_t* cloned_node
335
+ cdef LexborHTMLParser cls
312
336
 
313
337
  with nogil:
314
338
  cloned_document = lxb_html_document_create()
@@ -333,6 +357,7 @@ cdef class LexborHTMLParser:
333
357
 
334
358
  cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
335
359
  return cls
360
+
336
361
  def unwrap_tags(self, list tags, delete_empty = False):
337
362
  """Unwraps specified tags from the HTML tree.
338
363
 
@@ -353,5 +378,6 @@ cdef class LexborHTMLParser:
353
378
  >>> tree.body.html
354
379
  '<body><div>Hello world!</div></body>'
355
380
  """
356
- if self.root is not None:
381
+ # faster to check if the document is empty which should determine if we have a root
382
+ if self.document != NULL:
357
383
  self.root.unwrap_tags(tags, delete_empty=delete_empty)
@@ -1,4 +1,5 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_NoMemory
2
3
 
3
4
  from libc.stdlib cimport free
4
5
  from libc.stdlib cimport malloc
@@ -8,6 +9,7 @@ from libc.string cimport memcpy
8
9
  DEF _STACK_SIZE = 100
9
10
  DEF _ENCODING = 'UTF-8'
10
11
 
12
+
11
13
  @cython.final
12
14
  cdef class Stack:
13
15
  def __cinit__(self, size_t capacity=25):
@@ -23,9 +25,10 @@ cdef class Stack:
23
25
  cdef bint is_empty(self):
24
26
  return self.top <= 0
25
27
 
26
- cdef push(self, myhtml_tree_node_t* res):
28
+ cdef int push(self, myhtml_tree_node_t* res) except -1:
27
29
  if self.top >= self.capacity:
28
- self.resize()
30
+ if self.resize() < 0:
31
+ return -1
29
32
  self._stack[self.top] = res
30
33
  self.top += 1
31
34
 
@@ -33,10 +36,13 @@ cdef class Stack:
33
36
  self.top = self.top - 1
34
37
  return self._stack[self.top]
35
38
 
36
- cdef resize(self):
39
+ cdef int resize(self) except -1:
37
40
  self.capacity *= 2
38
41
  self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
39
-
42
+ if self._stack == NULL:
43
+ PyErr_NoMemory()
44
+ return -1
45
+ return 0
40
46
 
41
47
  cdef class _Attributes:
42
48
  """A dict-like object that represents attributes."""
@@ -130,25 +136,24 @@ cdef class _Attributes:
130
136
  tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
131
137
  return "<%s attributes, %s items>" % (tag_name, len(self))
132
138
 
133
-
134
-
135
139
  ctypedef fused str_or_Node:
136
140
  str
137
141
  bytes
138
142
  Node
139
143
 
140
-
141
144
  cdef class Node:
142
145
  """A class that represents HTML node (element)."""
143
146
  cdef myhtml_tree_node_t *node
144
147
  cdef public HTMLParser parser
145
148
 
146
-
147
- cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
148
- # custom init, because __cinit__ doesn't accept C types
149
- self.node = node
149
+ @staticmethod
150
+ cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
151
+ # custom __init__ for C, because __cinit__ doesn't accept C types
152
+ cdef Node cls = Node.__new__(Node)
153
+ cls.node = node
150
154
  # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
151
- self.parser = parser
155
+ cls.parser = parser
156
+ return cls
152
157
 
153
158
  @property
154
159
  def attributes(self):
@@ -288,7 +293,7 @@ cdef class Node:
288
293
  cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
289
294
  text = ""
290
295
  cdef Stack stack = Stack(_STACK_SIZE)
291
- cdef myhtml_tree_node_t* current_node = NULL;
296
+ cdef myhtml_tree_node_t* current_node = NULL
292
297
 
293
298
  if node.tag_id == MyHTML_TAG__TEXT:
294
299
  c_text = myhtml_node_text(node, NULL)
@@ -341,12 +346,10 @@ cdef class Node:
341
346
  node = node.next
342
347
  continue
343
348
 
344
- next_node = Node()
345
- next_node._init(node, self.parser)
349
+ next_node = Node.new(node, self.parser)
346
350
  yield next_node
347
351
  node = node.next
348
352
 
349
-
350
353
  def traverse(self, include_text=False):
351
354
  """Iterate over all child and next nodes starting from the current level.
352
355
 
@@ -360,16 +363,15 @@ cdef class Node:
360
363
  node
361
364
  """
362
365
  cdef Stack stack = Stack(_STACK_SIZE)
363
- cdef myhtml_tree_node_t* current_node = NULL;
364
- cdef Node next_node;
366
+ cdef myhtml_tree_node_t* current_node = NULL
367
+ cdef Node next_node
365
368
 
366
369
  stack.push(self.node)
367
370
 
368
371
  while not stack.is_empty():
369
372
  current_node = stack.pop()
370
373
  if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
371
- next_node = Node()
372
- next_node._init(current_node, self.parser)
374
+ next_node = Node.new(current_node, self.parser)
373
375
  yield next_node
374
376
 
375
377
  if current_node.next is not NULL:
@@ -398,8 +400,7 @@ cdef class Node:
398
400
  """Return the child node."""
399
401
  cdef Node node
400
402
  if self.node.child:
401
- node = Node()
402
- node._init(self.node.child, self.parser)
403
+ node = Node.new(self.node.child, self.parser)
403
404
  return node
404
405
  return None
405
406
 
@@ -408,8 +409,7 @@ cdef class Node:
408
409
  """Return the parent node."""
409
410
  cdef Node node
410
411
  if self.node.parent:
411
- node = Node()
412
- node._init(self.node.parent, self.parser)
412
+ node = Node.new(self.node.parent, self.parser)
413
413
  return node
414
414
  return None
415
415
 
@@ -418,8 +418,7 @@ cdef class Node:
418
418
  """Return next node."""
419
419
  cdef Node node
420
420
  if self.node.next:
421
- node = Node()
422
- node._init(self.node.next, self.parser)
421
+ node = Node.new(self.node.next, self.parser)
423
422
  return node
424
423
  return None
425
424
 
@@ -428,8 +427,7 @@ cdef class Node:
428
427
  """Return previous node."""
429
428
  cdef Node node
430
429
  if self.node.prev:
431
- node = Node()
432
- node._init(self.node.prev, self.parser)
430
+ node = Node.new(self.node.prev, self.parser)
433
431
  return node
434
432
  return None
435
433
 
@@ -438,8 +436,7 @@ cdef class Node:
438
436
  """Return last child node."""
439
437
  cdef Node node
440
438
  if self.node.last_child:
441
- node = Node()
442
- node._init(self.node.last_child, self.parser)
439
+ node = Node.new(self.node.last_child, self.parser)
443
440
  return node
444
441
  return None
445
442
 
@@ -539,8 +536,8 @@ cdef class Node:
539
536
  if delete_empty:
540
537
  myhtml_node_delete(self.node)
541
538
  return
542
- cdef myhtml_tree_node_t* next_node;
543
- cdef myhtml_tree_node_t* current_node;
539
+ cdef myhtml_tree_node_t* next_node
540
+ cdef myhtml_tree_node_t* current_node
544
541
 
545
542
  if self.node.child.next != NULL:
546
543
  current_node = self.node.child
@@ -574,6 +571,8 @@ cdef class Node:
574
571
  '<html><body><div>Hello world!</div></body></html>'
575
572
 
576
573
  """
574
+ # ensure cython can recast element to a Node so that decompose will be called sooner.
575
+ cdef Node element
577
576
  for tag in tags:
578
577
  for element in self.css(tag):
579
578
  element.decompose(recursive=recursive)
@@ -600,7 +599,7 @@ cdef class Node:
600
599
 
601
600
  Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
602
601
  """
603
-
602
+ cdef Node element
604
603
  for tag in tags:
605
604
  for element in self.css(tag):
606
605
  element.unwrap(delete_empty)
@@ -788,7 +787,7 @@ cdef class Node:
788
787
 
789
788
  Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
790
789
  """
791
-
790
+ cdef Node element
792
791
  for tag in tags:
793
792
  for element in self.css(tag):
794
793
  element.unwrap(delete_empty)
@@ -847,6 +846,7 @@ cdef class Node:
847
846
  The query to check.
848
847
 
849
848
  """
849
+ cdef Node node
850
850
  if self.parser.cached_script_texts is None:
851
851
  nodes = find_nodes(self.parser, self.node, 'script')
852
852
  text_nodes = []
@@ -895,6 +895,7 @@ cdef class Node:
895
895
  if not isinstance(other, Node):
896
896
  return False
897
897
  return self.html == other.html
898
+
898
899
  @property
899
900
  def text_content(self):
900
901
  """Returns the text of the node if it is a text node.
@@ -948,8 +949,8 @@ cdef class Node:
948
949
  while not stack.is_empty():
949
950
  current_node = stack.pop()
950
951
 
951
- if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
952
- current_node.prev.tag_id == MyHTML_TAG__TEXT:
952
+ if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
953
+ current_node.prev.tag_id == MyHTML_TAG__TEXT):
953
954
  left_text = myhtml_node_text(current_node.prev, &left_length)
954
955
  right_text = myhtml_node_text(current_node, &right_length)
955
956
  if left_text and right_text: