selectolax 0.3.31__cp311-cp311-win_amd64.whl → 0.3.33__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

Binary file
selectolax/lexbor.pxd CHANGED
@@ -1,4 +1,4 @@
1
- from libc.stdint cimport uint32_t, uint8_t, uintptr_t
1
+ from libc.stdint cimport uint8_t, uint32_t, uintptr_t
2
2
 
3
3
 
4
4
  cdef extern from "lexbor/core/core.h" nogil:
@@ -31,7 +31,6 @@ cdef extern from "lexbor/core/core.h" nogil:
31
31
  lexbor_str_t* lexbor_str_create()
32
32
  lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
33
33
 
34
-
35
34
  cdef extern from "lexbor/html/html.h" nogil:
36
35
  ctypedef unsigned int lxb_html_document_opt_t
37
36
 
@@ -54,14 +53,12 @@ cdef extern from "lexbor/html/html.h" nogil:
54
53
  void *events
55
54
 
56
55
  ctypedef struct lexbor_str_t:
57
- lxb_char_t *data;
58
- size_t length;
59
-
56
+ lxb_char_t *data
57
+ size_t length
60
58
 
61
59
  ctypedef struct lxb_dom_node_t:
62
60
  lxb_dom_event_target_t event_target
63
61
 
64
-
65
62
  uintptr_t local_name
66
63
  uintptr_t prefix
67
64
  uintptr_t ns
@@ -77,7 +74,6 @@ cdef extern from "lexbor/html/html.h" nogil:
77
74
 
78
75
  lxb_dom_node_type_t type
79
76
 
80
-
81
77
  ctypedef struct lxb_dom_document_t:
82
78
  lxb_dom_node_t node
83
79
 
@@ -104,7 +100,6 @@ cdef extern from "lexbor/html/html.h" nogil:
104
100
 
105
101
  bint scripting
106
102
 
107
-
108
103
  ctypedef struct lxb_html_document_t:
109
104
  lxb_dom_document_t dom_document
110
105
 
@@ -128,7 +123,6 @@ cdef extern from "lexbor/html/html.h" nogil:
128
123
  LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
129
124
  LXB_HTML_PARSER_STATE_ERROR = 0x04
130
125
 
131
-
132
126
  ctypedef enum lxb_dom_node_type_t:
133
127
  LXB_DOM_NODE_TYPE_ELEMENT = 0x01
134
128
  LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
@@ -175,10 +169,9 @@ cdef extern from "lexbor/html/html.h" nogil:
175
169
  size_t length
176
170
  size_t struct_size
177
171
 
178
-
179
172
  ctypedef struct lxb_html_tree_pending_table_t
180
- ctypedef bint lxb_html_tree_insertion_mode_f;
181
- ctypedef lxb_status_t lxb_html_tree_append_attr_f;
173
+ ctypedef bint lxb_html_tree_insertion_mode_f
174
+ ctypedef lxb_status_t lxb_html_tree_append_attr_f
182
175
 
183
176
  ctypedef struct lxb_html_tree_t:
184
177
 
@@ -189,13 +182,13 @@ cdef extern from "lexbor/html/html.h" nogil:
189
182
 
190
183
  lxb_html_form_element_t *form
191
184
 
192
- lexbor_array_t *open_elements;
193
- lexbor_array_t *active_formatting;
194
- lexbor_array_obj_t *template_insertion_modes;
185
+ lexbor_array_t *open_elements
186
+ lexbor_array_t *active_formatting
187
+ lexbor_array_obj_t *template_insertion_modes
195
188
 
196
- lxb_html_tree_pending_table_t *pending_table;
189
+ lxb_html_tree_pending_table_t *pending_table
197
190
 
198
- lexbor_array_obj_t *parse_errors;
191
+ lexbor_array_obj_t *parse_errors
199
192
 
200
193
  bint foster_parenting
201
194
  bint frameset_ok
@@ -232,9 +225,13 @@ cdef extern from "lexbor/html/html.h" nogil:
232
225
  lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
233
226
 
234
227
  cdef class LexborNode:
235
- cdef lxb_dom_node_t *node
236
- cdef public LexborHTMLParser parser
237
- cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser)
228
+ cdef:
229
+ lxb_dom_node_t *node
230
+ public LexborHTMLParser parser
231
+
232
+ @staticmethod
233
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
234
+
238
235
 
239
236
  cdef class LexborCSSSelector:
240
237
  cdef lxb_css_parser_t* parser
@@ -242,15 +239,15 @@ cdef class LexborCSSSelector:
242
239
  cdef lxb_css_selectors_t * css_selectors
243
240
  cdef public list results
244
241
  cdef public LexborNode current_node
245
- cdef _create_css_parser(self)
246
- cpdef find(self, str query, LexborNode node)
247
- cpdef any_matches(self, str query, LexborNode node)
242
+ cdef int _create_css_parser(self) except -1
243
+ cpdef list find(self, str query, LexborNode node)
244
+ cpdef int any_matches(self, str query, LexborNode node) except -1
248
245
 
249
246
  cdef class LexborHTMLParser:
250
247
  cdef lxb_html_document_t *document
251
248
  cdef public bytes raw_html
252
249
  cdef LexborCSSSelector _selector
253
- cdef _parse_html(self, char* html, size_t html_len)
250
+ cdef int _parse_html(self, char* html, size_t html_len) except -1
254
251
  cdef object cached_script_texts
255
252
  cdef object cached_script_srcs
256
253
 
@@ -267,8 +264,8 @@ cdef extern from "lexbor/dom/dom.h" nogil:
267
264
  ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)
268
265
 
269
266
  ctypedef struct lxb_dom_character_data_t:
270
- lxb_dom_node_t node;
271
- lexbor_str_t data;
267
+ lxb_dom_node_t node
268
+ lexbor_str_t data
272
269
 
273
270
  ctypedef struct lxb_dom_text_t:
274
271
  lxb_dom_character_data_t char_data
@@ -289,19 +286,20 @@ cdef extern from "lexbor/dom/dom.h" nogil:
289
286
  lxb_dom_element_t *owner
290
287
 
291
288
  lxb_dom_attr_t *next
292
- lxb_dom_attr_t *prev;
293
-
289
+ lxb_dom_attr_t *prev
294
290
 
295
291
  lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
296
292
  lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
293
+ lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
294
+ void lxb_dom_node_remove(lxb_dom_node_t *node)
297
295
  void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
298
- lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
296
+ lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
299
297
  lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
300
298
  lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
301
299
  lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
302
300
  lxb_dom_attr_t * lxb_dom_element_first_attribute_noi(lxb_dom_element_t *element)
303
301
 
304
- const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len);
302
+ const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
305
303
  const lxb_char_t * lxb_dom_attr_value_noi(lxb_dom_attr_t *attr, size_t *len)
306
304
 
307
305
  lxb_dom_attr_t * lxb_dom_element_set_attribute(lxb_dom_element_t *element,
@@ -314,7 +312,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
314
312
  lxb_tag_id_t lxb_dom_node_tag_id_noi(lxb_dom_node_t *node)
315
313
  lxb_dom_node_t * lxb_dom_document_import_node(lxb_dom_document_t *doc, lxb_dom_node_t *node, bint deep)
316
314
  void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
317
- lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node);
315
+ lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
318
316
  void lxb_dom_node_insert_child(lxb_dom_node_t *to, lxb_dom_node_t *node)
319
317
  void lxb_dom_node_insert_before(lxb_dom_node_t *to, lxb_dom_node_t *node)
320
318
  void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
@@ -347,7 +345,7 @@ cdef extern from "lexbor/css/css.h" nogil:
347
345
  lxb_css_parser_t * lxb_css_parser_create()
348
346
  lxb_status_t lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz)
349
347
  lxb_css_parser_t * lxb_css_parser_destroy(lxb_css_parser_t *parser, bint self_destroy)
350
- lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy);
348
+ lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
351
349
  void lxb_css_selector_list_destroy_memory(lxb_css_selector_list_t *list)
352
350
 
353
351
 
@@ -558,8 +556,7 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
558
556
  ctypedef struct lxb_selectors_t
559
557
  ctypedef struct lxb_css_selector_list_t
560
558
  ctypedef struct lxb_css_selector_specificity_t
561
- ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
562
- void *ctx)
559
+ ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx)
563
560
  ctypedef enum lxb_selectors_opt_t:
564
561
  LXB_SELECTORS_OPT_DEFAULT = 0x00
565
562
  LXB_SELECTORS_OPT_MATCH_ROOT = 1 << 1
@@ -576,4 +573,4 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
576
573
  lxb_status_t lxb_selectors_init(lxb_selectors_t *selectors)
577
574
  lxb_selectors_t * lxb_selectors_destroy(lxb_selectors_t *selectors, bint self_destroy)
578
575
  lxb_status_t lxb_selectors_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
579
- lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
576
+ lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
selectolax/lexbor.pyi CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Any, Iterator, Literal, TypeVar, NoReturn, overload, Optional
1
+ from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
2
2
 
3
3
  DefaultT = TypeVar("DefaultT")
4
4
 
@@ -158,11 +158,45 @@ class LexborNode:
158
158
  @overload
159
159
  def css_first(
160
160
  self, query: str, default: Any = ..., strict: Literal[True] = ...
161
- ) -> LexborNode: ...
161
+ ) -> LexborNode:
162
+ """Same as `css` but returns only the first match.
163
+
164
+ Parameters
165
+ ----------
166
+
167
+ query : str
168
+ default : bool, default None
169
+ Default value to return if there is no match.
170
+ strict: bool, default True
171
+ Set to True if you want to check if there is strictly only one match in the document.
172
+
173
+
174
+ Returns
175
+ -------
176
+ selector : `LexborNode` object
177
+ """
178
+ ...
162
179
  @overload
163
180
  def css_first(
164
181
  self, query: str, default: DefaultT, strict: bool = False
165
- ) -> LexborNode | DefaultT: ...
182
+ ) -> LexborNode | DefaultT:
183
+ """Same as `css` but returns only the first match.
184
+
185
+ Parameters
186
+ ----------
187
+
188
+ query : str
189
+ default : bool, default None
190
+ Default value to return if there is no match.
191
+ strict: bool, default True
192
+ Set to True if you want to check if there is strictly only one match in the document.
193
+
194
+
195
+ Returns
196
+ -------
197
+ selector : `LexborNode` object
198
+ """
199
+ ...
166
200
  @overload
167
201
  def css_first(
168
202
  self, query: str, default: None = ..., strict: bool = False
@@ -350,6 +384,25 @@ class LexborNode:
350
384
  Note: by default, empty tags are ignored, use "delete_empty" to change this.
351
385
  """
352
386
  ...
387
+ def merge_text_nodes(self) -> None:
388
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
389
+
390
+ This is useful for text extraction.
391
+ Use it when you need to strip HTML tags and merge "dangling" text.
392
+
393
+ Examples
394
+ --------
395
+
396
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
397
+ >>> node = tree.css_first('div')
398
+ >>> tree.unwrap_tags(["strong"])
399
+ >>> tree.text(deep=True, separator=" ", strip=True)
400
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
401
+ >>> node.merge_text_nodes()
402
+ >>> tree.text(deep=True, separator=" ", strip=True)
403
+ "John Doe"
404
+ """
405
+ ...
353
406
  def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
354
407
  """Iterate over all child and next nodes starting from the current level.
355
408
 
@@ -625,11 +678,45 @@ class LexborHTMLParser:
625
678
  @overload
626
679
  def css_first(
627
680
  self, query: str, default: Any = ..., strict: Literal[True] = ...
628
- ) -> LexborNode: ...
681
+ ) -> LexborNode:
682
+ """Same as `css` but returns only the first match.
683
+
684
+ Parameters
685
+ ----------
686
+
687
+ query : str
688
+ default : bool, default None
689
+ Default value to return if there is no match.
690
+ strict: bool, default True
691
+ Set to True if you want to check if there is strictly only one match in the document.
692
+
693
+
694
+ Returns
695
+ -------
696
+ selector : `LexborNode` object
697
+ """
698
+ ...
629
699
  @overload
630
700
  def css_first(
631
701
  self, query: str, default: DefaultT, strict: bool = False
632
- ) -> LexborNode | DefaultT: ...
702
+ ) -> LexborNode | DefaultT:
703
+ """Same as `css` but returns only the first match.
704
+
705
+ Parameters
706
+ ----------
707
+
708
+ query : str
709
+ default : bool, default None
710
+ Default value to return if there is no match.
711
+ strict: bool, default True
712
+ Set to True if you want to check if there is strictly only one match in the document.
713
+
714
+
715
+ Returns
716
+ -------
717
+ selector : `LexborNode` object
718
+ """
719
+ ...
633
720
  @overload
634
721
  def css_first(
635
722
  self, query: str, default: None = ..., strict: bool = False
@@ -711,6 +798,25 @@ class LexborHTMLParser:
711
798
  """
712
799
  ...
713
800
  def css_matches(self, selector: str) -> bool: ...
801
+ def merge_text_nodes(self) -> None:
802
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
803
+
804
+ This is useful for text extraction.
805
+ Use it when you need to strip HTML tags and merge "dangling" text.
806
+
807
+ Examples
808
+ --------
809
+
810
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
811
+ >>> node = tree.css_first('div')
812
+ >>> tree.unwrap_tags(["strong"])
813
+ >>> tree.text(deep=True, separator=" ", strip=True)
814
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
815
+ >>> node.merge_text_nodes()
816
+ >>> tree.text(deep=True, separator=" ", strip=True)
817
+ "John Doe"
818
+ """
819
+ ...
714
820
  def clone(self) -> LexborHTMLParser:
715
821
  """Clone the current tree."""
716
822
  ...
selectolax/lexbor.pyx CHANGED
@@ -1,4 +1,5 @@
1
- from cpython cimport bool
1
+ from cpython.bool cimport bool
2
+ from cpython.exc cimport PyErr_SetObject
2
3
 
3
4
  _ENCODING = 'UTF-8'
4
5
 
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
24
25
  html : str (unicode) or bytes
25
26
  """
26
27
  def __init__(self, html):
27
-
28
28
  cdef size_t html_len
29
- cdef char* html_chars
30
-
29
+ cdef object bytes_html
31
30
  bytes_html, html_len = preprocess_input(html)
32
31
  self._parse_html(bytes_html, html_len)
33
32
  self.raw_html = bytes_html
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
39
38
  self._selector = LexborCSSSelector()
40
39
  return self._selector
41
40
 
42
-
43
- cdef _parse_html(self, char *html, size_t html_len):
41
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
44
42
  cdef lxb_status_t status
45
43
 
46
44
  with nogil:
47
45
  self.document = lxb_html_document_create()
48
46
 
49
47
  if self.document == NULL:
50
- raise SelectolaxError("Failed to initialize object for HTML Document.")
48
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
49
+ return -1
51
50
 
52
51
  with nogil:
53
52
  status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
53
+
54
54
  if status != 0x0000:
55
- raise SelectolaxError("Can't parse HTML.")
55
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
56
+ return -1
56
57
 
57
- assert self.document != NULL
58
+ if self.document == NULL:
59
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
60
+ return -1
61
+ return 0
58
62
 
59
63
  def __dealloc__(self):
60
64
  if self.document != NULL:
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
68
72
  """Returns root node."""
69
73
  if self.document == NULL:
70
74
  return None
71
- return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
75
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
76
 
73
77
  @property
74
78
  def body(self):
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
77
81
  body = lxb_html_document_body_element_noi(self.document)
78
82
  if body == NULL:
79
83
  return None
80
- return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
84
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
81
85
 
82
86
  @property
83
87
  def head(self):
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
86
90
  head = lxb_html_document_head_element_noi(self.document)
87
91
  if head == NULL:
88
92
  return None
89
- return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
93
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
90
94
 
91
95
  def tags(self, str name):
92
96
  """Returns a list of tags that match specified name.
@@ -122,7 +126,7 @@ cdef class LexborHTMLParser:
122
126
  raise SelectolaxError("Can't locate elements.")
123
127
 
124
128
  for i in range(lxb_dom_collection_length_noi(collection)):
125
- node = LexborNode()._cinit(
129
+ node = LexborNode.new(
126
130
  <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
127
131
  self
128
132
  )
@@ -156,7 +160,7 @@ cdef class LexborHTMLParser:
156
160
  """Return HTML representation of the page."""
157
161
  if self.document == NULL:
158
162
  return None
159
- node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
163
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
160
164
  return node.html
161
165
 
162
166
  def css(self, str query):
@@ -238,7 +242,7 @@ cdef class LexborHTMLParser:
238
242
 
239
243
  for i in range(lxb_dom_collection_length_noi(collection)):
240
244
  if recursive:
241
- lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
245
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
242
246
  else:
243
247
  lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
244
248
  lxb_dom_collection_destroy(collection, <bint> True)
@@ -279,7 +283,6 @@ cdef class LexborHTMLParser:
279
283
  """
280
284
  return self.root.scripts_contain(query)
281
285
 
282
-
283
286
  def script_srcs_contain(self, tuple queries):
284
287
  """Returns True if any of the script SRCs attributes contain on of the specified text.
285
288
 
@@ -295,6 +298,26 @@ cdef class LexborHTMLParser:
295
298
  def css_matches(self, str selector):
296
299
  return self.root.css_matches(selector)
297
300
 
301
+ def merge_text_nodes(self):
302
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
303
+
304
+ This is useful for text extraction.
305
+ Use it when you need to strip HTML tags and merge "dangling" text.
306
+
307
+ Examples
308
+ --------
309
+
310
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
311
+ >>> node = tree.css_first('div')
312
+ >>> tree.unwrap_tags(["strong"])
313
+ >>> tree.text(deep=True, separator=" ", strip=True)
314
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
315
+ >>> node.merge_text_nodes()
316
+ >>> tree.text(deep=True, separator=" ", strip=True)
317
+ "John Doe"
318
+ """
319
+ return self.root.merge_text_nodes()
320
+
298
321
  @staticmethod
299
322
  cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
300
323
  obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -309,6 +332,7 @@ cdef class LexborHTMLParser:
309
332
  """Clone the current tree."""
310
333
  cdef lxb_html_document_t* cloned_document
311
334
  cdef lxb_dom_node_t* cloned_node
335
+ cdef LexborHTMLParser cls
312
336
 
313
337
  with nogil:
314
338
  cloned_document = lxb_html_document_create()
@@ -333,6 +357,7 @@ cdef class LexborHTMLParser:
333
357
 
334
358
  cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
335
359
  return cls
360
+
336
361
  def unwrap_tags(self, list tags, delete_empty = False):
337
362
  """Unwraps specified tags from the HTML tree.
338
363
 
@@ -353,5 +378,6 @@ cdef class LexborHTMLParser:
353
378
  >>> tree.body.html
354
379
  '<body><div>Hello world!</div></body>'
355
380
  """
356
- if self.root is not None:
381
+ # faster to check if the document is empty which should determine if we have a root
382
+ if self.document != NULL:
357
383
  self.root.unwrap_tags(tags, delete_empty=delete_empty)