selectolax 0.3.28__cp39-cp39-musllinux_1_2_x86_64.whl → 0.4.0__cp39-cp39-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -2,9 +2,7 @@
2
2
 
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
- __email__ = 'me@rushter.com'
6
- __version__ = '0.3.28'
5
+ __email__ = "me@rushter.com"
6
+ __version__ = "0.4.0"
7
7
 
8
- from . import parser
9
- from . import lexbor
10
- from . import modest
8
+ from . import lexbor, modest, parser
@@ -1,5 +1,6 @@
1
1
  cimport cython
2
2
 
3
+
3
4
  @cython.final
4
5
  cdef class LexborAttributes:
5
6
  """A dict-like object that represents attributes."""
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
23
24
  yield key.decode(_ENCODING)
24
25
  attr = attr.next
25
26
 
26
- def __setitem__(self, str key, value):
27
- value = str(value)
27
+ def __setitem__(self, str key, object value):
28
+ value = value
28
29
  bytes_key = key.encode(_ENCODING)
29
- bytes_value = value.encode(_ENCODING)
30
-
31
- lxb_dom_element_set_attribute(
32
- <lxb_dom_element_t *> self.node,
33
- <lxb_char_t *> bytes_key, len(bytes_key),
34
- <lxb_char_t *> bytes_value, len(bytes_value),
35
- )
30
+ bytes_value = value.encode(_ENCODING) if value else b""
31
+ cdef lxb_dom_attr_t *attr
32
+ cdef lxb_dom_document_t *doc
33
+
34
+ if value is None:
35
+ # N.B. This is suboptimal, but there is not API to set empty attributes
36
+ attr = lxb_dom_element_set_attribute(
37
+ <lxb_dom_element_t *> self.node,
38
+ <lxb_char_t *> bytes_key, len(bytes_key),
39
+ NULL, 0
40
+ )
41
+ doc = (<lxb_dom_node_t*>attr).owner_document
42
+ lexbor_str_destroy(attr.value, doc.text, 0)
43
+ attr.value = NULL
44
+
45
+ elif isinstance(value, str) or isinstance(value, unicode) :
46
+ lxb_dom_element_set_attribute(
47
+ <lxb_dom_element_t *> self.node,
48
+ <lxb_char_t *> bytes_key, len(bytes_key),
49
+ <lxb_char_t *> bytes_value, len(bytes_value),
50
+ )
51
+ else:
52
+ raise TypeError("Expected str or unicode, got %s" % type(value))
36
53
 
37
54
  def __delitem__(self, key):
38
55
  try:
@@ -1,31 +1,43 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_SetNone
3
+
4
+ import logging
5
+
6
+ logger = logging.getLogger("selectolax")
2
7
 
3
8
  _TAG_TO_NAME = {
4
- 0x0005: "- doctype",
9
+ 0x0005: "-doctype",
5
10
  0x0002: "-text",
6
11
  0x0004: "-comment",
7
12
  }
8
13
  ctypedef fused str_or_LexborNode:
9
- basestring
14
+ str
10
15
  bytes
11
16
  LexborNode
12
17
 
18
+ ctypedef fused str_or_bytes:
19
+ str
20
+ bytes
21
+
13
22
  cdef inline bytes to_bytes(str_or_LexborNode value):
14
23
  cdef bytes bytes_val
15
- if isinstance(value, (str, unicode)):
16
- bytes_val = value.encode(_ENCODING)
24
+ if isinstance(value, unicode):
25
+ bytes_val = <bytes>value.encode("utf-8")
17
26
  elif isinstance(value, bytes):
18
- bytes_val = <char*> value
27
+ bytes_val = <bytes>value
19
28
  return bytes_val
20
29
 
30
+
21
31
  @cython.final
22
32
  cdef class LexborNode:
23
33
  """A class that represents HTML node (element)."""
24
34
 
25
- cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
26
- self.parser = parser
27
- self.node = node
28
- return self
35
+ @staticmethod
36
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
37
+ cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
38
+ lxbnode.node = node
39
+ lxbnode.parser = parser
40
+ return lxbnode
29
41
 
30
42
  @property
31
43
  def mem_id(self):
@@ -33,7 +45,10 @@ cdef class LexborNode:
33
45
 
34
46
  @property
35
47
  def child(self):
36
- """Alias for the `first_child` property."""
48
+ """Alias for the `first_child` property.
49
+
50
+ **Deprecated**. Please use `first_child` instead.
51
+ """
37
52
  return self.first_child
38
53
 
39
54
  @property
@@ -41,8 +56,7 @@ cdef class LexborNode:
41
56
  """Return the first child node."""
42
57
  cdef LexborNode node
43
58
  if self.node.first_child:
44
- node = LexborNode()
45
- node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
59
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
46
60
  return node
47
61
  return None
48
62
 
@@ -50,9 +64,8 @@ cdef class LexborNode:
50
64
  def parent(self):
51
65
  """Return the parent node."""
52
66
  cdef LexborNode node
53
- if self.node.parent:
54
- node = LexborNode()
55
- node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
67
+ if self.node.parent != NULL:
68
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
56
69
  return node
57
70
  return None
58
71
 
@@ -60,9 +73,8 @@ cdef class LexborNode:
60
73
  def next(self):
61
74
  """Return next node."""
62
75
  cdef LexborNode node
63
- if self.node.next:
64
- node = LexborNode()
65
- node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
76
+ if self.node.next != NULL:
77
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
66
78
  return node
67
79
  return None
68
80
 
@@ -70,9 +82,8 @@ cdef class LexborNode:
70
82
  def prev(self):
71
83
  """Return previous node."""
72
84
  cdef LexborNode node
73
- if self.node.prev:
74
- node = LexborNode()
75
- node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
85
+ if self.node.prev != NULL:
86
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
76
87
  return node
77
88
  return None
78
89
 
@@ -80,9 +91,8 @@ cdef class LexborNode:
80
91
  def last_child(self):
81
92
  """Return last child node."""
82
93
  cdef LexborNode node
83
- if self.node.last_child:
84
- node = LexborNode()
85
- node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
94
+ if self.node.last_child != NULL:
95
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
86
96
  return node
87
97
  return None
88
98
 
@@ -181,6 +191,12 @@ cdef class LexborNode:
181
191
  Matches pattern `query` against HTML tree.
182
192
  `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
183
193
 
194
+ Special selectors:
195
+
196
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
197
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
198
+
199
+
184
200
  Parameters
185
201
  ----------
186
202
  query : str
@@ -195,13 +211,15 @@ cdef class LexborNode:
195
211
  def css_first(self, str query, default=None, bool strict=False):
196
212
  """Same as `css` but returns only the first match.
197
213
 
214
+ When `strict=False` stops at the first match. Works faster.
215
+
198
216
  Parameters
199
217
  ----------
200
218
 
201
219
  query : str
202
- default : bool, default None
220
+ default : Any, default None
203
221
  Default value to return if there is no match.
204
- strict: bool, default True
222
+ strict: bool, default False
205
223
  Set to True if you want to check if there is strictly only one match in the document.
206
224
 
207
225
 
@@ -209,8 +227,10 @@ cdef class LexborNode:
209
227
  -------
210
228
  selector : `LexborNode` object
211
229
  """
212
- # TODO: This can be improved.
213
- results = self.css(query)
230
+ if strict:
231
+ results = self.parser.selector.find(query, self)
232
+ else:
233
+ results = self.parser.selector.find_first(query, self)
214
234
  n_results = len(results)
215
235
  if n_results > 0:
216
236
  if strict and n_results > 1:
@@ -227,7 +247,7 @@ cdef class LexborNode:
227
247
 
228
248
  def css_matches(self, str selector):
229
249
  """Returns True if CSS selector matches a node."""
230
- return self.parser.selector.any_matches(selector, self)
250
+ return bool(self.parser.selector.any_matches(selector, self))
231
251
 
232
252
  def __repr__(self):
233
253
  return '<LexborNode %s>' % self.tag
@@ -241,6 +261,14 @@ cdef class LexborNode:
241
261
  def tag(self):
242
262
  """Return the name of the current tag (e.g. div, p, img).
243
263
 
264
+ For for non-tag nodes, returns the following names:
265
+
266
+ * `-text` - text node
267
+ * `-document` - document node
268
+ * `-comment` - comment node
269
+
270
+ This
271
+
244
272
  Returns
245
273
  -------
246
274
  text : str
@@ -256,7 +284,6 @@ cdef class LexborNode:
256
284
  text = c_text.decode(_ENCODING)
257
285
  return text
258
286
 
259
-
260
287
  def decompose(self, bool recursive=True):
261
288
  """Remove the current node from the tree.
262
289
 
@@ -273,10 +300,13 @@ cdef class LexborNode:
273
300
  >>> tag.decompose()
274
301
 
275
302
  """
303
+ if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
304
+ raise SelectolaxError("Decomposing the root node is not allowed.")
305
+
276
306
  if recursive:
277
- lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
307
+ node_remove_deep(<lxb_dom_node_t *> self.node)
278
308
  else:
279
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
309
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
280
310
 
281
311
  def strip_tags(self, list tags, bool recursive = False):
282
312
  """Remove specified tags from the HTML tree.
@@ -298,11 +328,11 @@ cdef class LexborNode:
298
328
  '<html><body><div>Hello world!</div></body></html>'
299
329
 
300
330
  """
331
+ cdef LexborNode element
301
332
  for tag in tags:
302
333
  for element in self.css(tag):
303
334
  element.decompose(recursive=recursive)
304
335
 
305
-
306
336
  @property
307
337
  def attributes(self):
308
338
  """Get all attributes that belong to the current node.
@@ -325,6 +355,9 @@ cdef class LexborNode:
325
355
  cdef size_t str_len = 0
326
356
  attributes = dict()
327
357
 
358
+ if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
359
+ return attributes
360
+
328
361
  while attr != NULL:
329
362
  key = lxb_dom_attr_local_name_noi(attr, &str_len)
330
363
  value = lxb_dom_attr_value_noi(attr, &str_len)
@@ -410,15 +443,20 @@ cdef class LexborNode:
410
443
  node = node.next
411
444
  continue
412
445
 
413
- next_node = LexborNode()
414
- next_node._cinit(<lxb_dom_node_t *> node, self.parser)
446
+ next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
415
447
  yield next_node
416
448
  node = node.next
417
449
 
418
-
419
- def unwrap(self):
450
+ def unwrap(self, bint delete_empty=False):
420
451
  """Replace node with whatever is inside this node.
421
452
 
453
+ Does nothing if you perform unwrapping second time on the same node.
454
+
455
+ Parameters
456
+ ----------
457
+ delete_empty : bool, default False
458
+ If True, removes empty tags.
459
+
422
460
  Examples
423
461
  --------
424
462
 
@@ -427,11 +465,19 @@ cdef class LexborNode:
427
465
  >>> tree.html
428
466
  '<html><head></head><body><div>Hello world!</div></body></html>'
429
467
 
468
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
430
469
  """
470
+
471
+ if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
472
+ logger.error("Attempt to unwrap removed node. Does nothing.")
473
+ return
474
+
431
475
  if self.node.first_child == NULL:
476
+ if delete_empty:
477
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
432
478
  return
433
- cdef lxb_dom_node_t* next_node;
434
- cdef lxb_dom_node_t* current_node;
479
+ cdef lxb_dom_node_t* next_node
480
+ cdef lxb_dom_node_t* current_node
435
481
 
436
482
  if self.node.first_child.next != NULL:
437
483
  current_node = self.node.first_child
@@ -443,9 +489,9 @@ cdef class LexborNode:
443
489
  current_node = next_node
444
490
  else:
445
491
  lxb_dom_node_insert_before(self.node, self.node.first_child)
446
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
492
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
447
493
 
448
- def unwrap_tags(self, list tags):
494
+ def unwrap_tags(self, list tags, bint delete_empty = False):
449
495
  """Unwraps specified tags from the HTML tree.
450
496
 
451
497
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -454,6 +500,8 @@ cdef class LexborNode:
454
500
  ----------
455
501
  tags : list
456
502
  List of tags to remove.
503
+ delete_empty : bool, default False
504
+ If True, removes empty tags.
457
505
 
458
506
  Examples
459
507
  --------
@@ -462,12 +510,56 @@ cdef class LexborNode:
462
510
  >>> tree.body.unwrap_tags(['i','a'])
463
511
  >>> tree.body.html
464
512
  '<body><div>Hello world!</div></body>'
465
- """
466
513
 
514
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
515
+ """
516
+ cdef LexborNode element
467
517
  for tag in tags:
468
518
  for element in self.css(tag):
469
- element.unwrap()
519
+ element.unwrap(delete_empty)
520
+
521
+ def merge_text_nodes(self):
522
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
470
523
 
524
+ This is useful for text extraction.
525
+ Use it when you need to strip HTML tags and merge "dangling" text.
526
+
527
+ Examples
528
+ --------
529
+
530
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
531
+ >>> node = tree.css_first('div')
532
+ >>> tree.unwrap_tags(["strong"])
533
+ >>> tree.text(deep=True, separator=" ", strip=True)
534
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
535
+ >>> node.merge_text_nodes()
536
+ >>> tree.text(deep=True, separator=" ", strip=True)
537
+ "John Doe"
538
+ """
539
+ cdef lxb_dom_node_t *node = self.node.first_child
540
+ cdef lxb_dom_node_t *next_node
541
+ cdef lxb_char_t *left_text
542
+ cdef lxb_char_t *right_text
543
+ cdef size_t left_length, right_length
544
+
545
+ while node != NULL:
546
+ next_node = node.next
547
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
548
+ left_text = lxb_dom_node_text_content(node.prev, &left_length)
549
+ right_text = lxb_dom_node_text_content(node, &right_length)
550
+ if left_text and right_text:
551
+ combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
552
+ lxb_dom_node_text_content_set(node, combined, len(combined))
553
+ lxb_dom_node_remove(node.prev)
554
+
555
+ if left_text is not NULL:
556
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
557
+ if right_text is not NULL:
558
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
559
+
560
+ if node.first_child:
561
+ LexborNode.new(node, self.parser).merge_text_nodes()
562
+ node = next_node
471
563
 
472
564
  def traverse(self, include_text=False):
473
565
  """Iterate over all child and next nodes starting from the current level.
@@ -487,8 +579,7 @@ cdef class LexborNode:
487
579
 
488
580
  while node != NULL:
489
581
  if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
490
- lxb_node = LexborNode()
491
- lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
582
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
492
583
  yield lxb_node
493
584
 
494
585
  if node.first_child != NULL:
@@ -538,7 +629,7 @@ cdef class LexborNode:
538
629
  if new_node == NULL:
539
630
  raise SelectolaxError("Can't create a new node")
540
631
  lxb_dom_node_insert_before(self.node, new_node)
541
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
632
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
542
633
  elif isinstance(value, LexborNode):
543
634
  new_node = lxb_dom_document_import_node(
544
635
  &self.parser.document.dom_document,
@@ -548,11 +639,10 @@ cdef class LexborNode:
548
639
  if new_node == NULL:
549
640
  raise SelectolaxError("Can't create a new node")
550
641
  lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
551
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
642
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
552
643
  else:
553
644
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
554
645
 
555
-
556
646
  def insert_before(self, str_or_LexborNode value):
557
647
  """
558
648
  Insert a node before the current Node.
@@ -727,7 +817,7 @@ cdef class LexborNode:
727
817
  >>> selector.child.raw_value
728
818
  b'&#x3C;test&#x3E;'
729
819
  """
730
- raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
820
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
731
821
 
732
822
  def scripts_contain(self, str query):
733
823
  """Returns True if any of the script tags contain specified text.
@@ -740,6 +830,7 @@ cdef class LexborNode:
740
830
  The query to check.
741
831
 
742
832
  """
833
+ cdef LexborNode node
743
834
  if self.parser.cached_script_texts is None:
744
835
  nodes = self.parser.selector.find('script', self)
745
836
  text_nodes = []
@@ -764,6 +855,7 @@ cdef class LexborNode:
764
855
  queries : tuple of str
765
856
 
766
857
  """
858
+ cdef LexborNode node
767
859
  if self.parser.cached_script_srcs is None:
768
860
  nodes = self.parser.selector.find('script', self)
769
861
  src_nodes = []
@@ -819,31 +911,99 @@ cdef class LexborNode:
819
911
  """
820
912
  cdef unsigned char * text
821
913
  cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
822
-
823
- container = TextContainer()
914
+ cdef TextContainer container
824
915
  if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
825
916
  return None
917
+
826
918
  text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
827
919
  if text != NULL:
920
+ container = TextContainer.new_with_defaults()
828
921
  py_text = text.decode(_ENCODING)
829
922
  container.append(py_text)
830
923
  return container.text
924
+
925
+ @property
926
+ def inner_html(self) -> str:
927
+ """Return HTML representation of the child nodes.
928
+
929
+ Works similar to innerHTML in JavaScript.
930
+ Unlike the `.html` property, does not include the current node.
931
+ Can be used to set HTML as well. See the setter docstring.
932
+
933
+ Returns
934
+ -------
935
+ text : str | None
936
+ """
937
+
938
+ cdef lexbor_str_t *lxb_str
939
+ cdef lxb_status_t status
940
+
941
+ lxb_str = lexbor_str_create()
942
+ status = lxb_html_serialize_deep_str(self.node, lxb_str)
943
+ if status == 0 and lxb_str.data:
944
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
945
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
946
+ return html
947
+ return None
948
+
949
+ @inner_html.setter
950
+ def inner_html(self, str html):
951
+ """Set inner HTML to the specified HTML.
952
+
953
+ Replaces existing data inside the node.
954
+ Works similar to innerHTML in JavaScript.
955
+
956
+ Parameters
957
+ ----------
958
+ html : str | None
959
+
960
+ """
961
+ cdef bytes bytes_val
962
+ bytes_val = <bytes>html.encode("utf-8")
963
+ lxb_html_element_inner_html_set(
964
+ <lxb_html_element_t *>self.node,
965
+ <lxb_char_t *> bytes_val, len(bytes_val)
966
+ )
967
+
968
+ def clone(self) -> LexborNode:
969
+ """Clone the current node.
970
+
971
+ You can use to do temporary modifications without affecting the original HTML tree.
972
+
973
+ It is tied to the current parser instance.
974
+ Gets destroyed when parser instance is destroyed.
975
+ """
976
+ cdef lxb_dom_node_t* node
977
+ node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
978
+ return LexborNode.new(node, self.parser)
979
+
980
+
981
+ @cython.internal
831
982
  @cython.final
832
983
  cdef class TextContainer:
833
984
  cdef str _text
834
- cdef public str separator
835
- cdef public bool strip
985
+ cdef str separator
986
+ cdef bint strip
987
+
988
+ @staticmethod
989
+ cdef TextContainer new_with_defaults():
990
+ cdef TextContainer cls = TextContainer.__new__(TextContainer)
991
+ cls._text = ''
992
+ cls.separator = ''
993
+ cls.strip = False
994
+ return cls
836
995
 
837
996
  def __init__(self, str separator = '', bool strip = False):
838
997
  self._text = ""
839
998
  self.separator = separator
840
999
  self.strip = strip
841
1000
 
842
- def append(self, node_text):
1001
+ def append(self, str node_text):
843
1002
  if self.strip:
844
1003
  self._text += node_text.strip() + self.separator
845
1004
  else:
846
1005
  self._text += node_text + self.separator
1006
+
847
1007
  @property
848
1008
  def text(self):
849
1009
  if self.separator and self._text and self._text.endswith(self.separator):
@@ -852,7 +1012,7 @@ cdef class TextContainer:
852
1012
 
853
1013
 
854
1014
  cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
855
- cdef unsigned char *text;
1015
+ cdef unsigned char *text
856
1016
  cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
857
1017
  if tag_id != LXB_TAG__TEXT:
858
1018
  return LEXBOR_ACTION_OK
@@ -860,8 +1020,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
860
1020
  text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
861
1021
  if not text:
862
1022
  return LEXBOR_ACTION_OK
863
- py_str = text.decode(_ENCODING)
864
- cdef object cls
865
- cls = <object> ctx
1023
+
1024
+ try:
1025
+ py_str = text.decode(_ENCODING, "replace")
1026
+
1027
+ except Exception as e:
1028
+ PyErr_SetNone(e)
1029
+ return LEXBOR_ACTION_STOP
1030
+
1031
+ cdef TextContainer cls
1032
+ cls = <TextContainer> ctx
866
1033
  cls.append(py_str)
867
1034
  return LEXBOR_ACTION_OK
@@ -0,0 +1,29 @@
1
+
2
+ cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
3
+ cdef lxb_dom_node_t *tmp
4
+ cdef lxb_dom_node_t *node = root
5
+
6
+ while node != NULL:
7
+ if node.first_child != NULL:
8
+ node = node.first_child
9
+ else:
10
+ while node != root and node.next == NULL:
11
+ tmp = node.parent
12
+ lxb_dom_node_remove(node)
13
+ node = tmp
14
+
15
+ if node == root:
16
+ lxb_dom_node_remove(node)
17
+ break
18
+
19
+ tmp = node.next
20
+ lxb_dom_node_remove(node)
21
+ node = tmp
22
+
23
+ return NULL
24
+
25
+ cdef bint node_is_removed(lxb_dom_node_t* node):
26
+ if node.parent == NULL and node.next == NULL \
27
+ and node.prev == NULL:
28
+ return 1
29
+ return 0