selectolax 0.3.34__cp310-cp310-win_arm64.whl → 0.4.0__cp310-cp310-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -3,6 +3,6 @@
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
5
  __email__ = "me@rushter.com"
6
- __version__ = "0.3.34"
6
+ __version__ = "0.4.0"
7
7
 
8
8
  from . import lexbor, modest, parser
@@ -1,8 +1,12 @@
1
1
  cimport cython
2
2
  from cpython.exc cimport PyErr_SetNone
3
3
 
4
+ import logging
5
+
6
+ logger = logging.getLogger("selectolax")
7
+
4
8
  _TAG_TO_NAME = {
5
- 0x0005: "- doctype",
9
+ 0x0005: "-doctype",
6
10
  0x0002: "-text",
7
11
  0x0004: "-comment",
8
12
  }
@@ -11,6 +15,10 @@ ctypedef fused str_or_LexborNode:
11
15
  bytes
12
16
  LexborNode
13
17
 
18
+ ctypedef fused str_or_bytes:
19
+ str
20
+ bytes
21
+
14
22
  cdef inline bytes to_bytes(str_or_LexborNode value):
15
23
  cdef bytes bytes_val
16
24
  if isinstance(value, unicode):
@@ -37,7 +45,10 @@ cdef class LexborNode:
37
45
 
38
46
  @property
39
47
  def child(self):
40
- """Alias for the `first_child` property."""
48
+ """Alias for the `first_child` property.
49
+
50
+ **Deprecated**. Please use `first_child` instead.
51
+ """
41
52
  return self.first_child
42
53
 
43
54
  @property
@@ -200,13 +211,15 @@ cdef class LexborNode:
200
211
  def css_first(self, str query, default=None, bool strict=False):
201
212
  """Same as `css` but returns only the first match.
202
213
 
214
+ When `strict=False` stops at the first match. Works faster.
215
+
203
216
  Parameters
204
217
  ----------
205
218
 
206
219
  query : str
207
- default : bool, default None
220
+ default : Any, default None
208
221
  Default value to return if there is no match.
209
- strict: bool, default True
222
+ strict: bool, default False
210
223
  Set to True if you want to check if there is strictly only one match in the document.
211
224
 
212
225
 
@@ -214,8 +227,10 @@ cdef class LexborNode:
214
227
  -------
215
228
  selector : `LexborNode` object
216
229
  """
217
- # TODO: This can be improved.
218
- results = self.css(query)
230
+ if strict:
231
+ results = self.parser.selector.find(query, self)
232
+ else:
233
+ results = self.parser.selector.find_first(query, self)
219
234
  n_results = len(results)
220
235
  if n_results > 0:
221
236
  if strict and n_results > 1:
@@ -232,7 +247,7 @@ cdef class LexborNode:
232
247
 
233
248
  def css_matches(self, str selector):
234
249
  """Returns True if CSS selector matches a node."""
235
- return self.parser.selector.any_matches(selector, self)
250
+ return bool(self.parser.selector.any_matches(selector, self))
236
251
 
237
252
  def __repr__(self):
238
253
  return '<LexborNode %s>' % self.tag
@@ -246,6 +261,14 @@ cdef class LexborNode:
246
261
  def tag(self):
247
262
  """Return the name of the current tag (e.g. div, p, img).
248
263
 
264
+ For for non-tag nodes, returns the following names:
265
+
266
+ * `-text` - text node
267
+ * `-document` - document node
268
+ * `-comment` - comment node
269
+
270
+ This
271
+
249
272
  Returns
250
273
  -------
251
274
  text : str
@@ -281,9 +304,9 @@ cdef class LexborNode:
281
304
  raise SelectolaxError("Decomposing the root node is not allowed.")
282
305
 
283
306
  if recursive:
284
- lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
307
+ node_remove_deep(<lxb_dom_node_t *> self.node)
285
308
  else:
286
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
309
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
287
310
 
288
311
  def strip_tags(self, list tags, bool recursive = False):
289
312
  """Remove specified tags from the HTML tree.
@@ -332,6 +355,9 @@ cdef class LexborNode:
332
355
  cdef size_t str_len = 0
333
356
  attributes = dict()
334
357
 
358
+ if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
359
+ return attributes
360
+
335
361
  while attr != NULL:
336
362
  key = lxb_dom_attr_local_name_noi(attr, &str_len)
337
363
  value = lxb_dom_attr_value_noi(attr, &str_len)
@@ -424,6 +450,8 @@ cdef class LexborNode:
424
450
  def unwrap(self, bint delete_empty=False):
425
451
  """Replace node with whatever is inside this node.
426
452
 
453
+ Does nothing if you perform unwrapping second time on the same node.
454
+
427
455
  Parameters
428
456
  ----------
429
457
  delete_empty : bool, default False
@@ -439,9 +467,14 @@ cdef class LexborNode:
439
467
 
440
468
  Note: by default, empty tags are ignored, use "delete_empty" to change this.
441
469
  """
470
+
471
+ if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
472
+ logger.error("Attempt to unwrap removed node. Does nothing.")
473
+ return
474
+
442
475
  if self.node.first_child == NULL:
443
476
  if delete_empty:
444
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
477
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
445
478
  return
446
479
  cdef lxb_dom_node_t* next_node
447
480
  cdef lxb_dom_node_t* current_node
@@ -456,7 +489,7 @@ cdef class LexborNode:
456
489
  current_node = next_node
457
490
  else:
458
491
  lxb_dom_node_insert_before(self.node, self.node.first_child)
459
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
492
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
460
493
 
461
494
  def unwrap_tags(self, list tags, bint delete_empty = False):
462
495
  """Unwraps specified tags from the HTML tree.
@@ -518,6 +551,12 @@ cdef class LexborNode:
518
551
  combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
519
552
  lxb_dom_node_text_content_set(node, combined, len(combined))
520
553
  lxb_dom_node_remove(node.prev)
554
+
555
+ if left_text is not NULL:
556
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
557
+ if right_text is not NULL:
558
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
559
+
521
560
  if node.first_child:
522
561
  LexborNode.new(node, self.parser).merge_text_nodes()
523
562
  node = next_node
@@ -590,7 +629,7 @@ cdef class LexborNode:
590
629
  if new_node == NULL:
591
630
  raise SelectolaxError("Can't create a new node")
592
631
  lxb_dom_node_insert_before(self.node, new_node)
593
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
632
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
594
633
  elif isinstance(value, LexborNode):
595
634
  new_node = lxb_dom_document_import_node(
596
635
  &self.parser.document.dom_document,
@@ -600,7 +639,7 @@ cdef class LexborNode:
600
639
  if new_node == NULL:
601
640
  raise SelectolaxError("Can't create a new node")
602
641
  lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
603
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
642
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
604
643
  else:
605
644
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
606
645
 
@@ -883,6 +922,61 @@ cdef class LexborNode:
883
922
  container.append(py_text)
884
923
  return container.text
885
924
 
925
+ @property
926
+ def inner_html(self) -> str:
927
+ """Return HTML representation of the child nodes.
928
+
929
+ Works similar to innerHTML in JavaScript.
930
+ Unlike the `.html` property, does not include the current node.
931
+ Can be used to set HTML as well. See the setter docstring.
932
+
933
+ Returns
934
+ -------
935
+ text : str | None
936
+ """
937
+
938
+ cdef lexbor_str_t *lxb_str
939
+ cdef lxb_status_t status
940
+
941
+ lxb_str = lexbor_str_create()
942
+ status = lxb_html_serialize_deep_str(self.node, lxb_str)
943
+ if status == 0 and lxb_str.data:
944
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
945
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
946
+ return html
947
+ return None
948
+
949
+ @inner_html.setter
950
+ def inner_html(self, str html):
951
+ """Set inner HTML to the specified HTML.
952
+
953
+ Replaces existing data inside the node.
954
+ Works similar to innerHTML in JavaScript.
955
+
956
+ Parameters
957
+ ----------
958
+ html : str | None
959
+
960
+ """
961
+ cdef bytes bytes_val
962
+ bytes_val = <bytes>html.encode("utf-8")
963
+ lxb_html_element_inner_html_set(
964
+ <lxb_html_element_t *>self.node,
965
+ <lxb_char_t *> bytes_val, len(bytes_val)
966
+ )
967
+
968
+ def clone(self) -> LexborNode:
969
+ """Clone the current node.
970
+
971
+ You can use to do temporary modifications without affecting the original HTML tree.
972
+
973
+ It is tied to the current parser instance.
974
+ Gets destroyed when parser instance is destroyed.
975
+ """
976
+ cdef lxb_dom_node_t* node
977
+ node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
978
+ return LexborNode.new(node, self.parser)
979
+
886
980
 
887
981
  @cython.internal
888
982
  @cython.final
@@ -928,7 +1022,7 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
928
1022
  return LEXBOR_ACTION_OK
929
1023
 
930
1024
  try:
931
- py_str = text.decode(_ENCODING)
1025
+ py_str = text.decode(_ENCODING, "replace")
932
1026
 
933
1027
  except Exception as e:
934
1028
  PyErr_SetNone(e)
@@ -0,0 +1,29 @@
1
+
2
+ cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
3
+ cdef lxb_dom_node_t *tmp
4
+ cdef lxb_dom_node_t *node = root
5
+
6
+ while node != NULL:
7
+ if node.first_child != NULL:
8
+ node = node.first_child
9
+ else:
10
+ while node != root and node.next == NULL:
11
+ tmp = node.parent
12
+ lxb_dom_node_remove(node)
13
+ node = tmp
14
+
15
+ if node == root:
16
+ lxb_dom_node_remove(node)
17
+ break
18
+
19
+ tmp = node.next
20
+ lxb_dom_node_remove(node)
21
+ node = tmp
22
+
23
+ return NULL
24
+
25
+ cdef bint node_is_removed(lxb_dom_node_t* node):
26
+ if node.parent == NULL and node.next == NULL \
27
+ and node.prev == NULL:
28
+ return 1
29
+ return 0
@@ -39,6 +39,12 @@ cdef class LexborCSSSelector:
39
39
  return 0
40
40
 
41
41
  cpdef list find(self, str query, LexborNode node):
42
+ return self._find(query, node, 0)
43
+
44
+ cpdef list find_first(self, str query, LexborNode node):
45
+ return self._find(query, node, 1)
46
+
47
+ cpdef list _find(self, str query, LexborNode node, bint only_first):
42
48
  cdef lxb_css_selector_list_t* selectors
43
49
  cdef lxb_char_t* c_selector
44
50
  cdef lxb_css_selector_list_t * selectors_list
@@ -54,8 +60,12 @@ cdef class LexborCSSSelector:
54
60
 
55
61
  self.current_node = node
56
62
  self.results = []
57
- status = lxb_selectors_find(self.selectors, node.node, selectors_list,
58
- <lxb_selectors_cb_f>css_finder_callback, <void*>self)
63
+ if only_first:
64
+ status = lxb_selectors_find(self.selectors, node.node, selectors_list,
65
+ <lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
66
+ else:
67
+ status = lxb_selectors_find(self.selectors, node.node, selectors_list,
68
+ <lxb_selectors_cb_f>css_finder_callback, <void*>self)
59
69
  results = list(self.results)
60
70
  self.results = []
61
71
  self.current_node = None
@@ -76,6 +86,7 @@ cdef class LexborCSSSelector:
76
86
 
77
87
  if selectors_list == NULL:
78
88
  PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
89
+ return -1
79
90
 
80
91
  self.results = []
81
92
  status = lxb_selectors_find(self.selectors, node.node, selectors_list,
@@ -83,6 +94,8 @@ cdef class LexborCSSSelector:
83
94
  if status != LXB_STATUS_OK:
84
95
  lxb_css_selector_list_destroy_memory(selectors_list)
85
96
  PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
97
+ return -1
98
+
86
99
  result = PyList_GET_SIZE(self.results) > 0
87
100
  self.results = []
88
101
  lxb_css_selector_list_destroy_memory(selectors_list)
@@ -185,6 +198,15 @@ cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_spe
185
198
  cls.results.append(lxb_node)
186
199
  return LXB_STATUS_OK
187
200
 
201
+ cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
202
+ cdef LexborNode lxb_node
203
+ cdef LexborCSSSelector cls
204
+ cls = <LexborCSSSelector> ctx
205
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
206
+ cls.results.append(lxb_node)
207
+ return LXB_STATUS_STOP
208
+
209
+
188
210
  cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
189
211
  cdef LexborNode lxb_node
190
212
  cdef LexborCSSSelector cls