selectolax 0.3.33__cp310-cp310-win_amd64.whl → 0.4.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -3,6 +3,6 @@
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
5
  __email__ = "me@rushter.com"
6
- __version__ = "0.3.33"
6
+ __version__ = "0.4.0"
7
7
 
8
8
  from . import lexbor, modest, parser
@@ -1,8 +1,12 @@
1
1
  cimport cython
2
2
  from cpython.exc cimport PyErr_SetNone
3
3
 
4
+ import logging
5
+
6
+ logger = logging.getLogger("selectolax")
7
+
4
8
  _TAG_TO_NAME = {
5
- 0x0005: "- doctype",
9
+ 0x0005: "-doctype",
6
10
  0x0002: "-text",
7
11
  0x0004: "-comment",
8
12
  }
@@ -11,6 +15,10 @@ ctypedef fused str_or_LexborNode:
11
15
  bytes
12
16
  LexborNode
13
17
 
18
+ ctypedef fused str_or_bytes:
19
+ str
20
+ bytes
21
+
14
22
  cdef inline bytes to_bytes(str_or_LexborNode value):
15
23
  cdef bytes bytes_val
16
24
  if isinstance(value, unicode):
@@ -37,7 +45,10 @@ cdef class LexborNode:
37
45
 
38
46
  @property
39
47
  def child(self):
40
- """Alias for the `first_child` property."""
48
+ """Alias for the `first_child` property.
49
+
50
+ **Deprecated**. Please use `first_child` instead.
51
+ """
41
52
  return self.first_child
42
53
 
43
54
  @property
@@ -180,6 +191,12 @@ cdef class LexborNode:
180
191
  Matches pattern `query` against HTML tree.
181
192
  `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
182
193
 
194
+ Special selectors:
195
+
196
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
197
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
198
+
199
+
183
200
  Parameters
184
201
  ----------
185
202
  query : str
@@ -194,13 +211,15 @@ cdef class LexborNode:
194
211
  def css_first(self, str query, default=None, bool strict=False):
195
212
  """Same as `css` but returns only the first match.
196
213
 
214
+ When `strict=False` stops at the first match. Works faster.
215
+
197
216
  Parameters
198
217
  ----------
199
218
 
200
219
  query : str
201
- default : bool, default None
220
+ default : Any, default None
202
221
  Default value to return if there is no match.
203
- strict: bool, default True
222
+ strict: bool, default False
204
223
  Set to True if you want to check if there is strictly only one match in the document.
205
224
 
206
225
 
@@ -208,8 +227,10 @@ cdef class LexborNode:
208
227
  -------
209
228
  selector : `LexborNode` object
210
229
  """
211
- # TODO: This can be improved.
212
- results = self.css(query)
230
+ if strict:
231
+ results = self.parser.selector.find(query, self)
232
+ else:
233
+ results = self.parser.selector.find_first(query, self)
213
234
  n_results = len(results)
214
235
  if n_results > 0:
215
236
  if strict and n_results > 1:
@@ -226,7 +247,7 @@ cdef class LexborNode:
226
247
 
227
248
  def css_matches(self, str selector):
228
249
  """Returns True if CSS selector matches a node."""
229
- return self.parser.selector.any_matches(selector, self)
250
+ return bool(self.parser.selector.any_matches(selector, self))
230
251
 
231
252
  def __repr__(self):
232
253
  return '<LexborNode %s>' % self.tag
@@ -240,6 +261,14 @@ cdef class LexborNode:
240
261
  def tag(self):
241
262
  """Return the name of the current tag (e.g. div, p, img).
242
263
 
264
+ For for non-tag nodes, returns the following names:
265
+
266
+ * `-text` - text node
267
+ * `-document` - document node
268
+ * `-comment` - comment node
269
+
270
+ This
271
+
243
272
  Returns
244
273
  -------
245
274
  text : str
@@ -275,9 +304,9 @@ cdef class LexborNode:
275
304
  raise SelectolaxError("Decomposing the root node is not allowed.")
276
305
 
277
306
  if recursive:
278
- lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
307
+ node_remove_deep(<lxb_dom_node_t *> self.node)
279
308
  else:
280
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
309
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
281
310
 
282
311
  def strip_tags(self, list tags, bool recursive = False):
283
312
  """Remove specified tags from the HTML tree.
@@ -326,6 +355,9 @@ cdef class LexborNode:
326
355
  cdef size_t str_len = 0
327
356
  attributes = dict()
328
357
 
358
+ if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
359
+ return attributes
360
+
329
361
  while attr != NULL:
330
362
  key = lxb_dom_attr_local_name_noi(attr, &str_len)
331
363
  value = lxb_dom_attr_value_noi(attr, &str_len)
@@ -418,6 +450,8 @@ cdef class LexborNode:
418
450
  def unwrap(self, bint delete_empty=False):
419
451
  """Replace node with whatever is inside this node.
420
452
 
453
+ Does nothing if you perform unwrapping second time on the same node.
454
+
421
455
  Parameters
422
456
  ----------
423
457
  delete_empty : bool, default False
@@ -433,9 +467,14 @@ cdef class LexborNode:
433
467
 
434
468
  Note: by default, empty tags are ignored, use "delete_empty" to change this.
435
469
  """
470
+
471
+ if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
472
+ logger.error("Attempt to unwrap removed node. Does nothing.")
473
+ return
474
+
436
475
  if self.node.first_child == NULL:
437
476
  if delete_empty:
438
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
477
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
439
478
  return
440
479
  cdef lxb_dom_node_t* next_node
441
480
  cdef lxb_dom_node_t* current_node
@@ -450,7 +489,7 @@ cdef class LexborNode:
450
489
  current_node = next_node
451
490
  else:
452
491
  lxb_dom_node_insert_before(self.node, self.node.first_child)
453
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
492
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
454
493
 
455
494
  def unwrap_tags(self, list tags, bint delete_empty = False):
456
495
  """Unwraps specified tags from the HTML tree.
@@ -512,6 +551,12 @@ cdef class LexborNode:
512
551
  combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
513
552
  lxb_dom_node_text_content_set(node, combined, len(combined))
514
553
  lxb_dom_node_remove(node.prev)
554
+
555
+ if left_text is not NULL:
556
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
557
+ if right_text is not NULL:
558
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
559
+
515
560
  if node.first_child:
516
561
  LexborNode.new(node, self.parser).merge_text_nodes()
517
562
  node = next_node
@@ -584,7 +629,7 @@ cdef class LexborNode:
584
629
  if new_node == NULL:
585
630
  raise SelectolaxError("Can't create a new node")
586
631
  lxb_dom_node_insert_before(self.node, new_node)
587
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
632
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
588
633
  elif isinstance(value, LexborNode):
589
634
  new_node = lxb_dom_document_import_node(
590
635
  &self.parser.document.dom_document,
@@ -594,7 +639,7 @@ cdef class LexborNode:
594
639
  if new_node == NULL:
595
640
  raise SelectolaxError("Can't create a new node")
596
641
  lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
597
- lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
642
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
598
643
  else:
599
644
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
600
645
 
@@ -877,6 +922,61 @@ cdef class LexborNode:
877
922
  container.append(py_text)
878
923
  return container.text
879
924
 
925
+ @property
926
+ def inner_html(self) -> str:
927
+ """Return HTML representation of the child nodes.
928
+
929
+ Works similar to innerHTML in JavaScript.
930
+ Unlike the `.html` property, does not include the current node.
931
+ Can be used to set HTML as well. See the setter docstring.
932
+
933
+ Returns
934
+ -------
935
+ text : str | None
936
+ """
937
+
938
+ cdef lexbor_str_t *lxb_str
939
+ cdef lxb_status_t status
940
+
941
+ lxb_str = lexbor_str_create()
942
+ status = lxb_html_serialize_deep_str(self.node, lxb_str)
943
+ if status == 0 and lxb_str.data:
944
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
945
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
946
+ return html
947
+ return None
948
+
949
+ @inner_html.setter
950
+ def inner_html(self, str html):
951
+ """Set inner HTML to the specified HTML.
952
+
953
+ Replaces existing data inside the node.
954
+ Works similar to innerHTML in JavaScript.
955
+
956
+ Parameters
957
+ ----------
958
+ html : str | None
959
+
960
+ """
961
+ cdef bytes bytes_val
962
+ bytes_val = <bytes>html.encode("utf-8")
963
+ lxb_html_element_inner_html_set(
964
+ <lxb_html_element_t *>self.node,
965
+ <lxb_char_t *> bytes_val, len(bytes_val)
966
+ )
967
+
968
+ def clone(self) -> LexborNode:
969
+ """Clone the current node.
970
+
971
+ You can use to do temporary modifications without affecting the original HTML tree.
972
+
973
+ It is tied to the current parser instance.
974
+ Gets destroyed when parser instance is destroyed.
975
+ """
976
+ cdef lxb_dom_node_t* node
977
+ node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
978
+ return LexborNode.new(node, self.parser)
979
+
880
980
 
881
981
  @cython.internal
882
982
  @cython.final
@@ -922,7 +1022,7 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
922
1022
  return LEXBOR_ACTION_OK
923
1023
 
924
1024
  try:
925
- py_str = text.decode(_ENCODING)
1025
+ py_str = text.decode(_ENCODING, "replace")
926
1026
 
927
1027
  except Exception as e:
928
1028
  PyErr_SetNone(e)
@@ -0,0 +1,29 @@
1
+
2
+ cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
3
+ cdef lxb_dom_node_t *tmp
4
+ cdef lxb_dom_node_t *node = root
5
+
6
+ while node != NULL:
7
+ if node.first_child != NULL:
8
+ node = node.first_child
9
+ else:
10
+ while node != root and node.next == NULL:
11
+ tmp = node.parent
12
+ lxb_dom_node_remove(node)
13
+ node = tmp
14
+
15
+ if node == root:
16
+ lxb_dom_node_remove(node)
17
+ break
18
+
19
+ tmp = node.next
20
+ lxb_dom_node_remove(node)
21
+ node = tmp
22
+
23
+ return NULL
24
+
25
+ cdef bint node_is_removed(lxb_dom_node_t* node):
26
+ if node.parent == NULL and node.next == NULL \
27
+ and node.prev == NULL:
28
+ return 1
29
+ return 0
@@ -39,6 +39,12 @@ cdef class LexborCSSSelector:
39
39
  return 0
40
40
 
41
41
  cpdef list find(self, str query, LexborNode node):
42
+ return self._find(query, node, 0)
43
+
44
+ cpdef list find_first(self, str query, LexborNode node):
45
+ return self._find(query, node, 1)
46
+
47
+ cpdef list _find(self, str query, LexborNode node, bint only_first):
42
48
  cdef lxb_css_selector_list_t* selectors
43
49
  cdef lxb_char_t* c_selector
44
50
  cdef lxb_css_selector_list_t * selectors_list
@@ -54,8 +60,12 @@ cdef class LexborCSSSelector:
54
60
 
55
61
  self.current_node = node
56
62
  self.results = []
57
- status = lxb_selectors_find(self.selectors, node.node, selectors_list,
58
- <lxb_selectors_cb_f>css_finder_callback, <void*>self)
63
+ if only_first:
64
+ status = lxb_selectors_find(self.selectors, node.node, selectors_list,
65
+ <lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
66
+ else:
67
+ status = lxb_selectors_find(self.selectors, node.node, selectors_list,
68
+ <lxb_selectors_cb_f>css_finder_callback, <void*>self)
59
69
  results = list(self.results)
60
70
  self.results = []
61
71
  self.current_node = None
@@ -76,6 +86,7 @@ cdef class LexborCSSSelector:
76
86
 
77
87
  if selectors_list == NULL:
78
88
  PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
89
+ return -1
79
90
 
80
91
  self.results = []
81
92
  status = lxb_selectors_find(self.selectors, node.node, selectors_list,
@@ -83,6 +94,8 @@ cdef class LexborCSSSelector:
83
94
  if status != LXB_STATUS_OK:
84
95
  lxb_css_selector_list_destroy_memory(selectors_list)
85
96
  PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
97
+ return -1
98
+
86
99
  result = PyList_GET_SIZE(self.results) > 0
87
100
  self.results = []
88
101
  lxb_css_selector_list_destroy_memory(selectors_list)
@@ -185,6 +198,15 @@ cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_spe
185
198
  cls.results.append(lxb_node)
186
199
  return LXB_STATUS_OK
187
200
 
201
+ cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
202
+ cdef LexborNode lxb_node
203
+ cdef LexborCSSSelector cls
204
+ cls = <LexborCSSSelector> ctx
205
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
206
+ cls.results.append(lxb_node)
207
+ return LXB_STATUS_STOP
208
+
209
+
188
210
  cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
189
211
  cdef LexborNode lxb_node
190
212
  cdef LexborCSSSelector cls