selectolax 0.3.32__cp313-cp313-macosx_10_13_x86_64.whl → 0.3.33__cp313-cp313-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -2,9 +2,7 @@
2
2
 
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
- __email__ = 'me@rushter.com'
6
- __version__ = '0.3.32'
5
+ __email__ = "me@rushter.com"
6
+ __version__ = "0.3.33"
7
7
 
8
- from . import parser
9
- from . import lexbor
10
- from . import modest
8
+ from . import lexbor, modest, parser
@@ -1,6 +1,5 @@
1
1
  cimport cython
2
2
 
3
- from typing import Optional
4
3
 
5
4
  @cython.final
6
5
  cdef class LexborAttributes:
@@ -1,4 +1,5 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_SetNone
2
3
 
3
4
  _TAG_TO_NAME = {
4
5
  0x0005: "- doctype",
@@ -18,14 +19,17 @@ cdef inline bytes to_bytes(str_or_LexborNode value):
18
19
  bytes_val = <bytes>value
19
20
  return bytes_val
20
21
 
22
+
21
23
  @cython.final
22
24
  cdef class LexborNode:
23
25
  """A class that represents HTML node (element)."""
24
26
 
25
- cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
26
- self.parser = parser
27
- self.node = node
28
- return self
27
+ @staticmethod
28
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
29
+ cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
30
+ lxbnode.node = node
31
+ lxbnode.parser = parser
32
+ return lxbnode
29
33
 
30
34
  @property
31
35
  def mem_id(self):
@@ -41,8 +45,7 @@ cdef class LexborNode:
41
45
  """Return the first child node."""
42
46
  cdef LexborNode node
43
47
  if self.node.first_child:
44
- node = LexborNode()
45
- node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
48
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
46
49
  return node
47
50
  return None
48
51
 
@@ -50,9 +53,8 @@ cdef class LexborNode:
50
53
  def parent(self):
51
54
  """Return the parent node."""
52
55
  cdef LexborNode node
53
- if self.node.parent:
54
- node = LexborNode()
55
- node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
56
+ if self.node.parent != NULL:
57
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
56
58
  return node
57
59
  return None
58
60
 
@@ -60,9 +62,8 @@ cdef class LexborNode:
60
62
  def next(self):
61
63
  """Return next node."""
62
64
  cdef LexborNode node
63
- if self.node.next:
64
- node = LexborNode()
65
- node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
65
+ if self.node.next != NULL:
66
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
66
67
  return node
67
68
  return None
68
69
 
@@ -70,9 +71,8 @@ cdef class LexborNode:
70
71
  def prev(self):
71
72
  """Return previous node."""
72
73
  cdef LexborNode node
73
- if self.node.prev:
74
- node = LexborNode()
75
- node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
74
+ if self.node.prev != NULL:
75
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
76
76
  return node
77
77
  return None
78
78
 
@@ -80,9 +80,8 @@ cdef class LexborNode:
80
80
  def last_child(self):
81
81
  """Return last child node."""
82
82
  cdef LexborNode node
83
- if self.node.last_child:
84
- node = LexborNode()
85
- node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
83
+ if self.node.last_child != NULL:
84
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
86
85
  return node
87
86
  return None
88
87
 
@@ -256,7 +255,6 @@ cdef class LexborNode:
256
255
  text = c_text.decode(_ENCODING)
257
256
  return text
258
257
 
259
-
260
258
  def decompose(self, bool recursive=True):
261
259
  """Remove the current node from the tree.
262
260
 
@@ -301,11 +299,11 @@ cdef class LexborNode:
301
299
  '<html><body><div>Hello world!</div></body></html>'
302
300
 
303
301
  """
302
+ cdef LexborNode element
304
303
  for tag in tags:
305
304
  for element in self.css(tag):
306
305
  element.decompose(recursive=recursive)
307
306
 
308
-
309
307
  @property
310
308
  def attributes(self):
311
309
  """Get all attributes that belong to the current node.
@@ -413,13 +411,11 @@ cdef class LexborNode:
413
411
  node = node.next
414
412
  continue
415
413
 
416
- next_node = LexborNode()
417
- next_node._cinit(<lxb_dom_node_t *> node, self.parser)
414
+ next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
418
415
  yield next_node
419
416
  node = node.next
420
417
 
421
-
422
- def unwrap(self, delete_empty=False):
418
+ def unwrap(self, bint delete_empty=False):
423
419
  """Replace node with whatever is inside this node.
424
420
 
425
421
  Parameters
@@ -441,8 +437,8 @@ cdef class LexborNode:
441
437
  if delete_empty:
442
438
  lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
443
439
  return
444
- cdef lxb_dom_node_t* next_node;
445
- cdef lxb_dom_node_t* current_node;
440
+ cdef lxb_dom_node_t* next_node
441
+ cdef lxb_dom_node_t* current_node
446
442
 
447
443
  if self.node.first_child.next != NULL:
448
444
  current_node = self.node.first_child
@@ -456,7 +452,7 @@ cdef class LexborNode:
456
452
  lxb_dom_node_insert_before(self.node, self.node.first_child)
457
453
  lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
458
454
 
459
- def unwrap_tags(self, list tags, delete_empty = False):
455
+ def unwrap_tags(self, list tags, bint delete_empty = False):
460
456
  """Unwraps specified tags from the HTML tree.
461
457
 
462
458
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -478,11 +474,47 @@ cdef class LexborNode:
478
474
 
479
475
  Note: by default, empty tags are ignored, use "delete_empty" to change this.
480
476
  """
481
-
477
+ cdef LexborNode element
482
478
  for tag in tags:
483
479
  for element in self.css(tag):
484
480
  element.unwrap(delete_empty)
485
481
 
482
+ def merge_text_nodes(self):
483
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
484
+
485
+ This is useful for text extraction.
486
+ Use it when you need to strip HTML tags and merge "dangling" text.
487
+
488
+ Examples
489
+ --------
490
+
491
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
492
+ >>> node = tree.css_first('div')
493
+ >>> tree.unwrap_tags(["strong"])
494
+ >>> tree.text(deep=True, separator=" ", strip=True)
495
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
496
+ >>> node.merge_text_nodes()
497
+ >>> tree.text(deep=True, separator=" ", strip=True)
498
+ "John Doe"
499
+ """
500
+ cdef lxb_dom_node_t *node = self.node.first_child
501
+ cdef lxb_dom_node_t *next_node
502
+ cdef lxb_char_t *left_text
503
+ cdef lxb_char_t *right_text
504
+ cdef size_t left_length, right_length
505
+
506
+ while node != NULL:
507
+ next_node = node.next
508
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
509
+ left_text = lxb_dom_node_text_content(node.prev, &left_length)
510
+ right_text = lxb_dom_node_text_content(node, &right_length)
511
+ if left_text and right_text:
512
+ combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
513
+ lxb_dom_node_text_content_set(node, combined, len(combined))
514
+ lxb_dom_node_remove(node.prev)
515
+ if node.first_child:
516
+ LexborNode.new(node, self.parser).merge_text_nodes()
517
+ node = next_node
486
518
 
487
519
  def traverse(self, include_text=False):
488
520
  """Iterate over all child and next nodes starting from the current level.
@@ -502,8 +534,7 @@ cdef class LexborNode:
502
534
 
503
535
  while node != NULL:
504
536
  if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
505
- lxb_node = LexborNode()
506
- lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
537
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
507
538
  yield lxb_node
508
539
 
509
540
  if node.first_child != NULL:
@@ -567,7 +598,6 @@ cdef class LexborNode:
567
598
  else:
568
599
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
569
600
 
570
-
571
601
  def insert_before(self, str_or_LexborNode value):
572
602
  """
573
603
  Insert a node before the current Node.
@@ -742,7 +772,7 @@ cdef class LexborNode:
742
772
  >>> selector.child.raw_value
743
773
  b'&#x3C;test&#x3E;'
744
774
  """
745
- raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
775
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
746
776
 
747
777
  def scripts_contain(self, str query):
748
778
  """Returns True if any of the script tags contain specified text.
@@ -755,6 +785,7 @@ cdef class LexborNode:
755
785
  The query to check.
756
786
 
757
787
  """
788
+ cdef LexborNode node
758
789
  if self.parser.cached_script_texts is None:
759
790
  nodes = self.parser.selector.find('script', self)
760
791
  text_nodes = []
@@ -779,6 +810,7 @@ cdef class LexborNode:
779
810
  queries : tuple of str
780
811
 
781
812
  """
813
+ cdef LexborNode node
782
814
  if self.parser.cached_script_srcs is None:
783
815
  nodes = self.parser.selector.find('script', self)
784
816
  src_nodes = []
@@ -834,31 +866,44 @@ cdef class LexborNode:
834
866
  """
835
867
  cdef unsigned char * text
836
868
  cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
837
-
838
- container = TextContainer()
869
+ cdef TextContainer container
839
870
  if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
840
871
  return None
872
+
841
873
  text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
842
874
  if text != NULL:
875
+ container = TextContainer.new_with_defaults()
843
876
  py_text = text.decode(_ENCODING)
844
877
  container.append(py_text)
845
878
  return container.text
879
+
880
+
881
+ @cython.internal
846
882
  @cython.final
847
883
  cdef class TextContainer:
848
884
  cdef str _text
849
- cdef public str separator
850
- cdef public bool strip
885
+ cdef str separator
886
+ cdef bint strip
887
+
888
+ @staticmethod
889
+ cdef TextContainer new_with_defaults():
890
+ cdef TextContainer cls = TextContainer.__new__(TextContainer)
891
+ cls._text = ''
892
+ cls.separator = ''
893
+ cls.strip = False
894
+ return cls
851
895
 
852
896
  def __init__(self, str separator = '', bool strip = False):
853
897
  self._text = ""
854
898
  self.separator = separator
855
899
  self.strip = strip
856
900
 
857
- def append(self, node_text):
901
+ def append(self, str node_text):
858
902
  if self.strip:
859
903
  self._text += node_text.strip() + self.separator
860
904
  else:
861
905
  self._text += node_text + self.separator
906
+
862
907
  @property
863
908
  def text(self):
864
909
  if self.separator and self._text and self._text.endswith(self.separator):
@@ -867,7 +912,7 @@ cdef class TextContainer:
867
912
 
868
913
 
869
914
  cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
870
- cdef unsigned char *text;
915
+ cdef unsigned char *text
871
916
  cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
872
917
  if tag_id != LXB_TAG__TEXT:
873
918
  return LEXBOR_ACTION_OK
@@ -875,8 +920,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
875
920
  text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
876
921
  if not text:
877
922
  return LEXBOR_ACTION_OK
878
- py_str = text.decode(_ENCODING)
879
- cdef object cls
880
- cls = <object> ctx
923
+
924
+ try:
925
+ py_str = text.decode(_ENCODING)
926
+
927
+ except Exception as e:
928
+ PyErr_SetNone(e)
929
+ return LEXBOR_ACTION_STOP
930
+
931
+ cdef TextContainer cls
932
+ cls = <TextContainer> ctx
881
933
  cls.append(py_str)
882
934
  return LEXBOR_ACTION_OK
@@ -1,4 +1,7 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_SetObject
3
+ from cpython.list cimport PyList_GET_SIZE
4
+
2
5
 
3
6
  @cython.final
4
7
  cdef class LexborCSSSelector:
@@ -8,21 +11,22 @@ cdef class LexborCSSSelector:
8
11
  self.results = []
9
12
  self.current_node = None
10
13
 
11
- cdef _create_css_parser(self):
14
+ cdef int _create_css_parser(self) except -1:
12
15
  cdef lxb_status_t status
13
16
 
14
-
15
17
  self.parser = lxb_css_parser_create()
16
18
  status = lxb_css_parser_init(self.parser, NULL)
17
19
 
18
20
  if status != LXB_STATUS_OK:
19
- raise SelectolaxError("Can't initialize CSS parser.")
21
+ PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
22
+ return -1
20
23
 
21
24
  self.css_selectors = lxb_css_selectors_create()
22
25
  status = lxb_css_selectors_init(self.css_selectors)
23
26
 
24
27
  if status != LXB_STATUS_OK:
25
- raise SelectolaxError("Can't initialize CSS selector.")
28
+ PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
29
+ return -1
26
30
 
27
31
  lxb_css_parser_selectors_set(self.parser, self.css_selectors)
28
32
 
@@ -30,10 +34,11 @@ cdef class LexborCSSSelector:
30
34
  status = lxb_selectors_init(self.selectors)
31
35
  lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
32
36
  if status != LXB_STATUS_OK:
33
- raise SelectolaxError("Can't initialize CSS selector.")
37
+ PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
38
+ return -1
39
+ return 0
34
40
 
35
-
36
- cpdef find(self, str query, LexborNode node):
41
+ cpdef list find(self, str query, LexborNode node):
37
42
  cdef lxb_css_selector_list_t* selectors
38
43
  cdef lxb_char_t* c_selector
39
44
  cdef lxb_css_selector_list_t * selectors_list
@@ -57,10 +62,11 @@ cdef class LexborCSSSelector:
57
62
  lxb_css_selector_list_destroy_memory(selectors_list)
58
63
  return results
59
64
 
60
- cpdef any_matches(self, str query, LexborNode node):
65
+ cpdef int any_matches(self, str query, LexborNode node) except -1:
61
66
  cdef lxb_css_selector_list_t * selectors
62
67
  cdef lxb_char_t * c_selector
63
68
  cdef lxb_css_selector_list_t * selectors_list
69
+ cdef int result
64
70
 
65
71
  if not isinstance(query, str):
66
72
  raise TypeError("Query must be a string.")
@@ -69,20 +75,19 @@ cdef class LexborCSSSelector:
69
75
  selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
70
76
 
71
77
  if selectors_list == NULL:
72
- raise SelectolaxError("Can't parse CSS selector.")
78
+ PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
73
79
 
74
80
  self.results = []
75
81
  status = lxb_selectors_find(self.selectors, node.node, selectors_list,
76
82
  <lxb_selectors_cb_f> css_matcher_callback, <void *> self)
77
83
  if status != LXB_STATUS_OK:
78
84
  lxb_css_selector_list_destroy_memory(selectors_list)
79
- raise SelectolaxError("Can't parse CSS selector.")
80
- result = bool(self.results)
85
+ PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
86
+ result = PyList_GET_SIZE(self.results) > 0
81
87
  self.results = []
82
88
  lxb_css_selector_list_destroy_memory(selectors_list)
83
89
  return result
84
90
 
85
-
86
91
  def __dealloc__(self):
87
92
  if self.selectors != NULL:
88
93
  lxb_selectors_destroy(self.selectors, True)
@@ -92,7 +97,6 @@ cdef class LexborCSSSelector:
92
97
  lxb_css_selectors_destroy(self.css_selectors, True)
93
98
 
94
99
 
95
-
96
100
  cdef class LexborSelector:
97
101
  """An advanced CSS selector that supports additional operations.
98
102
 
@@ -107,10 +111,9 @@ cdef class LexborSelector:
107
111
  self.node = node
108
112
  self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]
109
113
 
110
-
111
114
  cpdef css(self, str query):
112
115
  """Evaluate CSS selector against current scope."""
113
- raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
116
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
114
117
 
115
118
  @property
116
119
  def matches(self) -> list:
@@ -124,7 +127,7 @@ cdef class LexborSelector:
124
127
 
125
128
  def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
126
129
  """Filter all current matches given text."""
127
- nodes = []
130
+ cdef list nodes = []
128
131
  for node in self.nodes:
129
132
  node_text = node.text(deep=deep, separator=separator, strip=strip)
130
133
  if node_text and text in node_text:
@@ -134,7 +137,7 @@ cdef class LexborSelector:
134
137
 
135
138
  def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
136
139
  """Returns True if any node in the current search scope contains specified text"""
137
- nodes = []
140
+ cdef LexborNode node
138
141
  for node in self.nodes:
139
142
  node_text = node.text(deep=deep, separator=separator, strip=strip)
140
143
  if node_text and text in node_text:
@@ -146,7 +149,7 @@ cdef class LexborSelector:
146
149
 
147
150
  Similar to `string-length` in XPath.
148
151
  """
149
- nodes = []
152
+ cdef list nodes = []
150
153
  for node in self.nodes:
151
154
  attr = node.attributes.get(attribute)
152
155
  if attr and start and start in attr:
@@ -161,7 +164,7 @@ cdef class LexborSelector:
161
164
 
162
165
  Similar to `string-length` in XPath.
163
166
  """
164
- nodes = []
167
+ cdef LexborNode node
165
168
  for node in self.nodes:
166
169
  attr = node.attributes.get(attribute)
167
170
  if attr and start and start in attr:
@@ -176,16 +179,15 @@ cdef class LexborSelector:
176
179
 
177
180
  cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
178
181
  cdef LexborNode lxb_node
179
- cdef object cls
180
- cls = <object> ctx
181
- lxb_node = LexborNode()
182
- lxb_node._cinit(<lxb_dom_node_t *> node, cls.current_node.parser)
182
+ cdef LexborCSSSelector cls
183
+ cls = <LexborCSSSelector> ctx
184
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
183
185
  cls.results.append(lxb_node)
184
186
  return LXB_STATUS_OK
185
187
 
186
188
  cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
187
189
  cdef LexborNode lxb_node
188
- cdef object cls
189
- cls = <object> ctx
190
+ cdef LexborCSSSelector cls
191
+ cls = <LexborCSSSelector> ctx
190
192
  cls.results.append(True)
191
193
  return LXB_STATUS_STOP
@@ -1,5 +1,6 @@
1
1
  include "../utils.pxi"
2
2
 
3
+
3
4
  def create_tag(tag: str):
4
5
  """
5
6
  Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,