selectolax 0.3.28__cp310-cp310-musllinux_1_2_aarch64.whl → 0.4.0__cp310-cp310-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyx CHANGED
@@ -1,4 +1,6 @@
1
- from cpython cimport bool
1
+ from cpython.bool cimport bool
2
+ from cpython.exc cimport PyErr_SetObject
3
+
2
4
 
3
5
  _ENCODING = 'UTF-8'
4
6
 
@@ -8,6 +10,7 @@ include "lexbor/attrs.pxi"
8
10
  include "lexbor/node.pxi"
9
11
  include "lexbor/selection.pxi"
10
12
  include "lexbor/util.pxi"
13
+ include "lexbor/node_remove.pxi"
11
14
 
12
15
  # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
13
16
 
@@ -24,10 +27,8 @@ cdef class LexborHTMLParser:
24
27
  html : str (unicode) or bytes
25
28
  """
26
29
  def __init__(self, html):
27
-
28
30
  cdef size_t html_len
29
- cdef char* html_chars
30
-
31
+ cdef object bytes_html
31
32
  bytes_html, html_len = preprocess_input(html)
32
33
  self._parse_html(bytes_html, html_len)
33
34
  self.raw_html = bytes_html
@@ -39,22 +40,27 @@ cdef class LexborHTMLParser:
39
40
  self._selector = LexborCSSSelector()
40
41
  return self._selector
41
42
 
42
-
43
- cdef _parse_html(self, char *html, size_t html_len):
43
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
44
44
  cdef lxb_status_t status
45
45
 
46
46
  with nogil:
47
47
  self.document = lxb_html_document_create()
48
48
 
49
49
  if self.document == NULL:
50
- raise SelectolaxError("Failed to initialize object for HTML Document.")
50
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
51
+ return -1
51
52
 
52
53
  with nogil:
53
54
  status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
55
+
54
56
  if status != 0x0000:
55
- raise SelectolaxError("Can't parse HTML.")
57
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
58
+ return -1
56
59
 
57
- assert self.document != NULL
60
+ if self.document == NULL:
61
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
62
+ return -1
63
+ return 0
58
64
 
59
65
  def __dealloc__(self):
60
66
  if self.document != NULL:
@@ -68,7 +74,7 @@ cdef class LexborHTMLParser:
68
74
  """Returns root node."""
69
75
  if self.document == NULL:
70
76
  return None
71
- return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
77
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
78
 
73
79
  @property
74
80
  def body(self):
@@ -77,7 +83,7 @@ cdef class LexborHTMLParser:
77
83
  body = lxb_html_document_body_element_noi(self.document)
78
84
  if body == NULL:
79
85
  return None
80
- return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
86
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
81
87
 
82
88
  @property
83
89
  def head(self):
@@ -86,7 +92,7 @@ cdef class LexborHTMLParser:
86
92
  head = lxb_html_document_head_element_noi(self.document)
87
93
  if head == NULL:
88
94
  return None
89
- return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
95
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
90
96
 
91
97
  def tags(self, str name):
92
98
  """Returns a list of tags that match specified name.
@@ -96,6 +102,12 @@ cdef class LexborHTMLParser:
96
102
  name : str (e.g. div)
97
103
 
98
104
  """
105
+
106
+ if not name:
107
+ raise ValueError("Tag name cannot be empty")
108
+ if len(name) > 100:
109
+ raise ValueError("Tag name is too long")
110
+
99
111
  cdef lxb_dom_collection_t* collection = NULL
100
112
  cdef lxb_status_t status
101
113
  pybyte_name = name.encode('UTF-8')
@@ -116,7 +128,7 @@ cdef class LexborHTMLParser:
116
128
  raise SelectolaxError("Can't locate elements.")
117
129
 
118
130
  for i in range(lxb_dom_collection_length_noi(collection)):
119
- node = LexborNode()._cinit(
131
+ node = LexborNode.new(
120
132
  <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
121
133
  self
122
134
  )
@@ -150,7 +162,7 @@ cdef class LexborHTMLParser:
150
162
  """Return HTML representation of the page."""
151
163
  if self.document == NULL:
152
164
  return None
153
- node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
165
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
154
166
  return node.html
155
167
 
156
168
  def css(self, str query):
@@ -159,6 +171,11 @@ cdef class LexborHTMLParser:
159
171
  Matches pattern `query` against HTML tree.
160
172
  `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
161
173
 
174
+ Special selectors:
175
+
176
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
177
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
178
+
162
179
  Parameters
163
180
  ----------
164
181
  query : str
@@ -177,9 +194,9 @@ cdef class LexborHTMLParser:
177
194
  ----------
178
195
 
179
196
  query : str
180
- default : bool, default None
197
+ default : Any, default None
181
198
  Default value to return if there is no match.
182
- strict: bool, default True
199
+ strict: bool, default False
183
200
  Set to True if you want to check if there is strictly only one match in the document.
184
201
 
185
202
 
@@ -196,7 +213,7 @@ cdef class LexborHTMLParser:
196
213
  ----------
197
214
  tags : list of str
198
215
  List of tags to remove.
199
- recursive : bool, default True
216
+ recursive : bool, default False
200
217
  Whenever to delete all its child nodes
201
218
 
202
219
  Examples
@@ -232,7 +249,7 @@ cdef class LexborHTMLParser:
232
249
 
233
250
  for i in range(lxb_dom_collection_length_noi(collection)):
234
251
  if recursive:
235
- lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
252
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
236
253
  else:
237
254
  lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
238
255
  lxb_dom_collection_destroy(collection, <bint> True)
@@ -273,7 +290,6 @@ cdef class LexborHTMLParser:
273
290
  """
274
291
  return self.root.scripts_contain(query)
275
292
 
276
-
277
293
  def script_srcs_contain(self, tuple queries):
278
294
  """Returns True if any of the script SRCs attributes contain on of the specified text.
279
295
 
@@ -289,6 +305,26 @@ cdef class LexborHTMLParser:
289
305
  def css_matches(self, str selector):
290
306
  return self.root.css_matches(selector)
291
307
 
308
+ def merge_text_nodes(self):
309
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
310
+
311
+ This is useful for text extraction.
312
+ Use it when you need to strip HTML tags and merge "dangling" text.
313
+
314
+ Examples
315
+ --------
316
+
317
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
318
+ >>> node = tree.css_first('div')
319
+ >>> tree.unwrap_tags(["strong"])
320
+ >>> tree.text(deep=True, separator=" ", strip=True)
321
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
322
+ >>> node.merge_text_nodes()
323
+ >>> tree.text(deep=True, separator=" ", strip=True)
324
+ "John Doe"
325
+ """
326
+ return self.root.merge_text_nodes()
327
+
292
328
  @staticmethod
293
329
  cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
294
330
  obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -300,9 +336,16 @@ cdef class LexborHTMLParser:
300
336
  return obj
301
337
 
302
338
  def clone(self):
303
- """Clone the current tree."""
339
+ """Clone the current node.
340
+
341
+ You can use to do temporary modifications without affecting the original HTML tree.
342
+
343
+ It is tied to the current parser instance.
344
+ Gets destroyed when parser instance is destroyed.
345
+ """
304
346
  cdef lxb_html_document_t* cloned_document
305
347
  cdef lxb_dom_node_t* cloned_node
348
+ cdef LexborHTMLParser cls
306
349
 
307
350
  with nogil:
308
351
  cloned_document = lxb_html_document_create()
@@ -327,7 +370,8 @@ cdef class LexborHTMLParser:
327
370
 
328
371
  cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
329
372
  return cls
330
- def unwrap_tags(self, list tags):
373
+
374
+ def unwrap_tags(self, list tags, delete_empty = False):
331
375
  """Unwraps specified tags from the HTML tree.
332
376
 
333
377
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -336,6 +380,8 @@ cdef class LexborHTMLParser:
336
380
  ----------
337
381
  tags : list
338
382
  List of tags to remove.
383
+ delete_empty : bool
384
+ Whenever to delete empty tags.
339
385
 
340
386
  Examples
341
387
  --------
@@ -345,5 +391,34 @@ cdef class LexborHTMLParser:
345
391
  >>> tree.body.html
346
392
  '<body><div>Hello world!</div></body>'
347
393
  """
348
- if self.root is not None:
349
- self.root.unwrap_tags(tags)
394
+ # faster to check if the document is empty which should determine if we have a root
395
+ if self.document != NULL:
396
+ self.root.unwrap_tags(tags, delete_empty=delete_empty)
397
+
398
+ @property
399
+ def inner_html(self) -> str:
400
+ """Return HTML representation of the child nodes.
401
+
402
+ Works similar to innerHTML in JavaScript.
403
+ Unlike the `.html` property, does not include the current node.
404
+ Can be used to set HTML as well. See the setter docstring.
405
+
406
+ Returns
407
+ -------
408
+ text : str | None
409
+ """
410
+ return self.root.inner_html
411
+
412
+ @inner_html.setter
413
+ def inner_html(self, str html):
414
+ """Set inner HTML to the specified HTML.
415
+
416
+ Replaces existing data inside the node.
417
+ Works similar to innerHTML in JavaScript.
418
+
419
+ Parameters
420
+ ----------
421
+ html : str
422
+
423
+ """
424
+ self.root.inner_html = html
@@ -1,4 +1,5 @@
1
1
  cimport cython
2
+ from cpython.exc cimport PyErr_NoMemory
2
3
 
3
4
  from libc.stdlib cimport free
4
5
  from libc.stdlib cimport malloc
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
8
9
  DEF _STACK_SIZE = 100
9
10
  DEF _ENCODING = 'UTF-8'
10
11
 
12
+
11
13
  @cython.final
12
14
  cdef class Stack:
13
15
  def __cinit__(self, size_t capacity=25):
14
16
  self.capacity = capacity
15
17
  self.top = 0
16
18
  self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
19
+ if self._stack == NULL:
20
+ raise MemoryError("Failed to allocate memory for stack")
17
21
 
18
22
  def __dealloc__(self):
19
23
  free(self._stack)
@@ -21,9 +25,10 @@ cdef class Stack:
21
25
  cdef bint is_empty(self):
22
26
  return self.top <= 0
23
27
 
24
- cdef push(self, myhtml_tree_node_t* res):
28
+ cdef int push(self, myhtml_tree_node_t* res) except -1:
25
29
  if self.top >= self.capacity:
26
- self.resize()
30
+ if self.resize() < 0:
31
+ return -1
27
32
  self._stack[self.top] = res
28
33
  self.top += 1
29
34
 
@@ -31,10 +36,13 @@ cdef class Stack:
31
36
  self.top = self.top - 1
32
37
  return self._stack[self.top]
33
38
 
34
- cdef resize(self):
39
+ cdef int resize(self) except -1:
35
40
  self.capacity *= 2
36
41
  self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
37
-
42
+ if self._stack == NULL:
43
+ PyErr_NoMemory()
44
+ return -1
45
+ return 0
38
46
 
39
47
  cdef class _Attributes:
40
48
  """A dict-like object that represents attributes."""
@@ -128,25 +136,24 @@ cdef class _Attributes:
128
136
  tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
129
137
  return "<%s attributes, %s items>" % (tag_name, len(self))
130
138
 
131
-
132
-
133
139
  ctypedef fused str_or_Node:
134
- basestring
140
+ str
135
141
  bytes
136
142
  Node
137
143
 
138
-
139
144
  cdef class Node:
140
145
  """A class that represents HTML node (element)."""
141
146
  cdef myhtml_tree_node_t *node
142
147
  cdef public HTMLParser parser
143
148
 
144
-
145
- cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
146
- # custom init, because __cinit__ doesn't accept C types
147
- self.node = node
149
+ @staticmethod
150
+ cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
151
+ # custom __init__ for C, because __cinit__ doesn't accept C types
152
+ cdef Node cls = Node.__new__(Node)
153
+ cls.node = node
148
154
  # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
149
- self.parser = parser
155
+ cls.parser = parser
156
+ return cls
150
157
 
151
158
  @property
152
159
  def attributes(self):
@@ -286,7 +293,7 @@ cdef class Node:
286
293
  cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
287
294
  text = ""
288
295
  cdef Stack stack = Stack(_STACK_SIZE)
289
- cdef myhtml_tree_node_t* current_node = NULL;
296
+ cdef myhtml_tree_node_t* current_node = NULL
290
297
 
291
298
  if node.tag_id == MyHTML_TAG__TEXT:
292
299
  c_text = myhtml_node_text(node, NULL)
@@ -339,12 +346,10 @@ cdef class Node:
339
346
  node = node.next
340
347
  continue
341
348
 
342
- next_node = Node()
343
- next_node._init(node, self.parser)
349
+ next_node = Node.new(node, self.parser)
344
350
  yield next_node
345
351
  node = node.next
346
352
 
347
-
348
353
  def traverse(self, include_text=False):
349
354
  """Iterate over all child and next nodes starting from the current level.
350
355
 
@@ -358,16 +363,15 @@ cdef class Node:
358
363
  node
359
364
  """
360
365
  cdef Stack stack = Stack(_STACK_SIZE)
361
- cdef myhtml_tree_node_t* current_node = NULL;
362
- cdef Node next_node;
366
+ cdef myhtml_tree_node_t* current_node = NULL
367
+ cdef Node next_node
363
368
 
364
369
  stack.push(self.node)
365
370
 
366
371
  while not stack.is_empty():
367
372
  current_node = stack.pop()
368
373
  if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
369
- next_node = Node()
370
- next_node._init(current_node, self.parser)
374
+ next_node = Node.new(current_node, self.parser)
371
375
  yield next_node
372
376
 
373
377
  if current_node.next is not NULL:
@@ -393,11 +397,13 @@ cdef class Node:
393
397
 
394
398
  @property
395
399
  def child(self):
396
- """Return the child node."""
400
+ """Alias for the `first_child` property.
401
+
402
+ **Deprecated**. Please use `first_child` instead.
403
+ """
397
404
  cdef Node node
398
405
  if self.node.child:
399
- node = Node()
400
- node._init(self.node.child, self.parser)
406
+ node = Node.new(self.node.child, self.parser)
401
407
  return node
402
408
  return None
403
409
 
@@ -406,8 +412,7 @@ cdef class Node:
406
412
  """Return the parent node."""
407
413
  cdef Node node
408
414
  if self.node.parent:
409
- node = Node()
410
- node._init(self.node.parent, self.parser)
415
+ node = Node.new(self.node.parent, self.parser)
411
416
  return node
412
417
  return None
413
418
 
@@ -416,8 +421,7 @@ cdef class Node:
416
421
  """Return next node."""
417
422
  cdef Node node
418
423
  if self.node.next:
419
- node = Node()
420
- node._init(self.node.next, self.parser)
424
+ node = Node.new(self.node.next, self.parser)
421
425
  return node
422
426
  return None
423
427
 
@@ -426,8 +430,7 @@ cdef class Node:
426
430
  """Return previous node."""
427
431
  cdef Node node
428
432
  if self.node.prev:
429
- node = Node()
430
- node._init(self.node.prev, self.parser)
433
+ node = Node.new(self.node.prev, self.parser)
431
434
  return node
432
435
  return None
433
436
 
@@ -436,8 +439,7 @@ cdef class Node:
436
439
  """Return last child node."""
437
440
  cdef Node node
438
441
  if self.node.last_child:
439
- node = Node()
440
- node._init(self.node.last_child, self.parser)
442
+ node = Node.new(self.node.last_child, self.parser)
441
443
  return node
442
444
  return None
443
445
 
@@ -515,9 +517,14 @@ cdef class Node:
515
517
  """An alias for the decompose method."""
516
518
  self.decompose(recursive)
517
519
 
518
- def unwrap(self):
520
+ def unwrap(self, delete_empty = False):
519
521
  """Replace node with whatever is inside this node.
520
522
 
523
+ Parameters
524
+ ----------
525
+ delete_empty : bool, default False
526
+ Whenever to delete empty tags.
527
+
521
528
  Examples
522
529
  --------
523
530
 
@@ -526,11 +533,14 @@ cdef class Node:
526
533
  >>> tree.html
527
534
  '<html><head></head><body><div>Hello world!</div></body></html>'
528
535
 
536
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
529
537
  """
530
538
  if self.node.child == NULL:
539
+ if delete_empty:
540
+ myhtml_node_delete(self.node)
531
541
  return
532
- cdef myhtml_tree_node_t* next_node;
533
- cdef myhtml_tree_node_t* current_node;
542
+ cdef myhtml_tree_node_t* next_node
543
+ cdef myhtml_tree_node_t* current_node
534
544
 
535
545
  if self.node.child.next != NULL:
536
546
  current_node = self.node.child
@@ -564,11 +574,13 @@ cdef class Node:
564
574
  '<html><body><div>Hello world!</div></body></html>'
565
575
 
566
576
  """
577
+ # ensure cython can recast element to a Node so that decompose will be called sooner.
578
+ cdef Node element
567
579
  for tag in tags:
568
580
  for element in self.css(tag):
569
581
  element.decompose(recursive=recursive)
570
582
 
571
- def unwrap_tags(self, list tags):
583
+ def unwrap_tags(self, list tags, delete_empty = False):
572
584
  """Unwraps specified tags from the HTML tree.
573
585
 
574
586
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -577,6 +589,8 @@ cdef class Node:
577
589
  ----------
578
590
  tags : list
579
591
  List of tags to remove.
592
+ delete_empty : bool, default False
593
+ Whenever to delete empty tags.
580
594
 
581
595
  Examples
582
596
  --------
@@ -585,11 +599,13 @@ cdef class Node:
585
599
  >>> tree.body.unwrap_tags(['i','a'])
586
600
  >>> tree.body.html
587
601
  '<body><div>Hello world!</div></body>'
588
- """
589
602
 
603
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
604
+ """
605
+ cdef Node element
590
606
  for tag in tags:
591
607
  for element in self.css(tag):
592
- element.unwrap()
608
+ element.unwrap(delete_empty)
593
609
 
594
610
  def replace_with(self, str_or_Node value):
595
611
  """Replace current Node with specified value.
@@ -752,7 +768,7 @@ cdef class Node:
752
768
  else:
753
769
  raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
754
770
 
755
- def unwrap_tags(self, list tags):
771
+ def unwrap_tags(self, list tags, delete_empty = False):
756
772
  """Unwraps specified tags from the HTML tree.
757
773
 
758
774
  Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -761,6 +777,8 @@ cdef class Node:
761
777
  ----------
762
778
  tags : list
763
779
  List of tags to remove.
780
+ delete_empty : bool, default False
781
+ Whenever to delete empty tags.
764
782
 
765
783
  Examples
766
784
  --------
@@ -769,11 +787,13 @@ cdef class Node:
769
787
  >>> tree.body.unwrap_tags(['i','a'])
770
788
  >>> tree.body.html
771
789
  '<body><div>Hello world!</div></body>'
772
- """
773
790
 
791
+ Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
792
+ """
793
+ cdef Node element
774
794
  for tag in tags:
775
795
  for element in self.css(tag):
776
- element.unwrap()
796
+ element.unwrap(delete_empty)
777
797
 
778
798
  @property
779
799
  def raw_value(self):
@@ -829,6 +849,7 @@ cdef class Node:
829
849
  The query to check.
830
850
 
831
851
  """
852
+ cdef Node node
832
853
  if self.parser.cached_script_texts is None:
833
854
  nodes = find_nodes(self.parser, self.node, 'script')
834
855
  text_nodes = []
@@ -877,6 +898,7 @@ cdef class Node:
877
898
  if not isinstance(other, Node):
878
899
  return False
879
900
  return self.html == other.html
901
+
880
902
  @property
881
903
  def text_content(self):
882
904
  """Returns the text of the node if it is a text node.
@@ -930,8 +952,8 @@ cdef class Node:
930
952
  while not stack.is_empty():
931
953
  current_node = stack.pop()
932
954
 
933
- if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
934
- current_node.prev.tag_id == MyHTML_TAG__TEXT:
955
+ if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
956
+ current_node.prev.tag_id == MyHTML_TAG__TEXT):
935
957
  left_text = myhtml_node_text(current_node.prev, &left_length)
936
958
  right_text = myhtml_node_text(current_node, &right_length)
937
959
  if left_text and right_text:
@@ -962,8 +984,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
962
984
 
963
985
  cdef inline bytes to_bytes(str_or_Node value):
964
986
  cdef bytes bytes_val
965
- if isinstance(value, (str, unicode)):
966
- bytes_val = value.encode(_ENCODING)
987
+ if isinstance(value, unicode):
988
+ bytes_val = <bytes>value.encode("utf-8")
967
989
  elif isinstance(value, bytes):
968
- bytes_val = <char*> value
990
+ bytes_val = <bytes>value
969
991
  return bytes_val