selectolax 0.3.16__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.3.29__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
5
  __email__ = 'me@rushter.com'
6
- __version__ = '0.3.16'
6
+ __version__ = '0.3.29'
7
7
 
8
8
  from . import parser
9
9
  from . import lexbor
@@ -19,8 +19,9 @@ cdef class LexborAttributes:
19
19
 
20
20
  while attr != NULL:
21
21
  key = lxb_dom_attr_local_name_noi(attr, &str_len)
22
+ if key is not NULL:
23
+ yield key.decode(_ENCODING)
22
24
  attr = attr.next
23
- yield key.decode(_ENCODING)
24
25
 
25
26
  def __setitem__(self, str key, value):
26
27
  value = str(value)
@@ -95,7 +95,7 @@ cdef class LexborNode:
95
95
  text : str
96
96
  """
97
97
  cdef lexbor_str_t *lxb_str
98
- cdef lxb_status_t lxb_status_t
98
+ cdef lxb_status_t status
99
99
 
100
100
  lxb_str = lexbor_str_create()
101
101
  status = lxb_html_serialize_tree_str(self.node, lxb_str)
@@ -416,9 +416,14 @@ cdef class LexborNode:
416
416
  node = node.next
417
417
 
418
418
 
419
- def unwrap(self):
419
+ def unwrap(self, delete_empty=False):
420
420
  """Replace node with whatever is inside this node.
421
421
 
422
+ Parameters
423
+ ----------
424
+ delete_empty : bool, default False
425
+ If True, removes empty tags.
426
+
422
427
  Examples
423
428
  --------
424
429
 
@@ -426,9 +431,12 @@ cdef class LexborNode:
426
431
  >>> tree.css_first('i').unwrap()
427
432
  >>> tree.html
428
433
  '<html><head></head><body><div>Hello world!</div></body></html>'
429
-
434
+
435
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
430
436
  """
431
437
  if self.node.first_child == NULL:
438
+ if delete_empty:
439
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
432
440
  return
433
441
  cdef lxb_dom_node_t* next_node;
434
442
  cdef lxb_dom_node_t* current_node;
@@ -445,7 +453,7 @@ cdef class LexborNode:
445
453
  lxb_dom_node_insert_before(self.node, self.node.first_child)
446
454
  lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
447
455
 
448
- def unwrap_tags(self, list tags):
456
+ def unwrap_tags(self, list tags, delete_empty = False):
449
457
  """Unwraps specified tags from the HTML tree.
450
458
 
451
459
  Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -454,6 +462,8 @@ cdef class LexborNode:
454
462
  ----------
455
463
  tags : list
456
464
  List of tags to remove.
465
+ delete_empty : bool, default False
466
+ If True, removes empty tags.
457
467
 
458
468
  Examples
459
469
  --------
@@ -462,11 +472,13 @@ cdef class LexborNode:
462
472
  >>> tree.body.unwrap_tags(['i','a'])
463
473
  >>> tree.body.html
464
474
  '<body><div>Hello world!</div></body>'
475
+
476
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
465
477
  """
466
478
 
467
479
  for tag in tags:
468
480
  for element in self.css(tag):
469
- element.unwrap()
481
+ element.unwrap(delete_empty)
470
482
 
471
483
 
472
484
  def traverse(self, include_text=False):
@@ -655,6 +667,57 @@ cdef class LexborNode:
655
667
  else:
656
668
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
657
669
 
670
+ def insert_child(self, str_or_LexborNode value):
671
+ """
672
+ Insert a node inside (at the end of) the current Node.
673
+
674
+ Parameters
675
+ ----------
676
+ value : str, bytes or Node
677
+ The text or Node instance to insert inside the Node.
678
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
679
+ Convert and pass the ``Node`` object when you want to work with HTML.
680
+ Does not clone the ``Node`` object.
681
+ All future changes to the passed ``Node`` object will also be taken into account.
682
+
683
+ Examples
684
+ --------
685
+
686
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
687
+ >>> div = tree.css_first('div')
688
+ >>> div.insert_child('Laptop')
689
+ >>> tree.body.child.html
690
+ '<div>Get <img src="">Laptop</div>'
691
+
692
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
693
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
694
+ >>> span_node = html_parser.css_first('span')
695
+ >>> span_node.insert_child(html_parser2.body.child)
696
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
697
+ """
698
+ cdef lxb_dom_node_t * new_node
699
+
700
+ if isinstance(value, (str, bytes, unicode)):
701
+ bytes_val = to_bytes(value)
702
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
703
+ &self.parser.document.dom_document,
704
+ <lxb_char_t *> bytes_val, len(bytes_val)
705
+ )
706
+ if new_node == NULL:
707
+ raise SelectolaxError("Can't create a new node")
708
+ lxb_dom_node_insert_child(self.node, new_node)
709
+ elif isinstance(value, LexborNode):
710
+ new_node = lxb_dom_document_import_node(
711
+ &self.parser.document.dom_document,
712
+ <lxb_dom_node_t *> value.node,
713
+ <bint> True
714
+ )
715
+ if new_node == NULL:
716
+ raise SelectolaxError("Can't create a new node")
717
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
718
+ else:
719
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
720
+
658
721
  @property
659
722
  def raw_value(self):
660
723
  """Return the raw (unparsed, original) value of a node.
@@ -28,7 +28,7 @@ cdef class LexborCSSSelector:
28
28
 
29
29
  self.selectors = lxb_selectors_create()
30
30
  status = lxb_selectors_init(self.selectors)
31
-
31
+ lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
32
32
  if status != LXB_STATUS_OK:
33
33
  raise SelectolaxError("Can't initialize CSS selector.")
34
34
 
@@ -72,14 +72,17 @@ cdef class LexborCSSSelector:
72
72
  raise SelectolaxError("Can't parse CSS selector.")
73
73
  result = bool(self.results)
74
74
  self.results = []
75
+ lxb_css_selector_list_destroy_memory(selectors_list)
75
76
  return result
76
77
 
77
78
 
78
79
  def __dealloc__(self):
79
- lxb_selectors_destroy(self.selectors, True)
80
- lxb_css_parser_destroy(self.parser, True)
81
- lxb_css_selectors_destroy(self.css_selectors, True)
82
- # lxb_css_memory_destroy(, True)
80
+ if self.selectors != NULL:
81
+ lxb_selectors_destroy(self.selectors, True)
82
+ if self.parser != NULL:
83
+ lxb_css_parser_destroy(self.parser, True)
84
+ if self.css_selectors != NULL:
85
+ lxb_css_selectors_destroy(self.css_selectors, True)
83
86
 
84
87
 
85
88
 
@@ -103,16 +106,16 @@ cdef class LexborSelector:
103
106
  raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
104
107
 
105
108
  @property
106
- def matches(self):
109
+ def matches(self) -> list:
107
110
  """Returns all possible matches"""
108
111
  return self.nodes
109
112
 
110
113
  @property
111
- def any_matches(self):
114
+ def any_matches(self) -> bool:
112
115
  """Returns True if there are any matches"""
113
116
  return bool(self.nodes)
114
117
 
115
- def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
118
+ def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
116
119
  """Filter all current matches given text."""
117
120
  nodes = []
118
121
  for node in self.nodes:
@@ -122,7 +125,7 @@ cdef class LexborSelector:
122
125
  self.nodes = nodes
123
126
  return self
124
127
 
125
- def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
128
+ def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
126
129
  """Returns True if any node in the current search scope contains specified text"""
127
130
  nodes = []
128
131
  for node in self.nodes:
@@ -131,7 +134,7 @@ cdef class LexborSelector:
131
134
  return True
132
135
  return False
133
136
 
134
- def attribute_longer_than(self, str attribute, int length, str start = None):
137
+ def attribute_longer_than(self, str attribute, int length, str start = None) -> LexborSelector:
135
138
  """Filter all current matches by attribute length.
136
139
 
137
140
  Similar to `string-length` in XPath.
@@ -146,7 +149,7 @@ cdef class LexborSelector:
146
149
  self.nodes = nodes
147
150
  return self
148
151
 
149
- def any_attribute_longer_than(self, str attribute, int length, str start = None):
152
+ def any_attribute_longer_than(self, str attribute, int length, str start = None) -> bool:
150
153
  """Returns True any href attribute longer than a specified length.
151
154
 
152
155
  Similar to `string-length` in XPath.
@@ -0,0 +1,19 @@
1
+ include "../utils.pxi"
2
+
3
+ def create_tag(tag: str):
4
+ """
5
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
6
+ e.g. `"<div></div>"`.
7
+ """
8
+ return do_create_tag(tag, LexborHTMLParser)
9
+
10
+
11
+ def parse_fragment(html: str):
12
+ """
13
+ Given HTML, parse it into a list of Nodes, such that the nodes
14
+ correspond to the given HTML.
15
+
16
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
17
+ if they are missing. This function does not add these tags.
18
+ """
19
+ return do_parse_fragment(html, LexborHTMLParser)