selectolax 0.3.15__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.3.28__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
 
4
4
  __author__ = """Artem Golubin"""
5
5
  __email__ = 'me@rushter.com'
6
- __version__ = '0.3.15'
6
+ __version__ = '0.3.28'
7
7
 
8
8
  from . import parser
9
9
  from . import lexbor
@@ -19,8 +19,9 @@ cdef class LexborAttributes:
19
19
 
20
20
  while attr != NULL:
21
21
  key = lxb_dom_attr_local_name_noi(attr, &str_len)
22
+ if key is not NULL:
23
+ yield key.decode(_ENCODING)
22
24
  attr = attr.next
23
- yield key.decode(_ENCODING)
24
25
 
25
26
  def __setitem__(self, str key, value):
26
27
  value = str(value)
@@ -27,6 +27,10 @@ cdef class LexborNode:
27
27
  self.node = node
28
28
  return self
29
29
 
30
+ @property
31
+ def mem_id(self):
32
+ return <size_t> self.node
33
+
30
34
  @property
31
35
  def child(self):
32
36
  """Alias for the `first_child` property."""
@@ -91,7 +95,7 @@ cdef class LexborNode:
91
95
  text : str
92
96
  """
93
97
  cdef lexbor_str_t *lxb_str
94
- cdef lxb_status_t lxb_status_t
98
+ cdef lxb_status_t status
95
99
 
96
100
  lxb_str = lexbor_str_create()
97
101
  status = lxb_html_serialize_tree_str(self.node, lxb_str)
@@ -101,6 +105,9 @@ cdef class LexborNode:
101
105
  return html
102
106
  return None
103
107
 
108
+ def __hash__(self):
109
+ return self.mem_id
110
+
104
111
  def text_lexbor(self):
105
112
  """Returns the text of the node including text of all its child nodes.
106
113
 
@@ -648,6 +655,57 @@ cdef class LexborNode:
648
655
  else:
649
656
  raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
650
657
 
658
+ def insert_child(self, str_or_LexborNode value):
659
+ """
660
+ Insert a node inside (at the end of) the current Node.
661
+
662
+ Parameters
663
+ ----------
664
+ value : str, bytes or Node
665
+ The text or Node instance to insert inside the Node.
666
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
667
+ Convert and pass the ``Node`` object when you want to work with HTML.
668
+ Does not clone the ``Node`` object.
669
+ All future changes to the passed ``Node`` object will also be taken into account.
670
+
671
+ Examples
672
+ --------
673
+
674
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
675
+ >>> div = tree.css_first('div')
676
+ >>> div.insert_child('Laptop')
677
+ >>> tree.body.child.html
678
+ '<div>Get <img src="">Laptop</div>'
679
+
680
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
681
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
682
+ >>> span_node = html_parser.css_first('span')
683
+ >>> span_node.insert_child(html_parser2.body.child)
684
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
685
+ """
686
+ cdef lxb_dom_node_t * new_node
687
+
688
+ if isinstance(value, (str, bytes, unicode)):
689
+ bytes_val = to_bytes(value)
690
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
691
+ &self.parser.document.dom_document,
692
+ <lxb_char_t *> bytes_val, len(bytes_val)
693
+ )
694
+ if new_node == NULL:
695
+ raise SelectolaxError("Can't create a new node")
696
+ lxb_dom_node_insert_child(self.node, new_node)
697
+ elif isinstance(value, LexborNode):
698
+ new_node = lxb_dom_document_import_node(
699
+ &self.parser.document.dom_document,
700
+ <lxb_dom_node_t *> value.node,
701
+ <bint> True
702
+ )
703
+ if new_node == NULL:
704
+ raise SelectolaxError("Can't create a new node")
705
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
706
+ else:
707
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
708
+
651
709
  @property
652
710
  def raw_value(self):
653
711
  """Return the raw (unparsed, original) value of a node.
@@ -28,7 +28,7 @@ cdef class LexborCSSSelector:
28
28
 
29
29
  self.selectors = lxb_selectors_create()
30
30
  status = lxb_selectors_init(self.selectors)
31
-
31
+ lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
32
32
  if status != LXB_STATUS_OK:
33
33
  raise SelectolaxError("Can't initialize CSS selector.")
34
34
 
@@ -72,14 +72,17 @@ cdef class LexborCSSSelector:
72
72
  raise SelectolaxError("Can't parse CSS selector.")
73
73
  result = bool(self.results)
74
74
  self.results = []
75
+ lxb_css_selector_list_destroy_memory(selectors_list)
75
76
  return result
76
77
 
77
78
 
78
79
  def __dealloc__(self):
79
- lxb_selectors_destroy(self.selectors, True)
80
- lxb_css_parser_destroy(self.parser, True)
81
- lxb_css_selectors_destroy(self.css_selectors, True)
82
- # lxb_css_memory_destroy(, True)
80
+ if self.selectors != NULL:
81
+ lxb_selectors_destroy(self.selectors, True)
82
+ if self.parser != NULL:
83
+ lxb_css_parser_destroy(self.parser, True)
84
+ if self.css_selectors != NULL:
85
+ lxb_css_selectors_destroy(self.css_selectors, True)
83
86
 
84
87
 
85
88
 
@@ -103,16 +106,16 @@ cdef class LexborSelector:
103
106
  raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
104
107
 
105
108
  @property
106
- def matches(self):
109
+ def matches(self) -> list:
107
110
  """Returns all possible matches"""
108
111
  return self.nodes
109
112
 
110
113
  @property
111
- def any_matches(self):
114
+ def any_matches(self) -> bool:
112
115
  """Returns True if there are any matches"""
113
116
  return bool(self.nodes)
114
117
 
115
- def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
118
+ def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
116
119
  """Filter all current matches given text."""
117
120
  nodes = []
118
121
  for node in self.nodes:
@@ -122,7 +125,7 @@ cdef class LexborSelector:
122
125
  self.nodes = nodes
123
126
  return self
124
127
 
125
- def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
128
+ def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
126
129
  """Returns True if any node in the current search scope contains specified text"""
127
130
  nodes = []
128
131
  for node in self.nodes:
@@ -131,7 +134,7 @@ cdef class LexborSelector:
131
134
  return True
132
135
  return False
133
136
 
134
- def attribute_longer_than(self, str attribute, int length, str start = None):
137
+ def attribute_longer_than(self, str attribute, int length, str start = None) -> LexborSelector:
135
138
  """Filter all current matches by attribute length.
136
139
 
137
140
  Similar to `string-length` in XPath.
@@ -146,7 +149,7 @@ cdef class LexborSelector:
146
149
  self.nodes = nodes
147
150
  return self
148
151
 
149
- def any_attribute_longer_than(self, str attribute, int length, str start = None):
152
+ def any_attribute_longer_than(self, str attribute, int length, str start = None) -> bool:
150
153
  """Returns True any href attribute longer than a specified length.
151
154
 
152
155
  Similar to `string-length` in XPath.
@@ -0,0 +1,19 @@
1
+ include "../utils.pxi"
2
+
3
+ def create_tag(tag: str):
4
+ """
5
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
6
+ e.g. `"<div></div>"`.
7
+ """
8
+ return do_create_tag(tag, LexborHTMLParser)
9
+
10
+
11
+ def parse_fragment(html: str):
12
+ """
13
+ Given HTML, parse it into a list of Nodes, such that the nodes
14
+ correspond to the given HTML.
15
+
16
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
17
+ if they are missing. This function does not add these tags.
18
+ """
19
+ return do_parse_fragment(html, LexborHTMLParser)