PyPI - selectolax - Versions diffs - 0.3.29__cp310-cp310-win32.whl → 0.3.34__cp310-cp310-win32.whl - Mend

selectolax 0.3.29cp310-cp310-win32.whl → 0.3.34cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (26) hide show

selectolax/__init__.py +3 -5
selectolax/lexbor/attrs.pxi +26 -9
selectolax/lexbor/node.pxi +108 -47
selectolax/lexbor/selection.pxi +34 -25
selectolax/lexbor/util.pxi +1 -0
selectolax/lexbor.c +52987 -55311
selectolax/lexbor.cp310-win32.pyd +0 -0
selectolax/lexbor.pxd +36 -40
selectolax/lexbor.pyi +770 -65
selectolax/lexbor.pyx +54 -17
selectolax/modest/node.pxi +45 -42
selectolax/modest/selection.pxi +24 -22
selectolax/modest/util.pxi +1 -0
selectolax/parser.c +50190 -52325
selectolax/parser.cp310-win32.pyd +0 -0
selectolax/parser.pxd +17 -20
selectolax/parser.pyi +489 -45
selectolax/parser.pyx +39 -31
selectolax/utils.pxi +13 -3
selectolax-0.3.34.dist-info/METADATA +32 -0
selectolax-0.3.34.dist-info/RECORD +26 -0
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
selectolax-0.3.29.dist-info/METADATA +0 -194
selectolax-0.3.29.dist-info/RECORD +0 -26
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0

selectolax/lexbor.pyx CHANGED Viewed

@@ -1,4 +1,5 @@
-from cpython cimport bool
+from cpython.bool cimport bool
+from cpython.exc cimport PyErr_SetObject
 _ENCODING = 'UTF-8'
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
     html : str (unicode) or bytes
     """
     def __init__(self, html):
         cdef size_t html_len
-        cdef char* html_chars
+        cdef object bytes_html
         bytes_html, html_len = preprocess_input(html)
         self._parse_html(bytes_html, html_len)
         self.raw_html = bytes_html
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
             self._selector = LexborCSSSelector()
         return self._selector
-    cdef _parse_html(self, char *html, size_t html_len):
+    cdef int _parse_html(self, char *html, size_t html_len) except -1:
         cdef lxb_status_t status
         with nogil:
             self.document = lxb_html_document_create()
         if self.document == NULL:
-            raise SelectolaxError("Failed to initialize object for HTML Document.")
+            PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
+            return -1
         with nogil:
             status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
         if status != 0x0000:
-            raise SelectolaxError("Can't parse HTML.")
+            PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
+            return -1
-        assert self.document != NULL
+        if self.document == NULL:
+            PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
+            return -1
+        return 0
     def __dealloc__(self):
         if self.document != NULL:
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
         """Returns root node."""
         if self.document == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
+        return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
     @property
     def body(self):
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
         body = lxb_html_document_body_element_noi(self.document)
         if body == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
+        return LexborNode.new(<lxb_dom_node_t *> body, self)
     @property
     def head(self):
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
         head = lxb_html_document_head_element_noi(self.document)
         if head == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
+        return LexborNode.new(<lxb_dom_node_t *> head, self)
     def tags(self, str name):
         """Returns a list of tags that match specified name.
@@ -96,6 +100,12 @@ cdef class LexborHTMLParser:
         name : str (e.g. div)
         """
+        if not name:
+            raise ValueError("Tag name cannot be empty")
+        if len(name) > 100:
+            raise ValueError("Tag name is too long")
         cdef lxb_dom_collection_t* collection = NULL
         cdef lxb_status_t status
         pybyte_name = name.encode('UTF-8')
@@ -116,7 +126,7 @@ cdef class LexborHTMLParser:
             raise SelectolaxError("Can't locate elements.")
         for i in range(lxb_dom_collection_length_noi(collection)):
-            node = LexborNode()._cinit(
+            node = LexborNode.new(
                 <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
                 self
             )
@@ -150,7 +160,7 @@ cdef class LexborHTMLParser:
         """Return HTML representation of the page."""
         if self.document == NULL:
             return None
-        node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
+        node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
         return node.html
     def css(self, str query):
@@ -159,6 +169,11 @@ cdef class LexborHTMLParser:
         Matches pattern `query` against HTML tree.
         `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
+        Special selectors:
+         - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
+         - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
         Parameters
         ----------
         query : str
@@ -232,7 +247,7 @@ cdef class LexborHTMLParser:
             for i in range(lxb_dom_collection_length_noi(collection)):
                 if recursive:
-                    lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
+                    lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
                 else:
                     lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
             lxb_dom_collection_destroy(collection, <bint> True)
@@ -273,7 +288,6 @@ cdef class LexborHTMLParser:
         """
         return self.root.scripts_contain(query)
     def script_srcs_contain(self, tuple queries):
         """Returns True if any of the script SRCs attributes contain on of the specified text.
@@ -289,6 +303,26 @@ cdef class LexborHTMLParser:
     def css_matches(self, str selector):
         return self.root.css_matches(selector)
+    def merge_text_nodes(self):
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        return self.root.merge_text_nodes()
     @staticmethod
     cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
         obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -303,6 +337,7 @@ cdef class LexborHTMLParser:
         """Clone the current tree."""
         cdef lxb_html_document_t* cloned_document
         cdef lxb_dom_node_t* cloned_node
+        cdef LexborHTMLParser cls
         with nogil:
             cloned_document = lxb_html_document_create()
@@ -327,6 +362,7 @@ cdef class LexborHTMLParser:
         cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
         return cls
     def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
@@ -347,5 +383,6 @@ cdef class LexborHTMLParser:
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         """
-        if self.root is not None:
+        # faster to check if the document is empty which should determine if we have a root
+        if self.document != NULL:
             self.root.unwrap_tags(tags, delete_empty=delete_empty)

selectolax/modest/node.pxi CHANGED Viewed

@@ -1,4 +1,5 @@
 cimport cython
+from cpython.exc cimport PyErr_NoMemory
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
 DEF _STACK_SIZE = 100
 DEF _ENCODING = 'UTF-8'
 @cython.final
 cdef class Stack:
     def __cinit__(self, size_t capacity=25):
         self.capacity = capacity
         self.top = 0
         self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
+        if self._stack == NULL:
+            raise MemoryError("Failed to allocate memory for stack")
     def __dealloc__(self):
         free(self._stack)
@@ -21,9 +25,10 @@ cdef class Stack:
     cdef bint is_empty(self):
         return self.top <= 0
-    cdef push(self, myhtml_tree_node_t* res):
+    cdef int push(self, myhtml_tree_node_t* res) except -1:
         if self.top >= self.capacity:
-            self.resize()
+            if self.resize() < 0:
+                return -1
         self._stack[self.top] = res
         self.top += 1
@@ -31,10 +36,13 @@ cdef class Stack:
         self.top = self.top - 1
         return self._stack[self.top]
-    cdef resize(self):
+    cdef int resize(self) except -1:
         self.capacity *= 2
         self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
+        if self._stack == NULL:
+            PyErr_NoMemory()
+            return -1
+        return 0
 cdef class _Attributes:
     """A dict-like object that represents attributes."""
@@ -128,25 +136,24 @@ cdef class _Attributes:
         tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
         return "<%s attributes, %s items>" % (tag_name, len(self))
 ctypedef fused str_or_Node:
-    basestring
+    str
     bytes
     Node
 cdef class Node:
     """A class that represents HTML node (element)."""
     cdef myhtml_tree_node_t *node
     cdef public HTMLParser parser
-    cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
-        # custom init, because __cinit__ doesn't accept C types
-        self.node = node
+    @staticmethod
+    cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
+        # custom __init__ for C, because __cinit__ doesn't accept C types
+        cdef Node cls = Node.__new__(Node)
+        cls.node = node
         # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
-        self.parser = parser
+        cls.parser = parser
+        return cls
     @property
     def attributes(self):
@@ -286,7 +293,7 @@ cdef class Node:
     cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
         text = ""
         cdef Stack stack = Stack(_STACK_SIZE)
-        cdef myhtml_tree_node_t* current_node = NULL;
+        cdef myhtml_tree_node_t* current_node = NULL
         if node.tag_id == MyHTML_TAG__TEXT:
             c_text = myhtml_node_text(node, NULL)
@@ -339,12 +346,10 @@ cdef class Node:
                 node = node.next
                 continue
-            next_node = Node()
-            next_node._init(node, self.parser)
+            next_node = Node.new(node, self.parser)
             yield next_node
             node = node.next
     def traverse(self, include_text=False):
         """Iterate over all child and next nodes starting from the current level.
@@ -358,16 +363,15 @@ cdef class Node:
         node
         """
         cdef Stack stack = Stack(_STACK_SIZE)
-        cdef myhtml_tree_node_t* current_node = NULL;
-        cdef Node next_node;
+        cdef myhtml_tree_node_t* current_node = NULL
+        cdef Node next_node
         stack.push(self.node)
         while not stack.is_empty():
             current_node = stack.pop()
             if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
-                next_node = Node()
-                next_node._init(current_node, self.parser)
+                next_node = Node.new(current_node, self.parser)
                 yield next_node
             if current_node.next is not NULL:
@@ -396,8 +400,7 @@ cdef class Node:
         """Return the child node."""
         cdef Node node
         if self.node.child:
-            node = Node()
-            node._init(self.node.child, self.parser)
+            node = Node.new(self.node.child, self.parser)
             return node
         return None
@@ -406,8 +409,7 @@ cdef class Node:
         """Return the parent node."""
         cdef Node node
         if self.node.parent:
-            node = Node()
-            node._init(self.node.parent, self.parser)
+            node = Node.new(self.node.parent, self.parser)
             return node
         return None
@@ -416,8 +418,7 @@ cdef class Node:
         """Return next node."""
         cdef Node node
         if self.node.next:
-            node = Node()
-            node._init(self.node.next, self.parser)
+            node = Node.new(self.node.next, self.parser)
             return node
         return None
@@ -426,8 +427,7 @@ cdef class Node:
         """Return previous node."""
         cdef Node node
         if self.node.prev:
-            node = Node()
-            node._init(self.node.prev, self.parser)
+            node = Node.new(self.node.prev, self.parser)
             return node
         return None
@@ -436,8 +436,7 @@ cdef class Node:
         """Return last child node."""
         cdef Node node
         if self.node.last_child:
-            node = Node()
-            node._init(self.node.last_child, self.parser)
+            node = Node.new(self.node.last_child, self.parser)
             return node
         return None
@@ -537,8 +536,8 @@ cdef class Node:
             if delete_empty:
                 myhtml_node_delete(self.node)
             return
-        cdef myhtml_tree_node_t* next_node;
-        cdef myhtml_tree_node_t* current_node;
+        cdef myhtml_tree_node_t* next_node
+        cdef myhtml_tree_node_t* current_node
         if self.node.child.next != NULL:
             current_node = self.node.child
@@ -572,6 +571,8 @@ cdef class Node:
         '<html><body><div>Hello world!</div></body></html>'
         """
+        # ensure cython can recast element to a Node so that decompose will be called sooner.
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
@@ -595,10 +596,10 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
                 element.unwrap(delete_empty)
@@ -783,10 +784,10 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
                 element.unwrap(delete_empty)
@@ -845,6 +846,7 @@ cdef class Node:
             The query to check.
         """
+        cdef Node node
         if self.parser.cached_script_texts is None:
             nodes = find_nodes(self.parser, self.node, 'script')
             text_nodes = []
@@ -893,6 +895,7 @@ cdef class Node:
         if not isinstance(other, Node):
             return False
         return self.html == other.html
     @property
     def text_content(self):
         """Returns the text of the node if it is a text node.
@@ -946,8 +949,8 @@ cdef class Node:
         while not stack.is_empty():
             current_node = stack.pop()
-            if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
-                current_node.prev.tag_id == MyHTML_TAG__TEXT:
+            if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
+                    current_node.prev.tag_id == MyHTML_TAG__TEXT):
                 left_text = myhtml_node_text(current_node.prev, &left_length)
                 right_text = myhtml_node_text(current_node, &right_length)
                 if left_text and right_text:
@@ -978,8 +981,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
 cdef inline bytes to_bytes(str_or_Node value):
     cdef bytes bytes_val
-    if isinstance(value, (str, unicode)):
-        bytes_val = value.encode(_ENCODING)
+    if isinstance(value, unicode):
+        bytes_val = <bytes>value.encode("utf-8")
     elif isinstance(value, bytes):
-        bytes_val =  <char*> value
+        bytes_val = <bytes>value
     return bytes_val

selectolax/modest/selection.pxi CHANGED Viewed

@@ -1,4 +1,6 @@
 cimport cython
+from cpython.exc cimport PyErr_SetObject
 @cython.final
 cdef class CSSSelector:
@@ -28,35 +30,33 @@ cdef class CSSSelector:
         return collection
-    cdef _create_css_parser(self):
+    cdef int _create_css_parser(self) except -1:
         cdef mystatus_t status
         cdef mycss_t *mycss = mycss_create()
         status = mycss_init(mycss)
         if status != 0:
-            raise RuntimeError("Can't init MyCSS object.")
-            # return
+            PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
+            return -1
         self.css_entry = mycss_entry_create()
         status = mycss_entry_init(mycss, self.css_entry)
         if status != 0:
-            raise RuntimeError("Can't init MyCSS Entry object.")
+            PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
+            return -1
+        return 0
-    cdef _prepare_selector(self, mycss_entry_t *css_entry,
-                                                   const char *selector, size_t selector_size):
-        cdef mystatus_t out_status;
-        self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry),
-                                                         myencoding_t.MyENCODING_UTF_8,
-                                                         selector, selector_size,
-                                                         &out_status)
+    cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
+        cdef mystatus_t out_status
+        self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
+                                                    selector, selector_size, &out_status)
         if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
-            raise  ValueError("Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
+            PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
+            return -1
+        return 0
     def __dealloc__(self):
         mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
@@ -77,12 +77,11 @@ cdef class Selector:
     cdef Node node
     cdef list nodes
-    def __init__(self, Node node, query):
+    def __init__(self, Node node, str query):
         """custom init, because __cinit__ doesn't accept C types"""
         self.node = node
         self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
     cpdef css(self, str query):
         """Evaluate CSS selector against current scope."""
         cdef Node current_node
@@ -106,6 +105,7 @@ cdef class Selector:
     def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
         """Filter all current matches given text."""
         nodes = []
+        cdef Node node
         for node in self.nodes:
             node_text = node.text(deep=deep, separator=separator, strip=strip)
             if node_text and text in node_text:
@@ -116,6 +116,7 @@ cdef class Selector:
     def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
         """Returns True if any node in the current search scope contains specified text"""
         nodes = []
+        cdef Node node
         for node in self.nodes:
             node_text = node.text(deep=deep, separator=separator, strip=strip)
             if node_text and text in node_text:
@@ -142,7 +143,8 @@ cdef class Selector:
         Similar to `string-length` in XPath.
         """
-        nodes = []
+        cdef list nodes = []
+        cdef Node node
         for node in self.nodes:
             attr = node.attributes.get(attribute)
             if attr and start and start in attr:
@@ -157,16 +159,15 @@ cdef class Selector:
 cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
     cdef myhtml_collection_t *collection
     cdef CSSSelector selector = CSSSelector(query)
-    result = list()
+    cdef Node n
+    cdef list result = []
     collection = selector.find(node)
     if collection == NULL:
         return result
     for i in range(collection.length):
-        n = Node()
-        n._init(collection.list[i], parser)
+        n = Node.new(collection.list[i], parser)
         result.append(n)
     myhtml_collection_destroy(collection)
     return result
@@ -176,6 +177,7 @@ cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple select
     cdef myhtml_collection_t *collection
     cdef CSSSelector selector
     cdef int collection_size
+    cdef str query
     for query in selectors:
         selector = CSSSelector(query)

selectolax/modest/util.pxi CHANGED Viewed

@@ -1,5 +1,6 @@
 include "../utils.pxi"
 def create_tag(tag: str):
     """
     Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,

selectolax 0.3.29__cp310-cp310-win32.whl → 0.3.34__cp310-cp310-win32.whl

Potentially problematic release.

selectolax 0.3.29cp310-cp310-win32.whl → 0.3.34cp310-cp310-win32.whl