PyPI - selectolax - Versions diffs - 0.3.28__cp311-cp311-musllinux_1_2_aarch64.whl → 0.4.0__cp311-cp311-musllinux_1_2_aarch64.whl - Mend

selectolax 0.3.28__cp311-cp311-musllinux_1_2_aarch64.whl → 0.4.0__cp311-cp311-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (27) hide show

selectolax/__init__.py +3 -5
selectolax/lexbor/attrs.pxi +26 -9
selectolax/lexbor/node.pxi +225 -58
selectolax/lexbor/node_remove.pxi +29 -0
selectolax/lexbor/selection.pxi +57 -26
selectolax/lexbor/util.pxi +1 -0
selectolax/lexbor.c +21988 -22274
selectolax/lexbor.cpython-311-aarch64-linux-musl.so +0 -0
selectolax/lexbor.pxd +44 -40
selectolax/lexbor.pyi +847 -65
selectolax/lexbor.pyx +98 -23
selectolax/modest/node.pxi +68 -46
selectolax/modest/selection.pxi +24 -22
selectolax/modest/util.pxi +1 -0
selectolax/parser.c +18150 -20047
selectolax/parser.cpython-311-aarch64-linux-musl.so +0 -0
selectolax/parser.pxd +17 -20
selectolax/parser.pyi +493 -53
selectolax/parser.pyx +45 -35
selectolax/utils.pxi +13 -3
selectolax-0.4.0.dist-info/METADATA +32 -0
selectolax-0.4.0.dist-info/RECORD +27 -0
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
selectolax-0.3.28.dist-info/METADATA +0 -183
selectolax-0.3.28.dist-info/RECORD +0 -26
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0

selectolax/lexbor.pyx CHANGED Viewed

@@ -1,4 +1,6 @@
-from cpython cimport bool
+from cpython.bool cimport bool
+from cpython.exc cimport PyErr_SetObject
 _ENCODING = 'UTF-8'
@@ -8,6 +10,7 @@ include "lexbor/attrs.pxi"
 include "lexbor/node.pxi"
 include "lexbor/selection.pxi"
 include "lexbor/util.pxi"
+include "lexbor/node_remove.pxi"
 # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
@@ -24,10 +27,8 @@ cdef class LexborHTMLParser:
     html : str (unicode) or bytes
     """
     def __init__(self, html):
         cdef size_t html_len
-        cdef char* html_chars
+        cdef object bytes_html
         bytes_html, html_len = preprocess_input(html)
         self._parse_html(bytes_html, html_len)
         self.raw_html = bytes_html
@@ -39,22 +40,27 @@ cdef class LexborHTMLParser:
             self._selector = LexborCSSSelector()
         return self._selector
-    cdef _parse_html(self, char *html, size_t html_len):
+    cdef int _parse_html(self, char *html, size_t html_len) except -1:
         cdef lxb_status_t status
         with nogil:
             self.document = lxb_html_document_create()
         if self.document == NULL:
-            raise SelectolaxError("Failed to initialize object for HTML Document.")
+            PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
+            return -1
         with nogil:
             status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
         if status != 0x0000:
-            raise SelectolaxError("Can't parse HTML.")
+            PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
+            return -1
-        assert self.document != NULL
+        if self.document == NULL:
+            PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
+            return -1
+        return 0
     def __dealloc__(self):
         if self.document != NULL:
@@ -68,7 +74,7 @@ cdef class LexborHTMLParser:
         """Returns root node."""
         if self.document == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
+        return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
     @property
     def body(self):
@@ -77,7 +83,7 @@ cdef class LexborHTMLParser:
         body = lxb_html_document_body_element_noi(self.document)
         if body == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
+        return LexborNode.new(<lxb_dom_node_t *> body, self)
     @property
     def head(self):
@@ -86,7 +92,7 @@ cdef class LexborHTMLParser:
         head = lxb_html_document_head_element_noi(self.document)
         if head == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
+        return LexborNode.new(<lxb_dom_node_t *> head, self)
     def tags(self, str name):
         """Returns a list of tags that match specified name.
@@ -96,6 +102,12 @@ cdef class LexborHTMLParser:
         name : str (e.g. div)
         """
+        if not name:
+            raise ValueError("Tag name cannot be empty")
+        if len(name) > 100:
+            raise ValueError("Tag name is too long")
         cdef lxb_dom_collection_t* collection = NULL
         cdef lxb_status_t status
         pybyte_name = name.encode('UTF-8')
@@ -116,7 +128,7 @@ cdef class LexborHTMLParser:
             raise SelectolaxError("Can't locate elements.")
         for i in range(lxb_dom_collection_length_noi(collection)):
-            node = LexborNode()._cinit(
+            node = LexborNode.new(
                 <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
                 self
             )
@@ -150,7 +162,7 @@ cdef class LexborHTMLParser:
         """Return HTML representation of the page."""
         if self.document == NULL:
             return None
-        node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
+        node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
         return node.html
     def css(self, str query):
@@ -159,6 +171,11 @@ cdef class LexborHTMLParser:
         Matches pattern `query` against HTML tree.
         `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
+        Special selectors:
+         - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
+         - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
         Parameters
         ----------
         query : str
@@ -177,9 +194,9 @@ cdef class LexborHTMLParser:
         ----------
         query : str
-        default : bool, default None
+        default : Any, default None
             Default value to return if there is no match.
-        strict: bool, default True
+        strict: bool, default False
             Set to True if you want to check if there is strictly only one match in the document.
@@ -196,7 +213,7 @@ cdef class LexborHTMLParser:
         ----------
         tags : list of str
             List of tags to remove.
-        recursive : bool, default True
+        recursive : bool, default False
             Whenever to delete all its child nodes
         Examples
@@ -232,7 +249,7 @@ cdef class LexborHTMLParser:
             for i in range(lxb_dom_collection_length_noi(collection)):
                 if recursive:
-                    lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
+                    lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
                 else:
                     lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
             lxb_dom_collection_destroy(collection, <bint> True)
@@ -273,7 +290,6 @@ cdef class LexborHTMLParser:
         """
         return self.root.scripts_contain(query)
     def script_srcs_contain(self, tuple queries):
         """Returns True if any of the script SRCs attributes contain on of the specified text.
@@ -289,6 +305,26 @@ cdef class LexborHTMLParser:
     def css_matches(self, str selector):
         return self.root.css_matches(selector)
+    def merge_text_nodes(self):
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        return self.root.merge_text_nodes()
     @staticmethod
     cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
         obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -300,9 +336,16 @@ cdef class LexborHTMLParser:
         return obj
     def clone(self):
-        """Clone the current tree."""
+        """Clone the current node.
+        You can use to do temporary modifications without affecting the original HTML tree.
+        It is tied to the current parser instance.
+        Gets destroyed when parser instance is destroyed.
+        """
         cdef lxb_html_document_t* cloned_document
         cdef lxb_dom_node_t* cloned_node
+        cdef LexborHTMLParser cls
         with nogil:
             cloned_document = lxb_html_document_create()
@@ -327,7 +370,8 @@ cdef class LexborHTMLParser:
         cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
         return cls
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -336,6 +380,8 @@ cdef class LexborHTMLParser:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool
+            Whenever to delete empty tags.
         Examples
         --------
@@ -345,5 +391,34 @@ cdef class LexborHTMLParser:
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         """
-        if self.root is not None:
-            self.root.unwrap_tags(tags)
+        # faster to check if the document is empty which should determine if we have a root
+        if self.document != NULL:
+            self.root.unwrap_tags(tags, delete_empty=delete_empty)
+    @property
+    def inner_html(self) -> str:
+        """Return HTML representation of the child nodes.
+        Works similar to innerHTML in JavaScript.
+        Unlike the `.html` property, does not include the current node.
+        Can be used to set HTML as well. See the setter docstring.
+        Returns
+        -------
+        text : str | None
+        """
+        return self.root.inner_html
+    @inner_html.setter
+    def inner_html(self, str html):
+        """Set inner HTML to the specified HTML.
+        Replaces existing data inside the node.
+        Works similar to innerHTML in JavaScript.
+        Parameters
+        ----------
+        html : str
+        """
+        self.root.inner_html = html

selectolax/modest/node.pxi CHANGED Viewed

@@ -1,4 +1,5 @@
 cimport cython
+from cpython.exc cimport PyErr_NoMemory
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
@@ -8,12 +9,15 @@ from libc.string cimport memcpy
 DEF _STACK_SIZE = 100
 DEF _ENCODING = 'UTF-8'
 @cython.final
 cdef class Stack:
     def __cinit__(self, size_t capacity=25):
         self.capacity = capacity
         self.top = 0
         self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
+        if self._stack == NULL:
+            raise MemoryError("Failed to allocate memory for stack")
     def __dealloc__(self):
         free(self._stack)
@@ -21,9 +25,10 @@ cdef class Stack:
     cdef bint is_empty(self):
         return self.top <= 0
-    cdef push(self, myhtml_tree_node_t* res):
+    cdef int push(self, myhtml_tree_node_t* res) except -1:
         if self.top >= self.capacity:
-            self.resize()
+            if self.resize() < 0:
+                return -1
         self._stack[self.top] = res
         self.top += 1
@@ -31,10 +36,13 @@ cdef class Stack:
         self.top = self.top - 1
         return self._stack[self.top]
-    cdef resize(self):
+    cdef int resize(self) except -1:
         self.capacity *= 2
         self._stack = <myhtml_tree_node_t**> realloc(<void*> self._stack, self.capacity * sizeof(myhtml_tree_node_t))
+        if self._stack == NULL:
+            PyErr_NoMemory()
+            return -1
+        return 0
 cdef class _Attributes:
     """A dict-like object that represents attributes."""
@@ -128,25 +136,24 @@ cdef class _Attributes:
         tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
         return "<%s attributes, %s items>" % (tag_name, len(self))
 ctypedef fused str_or_Node:
-    basestring
+    str
     bytes
     Node
 cdef class Node:
     """A class that represents HTML node (element)."""
     cdef myhtml_tree_node_t *node
     cdef public HTMLParser parser
-    cdef _init(self, myhtml_tree_node_t *node, HTMLParser parser):
-        # custom init, because __cinit__ doesn't accept C types
-        self.node = node
+    @staticmethod
+    cdef Node new(myhtml_tree_node_t *node, HTMLParser parser):
+        # custom __init__ for C, because __cinit__ doesn't accept C types
+        cdef Node cls = Node.__new__(Node)
+        cls.node = node
         # Keep reference to the selector object, so myhtml structures will not be garbage collected prematurely
-        self.parser = parser
+        cls.parser = parser
+        return cls
     @property
     def attributes(self):
@@ -286,7 +293,7 @@ cdef class Node:
     cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
         text = ""
         cdef Stack stack = Stack(_STACK_SIZE)
-        cdef myhtml_tree_node_t* current_node = NULL;
+        cdef myhtml_tree_node_t* current_node = NULL
         if node.tag_id == MyHTML_TAG__TEXT:
             c_text = myhtml_node_text(node, NULL)
@@ -339,12 +346,10 @@ cdef class Node:
                 node = node.next
                 continue
-            next_node = Node()
-            next_node._init(node, self.parser)
+            next_node = Node.new(node, self.parser)
             yield next_node
             node = node.next
     def traverse(self, include_text=False):
         """Iterate over all child and next nodes starting from the current level.
@@ -358,16 +363,15 @@ cdef class Node:
         node
         """
         cdef Stack stack = Stack(_STACK_SIZE)
-        cdef myhtml_tree_node_t* current_node = NULL;
-        cdef Node next_node;
+        cdef myhtml_tree_node_t* current_node = NULL
+        cdef Node next_node
         stack.push(self.node)
         while not stack.is_empty():
             current_node = stack.pop()
             if current_node != NULL and not (current_node.tag_id == MyHTML_TAG__TEXT and not include_text):
-                next_node = Node()
-                next_node._init(current_node, self.parser)
+                next_node = Node.new(current_node, self.parser)
                 yield next_node
             if current_node.next is not NULL:
@@ -393,11 +397,13 @@ cdef class Node:
     @property
     def child(self):
-        """Return the child node."""
+        """Alias for the `first_child` property.
+        **Deprecated**. Please use `first_child` instead.
+        """
         cdef Node node
         if self.node.child:
-            node = Node()
-            node._init(self.node.child, self.parser)
+            node = Node.new(self.node.child, self.parser)
             return node
         return None
@@ -406,8 +412,7 @@ cdef class Node:
         """Return the parent node."""
         cdef Node node
         if self.node.parent:
-            node = Node()
-            node._init(self.node.parent, self.parser)
+            node = Node.new(self.node.parent, self.parser)
             return node
         return None
@@ -416,8 +421,7 @@ cdef class Node:
         """Return next node."""
         cdef Node node
         if self.node.next:
-            node = Node()
-            node._init(self.node.next, self.parser)
+            node = Node.new(self.node.next, self.parser)
             return node
         return None
@@ -426,8 +430,7 @@ cdef class Node:
         """Return previous node."""
         cdef Node node
         if self.node.prev:
-            node = Node()
-            node._init(self.node.prev, self.parser)
+            node = Node.new(self.node.prev, self.parser)
             return node
         return None
@@ -436,8 +439,7 @@ cdef class Node:
         """Return last child node."""
         cdef Node node
         if self.node.last_child:
-            node = Node()
-            node._init(self.node.last_child, self.parser)
+            node = Node.new(self.node.last_child, self.parser)
             return node
         return None
@@ -515,9 +517,14 @@ cdef class Node:
         """An alias for the decompose method."""
         self.decompose(recursive)
-    def unwrap(self):
+    def unwrap(self, delete_empty = False):
         """Replace node with whatever is inside this node.
+        Parameters
+        ----------
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -526,11 +533,14 @@ cdef class Node:
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
         if self.node.child == NULL:
+            if delete_empty:
+                myhtml_node_delete(self.node)
             return
-        cdef myhtml_tree_node_t* next_node;
-        cdef myhtml_tree_node_t* current_node;
+        cdef myhtml_tree_node_t* next_node
+        cdef myhtml_tree_node_t* current_node
         if self.node.child.next != NULL:
             current_node = self.node.child
@@ -564,11 +574,13 @@ cdef class Node:
         '<html><body><div>Hello world!</div></body></html>'
         """
+        # ensure cython can recast element to a Node so that decompose will be called sooner.
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -577,6 +589,8 @@ cdef class Node:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -585,11 +599,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
-        """
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
+        """
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
     def replace_with(self, str_or_Node value):
         """Replace current Node with specified value.
@@ -752,7 +768,7 @@ cdef class Node:
         else:
             raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -761,6 +777,8 @@ cdef class Node:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -769,11 +787,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
-        """
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
+        """
+        cdef Node element
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
     @property
     def raw_value(self):
@@ -829,6 +849,7 @@ cdef class Node:
             The query to check.
         """
+        cdef Node node
         if self.parser.cached_script_texts is None:
             nodes = find_nodes(self.parser, self.node, 'script')
             text_nodes = []
@@ -877,6 +898,7 @@ cdef class Node:
         if not isinstance(other, Node):
             return False
         return self.html == other.html
     @property
     def text_content(self):
         """Returns the text of the node if it is a text node.
@@ -930,8 +952,8 @@ cdef class Node:
         while not stack.is_empty():
             current_node = stack.pop()
-            if current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and \
-                current_node.prev.tag_id == MyHTML_TAG__TEXT:
+            if (current_node.tag_id == MyHTML_TAG__TEXT and current_node.prev and
+                    current_node.prev.tag_id == MyHTML_TAG__TEXT):
                 left_text = myhtml_node_text(current_node.prev, &left_length)
                 right_text = myhtml_node_text(current_node, &right_length)
                 if left_text and right_text:
@@ -962,8 +984,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
 cdef inline bytes to_bytes(str_or_Node value):
     cdef bytes bytes_val
-    if isinstance(value, (str, unicode)):
-        bytes_val = value.encode(_ENCODING)
+    if isinstance(value, unicode):
+        bytes_val = <bytes>value.encode("utf-8")
     elif isinstance(value, bytes):
-        bytes_val =  <char*> value
+        bytes_val = <bytes>value
     return bytes_val