PyPI - selectolax - Versions diffs - 0.3.28__cp313-cp313-musllinux_1_2_x86_64.whl → 0.4.0__cp313-cp313-musllinux_1_2_x86_64.whl - Mend

selectolax 0.3.28__cp313-cp313-musllinux_1_2_x86_64.whl → 0.4.0__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (27) hide show

selectolax/__init__.py +3 -5
selectolax/lexbor/attrs.pxi +26 -9
selectolax/lexbor/node.pxi +225 -58
selectolax/lexbor/node_remove.pxi +29 -0
selectolax/lexbor/selection.pxi +57 -26
selectolax/lexbor/util.pxi +1 -0
selectolax/lexbor.c +22000 -22286
selectolax/lexbor.cpython-313-x86_64-linux-musl.so +0 -0
selectolax/lexbor.pxd +44 -40
selectolax/lexbor.pyi +847 -65
selectolax/lexbor.pyx +98 -23
selectolax/modest/node.pxi +68 -46
selectolax/modest/selection.pxi +24 -22
selectolax/modest/util.pxi +1 -0
selectolax/parser.c +18150 -20047
selectolax/parser.cpython-313-x86_64-linux-musl.so +0 -0
selectolax/parser.pxd +17 -20
selectolax/parser.pyi +493 -53
selectolax/parser.pyx +45 -35
selectolax/utils.pxi +13 -3
selectolax-0.4.0.dist-info/METADATA +32 -0
selectolax-0.4.0.dist-info/RECORD +27 -0
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
selectolax-0.3.28.dist-info/METADATA +0 -193
selectolax-0.3.28.dist-info/RECORD +0 -26
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
{selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0

selectolax/__init__.py CHANGED Viewed

@@ -2,9 +2,7 @@
 __author__ = """Artem Golubin"""
-__email__ = 'me@rushter.com'
-__version__ = '0.3.28'
+__email__ = "me@rushter.com"
+__version__ = "0.4.0"
-from . import parser
-from . import lexbor
-from . import modest
+from . import lexbor, modest, parser

selectolax/lexbor/attrs.pxi CHANGED Viewed

@@ -1,5 +1,6 @@
 cimport cython
 @cython.final
 cdef class LexborAttributes:
     """A dict-like object that represents attributes."""
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
                 yield key.decode(_ENCODING)
             attr = attr.next
-    def __setitem__(self, str key, value):
-        value = str(value)
+    def __setitem__(self, str key, object value):
+        value = value
         bytes_key = key.encode(_ENCODING)
-        bytes_value = value.encode(_ENCODING)
-        lxb_dom_element_set_attribute(
-            <lxb_dom_element_t *> self.node,
-            <lxb_char_t *> bytes_key, len(bytes_key),
-            <lxb_char_t *> bytes_value, len(bytes_value),
-        )
+        bytes_value = value.encode(_ENCODING) if value else b""
+        cdef lxb_dom_attr_t *attr
+        cdef lxb_dom_document_t *doc
+        if value is None:
+            # N.B. This is suboptimal, but there is not API to set empty attributes
+            attr = lxb_dom_element_set_attribute(
+                <lxb_dom_element_t *> self.node,
+                <lxb_char_t *> bytes_key, len(bytes_key),
+                NULL, 0
+            )
+            doc = (<lxb_dom_node_t*>attr).owner_document
+            lexbor_str_destroy(attr.value, doc.text, 0)
+            attr.value = NULL
+        elif isinstance(value, str) or isinstance(value, unicode) :
+            lxb_dom_element_set_attribute(
+                <lxb_dom_element_t *> self.node,
+                <lxb_char_t *> bytes_key, len(bytes_key),
+                <lxb_char_t *> bytes_value, len(bytes_value),
+            )
+        else:
+            raise TypeError("Expected str or unicode, got %s" % type(value))
     def __delitem__(self, key):
         try:

selectolax/lexbor/node.pxi CHANGED Viewed

@@ -1,31 +1,43 @@
 cimport cython
+from cpython.exc cimport PyErr_SetNone
+import logging
+logger = logging.getLogger("selectolax")
 _TAG_TO_NAME = {
-    0x0005: "- doctype",
+    0x0005: "-doctype",
     0x0002: "-text",
     0x0004: "-comment",
 }
 ctypedef fused str_or_LexborNode:
-    basestring
+    str
     bytes
     LexborNode
+ctypedef fused str_or_bytes:
+    str
+    bytes
 cdef inline bytes to_bytes(str_or_LexborNode value):
     cdef bytes bytes_val
-    if isinstance(value, (str, unicode)):
-        bytes_val = value.encode(_ENCODING)
+    if isinstance(value, unicode):
+        bytes_val = <bytes>value.encode("utf-8")
     elif isinstance(value, bytes):
-        bytes_val =  <char*> value
+        bytes_val = <bytes>value
     return bytes_val
 @cython.final
 cdef class LexborNode:
     """A class that represents HTML node (element)."""
-    cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
-        self.parser = parser
-        self.node = node
-        return self
+    @staticmethod
+    cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
+        cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
+        lxbnode.node = node
+        lxbnode.parser = parser
+        return lxbnode
     @property
     def mem_id(self):
@@ -33,7 +45,10 @@ cdef class LexborNode:
     @property
     def child(self):
-        """Alias for the `first_child` property."""
+        """Alias for the `first_child` property.
+        **Deprecated**. Please use `first_child` instead.
+        """
         return self.first_child
     @property
@@ -41,8 +56,7 @@ cdef class LexborNode:
         """Return the first child node."""
         cdef LexborNode node
         if self.node.first_child:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
             return node
         return None
@@ -50,9 +64,8 @@ cdef class LexborNode:
     def parent(self):
         """Return the parent node."""
         cdef LexborNode node
-        if self.node.parent:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
+        if self.node.parent != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
             return node
         return None
@@ -60,9 +73,8 @@ cdef class LexborNode:
     def next(self):
         """Return next node."""
         cdef LexborNode node
-        if self.node.next:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
+        if self.node.next != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
             return node
         return None
@@ -70,9 +82,8 @@ cdef class LexborNode:
     def prev(self):
         """Return previous node."""
         cdef LexborNode node
-        if self.node.prev:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
+        if self.node.prev != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
             return node
         return None
@@ -80,9 +91,8 @@ cdef class LexborNode:
     def last_child(self):
         """Return last child node."""
         cdef LexborNode node
-        if self.node.last_child:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
+        if self.node.last_child != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
             return node
         return None
@@ -181,6 +191,12 @@ cdef class LexborNode:
         Matches pattern `query` against HTML tree.
         `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
+        Special selectors:
+         - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
+         - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
         Parameters
         ----------
         query : str
@@ -195,13 +211,15 @@ cdef class LexborNode:
     def css_first(self, str query, default=None, bool strict=False):
         """Same as `css` but returns only the first match.
+        When `strict=False` stops at the first match. Works faster.
         Parameters
         ----------
         query : str
-        default : bool, default None
+        default : Any, default None
             Default value to return if there is no match.
-        strict: bool, default True
+        strict: bool, default False
             Set to True if you want to check if there is strictly only one match in the document.
@@ -209,8 +227,10 @@ cdef class LexborNode:
         -------
         selector : `LexborNode` object
         """
-        # TODO: This can be improved.
-        results = self.css(query)
+        if strict:
+            results = self.parser.selector.find(query, self)
+        else:
+            results = self.parser.selector.find_first(query, self)
         n_results = len(results)
         if n_results > 0:
             if strict and n_results > 1:
@@ -227,7 +247,7 @@ cdef class LexborNode:
     def css_matches(self, str selector):
         """Returns True if CSS selector matches a node."""
-        return self.parser.selector.any_matches(selector, self)
+        return bool(self.parser.selector.any_matches(selector, self))
     def __repr__(self):
         return '<LexborNode %s>' % self.tag
@@ -241,6 +261,14 @@ cdef class LexborNode:
     def tag(self):
         """Return the name of the current tag (e.g. div, p, img).
+        For for non-tag nodes, returns the following names:
+         * `-text` - text node
+         * `-document` - document node
+         * `-comment` - comment node
+        This
         Returns
         -------
         text : str
@@ -256,7 +284,6 @@ cdef class LexborNode:
             text = c_text.decode(_ENCODING)
         return text
     def decompose(self, bool recursive=True):
         """Remove the current node from the tree.
@@ -273,10 +300,13 @@ cdef class LexborNode:
         >>>     tag.decompose()
         """
+        if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
+            raise SelectolaxError("Decomposing the root node is not allowed.")
         if recursive:
-            lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
+            node_remove_deep(<lxb_dom_node_t *> self.node)
         else:
-            lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
+            lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
     def strip_tags(self, list tags, bool recursive = False):
         """Remove specified tags from the HTML tree.
@@ -298,11 +328,11 @@ cdef class LexborNode:
         '<html><body><div>Hello world!</div></body></html>'
         """
+        cdef LexborNode element
         for tag in tags:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
     @property
     def attributes(self):
         """Get all attributes that belong to the current node.
@@ -325,6 +355,9 @@ cdef class LexborNode:
         cdef size_t str_len = 0
         attributes = dict()
+        if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
+            return attributes
         while attr != NULL:
             key = lxb_dom_attr_local_name_noi(attr, &str_len)
             value = lxb_dom_attr_value_noi(attr, &str_len)
@@ -410,15 +443,20 @@ cdef class LexborNode:
                 node = node.next
                 continue
-            next_node = LexborNode()
-            next_node._cinit(<lxb_dom_node_t *> node, self.parser)
+            next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
             yield next_node
             node = node.next
-    def unwrap(self):
+    def unwrap(self, bint delete_empty=False):
         """Replace node with whatever is inside this node.
+        Does nothing if you perform unwrapping second time on the same node.
+        Parameters
+        ----------
+        delete_empty : bool, default False
+            If True, removes empty tags.
         Examples
         --------
@@ -427,11 +465,19 @@ cdef class LexborNode:
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
+        Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
+        if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
+            logger.error("Attempt to unwrap removed node. Does nothing.")
+            return
         if self.node.first_child == NULL:
+            if delete_empty:
+                lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
             return
-        cdef lxb_dom_node_t* next_node;
-        cdef lxb_dom_node_t* current_node;
+        cdef lxb_dom_node_t* next_node
+        cdef lxb_dom_node_t* current_node
         if self.node.first_child.next != NULL:
             current_node = self.node.first_child
@@ -443,9 +489,9 @@ cdef class LexborNode:
                 current_node = next_node
         else:
             lxb_dom_node_insert_before(self.node, self.node.first_child)
-        lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
+        lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, bint delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -454,6 +500,8 @@ cdef class LexborNode:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool, default False
+            If True, removes empty tags.
         Examples
         --------
@@ -462,12 +510,56 @@ cdef class LexborNode:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
-        """
+        Note: by default, empty tags are ignored, use "delete_empty" to change this.
+        """
+        cdef LexborNode element
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
+    def merge_text_nodes(self):
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        cdef lxb_dom_node_t *node = self.node.first_child
+        cdef lxb_dom_node_t *next_node
+        cdef lxb_char_t *left_text
+        cdef lxb_char_t *right_text
+        cdef size_t left_length, right_length
+        while node != NULL:
+            next_node = node.next
+            if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
+                left_text = lxb_dom_node_text_content(node.prev, &left_length)
+                right_text = lxb_dom_node_text_content(node, &right_length)
+                if left_text and right_text:
+                    combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
+                    lxb_dom_node_text_content_set(node, combined, len(combined))
+                    lxb_dom_node_remove(node.prev)
+                if left_text is not NULL:
+                    lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
+                if right_text is not NULL:
+                    lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
+            if node.first_child:
+                LexborNode.new(node, self.parser).merge_text_nodes()
+            node = next_node
     def traverse(self, include_text=False):
         """Iterate over all child and next nodes starting from the current level.
@@ -487,8 +579,7 @@ cdef class LexborNode:
         while node != NULL:
             if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
-                lxb_node = LexborNode()
-                lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
+                lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
                 yield lxb_node
             if node.first_child != NULL:
@@ -538,7 +629,7 @@ cdef class LexborNode:
             if new_node == NULL:
                 raise SelectolaxError("Can't create a new node")
             lxb_dom_node_insert_before(self.node,  new_node)
-            lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
+            lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
         elif isinstance(value, LexborNode):
             new_node = lxb_dom_document_import_node(
                 &self.parser.document.dom_document,
@@ -548,11 +639,10 @@ cdef class LexborNode:
             if new_node == NULL:
                 raise SelectolaxError("Can't create a new node")
             lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
-            lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
+            lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
         else:
             raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
     def insert_before(self, str_or_LexborNode value):
         """
         Insert a node before the current Node.
@@ -727,7 +817,7 @@ cdef class LexborNode:
         >>> selector.child.raw_value
         b'&#x3C;test&#x3E;'
         """
-        raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
+        raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
     def scripts_contain(self, str query):
         """Returns True if any of the script tags contain specified text.
@@ -740,6 +830,7 @@ cdef class LexborNode:
             The query to check.
         """
+        cdef LexborNode node
         if self.parser.cached_script_texts is None:
             nodes = self.parser.selector.find('script', self)
             text_nodes = []
@@ -764,6 +855,7 @@ cdef class LexborNode:
         queries : tuple of str
         """
+        cdef LexborNode node
         if self.parser.cached_script_srcs is None:
             nodes = self.parser.selector.find('script', self)
             src_nodes = []
@@ -819,31 +911,99 @@ cdef class LexborNode:
         """
         cdef unsigned char * text
         cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
-        container = TextContainer()
+        cdef TextContainer container
         if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
             return None
         text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
         if text != NULL:
+            container = TextContainer.new_with_defaults()
             py_text = text.decode(_ENCODING)
             container.append(py_text)
             return container.text
+    @property
+    def inner_html(self) -> str:
+        """Return HTML representation of the child nodes.
+        Works similar to innerHTML in JavaScript.
+        Unlike the `.html` property, does not include the current node.
+        Can be used to set HTML as well. See the setter docstring.
+        Returns
+        -------
+        text : str | None
+        """
+        cdef lexbor_str_t *lxb_str
+        cdef lxb_status_t status
+        lxb_str = lexbor_str_create()
+        status = lxb_html_serialize_deep_str(self.node, lxb_str)
+        if status == 0 and lxb_str.data:
+            html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
+            lexbor_str_destroy(lxb_str,  self.node.owner_document.text, True)
+            return html
+        return None
+    @inner_html.setter
+    def inner_html(self, str html):
+        """Set inner HTML to the specified HTML.
+        Replaces existing data inside the node.
+        Works similar to innerHTML in JavaScript.
+        Parameters
+        ----------
+        html : str | None
+        """
+        cdef bytes bytes_val
+        bytes_val = <bytes>html.encode("utf-8")
+        lxb_html_element_inner_html_set(
+                <lxb_html_element_t  *>self.node,
+                <lxb_char_t *> bytes_val, len(bytes_val)
+        )
+    def clone(self) -> LexborNode:
+        """Clone the current node.
+        You can use to do temporary modifications without affecting the original HTML tree.
+        It is tied to the current parser instance.
+        Gets destroyed when parser instance is destroyed.
+        """
+        cdef lxb_dom_node_t* node
+        node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
+        return LexborNode.new(node, self.parser)
+@cython.internal
 @cython.final
 cdef class TextContainer:
     cdef str _text
-    cdef public str separator
-    cdef public bool strip
+    cdef str separator
+    cdef bint strip
+    @staticmethod
+    cdef TextContainer new_with_defaults():
+        cdef TextContainer cls = TextContainer.__new__(TextContainer)
+        cls._text = ''
+        cls.separator = ''
+        cls.strip = False
+        return cls
     def __init__(self, str separator = '', bool strip = False):
         self._text = ""
         self.separator = separator
         self.strip = strip
-    def append(self, node_text):
+    def append(self, str node_text):
         if self.strip:
             self._text += node_text.strip() + self.separator
         else:
             self._text += node_text + self.separator
     @property
     def text(self):
         if self.separator and self._text and self._text.endswith(self.separator):
@@ -852,7 +1012,7 @@ cdef class TextContainer:
 cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
-    cdef unsigned char *text;
+    cdef unsigned char *text
     cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
     if tag_id != LXB_TAG__TEXT:
         return LEXBOR_ACTION_OK
@@ -860,8 +1020,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
     text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
     if not text:
         return LEXBOR_ACTION_OK
-    py_str = text.decode(_ENCODING)
-    cdef object cls
-    cls = <object> ctx
+    try:
+        py_str = text.decode(_ENCODING, "replace")
+    except Exception as e:
+        PyErr_SetNone(e)
+        return LEXBOR_ACTION_STOP
+    cdef TextContainer cls
+    cls = <TextContainer> ctx
     cls.append(py_str)
     return LEXBOR_ACTION_OK

selectolax/lexbor/node_remove.pxi ADDED Viewed

@@ -0,0 +1,29 @@
+cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
+    cdef lxb_dom_node_t *tmp
+    cdef lxb_dom_node_t *node = root
+    while node != NULL:
+        if node.first_child != NULL:
+            node = node.first_child
+        else:
+            while node != root and node.next == NULL:
+                tmp = node.parent
+                lxb_dom_node_remove(node)
+                node = tmp
+            if node == root:
+                lxb_dom_node_remove(node)
+                break
+            tmp = node.next
+            lxb_dom_node_remove(node)
+            node = tmp
+    return NULL
+cdef bint node_is_removed(lxb_dom_node_t* node):
+    if node.parent == NULL and node.next == NULL \
+       and node.prev == NULL:
+        return 1
+    return 0