PyPI - selectolax - Versions diffs - 0.3.29__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl - Mend

selectolax 0.3.29cp312-cp312-win32.whl → 0.3.34cp312-cp312-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (26) hide show

selectolax/__init__.py +3 -5
selectolax/lexbor/attrs.pxi +26 -9
selectolax/lexbor/node.pxi +108 -47
selectolax/lexbor/selection.pxi +34 -25
selectolax/lexbor/util.pxi +1 -0
selectolax/lexbor.c +52987 -55311
selectolax/lexbor.cp312-win32.pyd +0 -0
selectolax/lexbor.pxd +36 -40
selectolax/lexbor.pyi +770 -65
selectolax/lexbor.pyx +54 -17
selectolax/modest/node.pxi +45 -42
selectolax/modest/selection.pxi +24 -22
selectolax/modest/util.pxi +1 -0
selectolax/parser.c +50190 -52325
selectolax/parser.cp312-win32.pyd +0 -0
selectolax/parser.pxd +17 -20
selectolax/parser.pyi +489 -45
selectolax/parser.pyx +39 -31
selectolax/utils.pxi +13 -3
selectolax-0.3.34.dist-info/METADATA +32 -0
selectolax-0.3.34.dist-info/RECORD +26 -0
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
selectolax-0.3.29.dist-info/METADATA +0 -194
selectolax-0.3.29.dist-info/RECORD +0 -26
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
{selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0

selectolax/__init__.py CHANGED Viewed

@@ -2,9 +2,7 @@
 __author__ = """Artem Golubin"""
-__email__ = 'me@rushter.com'
-__version__ = '0.3.29'
+__email__ = "me@rushter.com"
+__version__ = "0.3.34"
-from . import parser
-from . import lexbor
-from . import modest
+from . import lexbor, modest, parser

selectolax/lexbor/attrs.pxi CHANGED Viewed

@@ -1,5 +1,6 @@
 cimport cython
 @cython.final
 cdef class LexborAttributes:
     """A dict-like object that represents attributes."""
@@ -23,16 +24,32 @@ cdef class LexborAttributes:
                 yield key.decode(_ENCODING)
             attr = attr.next
-    def __setitem__(self, str key, value):
-        value = str(value)
+    def __setitem__(self, str key, object value):
+        value = value
         bytes_key = key.encode(_ENCODING)
-        bytes_value = value.encode(_ENCODING)
-        lxb_dom_element_set_attribute(
-            <lxb_dom_element_t *> self.node,
-            <lxb_char_t *> bytes_key, len(bytes_key),
-            <lxb_char_t *> bytes_value, len(bytes_value),
-        )
+        bytes_value = value.encode(_ENCODING) if value else b""
+        cdef lxb_dom_attr_t *attr
+        cdef lxb_dom_document_t *doc
+        if value is None:
+            # N.B. This is suboptimal, but there is not API to set empty attributes
+            attr = lxb_dom_element_set_attribute(
+                <lxb_dom_element_t *> self.node,
+                <lxb_char_t *> bytes_key, len(bytes_key),
+                NULL, 0
+            )
+            doc = (<lxb_dom_node_t*>attr).owner_document
+            lexbor_str_destroy(attr.value, doc.text, 0)
+            attr.value = NULL
+        elif isinstance(value, str) or isinstance(value, unicode) :
+            lxb_dom_element_set_attribute(
+                <lxb_dom_element_t *> self.node,
+                <lxb_char_t *> bytes_key, len(bytes_key),
+                <lxb_char_t *> bytes_value, len(bytes_value),
+            )
+        else:
+            raise TypeError("Expected str or unicode, got %s" % type(value))
     def __delitem__(self, key):
         try:

selectolax/lexbor/node.pxi CHANGED Viewed

@@ -1,4 +1,5 @@
 cimport cython
+from cpython.exc cimport PyErr_SetNone
 _TAG_TO_NAME = {
     0x0005: "- doctype",
@@ -6,26 +7,29 @@ _TAG_TO_NAME = {
     0x0004: "-comment",
 }
 ctypedef fused str_or_LexborNode:
-    basestring
+    str
     bytes
     LexborNode
 cdef inline bytes to_bytes(str_or_LexborNode value):
     cdef bytes bytes_val
-    if isinstance(value, (str, unicode)):
-        bytes_val = value.encode(_ENCODING)
+    if isinstance(value, unicode):
+        bytes_val = <bytes>value.encode("utf-8")
     elif isinstance(value, bytes):
-        bytes_val =  <char*> value
+        bytes_val = <bytes>value
     return bytes_val
 @cython.final
 cdef class LexborNode:
     """A class that represents HTML node (element)."""
-    cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
-        self.parser = parser
-        self.node = node
-        return self
+    @staticmethod
+    cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
+        cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
+        lxbnode.node = node
+        lxbnode.parser = parser
+        return lxbnode
     @property
     def mem_id(self):
@@ -41,8 +45,7 @@ cdef class LexborNode:
         """Return the first child node."""
         cdef LexborNode node
         if self.node.first_child:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
             return node
         return None
@@ -50,9 +53,8 @@ cdef class LexborNode:
     def parent(self):
         """Return the parent node."""
         cdef LexborNode node
-        if self.node.parent:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
+        if self.node.parent != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
             return node
         return None
@@ -60,9 +62,8 @@ cdef class LexborNode:
     def next(self):
         """Return next node."""
         cdef LexborNode node
-        if self.node.next:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
+        if self.node.next != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
             return node
         return None
@@ -70,9 +71,8 @@ cdef class LexborNode:
     def prev(self):
         """Return previous node."""
         cdef LexborNode node
-        if self.node.prev:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
+        if self.node.prev != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
             return node
         return None
@@ -80,9 +80,8 @@ cdef class LexborNode:
     def last_child(self):
         """Return last child node."""
         cdef LexborNode node
-        if self.node.last_child:
-            node = LexborNode()
-            node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
+        if self.node.last_child != NULL:
+            node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
             return node
         return None
@@ -181,6 +180,12 @@ cdef class LexborNode:
         Matches pattern `query` against HTML tree.
         `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
+        Special selectors:
+         - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
+         - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
         Parameters
         ----------
         query : str
@@ -256,7 +261,6 @@ cdef class LexborNode:
             text = c_text.decode(_ENCODING)
         return text
     def decompose(self, bool recursive=True):
         """Remove the current node from the tree.
@@ -273,6 +277,9 @@ cdef class LexborNode:
         >>>     tag.decompose()
         """
+        if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
+            raise SelectolaxError("Decomposing the root node is not allowed.")
         if recursive:
             lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
         else:
@@ -298,11 +305,11 @@ cdef class LexborNode:
         '<html><body><div>Hello world!</div></body></html>'
         """
+        cdef LexborNode element
         for tag in tags:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
     @property
     def attributes(self):
         """Get all attributes that belong to the current node.
@@ -410,13 +417,11 @@ cdef class LexborNode:
                 node = node.next
                 continue
-            next_node = LexborNode()
-            next_node._cinit(<lxb_dom_node_t *> node, self.parser)
+            next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
             yield next_node
             node = node.next
-    def unwrap(self, delete_empty=False):
+    def unwrap(self, bint delete_empty=False):
         """Replace node with whatever is inside this node.
         Parameters
@@ -431,15 +436,15 @@ cdef class LexborNode:
         >>>  tree.css_first('i').unwrap()
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
         Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
         if self.node.first_child == NULL:
             if delete_empty:
                 lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
             return
-        cdef lxb_dom_node_t* next_node;
-        cdef lxb_dom_node_t* current_node;
+        cdef lxb_dom_node_t* next_node
+        cdef lxb_dom_node_t* current_node
         if self.node.first_child.next != NULL:
             current_node = self.node.first_child
@@ -453,7 +458,7 @@ cdef class LexborNode:
             lxb_dom_node_insert_before(self.node, self.node.first_child)
         lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
-    def unwrap_tags(self, list tags, delete_empty = False):
+    def unwrap_tags(self, list tags, bint delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -472,14 +477,50 @@ cdef class LexborNode:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
+        cdef LexborNode element
         for tag in tags:
             for element in self.css(tag):
                 element.unwrap(delete_empty)
+    def merge_text_nodes(self):
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        cdef lxb_dom_node_t *node = self.node.first_child
+        cdef lxb_dom_node_t *next_node
+        cdef lxb_char_t *left_text
+        cdef lxb_char_t *right_text
+        cdef size_t left_length, right_length
+        while node != NULL:
+            next_node = node.next
+            if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
+                left_text = lxb_dom_node_text_content(node.prev, &left_length)
+                right_text = lxb_dom_node_text_content(node, &right_length)
+                if left_text and right_text:
+                    combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
+                    lxb_dom_node_text_content_set(node, combined, len(combined))
+                    lxb_dom_node_remove(node.prev)
+            if node.first_child:
+                LexborNode.new(node, self.parser).merge_text_nodes()
+            node = next_node
     def traverse(self, include_text=False):
         """Iterate over all child and next nodes starting from the current level.
@@ -499,8 +540,7 @@ cdef class LexborNode:
         while node != NULL:
             if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
-                lxb_node = LexborNode()
-                lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
+                lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
                 yield lxb_node
             if node.first_child != NULL:
@@ -564,7 +604,6 @@ cdef class LexborNode:
         else:
             raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
     def insert_before(self, str_or_LexborNode value):
         """
         Insert a node before the current Node.
@@ -739,7 +778,7 @@ cdef class LexborNode:
         >>> selector.child.raw_value
         b'&#x3C;test&#x3E;'
         """
-        raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
+        raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
     def scripts_contain(self, str query):
         """Returns True if any of the script tags contain specified text.
@@ -752,6 +791,7 @@ cdef class LexborNode:
             The query to check.
         """
+        cdef LexborNode node
         if self.parser.cached_script_texts is None:
             nodes = self.parser.selector.find('script', self)
             text_nodes = []
@@ -776,6 +816,7 @@ cdef class LexborNode:
         queries : tuple of str
         """
+        cdef LexborNode node
         if self.parser.cached_script_srcs is None:
             nodes = self.parser.selector.find('script', self)
             src_nodes = []
@@ -831,31 +872,44 @@ cdef class LexborNode:
         """
         cdef unsigned char * text
         cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
-        container = TextContainer()
+        cdef TextContainer container
         if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
             return None
         text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
         if text != NULL:
+            container = TextContainer.new_with_defaults()
             py_text = text.decode(_ENCODING)
             container.append(py_text)
             return container.text
+@cython.internal
 @cython.final
 cdef class TextContainer:
     cdef str _text
-    cdef public str separator
-    cdef public bool strip
+    cdef str separator
+    cdef bint strip
+    @staticmethod
+    cdef TextContainer new_with_defaults():
+        cdef TextContainer cls = TextContainer.__new__(TextContainer)
+        cls._text = ''
+        cls.separator = ''
+        cls.strip = False
+        return cls
     def __init__(self, str separator = '', bool strip = False):
         self._text = ""
         self.separator = separator
         self.strip = strip
-    def append(self, node_text):
+    def append(self, str node_text):
         if self.strip:
             self._text += node_text.strip() + self.separator
         else:
             self._text += node_text + self.separator
     @property
     def text(self):
         if self.separator and self._text and self._text.endswith(self.separator):
@@ -864,7 +918,7 @@ cdef class TextContainer:
 cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
-    cdef unsigned char *text;
+    cdef unsigned char *text
     cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
     if tag_id != LXB_TAG__TEXT:
         return LEXBOR_ACTION_OK
@@ -872,8 +926,15 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
     text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
     if not text:
         return LEXBOR_ACTION_OK
-    py_str = text.decode(_ENCODING)
-    cdef object cls
-    cls = <object> ctx
+    try:
+        py_str = text.decode(_ENCODING)
+    except Exception as e:
+        PyErr_SetNone(e)
+        return LEXBOR_ACTION_STOP
+    cdef TextContainer cls
+    cls = <TextContainer> ctx
     cls.append(py_str)
     return LEXBOR_ACTION_OK

selectolax/lexbor/selection.pxi CHANGED Viewed

@@ -1,4 +1,7 @@
 cimport cython
+from cpython.exc cimport PyErr_SetObject
+from cpython.list cimport PyList_GET_SIZE
 @cython.final
 cdef class LexborCSSSelector:
@@ -8,21 +11,22 @@ cdef class LexborCSSSelector:
         self.results = []
         self.current_node = None
-    cdef _create_css_parser(self):
+    cdef int _create_css_parser(self) except -1:
         cdef lxb_status_t status
         self.parser = lxb_css_parser_create()
         status = lxb_css_parser_init(self.parser, NULL)
         if status != LXB_STATUS_OK:
-            raise SelectolaxError("Can't initialize CSS parser.")
+            PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
+            return -1
         self.css_selectors = lxb_css_selectors_create()
         status = lxb_css_selectors_init(self.css_selectors)
         if status != LXB_STATUS_OK:
-            raise SelectolaxError("Can't initialize CSS selector.")
+            PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
+            return -1
         lxb_css_parser_selectors_set(self.parser, self.css_selectors)
@@ -30,14 +34,18 @@ cdef class LexborCSSSelector:
         status = lxb_selectors_init(self.selectors)
         lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
         if status != LXB_STATUS_OK:
-            raise SelectolaxError("Can't initialize CSS selector.")
+            PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
+            return -1
+        return 0
-    cpdef find(self, str query, LexborNode node):
+    cpdef list find(self, str query, LexborNode node):
         cdef lxb_css_selector_list_t* selectors
         cdef lxb_char_t* c_selector
         cdef lxb_css_selector_list_t * selectors_list
+        if not isinstance(query, str):
+            raise TypeError("Query must be a string.")
         bytes_query = query.encode(_ENCODING)
         selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(query))
@@ -54,28 +62,32 @@ cdef class LexborCSSSelector:
         lxb_css_selector_list_destroy_memory(selectors_list)
         return results
-    cpdef any_matches(self, str query, LexborNode node):
+    cpdef int any_matches(self, str query, LexborNode node) except -1:
         cdef lxb_css_selector_list_t * selectors
         cdef lxb_char_t * c_selector
         cdef lxb_css_selector_list_t * selectors_list
+        cdef int result
+        if not isinstance(query, str):
+            raise TypeError("Query must be a string.")
         bytes_query = query.encode(_ENCODING)
         selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
         if selectors_list == NULL:
-            raise SelectolaxError("Can't parse CSS selector.")
+            PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
         self.results = []
         status = lxb_selectors_find(self.selectors, node.node, selectors_list,
                                     <lxb_selectors_cb_f> css_matcher_callback, <void *> self)
         if status != LXB_STATUS_OK:
-            raise SelectolaxError("Can't parse CSS selector.")
-        result = bool(self.results)
+            lxb_css_selector_list_destroy_memory(selectors_list)
+            PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
+        result = PyList_GET_SIZE(self.results) > 0
         self.results = []
         lxb_css_selector_list_destroy_memory(selectors_list)
         return result
     def __dealloc__(self):
         if self.selectors != NULL:
             lxb_selectors_destroy(self.selectors, True)
@@ -85,7 +97,6 @@ cdef class LexborCSSSelector:
             lxb_css_selectors_destroy(self.css_selectors, True)
 cdef class LexborSelector:
     """An advanced CSS selector that supports additional operations.
@@ -100,10 +111,9 @@ cdef class LexborSelector:
         self.node = node
         self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]
     cpdef css(self, str query):
         """Evaluate CSS selector against current scope."""
-        raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
+        raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
     @property
     def matches(self) -> list:
@@ -117,7 +127,7 @@ cdef class LexborSelector:
     def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
         """Filter all current matches given text."""
-        nodes = []
+        cdef list nodes = []
         for node in self.nodes:
             node_text = node.text(deep=deep, separator=separator, strip=strip)
             if node_text and text in node_text:
@@ -127,7 +137,7 @@ cdef class LexborSelector:
     def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
         """Returns True if any node in the current search scope contains specified text"""
-        nodes = []
+        cdef LexborNode node
         for node in self.nodes:
             node_text = node.text(deep=deep, separator=separator, strip=strip)
             if node_text and text in node_text:
@@ -139,7 +149,7 @@ cdef class LexborSelector:
         Similar to `string-length` in XPath.
         """
-        nodes = []
+        cdef list nodes = []
         for node in self.nodes:
             attr = node.attributes.get(attribute)
             if attr and start and start in attr:
@@ -154,7 +164,7 @@ cdef class LexborSelector:
         Similar to `string-length` in XPath.
         """
-        nodes = []
+        cdef LexborNode node
         for node in self.nodes:
             attr = node.attributes.get(attribute)
             if attr and start and start in attr:
@@ -169,16 +179,15 @@ cdef class LexborSelector:
 cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
     cdef LexborNode lxb_node
-    cdef object cls
-    cls = <object> ctx
-    lxb_node = LexborNode()
-    lxb_node._cinit(<lxb_dom_node_t *> node, cls.current_node.parser)
+    cdef LexborCSSSelector cls
+    cls = <LexborCSSSelector> ctx
+    lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
     cls.results.append(lxb_node)
     return LXB_STATUS_OK
 cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
     cdef LexborNode lxb_node
-    cdef object cls
-    cls = <object> ctx
+    cdef LexborCSSSelector cls
+    cls = <LexborCSSSelector> ctx
     cls.results.append(True)
     return LXB_STATUS_STOP

selectolax/lexbor/util.pxi CHANGED Viewed

@@ -1,5 +1,6 @@
 include "../utils.pxi"
 def create_tag(tag: str):
     """
     Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,

selectolax 0.3.29__cp312-cp312-win32.whl → 0.3.34__cp312-cp312-win32.whl

Potentially problematic release.

selectolax 0.3.29cp312-cp312-win32.whl → 0.3.34cp312-cp312-win32.whl