PyPI - selectolax - Versions diffs - 0.3.31__cp311-cp311-win_amd64.whl → 0.3.33__cp311-cp311-win_amd64.whl - Mend

selectolax 0.3.31__cp311-cp311-win_amd64.whl → 0.3.33__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (25) hide show

selectolax/__init__.py +3 -5
selectolax/lexbor/attrs.pxi +0 -1
selectolax/lexbor/node.pxi +93 -41
selectolax/lexbor/selection.pxi +27 -25
selectolax/lexbor/util.pxi +1 -0
selectolax/lexbor.c +6367 -6672
selectolax/lexbor.cp311-win_amd64.pyd +0 -0
selectolax/lexbor.pxd +32 -35
selectolax/lexbor.pyi +111 -5
selectolax/lexbor.pyx +43 -17
selectolax/modest/node.pxi +37 -36
selectolax/modest/selection.pxi +24 -22
selectolax/modest/util.pxi +1 -0
selectolax/parser.c +4484 -5266
selectolax/parser.cp311-win_amd64.pyd +0 -0
selectolax/parser.pxd +17 -20
selectolax/parser.pyi +2 -2
selectolax/parser.pyx +28 -31
selectolax/utils.pxi +13 -3
{selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/METADATA +3 -3
selectolax-0.3.33.dist-info/RECORD +26 -0
selectolax-0.3.31.dist-info/RECORD +0 -26
{selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/WHEEL +0 -0
{selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/licenses/LICENSE +0 -0
{selectolax-0.3.31.dist-info → selectolax-0.3.33.dist-info}/top_level.txt +0 -0

selectolax/lexbor.cp311-win_amd64.pyd CHANGED Viewed

Binary file

selectolax/lexbor.pxd CHANGED Viewed

@@ -1,4 +1,4 @@
-from libc.stdint cimport uint32_t, uint8_t, uintptr_t
+from libc.stdint cimport uint8_t, uint32_t, uintptr_t
 cdef extern from "lexbor/core/core.h" nogil:
@@ -31,7 +31,6 @@ cdef extern from "lexbor/core/core.h" nogil:
     lexbor_str_t* lexbor_str_create()
     lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
 cdef extern from "lexbor/html/html.h" nogil:
     ctypedef unsigned int lxb_html_document_opt_t
@@ -54,14 +53,12 @@ cdef extern from "lexbor/html/html.h" nogil:
         void *events
     ctypedef struct lexbor_str_t:
-        lxb_char_t *data;
-        size_t     length;
+        lxb_char_t *data
+        size_t     length
     ctypedef struct lxb_dom_node_t:
         lxb_dom_event_target_t event_target
         uintptr_t              local_name
         uintptr_t              prefix
         uintptr_t              ns
@@ -77,7 +74,6 @@ cdef extern from "lexbor/html/html.h" nogil:
         lxb_dom_node_type_t    type
     ctypedef struct lxb_dom_document_t:
         lxb_dom_node_t              node
@@ -104,7 +100,6 @@ cdef extern from "lexbor/html/html.h" nogil:
         bint                        scripting
     ctypedef  struct lxb_html_document_t:
         lxb_dom_document_t dom_document
@@ -128,7 +123,6 @@ cdef extern from "lexbor/html/html.h" nogil:
         LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
         LXB_HTML_PARSER_STATE_ERROR            = 0x04
     ctypedef enum lxb_dom_node_type_t:
         LXB_DOM_NODE_TYPE_ELEMENT                = 0x01
         LXB_DOM_NODE_TYPE_ATTRIBUTE              = 0x02
@@ -175,10 +169,9 @@ cdef extern from "lexbor/html/html.h" nogil:
         size_t  length
         size_t  struct_size
     ctypedef struct lxb_html_tree_pending_table_t
-    ctypedef bint lxb_html_tree_insertion_mode_f;
-    ctypedef lxb_status_t lxb_html_tree_append_attr_f;
+    ctypedef bint lxb_html_tree_insertion_mode_f
+    ctypedef lxb_status_t lxb_html_tree_append_attr_f
     ctypedef struct lxb_html_tree_t:
@@ -189,13 +182,13 @@ cdef extern from "lexbor/html/html.h" nogil:
         lxb_html_form_element_t *form
-        lexbor_array_t *open_elements;
-        lexbor_array_t *active_formatting;
-        lexbor_array_obj_t *template_insertion_modes;
+        lexbor_array_t *open_elements
+        lexbor_array_t *active_formatting
+        lexbor_array_obj_t *template_insertion_modes
-        lxb_html_tree_pending_table_t *pending_table;
+        lxb_html_tree_pending_table_t *pending_table
-        lexbor_array_obj_t *parse_errors;
+        lexbor_array_obj_t *parse_errors
         bint foster_parenting
         bint frameset_ok
@@ -232,9 +225,13 @@ cdef extern from "lexbor/html/html.h" nogil:
     lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
 cdef class LexborNode:
-    cdef lxb_dom_node_t *node
-    cdef public LexborHTMLParser parser
-    cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser)
+    cdef:
+        lxb_dom_node_t *node
+        public LexborHTMLParser parser
+    @staticmethod
+    cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
 cdef class LexborCSSSelector:
     cdef lxb_css_parser_t* parser
@@ -242,15 +239,15 @@ cdef class LexborCSSSelector:
     cdef lxb_css_selectors_t * css_selectors
     cdef public list results
     cdef public LexborNode current_node
-    cdef _create_css_parser(self)
-    cpdef find(self, str query, LexborNode node)
-    cpdef any_matches(self, str query, LexborNode node)
+    cdef int _create_css_parser(self) except -1
+    cpdef list find(self, str query, LexborNode node)
+    cpdef int any_matches(self, str query, LexborNode node) except -1
 cdef class LexborHTMLParser:
     cdef lxb_html_document_t *document
     cdef public bytes raw_html
     cdef LexborCSSSelector _selector
-    cdef _parse_html(self, char* html, size_t html_len)
+    cdef int _parse_html(self, char* html, size_t html_len) except -1
     cdef object cached_script_texts
     cdef object cached_script_srcs
@@ -267,8 +264,8 @@ cdef extern from "lexbor/dom/dom.h" nogil:
     ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)
     ctypedef struct lxb_dom_character_data_t:
-        lxb_dom_node_t node;
-        lexbor_str_t   data;
+        lxb_dom_node_t node
+        lexbor_str_t   data
     ctypedef struct lxb_dom_text_t:
         lxb_dom_character_data_t char_data
@@ -289,19 +286,20 @@ cdef extern from "lexbor/dom/dom.h" nogil:
         lxb_dom_element_t *owner
         lxb_dom_attr_t *next
-        lxb_dom_attr_t *prev;
+        lxb_dom_attr_t *prev
     lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
     lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
+    lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
+    void lxb_dom_node_remove(lxb_dom_node_t *node)
     void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
-    lxb_dom_node_t *  lxb_dom_document_root(lxb_dom_document_t *document)
+    lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
     lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
     lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
     lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
     lxb_dom_attr_t * lxb_dom_element_first_attribute_noi(lxb_dom_element_t *element)
-    const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len);
+    const lxb_char_t * lxb_dom_attr_local_name_noi(lxb_dom_attr_t *attr, size_t *len)
     const lxb_char_t * lxb_dom_attr_value_noi(lxb_dom_attr_t *attr, size_t *len)
     lxb_dom_attr_t * lxb_dom_element_set_attribute(lxb_dom_element_t *element,
@@ -314,7 +312,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
     lxb_tag_id_t lxb_dom_node_tag_id_noi(lxb_dom_node_t *node)
     lxb_dom_node_t * lxb_dom_document_import_node(lxb_dom_document_t *doc, lxb_dom_node_t *node, bint deep)
     void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
-    lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node);
+    lxb_status_t lxb_dom_node_replace_all(lxb_dom_node_t *parent, lxb_dom_node_t *node)
     void lxb_dom_node_insert_child(lxb_dom_node_t *to, lxb_dom_node_t *node)
     void lxb_dom_node_insert_before(lxb_dom_node_t *to, lxb_dom_node_t *node)
     void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
@@ -347,7 +345,7 @@ cdef extern from "lexbor/css/css.h" nogil:
     lxb_css_parser_t * lxb_css_parser_create()
     lxb_status_t lxb_css_parser_init(lxb_css_parser_t *parser, lxb_css_syntax_tokenizer_t *tkz)
     lxb_css_parser_t * lxb_css_parser_destroy(lxb_css_parser_t *parser, bint self_destroy)
-    lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy);
+    lxb_css_memory_t * lxb_css_memory_destroy(lxb_css_memory_t *memory, bint self_destroy)
     void lxb_css_selector_list_destroy_memory(lxb_css_selector_list_t *list)
@@ -558,8 +556,7 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
     ctypedef struct lxb_selectors_t
     ctypedef struct lxb_css_selector_list_t
     ctypedef struct lxb_css_selector_specificity_t
-    ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
-                      void *ctx)
+    ctypedef lxb_status_t (*lxb_selectors_cb_f)(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx)
     ctypedef enum lxb_selectors_opt_t:
         LXB_SELECTORS_OPT_DEFAULT = 0x00
         LXB_SELECTORS_OPT_MATCH_ROOT = 1 << 1
@@ -576,4 +573,4 @@ cdef extern from "lexbor/selectors/selectors.h" nogil:
     lxb_status_t lxb_selectors_init(lxb_selectors_t *selectors)
     lxb_selectors_t * lxb_selectors_destroy(lxb_selectors_t *selectors, bint self_destroy)
     lxb_status_t lxb_selectors_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
-                   lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)
+                                    lxb_css_selector_list_t *list, lxb_selectors_cb_f cb, void *ctx)

selectolax/lexbor.pyi CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Iterator, Literal, TypeVar, NoReturn, overload, Optional
+from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
 DefaultT = TypeVar("DefaultT")
@@ -158,11 +158,45 @@ class LexborNode:
     @overload
     def css_first(
         self, query: str, default: Any = ..., strict: Literal[True] = ...
-    ) -> LexborNode: ...
+    ) -> LexborNode:
+        """Same as `css` but returns only the first match.
+        Parameters
+        ----------
+        query : str
+        default : bool, default None
+            Default value to return if there is no match.
+        strict: bool, default True
+            Set to True if you want to check if there is strictly only one match in the document.
+        Returns
+        -------
+        selector : `LexborNode` object
+        """
+        ...
     @overload
     def css_first(
         self, query: str, default: DefaultT, strict: bool = False
-    ) -> LexborNode | DefaultT: ...
+    ) -> LexborNode | DefaultT:
+        """Same as `css` but returns only the first match.
+        Parameters
+        ----------
+        query : str
+        default : bool, default None
+            Default value to return if there is no match.
+        strict: bool, default True
+            Set to True if you want to check if there is strictly only one match in the document.
+        Returns
+        -------
+        selector : `LexborNode` object
+        """
+        ...
     @overload
     def css_first(
         self, query: str, default: None = ..., strict: bool = False
@@ -350,6 +384,25 @@ class LexborNode:
         Note: by default, empty tags are ignored, use "delete_empty" to change this.
         """
         ...
+    def merge_text_nodes(self) -> None:
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        ...
     def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
         """Iterate over all child and next nodes starting from the current level.
@@ -625,11 +678,45 @@ class LexborHTMLParser:
     @overload
     def css_first(
         self, query: str, default: Any = ..., strict: Literal[True] = ...
-    ) -> LexborNode: ...
+    ) -> LexborNode:
+        """Same as `css` but returns only the first match.
+        Parameters
+        ----------
+        query : str
+        default : bool, default None
+            Default value to return if there is no match.
+        strict: bool, default True
+            Set to True if you want to check if there is strictly only one match in the document.
+        Returns
+        -------
+        selector : `LexborNode` object
+        """
+        ...
     @overload
     def css_first(
         self, query: str, default: DefaultT, strict: bool = False
-    ) -> LexborNode | DefaultT: ...
+    ) -> LexborNode | DefaultT:
+        """Same as `css` but returns only the first match.
+        Parameters
+        ----------
+        query : str
+        default : bool, default None
+            Default value to return if there is no match.
+        strict: bool, default True
+            Set to True if you want to check if there is strictly only one match in the document.
+        Returns
+        -------
+        selector : `LexborNode` object
+        """
+        ...
     @overload
     def css_first(
         self, query: str, default: None = ..., strict: bool = False
@@ -711,6 +798,25 @@ class LexborHTMLParser:
         """
         ...
     def css_matches(self, selector: str) -> bool: ...
+    def merge_text_nodes(self) -> None:
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        ...
     def clone(self) -> LexborHTMLParser:
         """Clone the current tree."""
         ...

selectolax/lexbor.pyx CHANGED Viewed

@@ -1,4 +1,5 @@
-from cpython cimport bool
+from cpython.bool cimport bool
+from cpython.exc cimport PyErr_SetObject
 _ENCODING = 'UTF-8'
@@ -24,10 +25,8 @@ cdef class LexborHTMLParser:
     html : str (unicode) or bytes
     """
     def __init__(self, html):
         cdef size_t html_len
-        cdef char* html_chars
+        cdef object bytes_html
         bytes_html, html_len = preprocess_input(html)
         self._parse_html(bytes_html, html_len)
         self.raw_html = bytes_html
@@ -39,22 +38,27 @@ cdef class LexborHTMLParser:
             self._selector = LexborCSSSelector()
         return self._selector
-    cdef _parse_html(self, char *html, size_t html_len):
+    cdef int _parse_html(self, char *html, size_t html_len) except -1:
         cdef lxb_status_t status
         with nogil:
             self.document = lxb_html_document_create()
         if self.document == NULL:
-            raise SelectolaxError("Failed to initialize object for HTML Document.")
+            PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
+            return -1
         with nogil:
             status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
         if status != 0x0000:
-            raise SelectolaxError("Can't parse HTML.")
+            PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
+            return -1
-        assert self.document != NULL
+        if self.document == NULL:
+            PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
+            return -1
+        return 0
     def __dealloc__(self):
         if self.document != NULL:
@@ -68,7 +72,7 @@ cdef class LexborHTMLParser:
         """Returns root node."""
         if self.document == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
+        return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
     @property
     def body(self):
@@ -77,7 +81,7 @@ cdef class LexborHTMLParser:
         body = lxb_html_document_body_element_noi(self.document)
         if body == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
+        return LexborNode.new(<lxb_dom_node_t *> body, self)
     @property
     def head(self):
@@ -86,7 +90,7 @@ cdef class LexborHTMLParser:
         head = lxb_html_document_head_element_noi(self.document)
         if head == NULL:
             return None
-        return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
+        return LexborNode.new(<lxb_dom_node_t *> head, self)
     def tags(self, str name):
         """Returns a list of tags that match specified name.
@@ -122,7 +126,7 @@ cdef class LexborHTMLParser:
             raise SelectolaxError("Can't locate elements.")
         for i in range(lxb_dom_collection_length_noi(collection)):
-            node = LexborNode()._cinit(
+            node = LexborNode.new(
                 <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
                 self
             )
@@ -156,7 +160,7 @@ cdef class LexborHTMLParser:
         """Return HTML representation of the page."""
         if self.document == NULL:
             return None
-        node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
+        node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
         return node.html
     def css(self, str query):
@@ -238,7 +242,7 @@ cdef class LexborHTMLParser:
             for i in range(lxb_dom_collection_length_noi(collection)):
                 if recursive:
-                    lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
+                    lxb_dom_node_destroy_deep(<lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
                 else:
                     lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
             lxb_dom_collection_destroy(collection, <bint> True)
@@ -279,7 +283,6 @@ cdef class LexborHTMLParser:
         """
         return self.root.scripts_contain(query)
     def script_srcs_contain(self, tuple queries):
         """Returns True if any of the script SRCs attributes contain on of the specified text.
@@ -295,6 +298,26 @@ cdef class LexborHTMLParser:
     def css_matches(self, str selector):
         return self.root.css_matches(selector)
+    def merge_text_nodes(self):
+        """Iterates over all text nodes and merges all text nodes that are close to each other.
+        This is useful for text extraction.
+        Use it when you need to strip HTML tags and merge "dangling" text.
+        Examples
+        --------
+        >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
+        >>> node = tree.css_first('div')
+        >>> tree.unwrap_tags(["strong"])
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
+        >>> node.merge_text_nodes()
+        >>> tree.text(deep=True, separator=" ", strip=True)
+        "John Doe"
+        """
+        return self.root.merge_text_nodes()
     @staticmethod
     cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
         obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
@@ -309,6 +332,7 @@ cdef class LexborHTMLParser:
         """Clone the current tree."""
         cdef lxb_html_document_t* cloned_document
         cdef lxb_dom_node_t* cloned_node
+        cdef LexborHTMLParser cls
         with nogil:
             cloned_document = lxb_html_document_create()
@@ -333,6 +357,7 @@ cdef class LexborHTMLParser:
         cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
         return cls
     def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
@@ -353,5 +378,6 @@ cdef class LexborHTMLParser:
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
         """
-        if self.root is not None:
+        # faster to check if the document is empty which should determine if we have a root
+        if self.document != NULL:
             self.root.unwrap_tags(tags, delete_empty=delete_empty)