PyPI - selectolax - Versions diffs - 0.3.28__cp310-cp310-win_amd64.whl → 0.3.30__cp310-cp310-win_amd64.whl - Mend

selectolax 0.3.28__cp310-cp310-win_amd64.whl → 0.3.30__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of selectolax might be problematic. Click here for more details.

Files changed (18) hide show

selectolax/__init__.py +1 -1
selectolax/lexbor/node.pxi +19 -7
selectolax/lexbor/selection.pxi +6 -0
selectolax/lexbor.c +3073 -2792
selectolax/lexbor.cp310-win_amd64.pyd +0 -0
selectolax/lexbor.pyi +7 -2
selectolax/lexbor.pyx +10 -2
selectolax/modest/node.pxi +27 -9
selectolax/parser.c +3591 -3122
selectolax/parser.cp310-win_amd64.pyd +0 -0
selectolax/parser.pyi +13 -20
selectolax/parser.pyx +19 -6
{selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/METADATA +22 -18
selectolax-0.3.30.dist-info/RECORD +26 -0
{selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/WHEEL +1 -1
selectolax-0.3.28.dist-info/RECORD +0 -26
{selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info/licenses}/LICENSE +0 -0
{selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/top_level.txt +0 -0

selectolax/lexbor.cp310-win_amd64.pyd CHANGED Viewed

Binary file

selectolax/lexbor.pyi CHANGED Viewed

@@ -101,7 +101,7 @@ class LexborNode:
     def id(self) -> str | None: ...
     def iter(self, include_text: bool = False) -> Iterator[LexborNode]: ...
     def unwrap(self) -> None: ...
-    def unwrap_tags(self, tags: list[str]) -> None: ...
+    def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
     def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: ...
     def replace_with(self, value: bytes | str | LexborNode) -> None: ...
     def insert_before(self, value: bytes | str | LexborNode) -> None: ...
@@ -152,7 +152,7 @@ class LexborHTMLParser:
     def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
     def css_matches(self, selector: str) -> bool: ...
     def clone(self) -> LexborHTMLParser: ...
-    def unwrap_tags(self, tags: list[str]) -> None: ...
+    def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
 def create_tag(tag: str) -> LexborNode:
     """
@@ -170,3 +170,8 @@ def parse_fragment(html: str) -> list[LexborNode]:
     if they are missing. This function does not add these tags.
     """
     ...
+class SelectolaxError(Exception):
+    """An exception that indicates error."""
+    pass

selectolax/lexbor.pyx CHANGED Viewed

@@ -96,6 +96,12 @@ cdef class LexborHTMLParser:
         name : str (e.g. div)
         """
+        if not name:
+            raise ValueError("Tag name cannot be empty")
+        if len(name) > 100:
+            raise ValueError("Tag name is too long")
         cdef lxb_dom_collection_t* collection = NULL
         cdef lxb_status_t status
         pybyte_name = name.encode('UTF-8')
@@ -327,7 +333,7 @@ cdef class LexborHTMLParser:
         cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
         return cls
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -336,6 +342,8 @@ cdef class LexborHTMLParser:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool
+            Whenever to delete empty tags.
         Examples
         --------
@@ -346,4 +354,4 @@ cdef class LexborHTMLParser:
         '<body><div>Hello world!</div></body>'
         """
         if self.root is not None:
-            self.root.unwrap_tags(tags)
+            self.root.unwrap_tags(tags, delete_empty=delete_empty)

selectolax/modest/node.pxi CHANGED Viewed

@@ -14,6 +14,8 @@ cdef class Stack:
         self.capacity = capacity
         self.top = 0
         self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
+        if self._stack == NULL:
+            raise MemoryError("Failed to allocate memory for stack")
     def __dealloc__(self):
         free(self._stack)
@@ -131,7 +133,7 @@ cdef class _Attributes:
 ctypedef fused str_or_Node:
-    basestring
+    str
     bytes
     Node
@@ -515,9 +517,14 @@ cdef class Node:
         """An alias for the decompose method."""
         self.decompose(recursive)
-    def unwrap(self):
+    def unwrap(self, delete_empty = False):
         """Replace node with whatever is inside this node.
+        Parameters
+        ----------
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -526,8 +533,11 @@ cdef class Node:
         >>>  tree.html
         '<html><head></head><body><div>Hello world!</div></body></html>'
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
         if self.node.child == NULL:
+            if delete_empty:
+                myhtml_node_delete(self.node)
             return
         cdef myhtml_tree_node_t* next_node;
         cdef myhtml_tree_node_t* current_node;
@@ -568,7 +578,7 @@ cdef class Node:
             for element in self.css(tag):
                 element.decompose(recursive=recursive)
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as the ``unwrap`` method, but applied to a list of tags.
@@ -577,6 +587,8 @@ cdef class Node:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -585,11 +597,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
     def replace_with(self, str_or_Node value):
         """Replace current Node with specified value.
@@ -752,7 +766,7 @@ cdef class Node:
         else:
             raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
-    def unwrap_tags(self, list tags):
+    def unwrap_tags(self, list tags, delete_empty = False):
         """Unwraps specified tags from the HTML tree.
         Works the same as th ``unwrap`` method, but applied to a list of tags.
@@ -761,6 +775,8 @@ cdef class Node:
         ----------
         tags : list
             List of tags to remove.
+        delete_empty : bool, default False
+            Whenever to delete empty tags.
         Examples
         --------
@@ -769,11 +785,13 @@ cdef class Node:
         >>> tree.body.unwrap_tags(['i','a'])
         >>> tree.body.html
         '<body><div>Hello world!</div></body>'
+        Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
         """
         for tag in tags:
             for element in self.css(tag):
-                element.unwrap()
+                element.unwrap(delete_empty)
     @property
     def raw_value(self):
@@ -962,8 +980,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
 cdef inline bytes to_bytes(str_or_Node value):
     cdef bytes bytes_val
-    if isinstance(value, (str, unicode)):
-        bytes_val = value.encode(_ENCODING)
+    if isinstance(value, unicode):
+        bytes_val = <bytes>value.encode("utf-8")
     elif isinstance(value, bytes):
-        bytes_val =  <char*> value
+        bytes_val = <bytes>value
     return bytes_val