selectolax 0.3.28__cp39-cp39-win_amd64.whl → 0.3.30__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +19 -7
- selectolax/lexbor/selection.pxi +6 -0
- selectolax/lexbor.c +3073 -2792
- selectolax/lexbor.cp39-win_amd64.pyd +0 -0
- selectolax/lexbor.pyi +7 -2
- selectolax/lexbor.pyx +10 -2
- selectolax/modest/node.pxi +27 -9
- selectolax/parser.c +3591 -3122
- selectolax/parser.cp39-win_amd64.pyd +0 -0
- selectolax/parser.pyi +13 -20
- selectolax/parser.pyx +19 -6
- {selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/METADATA +22 -18
- selectolax-0.3.30.dist-info/RECORD +26 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.30.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pyi
CHANGED
|
@@ -101,7 +101,7 @@ class LexborNode:
|
|
|
101
101
|
def id(self) -> str | None: ...
|
|
102
102
|
def iter(self, include_text: bool = False) -> Iterator[LexborNode]: ...
|
|
103
103
|
def unwrap(self) -> None: ...
|
|
104
|
-
def unwrap_tags(self, tags: list[str]) -> None: ...
|
|
104
|
+
def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
|
|
105
105
|
def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: ...
|
|
106
106
|
def replace_with(self, value: bytes | str | LexborNode) -> None: ...
|
|
107
107
|
def insert_before(self, value: bytes | str | LexborNode) -> None: ...
|
|
@@ -152,7 +152,7 @@ class LexborHTMLParser:
|
|
|
152
152
|
def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
|
|
153
153
|
def css_matches(self, selector: str) -> bool: ...
|
|
154
154
|
def clone(self) -> LexborHTMLParser: ...
|
|
155
|
-
def unwrap_tags(self, tags: list[str]) -> None: ...
|
|
155
|
+
def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
|
|
156
156
|
|
|
157
157
|
def create_tag(tag: str) -> LexborNode:
|
|
158
158
|
"""
|
|
@@ -170,3 +170,8 @@ def parse_fragment(html: str) -> list[LexborNode]:
|
|
|
170
170
|
if they are missing. This function does not add these tags.
|
|
171
171
|
"""
|
|
172
172
|
...
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class SelectolaxError(Exception):
|
|
176
|
+
"""An exception that indicates error."""
|
|
177
|
+
pass
|
selectolax/lexbor.pyx
CHANGED
|
@@ -96,6 +96,12 @@ cdef class LexborHTMLParser:
|
|
|
96
96
|
name : str (e.g. div)
|
|
97
97
|
|
|
98
98
|
"""
|
|
99
|
+
|
|
100
|
+
if not name:
|
|
101
|
+
raise ValueError("Tag name cannot be empty")
|
|
102
|
+
if len(name) > 100:
|
|
103
|
+
raise ValueError("Tag name is too long")
|
|
104
|
+
|
|
99
105
|
cdef lxb_dom_collection_t* collection = NULL
|
|
100
106
|
cdef lxb_status_t status
|
|
101
107
|
pybyte_name = name.encode('UTF-8')
|
|
@@ -327,7 +333,7 @@ cdef class LexborHTMLParser:
|
|
|
327
333
|
|
|
328
334
|
cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
|
|
329
335
|
return cls
|
|
330
|
-
def unwrap_tags(self, list tags):
|
|
336
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
331
337
|
"""Unwraps specified tags from the HTML tree.
|
|
332
338
|
|
|
333
339
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -336,6 +342,8 @@ cdef class LexborHTMLParser:
|
|
|
336
342
|
----------
|
|
337
343
|
tags : list
|
|
338
344
|
List of tags to remove.
|
|
345
|
+
delete_empty : bool
|
|
346
|
+
Whenever to delete empty tags.
|
|
339
347
|
|
|
340
348
|
Examples
|
|
341
349
|
--------
|
|
@@ -346,4 +354,4 @@ cdef class LexborHTMLParser:
|
|
|
346
354
|
'<body><div>Hello world!</div></body>'
|
|
347
355
|
"""
|
|
348
356
|
if self.root is not None:
|
|
349
|
-
self.root.unwrap_tags(tags)
|
|
357
|
+
self.root.unwrap_tags(tags, delete_empty=delete_empty)
|
selectolax/modest/node.pxi
CHANGED
|
@@ -14,6 +14,8 @@ cdef class Stack:
|
|
|
14
14
|
self.capacity = capacity
|
|
15
15
|
self.top = 0
|
|
16
16
|
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
|
|
17
|
+
if self._stack == NULL:
|
|
18
|
+
raise MemoryError("Failed to allocate memory for stack")
|
|
17
19
|
|
|
18
20
|
def __dealloc__(self):
|
|
19
21
|
free(self._stack)
|
|
@@ -131,7 +133,7 @@ cdef class _Attributes:
|
|
|
131
133
|
|
|
132
134
|
|
|
133
135
|
ctypedef fused str_or_Node:
|
|
134
|
-
|
|
136
|
+
str
|
|
135
137
|
bytes
|
|
136
138
|
Node
|
|
137
139
|
|
|
@@ -515,9 +517,14 @@ cdef class Node:
|
|
|
515
517
|
"""An alias for the decompose method."""
|
|
516
518
|
self.decompose(recursive)
|
|
517
519
|
|
|
518
|
-
def unwrap(self):
|
|
520
|
+
def unwrap(self, delete_empty = False):
|
|
519
521
|
"""Replace node with whatever is inside this node.
|
|
520
522
|
|
|
523
|
+
Parameters
|
|
524
|
+
----------
|
|
525
|
+
delete_empty : bool, default False
|
|
526
|
+
Whenever to delete empty tags.
|
|
527
|
+
|
|
521
528
|
Examples
|
|
522
529
|
--------
|
|
523
530
|
|
|
@@ -526,8 +533,11 @@ cdef class Node:
|
|
|
526
533
|
>>> tree.html
|
|
527
534
|
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
528
535
|
|
|
536
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
529
537
|
"""
|
|
530
538
|
if self.node.child == NULL:
|
|
539
|
+
if delete_empty:
|
|
540
|
+
myhtml_node_delete(self.node)
|
|
531
541
|
return
|
|
532
542
|
cdef myhtml_tree_node_t* next_node;
|
|
533
543
|
cdef myhtml_tree_node_t* current_node;
|
|
@@ -568,7 +578,7 @@ cdef class Node:
|
|
|
568
578
|
for element in self.css(tag):
|
|
569
579
|
element.decompose(recursive=recursive)
|
|
570
580
|
|
|
571
|
-
def unwrap_tags(self, list tags):
|
|
581
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
572
582
|
"""Unwraps specified tags from the HTML tree.
|
|
573
583
|
|
|
574
584
|
Works the same as the ``unwrap`` method, but applied to a list of tags.
|
|
@@ -577,6 +587,8 @@ cdef class Node:
|
|
|
577
587
|
----------
|
|
578
588
|
tags : list
|
|
579
589
|
List of tags to remove.
|
|
590
|
+
delete_empty : bool, default False
|
|
591
|
+
Whenever to delete empty tags.
|
|
580
592
|
|
|
581
593
|
Examples
|
|
582
594
|
--------
|
|
@@ -585,11 +597,13 @@ cdef class Node:
|
|
|
585
597
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
586
598
|
>>> tree.body.html
|
|
587
599
|
'<body><div>Hello world!</div></body>'
|
|
600
|
+
|
|
601
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
588
602
|
"""
|
|
589
603
|
|
|
590
604
|
for tag in tags:
|
|
591
605
|
for element in self.css(tag):
|
|
592
|
-
element.unwrap()
|
|
606
|
+
element.unwrap(delete_empty)
|
|
593
607
|
|
|
594
608
|
def replace_with(self, str_or_Node value):
|
|
595
609
|
"""Replace current Node with specified value.
|
|
@@ -752,7 +766,7 @@ cdef class Node:
|
|
|
752
766
|
else:
|
|
753
767
|
raise TypeError("Expected a string or Node instance, but %s found" % type(value).__name__)
|
|
754
768
|
|
|
755
|
-
def unwrap_tags(self, list tags):
|
|
769
|
+
def unwrap_tags(self, list tags, delete_empty = False):
|
|
756
770
|
"""Unwraps specified tags from the HTML tree.
|
|
757
771
|
|
|
758
772
|
Works the same as th ``unwrap`` method, but applied to a list of tags.
|
|
@@ -761,6 +775,8 @@ cdef class Node:
|
|
|
761
775
|
----------
|
|
762
776
|
tags : list
|
|
763
777
|
List of tags to remove.
|
|
778
|
+
delete_empty : bool, default False
|
|
779
|
+
Whenever to delete empty tags.
|
|
764
780
|
|
|
765
781
|
Examples
|
|
766
782
|
--------
|
|
@@ -769,11 +785,13 @@ cdef class Node:
|
|
|
769
785
|
>>> tree.body.unwrap_tags(['i','a'])
|
|
770
786
|
>>> tree.body.html
|
|
771
787
|
'<body><div>Hello world!</div></body>'
|
|
788
|
+
|
|
789
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
772
790
|
"""
|
|
773
791
|
|
|
774
792
|
for tag in tags:
|
|
775
793
|
for element in self.css(tag):
|
|
776
|
-
element.unwrap()
|
|
794
|
+
element.unwrap(delete_empty)
|
|
777
795
|
|
|
778
796
|
@property
|
|
779
797
|
def raw_value(self):
|
|
@@ -962,8 +980,8 @@ cdef inline str append_text(str text, str node_text, str separator='', bint stri
|
|
|
962
980
|
|
|
963
981
|
cdef inline bytes to_bytes(str_or_Node value):
|
|
964
982
|
cdef bytes bytes_val
|
|
965
|
-
if isinstance(value,
|
|
966
|
-
bytes_val = value.encode(
|
|
983
|
+
if isinstance(value, unicode):
|
|
984
|
+
bytes_val = <bytes>value.encode("utf-8")
|
|
967
985
|
elif isinstance(value, bytes):
|
|
968
|
-
bytes_val =
|
|
986
|
+
bytes_val = <bytes>value
|
|
969
987
|
return bytes_val
|