selectolax 0.3.24__cp312-cp312-musllinux_1_2_i686.whl → 0.3.25__cp312-cp312-musllinux_1_2_i686.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/util.pxi +19 -0
- selectolax/lexbor.c +21671 -16297
- selectolax/lexbor.cpython-312-i386-linux-musl.so +0 -0
- selectolax/lexbor.pyi +19 -0
- selectolax/lexbor.pyx +1 -0
- selectolax/modest/util.pxi +19 -0
- selectolax/parser.c +6700 -1322
- selectolax/parser.cpython-312-i386-linux-musl.so +0 -0
- selectolax/parser.pyi +21 -0
- selectolax/parser.pyx +1 -0
- selectolax/utils.pxi +94 -0
- {selectolax-0.3.24.dist-info → selectolax-0.3.25.dist-info}/METADATA +7 -2
- selectolax-0.3.25.dist-info/RECORD +26 -0
- selectolax-0.3.24.dist-info/RECORD +0 -24
- {selectolax-0.3.24.dist-info → selectolax-0.3.25.dist-info}/LICENSE +0 -0
- {selectolax-0.3.24.dist-info → selectolax-0.3.25.dist-info}/WHEEL +0 -0
- {selectolax-0.3.24.dist-info → selectolax-0.3.25.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/parser.pyi
CHANGED
|
@@ -56,6 +56,7 @@ class Selector:
|
|
|
56
56
|
...
|
|
57
57
|
|
|
58
58
|
class Node:
|
|
59
|
+
parser: "HTMLParser"
|
|
59
60
|
@property
|
|
60
61
|
def attributes(self) -> dict[str, None | str]:
|
|
61
62
|
"""Get all attributes that belong to the current node.
|
|
@@ -161,6 +162,9 @@ class Node:
|
|
|
161
162
|
def insert_after(self, value: str | bytes | None) -> None:
|
|
162
163
|
"""Insert a node after the current Node."""
|
|
163
164
|
...
|
|
165
|
+
def insert_child(self, value: str | bytes | None) -> None:
|
|
166
|
+
"""Insert a node inside (at the end of) the current Node.."""
|
|
167
|
+
...
|
|
164
168
|
@property
|
|
165
169
|
def raw_value(self) -> bytes:
|
|
166
170
|
"""Return the raw (unparsed, original) value of a node.
|
|
@@ -276,3 +280,20 @@ class HTMLParser:
|
|
|
276
280
|
|
|
277
281
|
This is useful for text extraction."""
|
|
278
282
|
...
|
|
283
|
+
|
|
284
|
+
def create_tag(tag: str) -> "Node":
|
|
285
|
+
"""
|
|
286
|
+
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
|
|
287
|
+
e.g. `"<div></div>"`.
|
|
288
|
+
"""
|
|
289
|
+
...
|
|
290
|
+
|
|
291
|
+
def parse_fragment(html: str) -> list["Node"]:
|
|
292
|
+
"""
|
|
293
|
+
Given HTML, parse it into a list of Nodes, such that the nodes
|
|
294
|
+
correspond to the given HTML.
|
|
295
|
+
|
|
296
|
+
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
|
|
297
|
+
if they are missing. This function does not add these tags.
|
|
298
|
+
"""
|
|
299
|
+
...
|
selectolax/parser.pyx
CHANGED
selectolax/utils.pxi
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
from typing import Literal, Optional, Union, Type
|
|
2
|
+
|
|
1
3
|
MAX_HTML_INPUT_SIZE = 250e+7
|
|
2
4
|
|
|
5
|
+
ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
|
|
6
|
+
Parser = Union["HTMLParser", "LexborHTMLParser"]
|
|
7
|
+
|
|
8
|
+
|
|
3
9
|
def preprocess_input(html, decode_errors='ignore'):
|
|
4
10
|
if isinstance(html, (str, unicode)):
|
|
5
11
|
bytes_html = html.encode('UTF-8', errors=decode_errors)
|
|
@@ -11,3 +17,91 @@ def preprocess_input(html, decode_errors='ignore'):
|
|
|
11
17
|
if html_len > MAX_HTML_INPUT_SIZE:
|
|
12
18
|
raise ValueError("The specified HTML input is too large to be processed (%d bytes)" % html_len)
|
|
13
19
|
return bytes_html, html_len
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def do_create_tag(tag: str, parser_cls: ParserCls):
|
|
23
|
+
if not tag:
|
|
24
|
+
raise ValueError("Tag name cannot be empty")
|
|
25
|
+
return do_parse_fragment(f"<{tag}></{tag}>", parser_cls)[0]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_fragment_type(
|
|
29
|
+
html: str,
|
|
30
|
+
parser_cls: ParserCls,
|
|
31
|
+
tree: Optional[Parser] = None,
|
|
32
|
+
) -> Literal["document", "fragment", "head", "body", "head_and_body", "document_no_head", "document_no_body", "document_no_head_no_body"]:
|
|
33
|
+
if not tree:
|
|
34
|
+
tree = parser_cls(html)
|
|
35
|
+
|
|
36
|
+
import re
|
|
37
|
+
html_re = re.compile(r"<html|<body|<head", re.IGNORECASE)
|
|
38
|
+
|
|
39
|
+
has_html = False
|
|
40
|
+
has_head = False
|
|
41
|
+
has_body = False
|
|
42
|
+
for match in html_re.finditer(html):
|
|
43
|
+
if match[0] == "<html":
|
|
44
|
+
has_html = True
|
|
45
|
+
elif match[0] == "<head":
|
|
46
|
+
has_head = True
|
|
47
|
+
elif match[0] == "<body":
|
|
48
|
+
has_body = True
|
|
49
|
+
|
|
50
|
+
if has_html and has_head and has_body:
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
if has_html and has_head and has_body:
|
|
54
|
+
return "document"
|
|
55
|
+
elif has_html and not has_head and has_body:
|
|
56
|
+
return "document_no_head"
|
|
57
|
+
elif has_html and has_head and not has_body:
|
|
58
|
+
return "document_no_body"
|
|
59
|
+
elif has_html and not has_head and not has_body:
|
|
60
|
+
return "document_no_head_no_body"
|
|
61
|
+
elif has_head and not has_body:
|
|
62
|
+
return "head"
|
|
63
|
+
elif not has_head and has_body:
|
|
64
|
+
return "body"
|
|
65
|
+
elif has_head and has_body:
|
|
66
|
+
return "head_and_body"
|
|
67
|
+
else:
|
|
68
|
+
return "fragment"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def do_parse_fragment(html: str, parser_cls: ParserCls):
|
|
72
|
+
"""
|
|
73
|
+
Given HTML, parse it into a list of Nodes, such that the nodes
|
|
74
|
+
correspond to the given HTML.
|
|
75
|
+
|
|
76
|
+
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
|
|
77
|
+
if they are missing. This function does not add these tags.
|
|
78
|
+
"""
|
|
79
|
+
html = html.strip()
|
|
80
|
+
tree = parser_cls(html)
|
|
81
|
+
frag_type = get_fragment_type(html, parser_cls, tree)
|
|
82
|
+
|
|
83
|
+
if frag_type == "document":
|
|
84
|
+
return [tree.root]
|
|
85
|
+
if frag_type == "document_no_head":
|
|
86
|
+
tree.head.decompose(recursive=True)
|
|
87
|
+
return [tree.root]
|
|
88
|
+
if frag_type == "document_no_body":
|
|
89
|
+
tree.body.decompose(recursive=True)
|
|
90
|
+
return [tree.root]
|
|
91
|
+
if frag_type == "document_no_head_no_body":
|
|
92
|
+
tree.head.decompose(recursive=True)
|
|
93
|
+
tree.body.decompose(recursive=True)
|
|
94
|
+
return [tree.root]
|
|
95
|
+
elif frag_type == "head":
|
|
96
|
+
tree.body.decompose(recursive=True)
|
|
97
|
+
return [tree.head]
|
|
98
|
+
elif frag_type == "body":
|
|
99
|
+
tree.head.decompose(recursive=True)
|
|
100
|
+
return [tree.body]
|
|
101
|
+
elif frag_type == "head_and_body":
|
|
102
|
+
return [tree.head, tree.body]
|
|
103
|
+
else:
|
|
104
|
+
return [
|
|
105
|
+
*tree.head.iter(include_text=True),
|
|
106
|
+
*tree.body.iter(include_text=True),
|
|
107
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: selectolax
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.25
|
|
4
4
|
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
5
|
Home-page: https://github.com/rushter/selectolax
|
|
6
6
|
Author: Artem Golubin
|
|
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
22
22
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
23
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
24
|
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
26
|
License-File: LICENSE
|
|
26
27
|
Provides-Extra: cython
|
|
27
28
|
Requires-Dist: Cython==3.0.11; extra == "cython"
|
|
@@ -122,6 +123,10 @@ Available backends
|
|
|
122
123
|
Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
|
|
123
124
|
Most of the features between backends are almost identical, but there are still some differences.
|
|
124
125
|
|
|
126
|
+
As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
|
|
127
|
+
and the underlying C library that selectolax uses is not maintained anymore.
|
|
128
|
+
|
|
129
|
+
|
|
125
130
|
To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
|
|
126
131
|
|
|
127
132
|
.. code:: python
|
|
@@ -147,7 +152,7 @@ Simple Benchmark
|
|
|
147
152
|
Package Time
|
|
148
153
|
============================ ===========
|
|
149
154
|
Beautiful Soup (html.parser) 61.02 sec.
|
|
150
|
-
lxml
|
|
155
|
+
lxml / Beautiful Soup (lxml) 9.09 sec.
|
|
151
156
|
html5_parser 16.10 sec.
|
|
152
157
|
selectolax (Modest) 2.94 sec.
|
|
153
158
|
selectolax (Lexbor) 2.39 sec.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
selectolax/lexbor.pxd,sha256=PwygBdb1blWAQcxXubZS5uffhgcXaqgySNMPFMT02-c,20958
|
|
2
|
+
selectolax/lexbor.cpython-312-i386-linux-musl.so,sha256=LEKcn1NJBLa5JfT4hdPcjfIOUz-yHgJ0gTD6-ai_9V4,16891572
|
|
3
|
+
selectolax/parser.cpython-312-i386-linux-musl.so,sha256=sK8FEzgOl-BsNHFegqwmTMLyXOQOL8I_ecT1lay82M8,6246756
|
|
4
|
+
selectolax/parser.pyi,sha256=gDTx5Qde0rKrWDMSpZZDS5XJbPlERnwSQCN8bg5U4AA,10558
|
|
5
|
+
selectolax/parser.pxd,sha256=zZlg1vHUg6o4MXaiwKAo5S5hO_DqBGc4_E10qJ2EcM4,24564
|
|
6
|
+
selectolax/base.pxi,sha256=eiPKlY9gG3l49qJoRQVLl1Ljza6z1k0A-met6sDPcqE,89
|
|
7
|
+
selectolax/parser.pyx,sha256=o1HkYE_nQr3TS7EPlldJx2-ygU9B5FI2uWYFzdF-VaI,12953
|
|
8
|
+
selectolax/__init__.py,sha256=oWCGdiVLnaTCmGs0dtRQpNGri89OdU7AwSRoqhmbajI,175
|
|
9
|
+
selectolax/lexbor.c,sha256=7rs-9d175hT0mRx49Lwnzbw2Hbd9kqpo6TYhn9oI3UU,2353590
|
|
10
|
+
selectolax/lexbor.pyx,sha256=ffEzBnZjGTsI-H5qck7bfjVRE9vteOhQnDp6RjVD7G0,10750
|
|
11
|
+
selectolax/lexbor.pyi,sha256=FFVEZfXI8BwvUI0AtNQRUaTTzf66sXq2PWXiggaglug,5543
|
|
12
|
+
selectolax/utils.pxi,sha256=rPNMFqS0PRLkQPugwPfj-pnHCzkQzQ2cjIRMPZdR6R8,3453
|
|
13
|
+
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
selectolax/parser.c,sha256=7APT8EQr5vuogGiSwCRNUi_S32l0YAVV8f1CYtVSkNM,2214750
|
|
15
|
+
selectolax/lexbor/util.pxi,sha256=Zq7S-zlyU3wOo49wGHQHnmmhpbkrcJm59ZCTPENcZQA,563
|
|
16
|
+
selectolax/lexbor/node.pxi,sha256=1XNzUwCbTYXy4D6rZtHxMpoJ9M-xoprB9wjdsiaWhr0,29346
|
|
17
|
+
selectolax/lexbor/selection.pxi,sha256=PqjvpL6H9uFcmcQWVGfML8FDsTO7tGoZujpA00g9pWk,6444
|
|
18
|
+
selectolax/lexbor/attrs.pxi,sha256=-518D5v70GgMJhtsxWrWcgIMnXg8afECpUubzq8kqqs,3102
|
|
19
|
+
selectolax/modest/util.pxi,sha256=aX9UnRNTITImHVBTlIs9efOd3EyugLq_Lwuo0zVTiuQ,551
|
|
20
|
+
selectolax/modest/node.pxi,sha256=NrMzJnQJDCmgTHpUxpMHDyAfQ_AS_n_Cr_2ryEKjyL0,32550
|
|
21
|
+
selectolax/modest/selection.pxi,sha256=S55MMxEW2B1oPExB_DRwPM46WoWZU73J3rFRZU1URuQ,6393
|
|
22
|
+
selectolax-0.3.25.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
23
|
+
selectolax-0.3.25.dist-info/METADATA,sha256=XbR_TA9gBumRbVEUX7o7IlMBJgDVyAFF0sgmFxCP-v8,5928
|
|
24
|
+
selectolax-0.3.25.dist-info/WHEEL,sha256=keQAt_ShtBm9e5UyXQHlogOYqntwDnvEGVyQY3Tkxig,110
|
|
25
|
+
selectolax-0.3.25.dist-info/RECORD,,
|
|
26
|
+
selectolax-0.3.25.dist-info/LICENSE,sha256=kYggm2ZJzBgL79x1gCsYsx8rFIYP2IE-BdXRV3Rm0NU,1077
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
selectolax/lexbor.pyi,sha256=N2u6ru6bMnJWsjRmzoctM9AHju41FZVr6Iij-gbkNOs,4945
|
|
3
|
-
selectolax/parser.pyx,sha256=2pKAUhY64htRxF5v64Ke7NRNSm1BHAS60wxrmmapJi0,12927
|
|
4
|
-
selectolax/base.pxi,sha256=eiPKlY9gG3l49qJoRQVLl1Ljza6z1k0A-met6sDPcqE,89
|
|
5
|
-
selectolax/utils.pxi,sha256=BCIzfmduFEuCxu_9vIEmjzrcwawOtyW31lDmfEU1wk4,544
|
|
6
|
-
selectolax/parser.c,sha256=NodaVPpvDLnKD8kX08eu9sZNYbzKhb3sT-rBNrRqTAU,1995870
|
|
7
|
-
selectolax/lexbor.cpython-312-i386-linux-musl.so,sha256=Rf4GJEohyyLpQNaWVJZ7mHERzqfOilpgT21MD9y-lFs,16576008
|
|
8
|
-
selectolax/parser.pyi,sha256=g8AED0DK6tpLxlgbYtmpzvuUQ1TMGX7hQBi6u5jlbHQ,9908
|
|
9
|
-
selectolax/lexbor.pyx,sha256=HhP93LGdiUmRY3F1ybFQHgz7NG25Ae1RFqhxx7HE9BY,10724
|
|
10
|
-
selectolax/lexbor.pxd,sha256=PwygBdb1blWAQcxXubZS5uffhgcXaqgySNMPFMT02-c,20958
|
|
11
|
-
selectolax/lexbor.c,sha256=Is-9U-NXnZNUGZDh7_HU7ztvIYUMXQVwOOTqujR895c,2134829
|
|
12
|
-
selectolax/parser.cpython-312-i386-linux-musl.so,sha256=DJsH37WQXEWpcUGi8InX1OJJfPTUQSjI165jzmPkdok,5911600
|
|
13
|
-
selectolax/__init__.py,sha256=0DJWRFlhuIq2te-rSQYt7CZ4a7_wDJZFxi5tMdQTiHM,175
|
|
14
|
-
selectolax/parser.pxd,sha256=zZlg1vHUg6o4MXaiwKAo5S5hO_DqBGc4_E10qJ2EcM4,24564
|
|
15
|
-
selectolax/modest/selection.pxi,sha256=S55MMxEW2B1oPExB_DRwPM46WoWZU73J3rFRZU1URuQ,6393
|
|
16
|
-
selectolax/modest/node.pxi,sha256=NrMzJnQJDCmgTHpUxpMHDyAfQ_AS_n_Cr_2ryEKjyL0,32550
|
|
17
|
-
selectolax/lexbor/selection.pxi,sha256=PqjvpL6H9uFcmcQWVGfML8FDsTO7tGoZujpA00g9pWk,6444
|
|
18
|
-
selectolax/lexbor/attrs.pxi,sha256=-518D5v70GgMJhtsxWrWcgIMnXg8afECpUubzq8kqqs,3102
|
|
19
|
-
selectolax/lexbor/node.pxi,sha256=1XNzUwCbTYXy4D6rZtHxMpoJ9M-xoprB9wjdsiaWhr0,29346
|
|
20
|
-
selectolax-0.3.24.dist-info/LICENSE,sha256=kYggm2ZJzBgL79x1gCsYsx8rFIYP2IE-BdXRV3Rm0NU,1077
|
|
21
|
-
selectolax-0.3.24.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
22
|
-
selectolax-0.3.24.dist-info/RECORD,,
|
|
23
|
-
selectolax-0.3.24.dist-info/METADATA,sha256=tTXFaywjSCLtoHn-mCX-jZCSuhWVZ8CVJocJit3e4Po,5681
|
|
24
|
-
selectolax-0.3.24.dist-info/WHEEL,sha256=keQAt_ShtBm9e5UyXQHlogOYqntwDnvEGVyQY3Tkxig,110
|
|
File without changes
|
|
File without changes
|
|
File without changes
|