selectolax 0.3.29__cp311-cp311-musllinux_1_2_aarch64.whl → 0.4.0__cp311-cp311-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +215 -60
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +24654 -25072
- selectolax/lexbor.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +94 -21
- selectolax/modest/node.pxi +49 -43
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18015 -20066
- selectolax/parser.cpython-311-aarch64-linux-musl.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -46
- selectolax/parser.pyx +41 -33
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -183
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/parser.pyx
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
from cpython cimport bool
|
|
2
|
+
from cpython.bool cimport bool
|
|
3
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
4
|
|
|
4
5
|
include "modest/selection.pxi"
|
|
5
6
|
include "modest/node.pxi"
|
|
@@ -61,8 +62,7 @@ cdef class HTMLParser:
|
|
|
61
62
|
|
|
62
63
|
"""
|
|
63
64
|
|
|
64
|
-
node = Node()
|
|
65
|
-
node._init(self.html_tree.node_html, self)
|
|
65
|
+
cdef Node node = Node.new(self.html_tree.node_html, self)
|
|
66
66
|
return node.css(query)
|
|
67
67
|
|
|
68
68
|
def css_first(self, str query, default=None, strict=False):
|
|
@@ -72,9 +72,9 @@ cdef class HTMLParser:
|
|
|
72
72
|
----------
|
|
73
73
|
|
|
74
74
|
query : str
|
|
75
|
-
default :
|
|
75
|
+
default : Any, default None
|
|
76
76
|
Default value to return if there is no match.
|
|
77
|
-
strict: bool, default
|
|
77
|
+
strict: bool, default False
|
|
78
78
|
Set to True if you want to check if there is strictly only one match in the document.
|
|
79
79
|
|
|
80
80
|
|
|
@@ -84,12 +84,11 @@ cdef class HTMLParser:
|
|
|
84
84
|
|
|
85
85
|
"""
|
|
86
86
|
|
|
87
|
-
node = Node()
|
|
88
|
-
node._init(self.html_tree.node_html, self)
|
|
87
|
+
cdef Node node = Node.new(self.html_tree.node_html, self)
|
|
89
88
|
return node.css_first(query, default, strict)
|
|
90
89
|
|
|
91
90
|
cdef void _detect_encoding(self, char* html, size_t html_len) nogil:
|
|
92
|
-
cdef myencoding_t encoding = MyENCODING_DEFAULT
|
|
91
|
+
cdef myencoding_t encoding = MyENCODING_DEFAULT
|
|
93
92
|
|
|
94
93
|
if self.use_meta_tags:
|
|
95
94
|
encoding = myencoding_prescan_stream_to_determine_encoding(html, html_len)
|
|
@@ -102,7 +101,7 @@ cdef class HTMLParser:
|
|
|
102
101
|
|
|
103
102
|
self._encoding = encoding
|
|
104
103
|
|
|
105
|
-
cdef _parse_html(self, char* html, size_t html_len):
|
|
104
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1:
|
|
106
105
|
cdef myhtml_t* myhtml
|
|
107
106
|
cdef mystatus_t status
|
|
108
107
|
|
|
@@ -111,23 +110,28 @@ cdef class HTMLParser:
|
|
|
111
110
|
status = myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0)
|
|
112
111
|
|
|
113
112
|
if status != 0:
|
|
114
|
-
|
|
113
|
+
PyErr_SetObject(RuntimeError, "Can't init MyHTML object.")
|
|
114
|
+
return -1
|
|
115
115
|
|
|
116
116
|
with nogil:
|
|
117
117
|
self.html_tree = myhtml_tree_create()
|
|
118
118
|
status = myhtml_tree_init(self.html_tree, myhtml)
|
|
119
119
|
|
|
120
120
|
if status != 0:
|
|
121
|
-
|
|
121
|
+
PyErr_SetObject(RuntimeError, "Can't init MyHTML Tree object.")
|
|
122
|
+
return -1
|
|
122
123
|
|
|
123
124
|
with nogil:
|
|
124
125
|
status = myhtml_parse(self.html_tree, self._encoding, html, html_len)
|
|
125
126
|
|
|
126
127
|
if status != 0:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
assert self.html_tree.node_html != NULL
|
|
128
|
+
PyErr_SetObject(RuntimeError, "Can't parse HTML (status code: %d)" % status)
|
|
129
|
+
return -1
|
|
130
130
|
|
|
131
|
+
if self.html_tree.node_html == NULL:
|
|
132
|
+
PyErr_SetObject(RuntimeError, "html_tree is still NULL even after parsing ")
|
|
133
|
+
return -1
|
|
134
|
+
return 0
|
|
131
135
|
|
|
132
136
|
@property
|
|
133
137
|
def input_encoding(self):
|
|
@@ -147,9 +151,11 @@ cdef class HTMLParser:
|
|
|
147
151
|
def root(self):
|
|
148
152
|
"""Returns root node."""
|
|
149
153
|
if self.html_tree and self.html_tree.node_html:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
154
|
+
try:
|
|
155
|
+
return Node.new(self.html_tree.node_html, self)
|
|
156
|
+
except Exception:
|
|
157
|
+
# If Node creation or initialization fails, return None
|
|
158
|
+
return None
|
|
153
159
|
return None
|
|
154
160
|
|
|
155
161
|
@property
|
|
@@ -159,9 +165,7 @@ cdef class HTMLParser:
|
|
|
159
165
|
head = myhtml_tree_get_node_head(self.html_tree)
|
|
160
166
|
|
|
161
167
|
if head != NULL:
|
|
162
|
-
|
|
163
|
-
node._init(head, self)
|
|
164
|
-
return node
|
|
168
|
+
return Node.new(head, self)
|
|
165
169
|
return None
|
|
166
170
|
|
|
167
171
|
@property
|
|
@@ -171,10 +175,7 @@ cdef class HTMLParser:
|
|
|
171
175
|
body = myhtml_tree_get_node_body(self.html_tree)
|
|
172
176
|
|
|
173
177
|
if body != NULL:
|
|
174
|
-
|
|
175
|
-
node._init(body, self)
|
|
176
|
-
return node
|
|
177
|
-
|
|
178
|
+
return Node.new(body, self)
|
|
178
179
|
return None
|
|
179
180
|
|
|
180
181
|
def tags(self, str name):
|
|
@@ -185,9 +186,15 @@ cdef class HTMLParser:
|
|
|
185
186
|
name : str (e.g. div)
|
|
186
187
|
|
|
187
188
|
"""
|
|
189
|
+
# Validate tag name
|
|
190
|
+
if not name:
|
|
191
|
+
raise ValueError("Tag name cannot be empty")
|
|
192
|
+
if len(name) > 100: # Reasonable limit for tag names
|
|
193
|
+
raise ValueError("Tag name is too long")
|
|
194
|
+
|
|
188
195
|
cdef myhtml_collection_t* collection = NULL
|
|
189
196
|
pybyte_name = name.encode('UTF-8')
|
|
190
|
-
cdef mystatus_t status = 0
|
|
197
|
+
cdef mystatus_t status = 0
|
|
191
198
|
|
|
192
199
|
result = list()
|
|
193
200
|
collection = myhtml_get_nodes_by_name(self.html_tree, NULL, pybyte_name, len(pybyte_name), &status)
|
|
@@ -197,8 +204,7 @@ cdef class HTMLParser:
|
|
|
197
204
|
|
|
198
205
|
if status == 0:
|
|
199
206
|
for i in range(collection.length):
|
|
200
|
-
node = Node()
|
|
201
|
-
node._init(collection.list[i], self)
|
|
207
|
+
node = Node.new(collection.list[i], self)
|
|
202
208
|
result.append(node)
|
|
203
209
|
|
|
204
210
|
myhtml_collection_destroy(collection)
|
|
@@ -248,7 +254,7 @@ cdef class HTMLParser:
|
|
|
248
254
|
"""
|
|
249
255
|
cdef myhtml_collection_t* collection = NULL
|
|
250
256
|
|
|
251
|
-
cdef mystatus_t status = 0
|
|
257
|
+
cdef mystatus_t status = 0
|
|
252
258
|
|
|
253
259
|
for tag in tags:
|
|
254
260
|
pybyte_name = tag.encode('UTF-8')
|
|
@@ -268,7 +274,6 @@ cdef class HTMLParser:
|
|
|
268
274
|
|
|
269
275
|
myhtml_collection_destroy(collection)
|
|
270
276
|
|
|
271
|
-
|
|
272
277
|
def unwrap_tags(self, list tags, delete_empty : bool = False):
|
|
273
278
|
"""Unwraps specified tags from the HTML tree.
|
|
274
279
|
|
|
@@ -295,9 +300,9 @@ cdef class HTMLParser:
|
|
|
295
300
|
@property
|
|
296
301
|
def html(self):
|
|
297
302
|
"""Return HTML representation of the page."""
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
node.
|
|
303
|
+
cdef Node node
|
|
304
|
+
if self.html_tree != NULL and self.html_tree.document != NULL:
|
|
305
|
+
node = Node.new(self.html_tree.document, self)
|
|
301
306
|
return node.html
|
|
302
307
|
return None
|
|
303
308
|
|
|
@@ -351,6 +356,7 @@ cdef class HTMLParser:
|
|
|
351
356
|
|
|
352
357
|
def css_matches(self, str selector):
|
|
353
358
|
return self.root.css_matches(selector)
|
|
359
|
+
|
|
354
360
|
def merge_text_nodes(self):
|
|
355
361
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
356
362
|
|
|
@@ -370,6 +376,7 @@ cdef class HTMLParser:
|
|
|
370
376
|
"John Doe"
|
|
371
377
|
"""
|
|
372
378
|
return self.root.merge_text_nodes()
|
|
379
|
+
|
|
373
380
|
@staticmethod
|
|
374
381
|
cdef HTMLParser from_tree(
|
|
375
382
|
myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
|
|
@@ -386,13 +393,13 @@ cdef class HTMLParser:
|
|
|
386
393
|
obj.cached_script_srcs = None
|
|
387
394
|
return obj
|
|
388
395
|
|
|
389
|
-
|
|
390
396
|
def clone(self):
|
|
391
397
|
"""Clone the current tree."""
|
|
392
398
|
cdef myhtml_t* myhtml
|
|
393
399
|
cdef mystatus_t status
|
|
394
400
|
cdef myhtml_tree_t* html_tree
|
|
395
401
|
cdef myhtml_tree_node_t* node
|
|
402
|
+
cdef HTMLParser cls
|
|
396
403
|
|
|
397
404
|
with nogil:
|
|
398
405
|
myhtml = myhtml_create()
|
|
@@ -428,6 +435,7 @@ cdef class HTMLParser:
|
|
|
428
435
|
if self.html_tree != NULL:
|
|
429
436
|
myhtml = self.html_tree.myhtml
|
|
430
437
|
myhtml_tree_destroy(self.html_tree)
|
|
438
|
+
self.html_tree = NULL # Prevent double-free
|
|
431
439
|
if myhtml != NULL:
|
|
432
440
|
myhtml_destroy(myhtml)
|
|
433
441
|
|
selectolax/utils.pxi
CHANGED
|
@@ -4,6 +4,16 @@ MAX_HTML_INPUT_SIZE = 250e+7
|
|
|
4
4
|
|
|
5
5
|
ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
|
|
6
6
|
Parser = Union["HTMLParser", "LexborHTMLParser"]
|
|
7
|
+
FRAGMENT = Literal[
|
|
8
|
+
"document",
|
|
9
|
+
"fragment",
|
|
10
|
+
"head",
|
|
11
|
+
"body",
|
|
12
|
+
"head_and_body",
|
|
13
|
+
"document_no_head",
|
|
14
|
+
"document_no_body",
|
|
15
|
+
"document_no_head_no_body",
|
|
16
|
+
]
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
def preprocess_input(html, decode_errors='ignore'):
|
|
@@ -29,10 +39,10 @@ def get_fragment_type(
|
|
|
29
39
|
html: str,
|
|
30
40
|
parser_cls: ParserCls,
|
|
31
41
|
tree: Optional[Parser] = None,
|
|
32
|
-
) ->
|
|
42
|
+
) -> FRAGMENT:
|
|
33
43
|
if not tree:
|
|
34
44
|
tree = parser_cls(html)
|
|
35
|
-
|
|
45
|
+
|
|
36
46
|
import re
|
|
37
47
|
html_re = re.compile(r"<html|<body|<head(?!er)", re.IGNORECASE)
|
|
38
48
|
|
|
@@ -49,7 +59,7 @@ def get_fragment_type(
|
|
|
49
59
|
|
|
50
60
|
if has_html and has_head and has_body:
|
|
51
61
|
break
|
|
52
|
-
|
|
62
|
+
|
|
53
63
|
if has_html and has_head and has_body:
|
|
54
64
|
return "document"
|
|
55
65
|
elif has_html and not has_head and has_body:
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: selectolax
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
+
Home-page: https://github.com/rushter/selectolax
|
|
6
|
+
Author: Artem Golubin
|
|
7
|
+
Author-email: Artem Golubin <me@rushter.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Project-URL: Repository, https://github.com/rushter/selectolax
|
|
10
|
+
Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
|
|
11
|
+
Project-URL: Changelog, https://github.com/rushter/selectolax/blob/master/CHANGES.md
|
|
12
|
+
Keywords: selectolax,html,parser,css,fast
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Topic :: Internet
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Description-Content-Type: text/x-rst
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: cython
|
|
29
|
+
Requires-Dist: Cython; extra == "cython"
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
selectolax/__init__.py,sha256=CnY6a5BeJexKaFN_b2L28F5AVD1jPM1lFz9kfS6RC5w,148
|
|
2
|
+
selectolax/base.pxi,sha256=eiPKlY9gG3l49qJoRQVLl1Ljza6z1k0A-met6sDPcqE,89
|
|
3
|
+
selectolax/lexbor.c,sha256=uyvVzFr0FZyq_yL-qT-ZZCizC69Noq0aeM5jSruq6wM,2445661
|
|
4
|
+
selectolax/lexbor.cpython-311-aarch64-linux-musl.so,sha256=KDsBiCGiSGMaX6LTdJvEpiB_nr1iwbXwCOU5YhI6IEc,4135632
|
|
5
|
+
selectolax/lexbor.pxd,sha256=cAitQeHgGxp5Aac-o5aaOyg6_IiOpp3Rg0JNlz8Cstk,21652
|
|
6
|
+
selectolax/lexbor.pyi,sha256=MDQ4YQWcywG3oeSITWifMkCsa09MmPbyXMQq06wqwAY,30092
|
|
7
|
+
selectolax/lexbor.pyx,sha256=-QsF8Ru8DvWEEy3AIjXDdoKTG5saocX-HkTE_feS6tQ,13468
|
|
8
|
+
selectolax/parser.c,sha256=pFUbwlf1TmgyaPICOdTdEAZIjaytK6foTCSck95-iZ4,2210172
|
|
9
|
+
selectolax/parser.cpython-311-aarch64-linux-musl.so,sha256=v8HPOUvMZKWxkyV7zPCWuUPmexBti-agX12zP5_AXCo,3348576
|
|
10
|
+
selectolax/parser.pxd,sha256=BQSlDGibVfqFDhfqX6l5sBnfkHEETxlj-eSpGWERKEs,24618
|
|
11
|
+
selectolax/parser.pyi,sha256=qi9AHy_DWalANSOVTN6gbtbf-YJZCfE7i12aWJbrUp8,24929
|
|
12
|
+
selectolax/parser.pyx,sha256=sBq2_HR83Ek8yqnFBYrG2xBBCM4S6Jiiie_xq5O_twE,13684
|
|
13
|
+
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
selectolax/utils.pxi,sha256=_g-ZLprPgbqv7BLs-WEe8IhbDd_QTcfirz_NEyR1Yww,3506
|
|
15
|
+
selectolax/lexbor/attrs.pxi,sha256=eH90zJYHicffTzC7peIitHkOqyIw3xzomhJHxJv9hP8,3858
|
|
16
|
+
selectolax/lexbor/node.pxi,sha256=Sj5Kx_I2vBarZRNrYhPk2TufhEYYNlV9wnSbLACyZMQ,35311
|
|
17
|
+
selectolax/lexbor/node_remove.pxi,sha256=iqJ2PPNvQmK2dq8kJLXiZawoGf1Az3MpbrlQI6k4jDM,760
|
|
18
|
+
selectolax/lexbor/selection.pxi,sha256=ZJ5ed7YgxvcsOW_qPbMhUQRKgChl9cih1n1d5elfTZ8,8030
|
|
19
|
+
selectolax/lexbor/util.pxi,sha256=hqMQU1O_5O82ThjUzk8NxQPl-Kg29DDGFFpC46LcejI,564
|
|
20
|
+
selectolax/modest/node.pxi,sha256=l0aQf2Ojpzxh-L-0KxLetG7uGgGhkV7Cqgfy8O_5ch4,33786
|
|
21
|
+
selectolax/modest/selection.pxi,sha256=m4GDpl0aI7lSWHFeBBheroUKDrZgJcc6uVubtzrXL1M,6508
|
|
22
|
+
selectolax/modest/util.pxi,sha256=di9cLmAyuGFXmiuptZ7Fz1SgkCf7hmiZLnpKCKEKsUc,552
|
|
23
|
+
selectolax-0.4.0.dist-info/METADATA,sha256=xFNpvUeepMxneDCjZoP5kTv4nBQccf5_wyWieI3BFnY,1287
|
|
24
|
+
selectolax-0.4.0.dist-info/WHEEL,sha256=LNZuceeFd58B9QVekwESAWqB09LEv_tgGDcYQrpkG9U,113
|
|
25
|
+
selectolax-0.4.0.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
26
|
+
selectolax-0.4.0.dist-info/RECORD,,
|
|
27
|
+
selectolax-0.4.0.dist-info/licenses/LICENSE,sha256=MYCcM-Cv_rC2-lQiwDumin0E-rMXAhK-qIGGA29434Y,1077
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: selectolax
|
|
3
|
-
Version: 0.3.29
|
|
4
|
-
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
-
Home-page: https://github.com/rushter/selectolax
|
|
6
|
-
Author: Artem Golubin
|
|
7
|
-
Author-email: me@rushter.com
|
|
8
|
-
License: MIT license
|
|
9
|
-
Project-URL: Source code, https://github.com/rushter/selectolax
|
|
10
|
-
Keywords: selectolax
|
|
11
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
13
|
-
Classifier: Topic :: Internet
|
|
14
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
-
Classifier: Intended Audience :: Developers
|
|
16
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
-
Classifier: Natural Language :: English
|
|
18
|
-
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
-
License-File: LICENSE
|
|
27
|
-
Provides-Extra: cython
|
|
28
|
-
Requires-Dist: Cython==3.0.11; extra == "cython"
|
|
29
|
-
|
|
30
|
-
.. image:: docs/logo.png
|
|
31
|
-
:alt: selectolax logo
|
|
32
|
-
|
|
33
|
-
-------------------------
|
|
34
|
-
|
|
35
|
-
.. image:: https://img.shields.io/pypi/v/selectolax.svg
|
|
36
|
-
:target: https://pypi.python.org/pypi/selectolax
|
|
37
|
-
|
|
38
|
-
A fast HTML5 parser with CSS selectors using `Modest <https://github.com/lexborisov/Modest/>`_ and
|
|
39
|
-
`Lexbor <https://github.com/lexbor/lexbor>`_ engines.
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
Installation
|
|
43
|
-
------------
|
|
44
|
-
From PyPI using pip:
|
|
45
|
-
|
|
46
|
-
.. code-block:: bash
|
|
47
|
-
|
|
48
|
-
pip install selectolax
|
|
49
|
-
|
|
50
|
-
If installation fails due to compilation errors, you may need to install `Cython <https://github.com/cython/cython>`_:
|
|
51
|
-
|
|
52
|
-
.. code-block:: bash
|
|
53
|
-
|
|
54
|
-
pip install selectolax[cython]
|
|
55
|
-
|
|
56
|
-
This usually happens when you try to install an outdated version of selectolax on a newer version of Python.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
Development version from GitHub:
|
|
60
|
-
|
|
61
|
-
.. code-block:: bash
|
|
62
|
-
|
|
63
|
-
git clone --recursive https://github.com/rushter/selectolax
|
|
64
|
-
cd selectolax
|
|
65
|
-
pip install -r requirements_dev.txt
|
|
66
|
-
python setup.py install
|
|
67
|
-
|
|
68
|
-
How to compile selectolax while developing:
|
|
69
|
-
|
|
70
|
-
.. code-block:: bash
|
|
71
|
-
|
|
72
|
-
make clean
|
|
73
|
-
make dev
|
|
74
|
-
|
|
75
|
-
Basic examples
|
|
76
|
-
--------------
|
|
77
|
-
|
|
78
|
-
Here are some basic examples to get you started with selectolax:
|
|
79
|
-
|
|
80
|
-
Parsing HTML and extracting text:
|
|
81
|
-
|
|
82
|
-
.. code:: python
|
|
83
|
-
|
|
84
|
-
In [1]: from selectolax.parser import HTMLParser
|
|
85
|
-
...:
|
|
86
|
-
...: html = """
|
|
87
|
-
...: <h1 id="title" data-updated="20201101">Hi there</h1>
|
|
88
|
-
...: <div class="post">Lorem Ipsum is simply dummy text of the printing and typesetting industry. </div>
|
|
89
|
-
...: <div class="post">Lorem ipsum dolor sit amet, consectetur adipiscing elit.</div>
|
|
90
|
-
...: """
|
|
91
|
-
...: tree = HTMLParser(html)
|
|
92
|
-
|
|
93
|
-
In [2]: tree.css_first('h1#title').text()
|
|
94
|
-
Out[2]: 'Hi there'
|
|
95
|
-
|
|
96
|
-
In [3]: tree.css_first('h1#title').attributes
|
|
97
|
-
Out[3]: {'id': 'title', 'data-updated': '20201101'}
|
|
98
|
-
|
|
99
|
-
In [4]: [node.text() for node in tree.css('.post')]
|
|
100
|
-
Out[4]:
|
|
101
|
-
['Lorem Ipsum is simply dummy text of the printing and typesetting industry. ',
|
|
102
|
-
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.']
|
|
103
|
-
|
|
104
|
-
Using advanced CSS selectors:
|
|
105
|
-
|
|
106
|
-
.. code:: python
|
|
107
|
-
|
|
108
|
-
In [1]: html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
|
|
109
|
-
...: selector = "div > :nth-child(2n+1):not(:has(a))"
|
|
110
|
-
|
|
111
|
-
In [2]: for node in HTMLParser(html).css(selector):
|
|
112
|
-
...: print(node.attributes, node.text(), node.tag)
|
|
113
|
-
...: print(node.parent.tag)
|
|
114
|
-
...: print(node.html)
|
|
115
|
-
...:
|
|
116
|
-
{'id': 'p1'} p
|
|
117
|
-
div
|
|
118
|
-
<p id="p1"></p>
|
|
119
|
-
{'id': 'p5'} text p
|
|
120
|
-
div
|
|
121
|
-
<p id="p5">text</p>
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
125
|
-
|
|
126
|
-
Available backends
|
|
127
|
-
------------------
|
|
128
|
-
|
|
129
|
-
Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
|
|
130
|
-
Most of the features between backends are almost identical, but there are still some differences.
|
|
131
|
-
|
|
132
|
-
As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
|
|
133
|
-
and the underlying C library that selectolax uses is not maintained anymore.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
|
|
137
|
-
|
|
138
|
-
.. code:: python
|
|
139
|
-
|
|
140
|
-
In [1]: from selectolax.lexbor import LexborHTMLParser
|
|
141
|
-
|
|
142
|
-
In [2]: html = """
|
|
143
|
-
...: <title>Hi there</title>
|
|
144
|
-
...: <div id="updated">2021-08-15</div>
|
|
145
|
-
...: """
|
|
146
|
-
|
|
147
|
-
In [3]: parser = LexborHTMLParser(html)
|
|
148
|
-
In [4]: parser.root.css_first("#updated").text()
|
|
149
|
-
Out[4]: '2021-08-15'
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
Simple Benchmark
|
|
153
|
-
----------------
|
|
154
|
-
|
|
155
|
-
* Extract title, links, scripts and a meta tag from main pages of top 754 domains. See ``examples/benchmark.py`` for more information.
|
|
156
|
-
|
|
157
|
-
============================ ===========
|
|
158
|
-
Package Time
|
|
159
|
-
============================ ===========
|
|
160
|
-
Beautiful Soup (html.parser) 61.02 sec.
|
|
161
|
-
lxml / Beautiful Soup (lxml) 9.09 sec.
|
|
162
|
-
html5_parser 16.10 sec.
|
|
163
|
-
selectolax (Modest) 2.94 sec.
|
|
164
|
-
selectolax (Lexbor) 2.39 sec.
|
|
165
|
-
============================ ===========
|
|
166
|
-
|
|
167
|
-
Links
|
|
168
|
-
-----
|
|
169
|
-
|
|
170
|
-
* `selectolax API reference <http://selectolax.readthedocs.io/en/latest/parser.html>`_
|
|
171
|
-
* `Video introduction to web scraping using selectolax <https://youtu.be/HpRsfpPuUzE>`_
|
|
172
|
-
* `How to Scrape 7k Products with Python using selectolax and httpx <https://www.youtube.com/watch?v=XpGvq755J2U>`_
|
|
173
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
174
|
-
* `Modest introduction <https://lexborisov.github.io/Modest/>`_
|
|
175
|
-
* `Modest benchmark <http://lexborisov.github.io/benchmark-html-persers/>`_
|
|
176
|
-
* `Python benchmark <https://rushter.com/blog/python-fast-html-parser/>`_
|
|
177
|
-
* `Another Python benchmark <https://www.peterbe.com/plog/selectolax-or-pyquery>`_
|
|
178
|
-
|
|
179
|
-
License
|
|
180
|
-
-------
|
|
181
|
-
|
|
182
|
-
* Modest engine — `LGPL2.1 <https://github.com/lexborisov/Modest/blob/master/LICENSE>`_
|
|
183
|
-
* selectolax - `MIT <https://github.com/rushter/selectolax/blob/master/LICENSE>`_
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
selectolax-0.3.29.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
2
|
-
selectolax-0.3.29.dist-info/METADATA,sha256=psjueWhyKDLmG66FhXi1EKCDxfmA_4SLEXS84DQQZt0,6060
|
|
3
|
-
selectolax-0.3.29.dist-info/LICENSE,sha256=MYCcM-Cv_rC2-lQiwDumin0E-rMXAhK-qIGGA29434Y,1077
|
|
4
|
-
selectolax-0.3.29.dist-info/RECORD,,
|
|
5
|
-
selectolax-0.3.29.dist-info/WHEEL,sha256=LOj1sxDbNUQjIM1BHAja1a1FTFvtDnFFrOvKZxi7pio,113
|
|
6
|
-
selectolax/utils.pxi,sha256=uB0-0naFQPy1JpR2DiIlKnyLyC76yWLnUHSuH11xg6s,3459
|
|
7
|
-
selectolax/lexbor.pyi,sha256=WFfpFEmhmvUc3qRJJ4mZxiXAePqdg2_Ud35eQ4jlaqU,6610
|
|
8
|
-
selectolax/base.pxi,sha256=eiPKlY9gG3l49qJoRQVLl1Ljza6z1k0A-met6sDPcqE,89
|
|
9
|
-
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
selectolax/parser.pxd,sha256=zZlg1vHUg6o4MXaiwKAo5S5hO_DqBGc4_E10qJ2EcM4,24564
|
|
11
|
-
selectolax/parser.c,sha256=Xl4naiO53VdzRcVWqws6koZNRRqpErvTy09qHIrvbtQ,2224633
|
|
12
|
-
selectolax/lexbor.pyx,sha256=EnFRvKRVoRhxg6r4vcQ89eWYUDFRlCRDm4cBRlQZnDY,11002
|
|
13
|
-
selectolax/lexbor.pxd,sha256=PwygBdb1blWAQcxXubZS5uffhgcXaqgySNMPFMT02-c,20958
|
|
14
|
-
selectolax/parser.cpython-311-aarch64-linux-musl.so,sha256=3yKg2EVh0kV_RzxAtf78hRQl7IIyL73gY5bqbAk_O3E,7575120
|
|
15
|
-
selectolax/lexbor.c,sha256=nmcIPSc9LZpbSFQLylDDVTDVU0uy0xHp4NiS_n6NM3k,2368108
|
|
16
|
-
selectolax/parser.pyx,sha256=GCdlRtpNKgCYsRS6iOnjKr_5GhZNcAaFMBQZSWLye8A,13093
|
|
17
|
-
selectolax/parser.pyi,sha256=6S9RKAevzv9zBYL1v12qQojkMst35yzy3TnD3HtZZo4,11275
|
|
18
|
-
selectolax/lexbor.cpython-311-aarch64-linux-musl.so,sha256=lEC-T6Xn1dS5EVi7TrTBrio-hVIofmnIPOTYhNa4dAo,9239736
|
|
19
|
-
selectolax/__init__.py,sha256=J5aFJ2fot0JTvAyn1K0rx3Ux6jaDJXJF1Uo4Zct_1Jw,175
|
|
20
|
-
selectolax/modest/selection.pxi,sha256=S55MMxEW2B1oPExB_DRwPM46WoWZU73J3rFRZU1URuQ,6393
|
|
21
|
-
selectolax/modest/node.pxi,sha256=8lX5cmGbX_X4Z9OuPpZ-P-5jne5k_-ck1hU-152e20Y,33315
|
|
22
|
-
selectolax/modest/util.pxi,sha256=aX9UnRNTITImHVBTlIs9efOd3EyugLq_Lwuo0zVTiuQ,551
|
|
23
|
-
selectolax/lexbor/selection.pxi,sha256=FA6npHtXjJjvS8H2_e_LS53i5zbpGYgb5zTh5Tf_XQY,6571
|
|
24
|
-
selectolax/lexbor/attrs.pxi,sha256=Ol2RNzXZAcWaqJdDBUe0ChOCcA8HC990Hjncj98XAkw,3138
|
|
25
|
-
selectolax/lexbor/node.pxi,sha256=_-zlshCku6gmCcpIuxfbkqZHVyihReyDmEK3QgYLVdg,29884
|
|
26
|
-
selectolax/lexbor/util.pxi,sha256=Zq7S-zlyU3wOo49wGHQHnmmhpbkrcJm59ZCTPENcZQA,563
|
|
File without changes
|
|
File without changes
|