selectolax 0.3.32__cp310-cp310-win_amd64.whl → 0.3.34__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +0 -1
- selectolax/lexbor/node.pxi +99 -41
- selectolax/lexbor/selection.pxi +27 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +6412 -6702
- selectolax/lexbor.cp310-win_amd64.pyd +0 -0
- selectolax/lexbor.pxd +32 -35
- selectolax/lexbor.pyi +51 -1
- selectolax/lexbor.pyx +48 -17
- selectolax/modest/node.pxi +37 -36
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +4524 -5291
- selectolax/parser.cp310-win_amd64.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +2 -2
- selectolax/parser.pyx +28 -31
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- selectolax-0.3.32.dist-info/METADATA +0 -187
- selectolax-0.3.32.dist-info/RECORD +0 -26
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/WHEEL +0 -0
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.32.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/parser.pxd
CHANGED
|
@@ -33,9 +33,9 @@ cdef extern from "myhtml/myhtml.h" nogil:
|
|
|
33
33
|
MyHTML_OPTIONS_PARSE_MODE_SEPARATELY = 0x04
|
|
34
34
|
|
|
35
35
|
ctypedef struct myhtml_collection_t:
|
|
36
|
-
myhtml_tree_node_t **list
|
|
37
|
-
size_t size
|
|
38
|
-
size_t length
|
|
36
|
+
myhtml_tree_node_t **list
|
|
37
|
+
size_t size
|
|
38
|
+
size_t length
|
|
39
39
|
|
|
40
40
|
ctypedef struct myhtml_tree_node_t:
|
|
41
41
|
myhtml_tree_node_flags flags
|
|
@@ -86,7 +86,6 @@ cdef extern from "myhtml/myhtml.h" nogil:
|
|
|
86
86
|
MyHTML_TOKEN_TYPE_COMMENT = 0x400
|
|
87
87
|
MyHTML_TOKEN_TYPE_NULL = 0x800
|
|
88
88
|
|
|
89
|
-
|
|
90
89
|
ctypedef enum myhtml_tags:
|
|
91
90
|
MyHTML_TAG__UNDEF = 0x000
|
|
92
91
|
MyHTML_TAG__TEXT = 0x001
|
|
@@ -392,8 +391,6 @@ cdef extern from "myhtml/myhtml.h" nogil:
|
|
|
392
391
|
size_t raw_value_begin
|
|
393
392
|
size_t raw_value_length
|
|
394
393
|
|
|
395
|
-
|
|
396
|
-
|
|
397
394
|
myhtml_t * myhtml_create()
|
|
398
395
|
mystatus_t myhtml_init(myhtml_t* myhtml, myhtml_options opt, size_t thread_count, size_t queue_size)
|
|
399
396
|
myhtml_tree_t * myhtml_tree_create()
|
|
@@ -415,7 +412,7 @@ cdef extern from "myhtml/myhtml.h" nogil:
|
|
|
415
412
|
myhtml_tree_node_t* myhtml_tree_get_node_head(myhtml_tree_t* tree)
|
|
416
413
|
|
|
417
414
|
myhtml_collection_t* myhtml_get_nodes_by_name(myhtml_tree_t* tree, myhtml_collection_t *collection,
|
|
418
|
-
|
|
415
|
+
const char* name, size_t length, mystatus_t *status)
|
|
419
416
|
|
|
420
417
|
void myhtml_node_delete(myhtml_tree_node_t *node)
|
|
421
418
|
void myhtml_node_delete_recursive(myhtml_tree_node_t *node)
|
|
@@ -427,7 +424,7 @@ cdef extern from "myhtml/myhtml.h" nogil:
|
|
|
427
424
|
myhtml_tree_node_t * myhtml_node_append_child(myhtml_tree_node_t* target, myhtml_tree_node_t* node)
|
|
428
425
|
|
|
429
426
|
mycore_string_t * myhtml_node_text_set(myhtml_tree_node_t *node, const char* text, size_t length,
|
|
430
|
-
|
|
427
|
+
myencoding_t encoding)
|
|
431
428
|
myhtml_tree_attr_t * myhtml_attribute_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
|
|
432
429
|
myhtml_tree_attr_t * myhtml_attribute_remove_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
|
|
433
430
|
myhtml_tree_attr_t * myhtml_attribute_add(myhtml_tree_node_t *node, const char *key, size_t key_len,
|
|
@@ -515,16 +512,16 @@ cdef extern from "mycss/mycss.h" nogil:
|
|
|
515
512
|
ctypedef mycss_selectors_flags mycss_selectors_flags_t
|
|
516
513
|
|
|
517
514
|
ctypedef struct mycss_selectors_list_t:
|
|
518
|
-
mycss_selectors_entries_list_t* entries_list
|
|
519
|
-
size_t entries_list_length
|
|
515
|
+
mycss_selectors_entries_list_t* entries_list
|
|
516
|
+
size_t entries_list_length
|
|
520
517
|
|
|
521
|
-
mycss_declaration_entry_t* declaration_entry
|
|
518
|
+
mycss_declaration_entry_t* declaration_entry
|
|
522
519
|
|
|
523
|
-
mycss_selectors_flags_t flags
|
|
520
|
+
mycss_selectors_flags_t flags
|
|
524
521
|
|
|
525
|
-
mycss_selectors_list_t* parent
|
|
526
|
-
mycss_selectors_list_t* next
|
|
527
|
-
mycss_selectors_list_t* prev
|
|
522
|
+
mycss_selectors_list_t* parent
|
|
523
|
+
mycss_selectors_list_t* next
|
|
524
|
+
mycss_selectors_list_t* prev
|
|
528
525
|
|
|
529
526
|
# CSS init routines
|
|
530
527
|
mycss_t * mycss_create()
|
|
@@ -542,12 +539,11 @@ cdef extern from "mycss/mycss.h" nogil:
|
|
|
542
539
|
mycss_t * mycss_destroy(mycss_t* mycss, bint self_destroy)
|
|
543
540
|
|
|
544
541
|
|
|
545
|
-
|
|
546
542
|
cdef extern from "modest/finder/finder.h" nogil:
|
|
547
543
|
ctypedef struct modest_finder_t
|
|
548
544
|
modest_finder_t* modest_finder_create_simple()
|
|
549
545
|
mystatus_t modest_finder_by_selectors_list(modest_finder_t* finder, myhtml_tree_node_t* scope_node,
|
|
550
|
-
|
|
546
|
+
mycss_selectors_list_t* selector_list, myhtml_collection_t** collection)
|
|
551
547
|
modest_finder_t * modest_finder_destroy(modest_finder_t* finder, bint self_destroy)
|
|
552
548
|
|
|
553
549
|
|
|
@@ -562,7 +558,8 @@ cdef class HTMLParser:
|
|
|
562
558
|
cdef object cached_script_srcs
|
|
563
559
|
|
|
564
560
|
cdef void _detect_encoding(self, char* html, size_t html_len) nogil
|
|
565
|
-
cdef _parse_html(self, char* html, size_t html_len)
|
|
561
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1
|
|
562
|
+
|
|
566
563
|
@staticmethod
|
|
567
564
|
cdef HTMLParser from_tree(
|
|
568
565
|
myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
|
|
@@ -576,6 +573,6 @@ cdef class Stack:
|
|
|
576
573
|
cdef myhtml_tree_node_t ** _stack
|
|
577
574
|
|
|
578
575
|
cdef bint is_empty(self)
|
|
579
|
-
cdef push(self, myhtml_tree_node_t* res)
|
|
576
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1
|
|
580
577
|
cdef myhtml_tree_node_t * pop(self)
|
|
581
|
-
cdef resize(self)
|
|
578
|
+
cdef int resize(self) except -1
|
selectolax/parser.pyi
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Iterator,
|
|
1
|
+
from typing import Iterator, Literal, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
@@ -268,7 +268,7 @@ class Node:
|
|
|
268
268
|
def remove(self, recursive: bool = True) -> None:
|
|
269
269
|
"""An alias for the decompose method."""
|
|
270
270
|
...
|
|
271
|
-
def unwrap(self) -> None:
|
|
271
|
+
def unwrap(self, delete_empty: bool = False) -> None:
|
|
272
272
|
"""Replace node with whatever is inside this node.
|
|
273
273
|
|
|
274
274
|
Parameters
|
selectolax/parser.pyx
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
from cpython cimport bool
|
|
2
|
+
from cpython.bool cimport bool
|
|
3
|
+
from cpython.exc cimport PyErr_SetObject
|
|
3
4
|
|
|
4
5
|
include "modest/selection.pxi"
|
|
5
6
|
include "modest/node.pxi"
|
|
@@ -61,8 +62,7 @@ cdef class HTMLParser:
|
|
|
61
62
|
|
|
62
63
|
"""
|
|
63
64
|
|
|
64
|
-
node = Node()
|
|
65
|
-
node._init(self.html_tree.node_html, self)
|
|
65
|
+
cdef Node node = Node.new(self.html_tree.node_html, self)
|
|
66
66
|
return node.css(query)
|
|
67
67
|
|
|
68
68
|
def css_first(self, str query, default=None, strict=False):
|
|
@@ -84,12 +84,11 @@ cdef class HTMLParser:
|
|
|
84
84
|
|
|
85
85
|
"""
|
|
86
86
|
|
|
87
|
-
node = Node()
|
|
88
|
-
node._init(self.html_tree.node_html, self)
|
|
87
|
+
cdef Node node = Node.new(self.html_tree.node_html, self)
|
|
89
88
|
return node.css_first(query, default, strict)
|
|
90
89
|
|
|
91
90
|
cdef void _detect_encoding(self, char* html, size_t html_len) nogil:
|
|
92
|
-
cdef myencoding_t encoding = MyENCODING_DEFAULT
|
|
91
|
+
cdef myencoding_t encoding = MyENCODING_DEFAULT
|
|
93
92
|
|
|
94
93
|
if self.use_meta_tags:
|
|
95
94
|
encoding = myencoding_prescan_stream_to_determine_encoding(html, html_len)
|
|
@@ -102,7 +101,7 @@ cdef class HTMLParser:
|
|
|
102
101
|
|
|
103
102
|
self._encoding = encoding
|
|
104
103
|
|
|
105
|
-
cdef _parse_html(self, char* html, size_t html_len):
|
|
104
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1:
|
|
106
105
|
cdef myhtml_t* myhtml
|
|
107
106
|
cdef mystatus_t status
|
|
108
107
|
|
|
@@ -111,23 +110,28 @@ cdef class HTMLParser:
|
|
|
111
110
|
status = myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0)
|
|
112
111
|
|
|
113
112
|
if status != 0:
|
|
114
|
-
|
|
113
|
+
PyErr_SetObject(RuntimeError, "Can't init MyHTML object.")
|
|
114
|
+
return -1
|
|
115
115
|
|
|
116
116
|
with nogil:
|
|
117
117
|
self.html_tree = myhtml_tree_create()
|
|
118
118
|
status = myhtml_tree_init(self.html_tree, myhtml)
|
|
119
119
|
|
|
120
120
|
if status != 0:
|
|
121
|
-
|
|
121
|
+
PyErr_SetObject(RuntimeError, "Can't init MyHTML Tree object.")
|
|
122
|
+
return -1
|
|
122
123
|
|
|
123
124
|
with nogil:
|
|
124
125
|
status = myhtml_parse(self.html_tree, self._encoding, html, html_len)
|
|
125
126
|
|
|
126
127
|
if status != 0:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
assert self.html_tree.node_html != NULL
|
|
128
|
+
PyErr_SetObject(RuntimeError, "Can't parse HTML (status code: %d)" % status)
|
|
129
|
+
return -1
|
|
130
130
|
|
|
131
|
+
if self.html_tree.node_html == NULL:
|
|
132
|
+
PyErr_SetObject(RuntimeError, "html_tree is still NULL even after parsing ")
|
|
133
|
+
return -1
|
|
134
|
+
return 0
|
|
131
135
|
|
|
132
136
|
@property
|
|
133
137
|
def input_encoding(self):
|
|
@@ -148,9 +152,7 @@ cdef class HTMLParser:
|
|
|
148
152
|
"""Returns root node."""
|
|
149
153
|
if self.html_tree and self.html_tree.node_html:
|
|
150
154
|
try:
|
|
151
|
-
|
|
152
|
-
node._init(self.html_tree.node_html, self)
|
|
153
|
-
return node
|
|
155
|
+
return Node.new(self.html_tree.node_html, self)
|
|
154
156
|
except Exception:
|
|
155
157
|
# If Node creation or initialization fails, return None
|
|
156
158
|
return None
|
|
@@ -163,9 +165,7 @@ cdef class HTMLParser:
|
|
|
163
165
|
head = myhtml_tree_get_node_head(self.html_tree)
|
|
164
166
|
|
|
165
167
|
if head != NULL:
|
|
166
|
-
|
|
167
|
-
node._init(head, self)
|
|
168
|
-
return node
|
|
168
|
+
return Node.new(head, self)
|
|
169
169
|
return None
|
|
170
170
|
|
|
171
171
|
@property
|
|
@@ -175,10 +175,7 @@ cdef class HTMLParser:
|
|
|
175
175
|
body = myhtml_tree_get_node_body(self.html_tree)
|
|
176
176
|
|
|
177
177
|
if body != NULL:
|
|
178
|
-
|
|
179
|
-
node._init(body, self)
|
|
180
|
-
return node
|
|
181
|
-
|
|
178
|
+
return Node.new(body, self)
|
|
182
179
|
return None
|
|
183
180
|
|
|
184
181
|
def tags(self, str name):
|
|
@@ -197,7 +194,7 @@ cdef class HTMLParser:
|
|
|
197
194
|
|
|
198
195
|
cdef myhtml_collection_t* collection = NULL
|
|
199
196
|
pybyte_name = name.encode('UTF-8')
|
|
200
|
-
cdef mystatus_t status = 0
|
|
197
|
+
cdef mystatus_t status = 0
|
|
201
198
|
|
|
202
199
|
result = list()
|
|
203
200
|
collection = myhtml_get_nodes_by_name(self.html_tree, NULL, pybyte_name, len(pybyte_name), &status)
|
|
@@ -207,8 +204,7 @@ cdef class HTMLParser:
|
|
|
207
204
|
|
|
208
205
|
if status == 0:
|
|
209
206
|
for i in range(collection.length):
|
|
210
|
-
node = Node()
|
|
211
|
-
node._init(collection.list[i], self)
|
|
207
|
+
node = Node.new(collection.list[i], self)
|
|
212
208
|
result.append(node)
|
|
213
209
|
|
|
214
210
|
myhtml_collection_destroy(collection)
|
|
@@ -258,7 +254,7 @@ cdef class HTMLParser:
|
|
|
258
254
|
"""
|
|
259
255
|
cdef myhtml_collection_t* collection = NULL
|
|
260
256
|
|
|
261
|
-
cdef mystatus_t status = 0
|
|
257
|
+
cdef mystatus_t status = 0
|
|
262
258
|
|
|
263
259
|
for tag in tags:
|
|
264
260
|
pybyte_name = tag.encode('UTF-8')
|
|
@@ -278,7 +274,6 @@ cdef class HTMLParser:
|
|
|
278
274
|
|
|
279
275
|
myhtml_collection_destroy(collection)
|
|
280
276
|
|
|
281
|
-
|
|
282
277
|
def unwrap_tags(self, list tags, delete_empty : bool = False):
|
|
283
278
|
"""Unwraps specified tags from the HTML tree.
|
|
284
279
|
|
|
@@ -305,9 +300,9 @@ cdef class HTMLParser:
|
|
|
305
300
|
@property
|
|
306
301
|
def html(self):
|
|
307
302
|
"""Return HTML representation of the page."""
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
node.
|
|
303
|
+
cdef Node node
|
|
304
|
+
if self.html_tree != NULL and self.html_tree.document != NULL:
|
|
305
|
+
node = Node.new(self.html_tree.document, self)
|
|
311
306
|
return node.html
|
|
312
307
|
return None
|
|
313
308
|
|
|
@@ -361,6 +356,7 @@ cdef class HTMLParser:
|
|
|
361
356
|
|
|
362
357
|
def css_matches(self, str selector):
|
|
363
358
|
return self.root.css_matches(selector)
|
|
359
|
+
|
|
364
360
|
def merge_text_nodes(self):
|
|
365
361
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
366
362
|
|
|
@@ -380,6 +376,7 @@ cdef class HTMLParser:
|
|
|
380
376
|
"John Doe"
|
|
381
377
|
"""
|
|
382
378
|
return self.root.merge_text_nodes()
|
|
379
|
+
|
|
383
380
|
@staticmethod
|
|
384
381
|
cdef HTMLParser from_tree(
|
|
385
382
|
myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
|
|
@@ -396,13 +393,13 @@ cdef class HTMLParser:
|
|
|
396
393
|
obj.cached_script_srcs = None
|
|
397
394
|
return obj
|
|
398
395
|
|
|
399
|
-
|
|
400
396
|
def clone(self):
|
|
401
397
|
"""Clone the current tree."""
|
|
402
398
|
cdef myhtml_t* myhtml
|
|
403
399
|
cdef mystatus_t status
|
|
404
400
|
cdef myhtml_tree_t* html_tree
|
|
405
401
|
cdef myhtml_tree_node_t* node
|
|
402
|
+
cdef HTMLParser cls
|
|
406
403
|
|
|
407
404
|
with nogil:
|
|
408
405
|
myhtml = myhtml_create()
|
selectolax/utils.pxi
CHANGED
|
@@ -4,6 +4,16 @@ MAX_HTML_INPUT_SIZE = 250e+7
|
|
|
4
4
|
|
|
5
5
|
ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
|
|
6
6
|
Parser = Union["HTMLParser", "LexborHTMLParser"]
|
|
7
|
+
FRAGMENT = Literal[
|
|
8
|
+
"document",
|
|
9
|
+
"fragment",
|
|
10
|
+
"head",
|
|
11
|
+
"body",
|
|
12
|
+
"head_and_body",
|
|
13
|
+
"document_no_head",
|
|
14
|
+
"document_no_body",
|
|
15
|
+
"document_no_head_no_body",
|
|
16
|
+
]
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
def preprocess_input(html, decode_errors='ignore'):
|
|
@@ -29,10 +39,10 @@ def get_fragment_type(
|
|
|
29
39
|
html: str,
|
|
30
40
|
parser_cls: ParserCls,
|
|
31
41
|
tree: Optional[Parser] = None,
|
|
32
|
-
) ->
|
|
42
|
+
) -> FRAGMENT:
|
|
33
43
|
if not tree:
|
|
34
44
|
tree = parser_cls(html)
|
|
35
|
-
|
|
45
|
+
|
|
36
46
|
import re
|
|
37
47
|
html_re = re.compile(r"<html|<body|<head(?!er)", re.IGNORECASE)
|
|
38
48
|
|
|
@@ -49,7 +59,7 @@ def get_fragment_type(
|
|
|
49
59
|
|
|
50
60
|
if has_html and has_head and has_body:
|
|
51
61
|
break
|
|
52
|
-
|
|
62
|
+
|
|
53
63
|
if has_html and has_head and has_body:
|
|
54
64
|
return "document"
|
|
55
65
|
elif has_html and not has_head and has_body:
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: selectolax
|
|
3
|
+
Version: 0.3.34
|
|
4
|
+
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
+
Home-page: https://github.com/rushter/selectolax
|
|
6
|
+
Author: Artem Golubin
|
|
7
|
+
Author-email: Artem Golubin <me@rushter.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Project-URL: Repository, https://github.com/rushter/selectolax
|
|
10
|
+
Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
|
|
11
|
+
Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.md
|
|
12
|
+
Keywords: selectolax,html,parser,css,fast
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Topic :: Internet
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Description-Content-Type: text/x-rst
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: cython
|
|
29
|
+
Requires-Dist: Cython; extra == "cython"
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
selectolax/__init__.py,sha256=iI6pQ10gimevS2gTf4K4_1cXh4NBRFj_5GjkmhrvU94,157
|
|
2
|
+
selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
|
|
3
|
+
selectolax/lexbor.c,sha256=ohdT--aXa69etvB2pMZU-vBU8P0urBCVeNIsKJzIe3k,2419508
|
|
4
|
+
selectolax/lexbor.cp310-win_amd64.pyd,sha256=h0bKQ_UgJiQ8nkBlSwF_woJ3to9gOZE5KwLC5XaxL68,3149312
|
|
5
|
+
selectolax/lexbor.pxd,sha256=BcqAzhlUVq0GVWiJHWXNhs4jY-gi6k0BELEnQtSYJAI,21720
|
|
6
|
+
selectolax/lexbor.pyi,sha256=dRNzLXJEbFRR7QcItuX8Ews9E9I6h6G4vA3X1hijzj4,28990
|
|
7
|
+
selectolax/lexbor.pyx,sha256=XLZ2vGwLoWdctnmU-gfizjD6tMjehR_bzNOapDJ_YOQ,12891
|
|
8
|
+
selectolax/parser.c,sha256=E6Jy4u2RriHcudgyhxtAxpEyCvwH3lPxxjPIfemuK7A,2259306
|
|
9
|
+
selectolax/parser.cp310-win_amd64.pyd,sha256=MZlpP-wdrtOC8wNs18WhA8lF5IfLXn1_xGziFYR4ZMw,2105344
|
|
10
|
+
selectolax/parser.pxd,sha256=T7GoQdaOkhp_W2TBlRY0tZqom97PkHrytYaXQlyVnbI,25196
|
|
11
|
+
selectolax/parser.pyi,sha256=-qutpjrK1dD4rrl3SsHWQt2FT5lv6meaACkQzk1Bt6o,25612
|
|
12
|
+
selectolax/parser.pyx,sha256=nIWuhaEFRwlfo64WmgrSOM0A8mUw0eWw9j_fWyLV-Ro,14127
|
|
13
|
+
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
selectolax/utils.pxi,sha256=hkzKfycdpwH1P-E_pP-9NTGsmiajt6EJNZSlkxlRapA,3623
|
|
15
|
+
selectolax/lexbor/attrs.pxi,sha256=d59V77aGkpp7YsYsd6t_z4-tRnUoQTJZKsvMC8nyttM,3978
|
|
16
|
+
selectolax/lexbor/node.pxi,sha256=KODqPk3yZ_owwdSxqNr2Ih6qAOhu9CJ-jrHtqQJcWmY,33407
|
|
17
|
+
selectolax/lexbor/selection.pxi,sha256=BeUDypw5_P0CTmi-ACLcd7pK2NnG9ASrwWOdLdweAZY,7378
|
|
18
|
+
selectolax/lexbor/util.pxi,sha256=q2EYVNdnROg9y30mWpGwlNA0W00nJ7ZRNEEDrOEG14s,584
|
|
19
|
+
selectolax/modest/node.pxi,sha256=iX_yRPIPVkG0ALW7hEfmXiVperw6RjkSGATkxzLokz0,34691
|
|
20
|
+
selectolax/modest/selection.pxi,sha256=PfHUN1uuNA7YfcxTu7JZjhxevVbFRP1bHd3kyyFdO7E,6703
|
|
21
|
+
selectolax/modest/util.pxi,sha256=zab67Wzo8FcipA2VS8ClptaC19lZirbNqFEGQ3hW2Is,572
|
|
22
|
+
selectolax-0.3.34.dist-info/licenses/LICENSE,sha256=A7Jb3WZcENcLfZRc7QPdm9zJdwfpIyPodPJu-kdMH6E,1087
|
|
23
|
+
selectolax-0.3.34.dist-info/METADATA,sha256=rAqskRB9wMSn7tEZLxZswUJD4wFzN4fizyXjiBv4L4o,1318
|
|
24
|
+
selectolax-0.3.34.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
|
|
25
|
+
selectolax-0.3.34.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
26
|
+
selectolax-0.3.34.dist-info/RECORD,,
|
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: selectolax
|
|
3
|
-
Version: 0.3.32
|
|
4
|
-
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
-
Home-page: https://github.com/rushter/selectolax
|
|
6
|
-
Author: Artem Golubin
|
|
7
|
-
Author-email: Artem Golubin <me@rushter.com>
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Repository, https://github.com/rushter/selectolax
|
|
10
|
-
Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
|
|
11
|
-
Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.rst
|
|
12
|
-
Keywords: selectolax,html,parser,css,fast
|
|
13
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
-
Classifier: Topic :: Internet
|
|
16
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
-
Classifier: Intended Audience :: Developers
|
|
18
|
-
Classifier: Natural Language :: English
|
|
19
|
-
Classifier: Programming Language :: Python :: 3
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
-
Requires-Python: >=3.9
|
|
26
|
-
Description-Content-Type: text/x-rst
|
|
27
|
-
License-File: LICENSE
|
|
28
|
-
Provides-Extra: cython
|
|
29
|
-
Requires-Dist: Cython; extra == "cython"
|
|
30
|
-
Dynamic: author
|
|
31
|
-
Dynamic: home-page
|
|
32
|
-
Dynamic: license-file
|
|
33
|
-
|
|
34
|
-
.. image:: docs/logo.png
|
|
35
|
-
:alt: selectolax logo
|
|
36
|
-
|
|
37
|
-
-------------------------
|
|
38
|
-
|
|
39
|
-
.. image:: https://img.shields.io/pypi/v/selectolax.svg
|
|
40
|
-
:target: https://pypi.python.org/pypi/selectolax
|
|
41
|
-
|
|
42
|
-
A fast HTML5 parser with CSS selectors using `Modest <https://github.com/lexborisov/Modest/>`_ and
|
|
43
|
-
`Lexbor <https://github.com/lexbor/lexbor>`_ engines.
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
Installation
|
|
47
|
-
------------
|
|
48
|
-
From PyPI using pip:
|
|
49
|
-
|
|
50
|
-
.. code-block:: bash
|
|
51
|
-
|
|
52
|
-
pip install selectolax
|
|
53
|
-
|
|
54
|
-
If installation fails due to compilation errors, you may need to install `Cython <https://github.com/cython/cython>`_:
|
|
55
|
-
|
|
56
|
-
.. code-block:: bash
|
|
57
|
-
|
|
58
|
-
pip install selectolax[cython]
|
|
59
|
-
|
|
60
|
-
This usually happens when you try to install an outdated version of selectolax on a newer version of Python.
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
Development version from GitHub:
|
|
64
|
-
|
|
65
|
-
.. code-block:: bash
|
|
66
|
-
|
|
67
|
-
git clone --recursive https://github.com/rushter/selectolax
|
|
68
|
-
cd selectolax
|
|
69
|
-
pip install -r requirements_dev.txt
|
|
70
|
-
python setup.py install
|
|
71
|
-
|
|
72
|
-
How to compile selectolax while developing:
|
|
73
|
-
|
|
74
|
-
.. code-block:: bash
|
|
75
|
-
|
|
76
|
-
make clean
|
|
77
|
-
make dev
|
|
78
|
-
|
|
79
|
-
Basic examples
|
|
80
|
-
--------------
|
|
81
|
-
|
|
82
|
-
Here are some basic examples to get you started with selectolax:
|
|
83
|
-
|
|
84
|
-
Parsing HTML and extracting text:
|
|
85
|
-
|
|
86
|
-
.. code:: python
|
|
87
|
-
|
|
88
|
-
In [1]: from selectolax.parser import HTMLParser
|
|
89
|
-
...:
|
|
90
|
-
...: html = """
|
|
91
|
-
...: <h1 id="title" data-updated="20201101">Hi there</h1>
|
|
92
|
-
...: <div class="post">Lorem Ipsum is simply dummy text of the printing and typesetting industry. </div>
|
|
93
|
-
...: <div class="post">Lorem ipsum dolor sit amet, consectetur adipiscing elit.</div>
|
|
94
|
-
...: """
|
|
95
|
-
...: tree = HTMLParser(html)
|
|
96
|
-
|
|
97
|
-
In [2]: tree.css_first('h1#title').text()
|
|
98
|
-
Out[2]: 'Hi there'
|
|
99
|
-
|
|
100
|
-
In [3]: tree.css_first('h1#title').attributes
|
|
101
|
-
Out[3]: {'id': 'title', 'data-updated': '20201101'}
|
|
102
|
-
|
|
103
|
-
In [4]: [node.text() for node in tree.css('.post')]
|
|
104
|
-
Out[4]:
|
|
105
|
-
['Lorem Ipsum is simply dummy text of the printing and typesetting industry. ',
|
|
106
|
-
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.']
|
|
107
|
-
|
|
108
|
-
Using advanced CSS selectors:
|
|
109
|
-
|
|
110
|
-
.. code:: python
|
|
111
|
-
|
|
112
|
-
In [1]: html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
|
|
113
|
-
...: selector = "div > :nth-child(2n+1):not(:has(a))"
|
|
114
|
-
|
|
115
|
-
In [2]: for node in HTMLParser(html).css(selector):
|
|
116
|
-
...: print(node.attributes, node.text(), node.tag)
|
|
117
|
-
...: print(node.parent.tag)
|
|
118
|
-
...: print(node.html)
|
|
119
|
-
...:
|
|
120
|
-
{'id': 'p1'} p
|
|
121
|
-
div
|
|
122
|
-
<p id="p1"></p>
|
|
123
|
-
{'id': 'p5'} text p
|
|
124
|
-
div
|
|
125
|
-
<p id="p5">text</p>
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
129
|
-
|
|
130
|
-
Available backends
|
|
131
|
-
------------------
|
|
132
|
-
|
|
133
|
-
Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
|
|
134
|
-
Most of the features between backends are almost identical, but there are still some differences.
|
|
135
|
-
|
|
136
|
-
As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
|
|
137
|
-
and the underlying C library that selectolax uses is not maintained anymore.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
|
|
141
|
-
|
|
142
|
-
.. code:: python
|
|
143
|
-
|
|
144
|
-
In [1]: from selectolax.lexbor import LexborHTMLParser
|
|
145
|
-
|
|
146
|
-
In [2]: html = """
|
|
147
|
-
...: <title>Hi there</title>
|
|
148
|
-
...: <div id="updated">2021-08-15</div>
|
|
149
|
-
...: """
|
|
150
|
-
|
|
151
|
-
In [3]: parser = LexborHTMLParser(html)
|
|
152
|
-
In [4]: parser.root.css_first("#updated").text()
|
|
153
|
-
Out[4]: '2021-08-15'
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
Simple Benchmark
|
|
157
|
-
----------------
|
|
158
|
-
|
|
159
|
-
* Extract title, links, scripts and a meta tag from main pages of top 754 domains. See ``examples/benchmark.py`` for more information.
|
|
160
|
-
|
|
161
|
-
============================ ===========
|
|
162
|
-
Package Time
|
|
163
|
-
============================ ===========
|
|
164
|
-
Beautiful Soup (html.parser) 61.02 sec.
|
|
165
|
-
lxml / Beautiful Soup (lxml) 9.09 sec.
|
|
166
|
-
html5_parser 16.10 sec.
|
|
167
|
-
selectolax (Modest) 2.94 sec.
|
|
168
|
-
selectolax (Lexbor) 2.39 sec.
|
|
169
|
-
============================ ===========
|
|
170
|
-
|
|
171
|
-
Links
|
|
172
|
-
-----
|
|
173
|
-
|
|
174
|
-
* `selectolax API reference <https://selectolax.readthedocs.io/en/latest/index.html>`_
|
|
175
|
-
* `Video introduction to web scraping using selectolax <https://youtu.be/HpRsfpPuUzE>`_
|
|
176
|
-
* `How to Scrape 7k Products with Python using selectolax and httpx <https://www.youtube.com/watch?v=XpGvq755J2U>`_
|
|
177
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
178
|
-
* `Modest introduction <https://lexborisov.github.io/Modest/>`_
|
|
179
|
-
* `Modest benchmark <https://lexborisov.github.io/benchmark-html-parsers/>`_
|
|
180
|
-
* `Python benchmark <https://rushter.com/blog/python-fast-html-parser/>`_
|
|
181
|
-
* `Another Python benchmark <https://www.peterbe.com/plog/selectolax-or-pyquery>`_
|
|
182
|
-
|
|
183
|
-
License
|
|
184
|
-
-------
|
|
185
|
-
|
|
186
|
-
* Modest engine — `LGPL2.1 <https://github.com/lexborisov/Modest/blob/master/LICENSE>`_
|
|
187
|
-
* selectolax - `MIT <https://github.com/rushter/selectolax/blob/master/LICENSE>`_
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
selectolax/__init__.py,sha256=-HUZIEaQkWhTkyAc3ZYkjSig-PwZhip6R_VUo5KmwKk,185
|
|
2
|
-
selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
|
|
3
|
-
selectolax/lexbor.c,sha256=rvkQp6ZbkbXMmMBAZXNY8U8UBD41YbE60kwHji94CSk,2419967
|
|
4
|
-
selectolax/lexbor.cp310-win_amd64.pyd,sha256=VNaCjknrMwgxtzXQu_ImIPyhw8HLm0N73i4Q64cNcuI,3144192
|
|
5
|
-
selectolax/lexbor.pxd,sha256=cG264E-tFNOFTy0k5bAqV_sZnz4G6a4a21WEhqTL-NI,21516
|
|
6
|
-
selectolax/lexbor.pyi,sha256=NS2pI6PL7klkk6xXThHE72Jsi8583xLUdQ6gvCAQovY,27028
|
|
7
|
-
selectolax/lexbor.pyx,sha256=-O-g03mLCQKc9F19eMvo3PyoLDtF09IIuFziXJAl6Ao,11520
|
|
8
|
-
selectolax/parser.c,sha256=xGQmCxrEmyQCi9I11d3py7cIcFumqhhVpdystsQ1dp0,2287084
|
|
9
|
-
selectolax/parser.cp310-win_amd64.pyd,sha256=t_VCxIXCbrQmzz087ukSMGpnXUDxGWpReztFfOqrp6w,2108416
|
|
10
|
-
selectolax/parser.pxd,sha256=4pM_CcZlvJlaR8EMjZCnSmnCcJbwcYOldRTBEbfwm48,25145
|
|
11
|
-
selectolax/parser.pyi,sha256=5Czf63278MQC01IxY-CHzoDyTS1oHiYRD2OxEscyL1o,25584
|
|
12
|
-
selectolax/parser.pyx,sha256=bS2n70o_5OPJ6JuXTBAVUTc-XhxqC4DXzPE4H3-e5Ek,13987
|
|
13
|
-
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
selectolax/utils.pxi,sha256=4rtdRcLWuemxN1qe7Eul5jvAmHZ65r7Gvf67_Wg8Bt4,3566
|
|
15
|
-
selectolax/lexbor/attrs.pxi,sha256=KvQaCo0jM3Bva6_xG3TGzkCVFjNQ4kgTxWO95gsGbkw,4007
|
|
16
|
-
selectolax/lexbor/node.pxi,sha256=BuJIYcTUucvHUP4w2wl90uBwjcqpv1vRyt4No2O_Quo,30921
|
|
17
|
-
selectolax/lexbor/selection.pxi,sha256=4I8cjYMjC2Gz7xLrYWrl7jbnwDoVmab-GvrJKiTALTI,7017
|
|
18
|
-
selectolax/lexbor/util.pxi,sha256=0I4ElWIwXxrZCfMmGCtyDU127oMsPCqC3IcUk4QmMAc,582
|
|
19
|
-
selectolax/modest/node.pxi,sha256=Da2b3cdmggCX736x0htGvac51SEeGCcY5l-LA5H4HNI,34376
|
|
20
|
-
selectolax/modest/selection.pxi,sha256=0elY7JwnpPVaw0QZE1T7A78s9FIph5uWIhwy4sEXGU8,6586
|
|
21
|
-
selectolax/modest/util.pxi,sha256=o2nPGGGtRlLqOCa7yPk94CfBzNlVr7ull7osFy6NRX4,570
|
|
22
|
-
selectolax-0.3.32.dist-info/licenses/LICENSE,sha256=A7Jb3WZcENcLfZRc7QPdm9zJdwfpIyPodPJu-kdMH6E,1087
|
|
23
|
-
selectolax-0.3.32.dist-info/METADATA,sha256=mB27GyUKzOKfZK92CEHDlQ6wrwG1um9kesSgMr8yb1c,6402
|
|
24
|
-
selectolax-0.3.32.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
|
|
25
|
-
selectolax-0.3.32.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
26
|
-
selectolax-0.3.32.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|