selectolax 0.3.32__cp311-cp311-win_arm64.whl → 0.3.34__cp311-cp311-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

Binary file
selectolax/parser.pxd CHANGED
@@ -33,9 +33,9 @@ cdef extern from "myhtml/myhtml.h" nogil:
33
33
  MyHTML_OPTIONS_PARSE_MODE_SEPARATELY = 0x04
34
34
 
35
35
  ctypedef struct myhtml_collection_t:
36
- myhtml_tree_node_t **list;
37
- size_t size;
38
- size_t length;
36
+ myhtml_tree_node_t **list
37
+ size_t size
38
+ size_t length
39
39
 
40
40
  ctypedef struct myhtml_tree_node_t:
41
41
  myhtml_tree_node_flags flags
@@ -86,7 +86,6 @@ cdef extern from "myhtml/myhtml.h" nogil:
86
86
  MyHTML_TOKEN_TYPE_COMMENT = 0x400
87
87
  MyHTML_TOKEN_TYPE_NULL = 0x800
88
88
 
89
-
90
89
  ctypedef enum myhtml_tags:
91
90
  MyHTML_TAG__UNDEF = 0x000
92
91
  MyHTML_TAG__TEXT = 0x001
@@ -392,8 +391,6 @@ cdef extern from "myhtml/myhtml.h" nogil:
392
391
  size_t raw_value_begin
393
392
  size_t raw_value_length
394
393
 
395
-
396
-
397
394
  myhtml_t * myhtml_create()
398
395
  mystatus_t myhtml_init(myhtml_t* myhtml, myhtml_options opt, size_t thread_count, size_t queue_size)
399
396
  myhtml_tree_t * myhtml_tree_create()
@@ -415,7 +412,7 @@ cdef extern from "myhtml/myhtml.h" nogil:
415
412
  myhtml_tree_node_t* myhtml_tree_get_node_head(myhtml_tree_t* tree)
416
413
 
417
414
  myhtml_collection_t* myhtml_get_nodes_by_name(myhtml_tree_t* tree, myhtml_collection_t *collection,
418
- const char* name, size_t length, mystatus_t *status)
415
+ const char* name, size_t length, mystatus_t *status)
419
416
 
420
417
  void myhtml_node_delete(myhtml_tree_node_t *node)
421
418
  void myhtml_node_delete_recursive(myhtml_tree_node_t *node)
@@ -427,7 +424,7 @@ cdef extern from "myhtml/myhtml.h" nogil:
427
424
  myhtml_tree_node_t * myhtml_node_append_child(myhtml_tree_node_t* target, myhtml_tree_node_t* node)
428
425
 
429
426
  mycore_string_t * myhtml_node_text_set(myhtml_tree_node_t *node, const char* text, size_t length,
430
- myencoding_t encoding)
427
+ myencoding_t encoding)
431
428
  myhtml_tree_attr_t * myhtml_attribute_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
432
429
  myhtml_tree_attr_t * myhtml_attribute_remove_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
433
430
  myhtml_tree_attr_t * myhtml_attribute_add(myhtml_tree_node_t *node, const char *key, size_t key_len,
@@ -515,16 +512,16 @@ cdef extern from "mycss/mycss.h" nogil:
515
512
  ctypedef mycss_selectors_flags mycss_selectors_flags_t
516
513
 
517
514
  ctypedef struct mycss_selectors_list_t:
518
- mycss_selectors_entries_list_t* entries_list;
519
- size_t entries_list_length;
515
+ mycss_selectors_entries_list_t* entries_list
516
+ size_t entries_list_length
520
517
 
521
- mycss_declaration_entry_t* declaration_entry;
518
+ mycss_declaration_entry_t* declaration_entry
522
519
 
523
- mycss_selectors_flags_t flags;
520
+ mycss_selectors_flags_t flags
524
521
 
525
- mycss_selectors_list_t* parent;
526
- mycss_selectors_list_t* next;
527
- mycss_selectors_list_t* prev;
522
+ mycss_selectors_list_t* parent
523
+ mycss_selectors_list_t* next
524
+ mycss_selectors_list_t* prev
528
525
 
529
526
  # CSS init routines
530
527
  mycss_t * mycss_create()
@@ -542,12 +539,11 @@ cdef extern from "mycss/mycss.h" nogil:
542
539
  mycss_t * mycss_destroy(mycss_t* mycss, bint self_destroy)
543
540
 
544
541
 
545
-
546
542
  cdef extern from "modest/finder/finder.h" nogil:
547
543
  ctypedef struct modest_finder_t
548
544
  modest_finder_t* modest_finder_create_simple()
549
545
  mystatus_t modest_finder_by_selectors_list(modest_finder_t* finder, myhtml_tree_node_t* scope_node,
550
- mycss_selectors_list_t* selector_list, myhtml_collection_t** collection)
546
+ mycss_selectors_list_t* selector_list, myhtml_collection_t** collection)
551
547
  modest_finder_t * modest_finder_destroy(modest_finder_t* finder, bint self_destroy)
552
548
 
553
549
 
@@ -562,7 +558,8 @@ cdef class HTMLParser:
562
558
  cdef object cached_script_srcs
563
559
 
564
560
  cdef void _detect_encoding(self, char* html, size_t html_len) nogil
565
- cdef _parse_html(self, char* html, size_t html_len)
561
+ cdef int _parse_html(self, char* html, size_t html_len) except -1
562
+
566
563
  @staticmethod
567
564
  cdef HTMLParser from_tree(
568
565
  myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
@@ -576,6 +573,6 @@ cdef class Stack:
576
573
  cdef myhtml_tree_node_t ** _stack
577
574
 
578
575
  cdef bint is_empty(self)
579
- cdef push(self, myhtml_tree_node_t* res)
576
+ cdef int push(self, myhtml_tree_node_t* res) except -1
580
577
  cdef myhtml_tree_node_t * pop(self)
581
- cdef resize(self)
578
+ cdef int resize(self) except -1
selectolax/parser.pyi CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Iterator, TypeVar, Literal, overload
1
+ from typing import Iterator, Literal, TypeVar, overload
2
2
 
3
3
  DefaultT = TypeVar("DefaultT")
4
4
 
@@ -268,7 +268,7 @@ class Node:
268
268
  def remove(self, recursive: bool = True) -> None:
269
269
  """An alias for the decompose method."""
270
270
  ...
271
- def unwrap(self) -> None:
271
+ def unwrap(self, delete_empty: bool = False) -> None:
272
272
  """Replace node with whatever is inside this node.
273
273
 
274
274
  Parameters
selectolax/parser.pyx CHANGED
@@ -1,5 +1,6 @@
1
1
 
2
- from cpython cimport bool
2
+ from cpython.bool cimport bool
3
+ from cpython.exc cimport PyErr_SetObject
3
4
 
4
5
  include "modest/selection.pxi"
5
6
  include "modest/node.pxi"
@@ -61,8 +62,7 @@ cdef class HTMLParser:
61
62
 
62
63
  """
63
64
 
64
- node = Node()
65
- node._init(self.html_tree.node_html, self)
65
+ cdef Node node = Node.new(self.html_tree.node_html, self)
66
66
  return node.css(query)
67
67
 
68
68
  def css_first(self, str query, default=None, strict=False):
@@ -84,12 +84,11 @@ cdef class HTMLParser:
84
84
 
85
85
  """
86
86
 
87
- node = Node()
88
- node._init(self.html_tree.node_html, self)
87
+ cdef Node node = Node.new(self.html_tree.node_html, self)
89
88
  return node.css_first(query, default, strict)
90
89
 
91
90
  cdef void _detect_encoding(self, char* html, size_t html_len) nogil:
92
- cdef myencoding_t encoding = MyENCODING_DEFAULT;
91
+ cdef myencoding_t encoding = MyENCODING_DEFAULT
93
92
 
94
93
  if self.use_meta_tags:
95
94
  encoding = myencoding_prescan_stream_to_determine_encoding(html, html_len)
@@ -102,7 +101,7 @@ cdef class HTMLParser:
102
101
 
103
102
  self._encoding = encoding
104
103
 
105
- cdef _parse_html(self, char* html, size_t html_len):
104
+ cdef int _parse_html(self, char* html, size_t html_len) except -1:
106
105
  cdef myhtml_t* myhtml
107
106
  cdef mystatus_t status
108
107
 
@@ -111,23 +110,28 @@ cdef class HTMLParser:
111
110
  status = myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0)
112
111
 
113
112
  if status != 0:
114
- raise RuntimeError("Can't init MyHTML object.")
113
+ PyErr_SetObject(RuntimeError, "Can't init MyHTML object.")
114
+ return -1
115
115
 
116
116
  with nogil:
117
117
  self.html_tree = myhtml_tree_create()
118
118
  status = myhtml_tree_init(self.html_tree, myhtml)
119
119
 
120
120
  if status != 0:
121
- raise RuntimeError("Can't init MyHTML Tree object.")
121
+ PyErr_SetObject(RuntimeError, "Can't init MyHTML Tree object.")
122
+ return -1
122
123
 
123
124
  with nogil:
124
125
  status = myhtml_parse(self.html_tree, self._encoding, html, html_len)
125
126
 
126
127
  if status != 0:
127
- raise RuntimeError("Can't parse HTML (status code: %d)" % status)
128
-
129
- assert self.html_tree.node_html != NULL
128
+ PyErr_SetObject(RuntimeError, "Can't parse HTML (status code: %d)" % status)
129
+ return -1
130
130
 
131
+ if self.html_tree.node_html == NULL:
132
+ PyErr_SetObject(RuntimeError, "html_tree is still NULL even after parsing ")
133
+ return -1
134
+ return 0
131
135
 
132
136
  @property
133
137
  def input_encoding(self):
@@ -148,9 +152,7 @@ cdef class HTMLParser:
148
152
  """Returns root node."""
149
153
  if self.html_tree and self.html_tree.node_html:
150
154
  try:
151
- node = Node()
152
- node._init(self.html_tree.node_html, self)
153
- return node
155
+ return Node.new(self.html_tree.node_html, self)
154
156
  except Exception:
155
157
  # If Node creation or initialization fails, return None
156
158
  return None
@@ -163,9 +165,7 @@ cdef class HTMLParser:
163
165
  head = myhtml_tree_get_node_head(self.html_tree)
164
166
 
165
167
  if head != NULL:
166
- node = Node()
167
- node._init(head, self)
168
- return node
168
+ return Node.new(head, self)
169
169
  return None
170
170
 
171
171
  @property
@@ -175,10 +175,7 @@ cdef class HTMLParser:
175
175
  body = myhtml_tree_get_node_body(self.html_tree)
176
176
 
177
177
  if body != NULL:
178
- node = Node()
179
- node._init(body, self)
180
- return node
181
-
178
+ return Node.new(body, self)
182
179
  return None
183
180
 
184
181
  def tags(self, str name):
@@ -197,7 +194,7 @@ cdef class HTMLParser:
197
194
 
198
195
  cdef myhtml_collection_t* collection = NULL
199
196
  pybyte_name = name.encode('UTF-8')
200
- cdef mystatus_t status = 0;
197
+ cdef mystatus_t status = 0
201
198
 
202
199
  result = list()
203
200
  collection = myhtml_get_nodes_by_name(self.html_tree, NULL, pybyte_name, len(pybyte_name), &status)
@@ -207,8 +204,7 @@ cdef class HTMLParser:
207
204
 
208
205
  if status == 0:
209
206
  for i in range(collection.length):
210
- node = Node()
211
- node._init(collection.list[i], self)
207
+ node = Node.new(collection.list[i], self)
212
208
  result.append(node)
213
209
 
214
210
  myhtml_collection_destroy(collection)
@@ -258,7 +254,7 @@ cdef class HTMLParser:
258
254
  """
259
255
  cdef myhtml_collection_t* collection = NULL
260
256
 
261
- cdef mystatus_t status = 0;
257
+ cdef mystatus_t status = 0
262
258
 
263
259
  for tag in tags:
264
260
  pybyte_name = tag.encode('UTF-8')
@@ -278,7 +274,6 @@ cdef class HTMLParser:
278
274
 
279
275
  myhtml_collection_destroy(collection)
280
276
 
281
-
282
277
  def unwrap_tags(self, list tags, delete_empty : bool = False):
283
278
  """Unwraps specified tags from the HTML tree.
284
279
 
@@ -305,9 +300,9 @@ cdef class HTMLParser:
305
300
  @property
306
301
  def html(self):
307
302
  """Return HTML representation of the page."""
308
- if self.html_tree and self.html_tree.document:
309
- node = Node()
310
- node._init(self.html_tree.document, self)
303
+ cdef Node node
304
+ if self.html_tree != NULL and self.html_tree.document != NULL:
305
+ node = Node.new(self.html_tree.document, self)
311
306
  return node.html
312
307
  return None
313
308
 
@@ -361,6 +356,7 @@ cdef class HTMLParser:
361
356
 
362
357
  def css_matches(self, str selector):
363
358
  return self.root.css_matches(selector)
359
+
364
360
  def merge_text_nodes(self):
365
361
  """Iterates over all text nodes and merges all text nodes that are close to each other.
366
362
 
@@ -380,6 +376,7 @@ cdef class HTMLParser:
380
376
  "John Doe"
381
377
  """
382
378
  return self.root.merge_text_nodes()
379
+
383
380
  @staticmethod
384
381
  cdef HTMLParser from_tree(
385
382
  myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
@@ -396,13 +393,13 @@ cdef class HTMLParser:
396
393
  obj.cached_script_srcs = None
397
394
  return obj
398
395
 
399
-
400
396
  def clone(self):
401
397
  """Clone the current tree."""
402
398
  cdef myhtml_t* myhtml
403
399
  cdef mystatus_t status
404
400
  cdef myhtml_tree_t* html_tree
405
401
  cdef myhtml_tree_node_t* node
402
+ cdef HTMLParser cls
406
403
 
407
404
  with nogil:
408
405
  myhtml = myhtml_create()
selectolax/utils.pxi CHANGED
@@ -4,6 +4,16 @@ MAX_HTML_INPUT_SIZE = 250e+7
4
4
 
5
5
  ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
6
6
  Parser = Union["HTMLParser", "LexborHTMLParser"]
7
+ FRAGMENT = Literal[
8
+ "document",
9
+ "fragment",
10
+ "head",
11
+ "body",
12
+ "head_and_body",
13
+ "document_no_head",
14
+ "document_no_body",
15
+ "document_no_head_no_body",
16
+ ]
7
17
 
8
18
 
9
19
  def preprocess_input(html, decode_errors='ignore'):
@@ -29,10 +39,10 @@ def get_fragment_type(
29
39
  html: str,
30
40
  parser_cls: ParserCls,
31
41
  tree: Optional[Parser] = None,
32
- ) -> Literal["document", "fragment", "head", "body", "head_and_body", "document_no_head", "document_no_body", "document_no_head_no_body"]:
42
+ ) -> FRAGMENT:
33
43
  if not tree:
34
44
  tree = parser_cls(html)
35
-
45
+
36
46
  import re
37
47
  html_re = re.compile(r"<html|<body|<head(?!er)", re.IGNORECASE)
38
48
 
@@ -49,7 +59,7 @@ def get_fragment_type(
49
59
 
50
60
  if has_html and has_head and has_body:
51
61
  break
52
-
62
+
53
63
  if has_html and has_head and has_body:
54
64
  return "document"
55
65
  elif has_html and not has_head and has_body:
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: selectolax
3
+ Version: 0.3.34
4
+ Summary: Fast HTML5 parser with CSS selectors.
5
+ Home-page: https://github.com/rushter/selectolax
6
+ Author: Artem Golubin
7
+ Author-email: Artem Golubin <me@rushter.com>
8
+ License-Expression: MIT
9
+ Project-URL: Repository, https://github.com/rushter/selectolax
10
+ Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
11
+ Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.md
12
+ Keywords: selectolax,html,parser,css,fast
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Topic :: Text Processing :: Markup :: HTML
15
+ Classifier: Topic :: Internet
16
+ Classifier: Topic :: Internet :: WWW/HTTP
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: Natural Language :: English
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Requires-Python: >=3.9
26
+ Description-Content-Type: text/x-rst
27
+ License-File: LICENSE
28
+ Provides-Extra: cython
29
+ Requires-Dist: Cython; extra == "cython"
30
+ Dynamic: author
31
+ Dynamic: home-page
32
+ Dynamic: license-file
@@ -0,0 +1,26 @@
1
+ selectolax/__init__.py,sha256=iI6pQ10gimevS2gTf4K4_1cXh4NBRFj_5GjkmhrvU94,157
2
+ selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
3
+ selectolax/lexbor.c,sha256=Kz7IFiUGbVTJvAH3WTwu188zD4xQm08Fs6ab6Jo6jyE,2419433
4
+ selectolax/lexbor.cp311-win_arm64.pyd,sha256=MQId555MXy5_gOSYom1MYwGodic5Grov2-8GNrKK41I,3189760
5
+ selectolax/lexbor.pxd,sha256=BcqAzhlUVq0GVWiJHWXNhs4jY-gi6k0BELEnQtSYJAI,21720
6
+ selectolax/lexbor.pyi,sha256=dRNzLXJEbFRR7QcItuX8Ews9E9I6h6G4vA3X1hijzj4,28990
7
+ selectolax/lexbor.pyx,sha256=XLZ2vGwLoWdctnmU-gfizjD6tMjehR_bzNOapDJ_YOQ,12891
8
+ selectolax/parser.c,sha256=zUJAqFbI1vy5-cjgPwJVfYassgbP7Gdnr2eRYv5D3W4,2259231
9
+ selectolax/parser.cp311-win_arm64.pyd,sha256=u4RlwwAmELrofK2sxxPrzosyU-b2s1IA_69-NXzjF1Y,2131968
10
+ selectolax/parser.pxd,sha256=T7GoQdaOkhp_W2TBlRY0tZqom97PkHrytYaXQlyVnbI,25196
11
+ selectolax/parser.pyi,sha256=-qutpjrK1dD4rrl3SsHWQt2FT5lv6meaACkQzk1Bt6o,25612
12
+ selectolax/parser.pyx,sha256=nIWuhaEFRwlfo64WmgrSOM0A8mUw0eWw9j_fWyLV-Ro,14127
13
+ selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ selectolax/utils.pxi,sha256=hkzKfycdpwH1P-E_pP-9NTGsmiajt6EJNZSlkxlRapA,3623
15
+ selectolax/lexbor/attrs.pxi,sha256=d59V77aGkpp7YsYsd6t_z4-tRnUoQTJZKsvMC8nyttM,3978
16
+ selectolax/lexbor/node.pxi,sha256=KODqPk3yZ_owwdSxqNr2Ih6qAOhu9CJ-jrHtqQJcWmY,33407
17
+ selectolax/lexbor/selection.pxi,sha256=BeUDypw5_P0CTmi-ACLcd7pK2NnG9ASrwWOdLdweAZY,7378
18
+ selectolax/lexbor/util.pxi,sha256=q2EYVNdnROg9y30mWpGwlNA0W00nJ7ZRNEEDrOEG14s,584
19
+ selectolax/modest/node.pxi,sha256=iX_yRPIPVkG0ALW7hEfmXiVperw6RjkSGATkxzLokz0,34691
20
+ selectolax/modest/selection.pxi,sha256=PfHUN1uuNA7YfcxTu7JZjhxevVbFRP1bHd3kyyFdO7E,6703
21
+ selectolax/modest/util.pxi,sha256=zab67Wzo8FcipA2VS8ClptaC19lZirbNqFEGQ3hW2Is,572
22
+ selectolax-0.3.34.dist-info/licenses/LICENSE,sha256=A7Jb3WZcENcLfZRc7QPdm9zJdwfpIyPodPJu-kdMH6E,1087
23
+ selectolax-0.3.34.dist-info/METADATA,sha256=rAqskRB9wMSn7tEZLxZswUJD4wFzN4fizyXjiBv4L4o,1318
24
+ selectolax-0.3.34.dist-info/WHEEL,sha256=_6dVEvfjMkp6KZZXihi2C2UP-ewiZXAMezDMkPqYmGo,101
25
+ selectolax-0.3.34.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
26
+ selectolax-0.3.34.dist-info/RECORD,,
@@ -1,187 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: selectolax
3
- Version: 0.3.32
4
- Summary: Fast HTML5 parser with CSS selectors.
5
- Home-page: https://github.com/rushter/selectolax
6
- Author: Artem Golubin
7
- Author-email: Artem Golubin <me@rushter.com>
8
- License: MIT
9
- Project-URL: Repository, https://github.com/rushter/selectolax
10
- Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
11
- Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.rst
12
- Keywords: selectolax,html,parser,css,fast
13
- Classifier: Development Status :: 5 - Production/Stable
14
- Classifier: Topic :: Text Processing :: Markup :: HTML
15
- Classifier: Topic :: Internet
16
- Classifier: Topic :: Internet :: WWW/HTTP
17
- Classifier: Intended Audience :: Developers
18
- Classifier: Natural Language :: English
19
- Classifier: Programming Language :: Python :: 3
20
- Classifier: Programming Language :: Python :: 3.9
21
- Classifier: Programming Language :: Python :: 3.10
22
- Classifier: Programming Language :: Python :: 3.11
23
- Classifier: Programming Language :: Python :: 3.12
24
- Classifier: Programming Language :: Python :: 3.13
25
- Requires-Python: >=3.9
26
- Description-Content-Type: text/x-rst
27
- License-File: LICENSE
28
- Provides-Extra: cython
29
- Requires-Dist: Cython; extra == "cython"
30
- Dynamic: author
31
- Dynamic: home-page
32
- Dynamic: license-file
33
-
34
- .. image:: docs/logo.png
35
- :alt: selectolax logo
36
-
37
- -------------------------
38
-
39
- .. image:: https://img.shields.io/pypi/v/selectolax.svg
40
- :target: https://pypi.python.org/pypi/selectolax
41
-
42
- A fast HTML5 parser with CSS selectors using `Modest <https://github.com/lexborisov/Modest/>`_ and
43
- `Lexbor <https://github.com/lexbor/lexbor>`_ engines.
44
-
45
-
46
- Installation
47
- ------------
48
- From PyPI using pip:
49
-
50
- .. code-block:: bash
51
-
52
- pip install selectolax
53
-
54
- If installation fails due to compilation errors, you may need to install `Cython <https://github.com/cython/cython>`_:
55
-
56
- .. code-block:: bash
57
-
58
- pip install selectolax[cython]
59
-
60
- This usually happens when you try to install an outdated version of selectolax on a newer version of Python.
61
-
62
-
63
- Development version from GitHub:
64
-
65
- .. code-block:: bash
66
-
67
- git clone --recursive https://github.com/rushter/selectolax
68
- cd selectolax
69
- pip install -r requirements_dev.txt
70
- python setup.py install
71
-
72
- How to compile selectolax while developing:
73
-
74
- .. code-block:: bash
75
-
76
- make clean
77
- make dev
78
-
79
- Basic examples
80
- --------------
81
-
82
- Here are some basic examples to get you started with selectolax:
83
-
84
- Parsing HTML and extracting text:
85
-
86
- .. code:: python
87
-
88
- In [1]: from selectolax.parser import HTMLParser
89
- ...:
90
- ...: html = """
91
- ...: <h1 id="title" data-updated="20201101">Hi there</h1>
92
- ...: <div class="post">Lorem Ipsum is simply dummy text of the printing and typesetting industry. </div>
93
- ...: <div class="post">Lorem ipsum dolor sit amet, consectetur adipiscing elit.</div>
94
- ...: """
95
- ...: tree = HTMLParser(html)
96
-
97
- In [2]: tree.css_first('h1#title').text()
98
- Out[2]: 'Hi there'
99
-
100
- In [3]: tree.css_first('h1#title').attributes
101
- Out[3]: {'id': 'title', 'data-updated': '20201101'}
102
-
103
- In [4]: [node.text() for node in tree.css('.post')]
104
- Out[4]:
105
- ['Lorem Ipsum is simply dummy text of the printing and typesetting industry. ',
106
- 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.']
107
-
108
- Using advanced CSS selectors:
109
-
110
- .. code:: python
111
-
112
- In [1]: html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
113
- ...: selector = "div > :nth-child(2n+1):not(:has(a))"
114
-
115
- In [2]: for node in HTMLParser(html).css(selector):
116
- ...: print(node.attributes, node.text(), node.tag)
117
- ...: print(node.parent.tag)
118
- ...: print(node.html)
119
- ...:
120
- {'id': 'p1'} p
121
- div
122
- <p id="p1"></p>
123
- {'id': 'p5'} text p
124
- div
125
- <p id="p5">text</p>
126
-
127
-
128
- * `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
129
-
130
- Available backends
131
- ------------------
132
-
133
- Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
134
- Most of the features between backends are almost identical, but there are still some differences.
135
-
136
- As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
137
- and the underlying C library that selectolax uses is not maintained anymore.
138
-
139
-
140
- To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
141
-
142
- .. code:: python
143
-
144
- In [1]: from selectolax.lexbor import LexborHTMLParser
145
-
146
- In [2]: html = """
147
- ...: <title>Hi there</title>
148
- ...: <div id="updated">2021-08-15</div>
149
- ...: """
150
-
151
- In [3]: parser = LexborHTMLParser(html)
152
- In [4]: parser.root.css_first("#updated").text()
153
- Out[4]: '2021-08-15'
154
-
155
-
156
- Simple Benchmark
157
- ----------------
158
-
159
- * Extract title, links, scripts and a meta tag from main pages of top 754 domains. See ``examples/benchmark.py`` for more information.
160
-
161
- ============================ ===========
162
- Package Time
163
- ============================ ===========
164
- Beautiful Soup (html.parser) 61.02 sec.
165
- lxml / Beautiful Soup (lxml) 9.09 sec.
166
- html5_parser 16.10 sec.
167
- selectolax (Modest) 2.94 sec.
168
- selectolax (Lexbor) 2.39 sec.
169
- ============================ ===========
170
-
171
- Links
172
- -----
173
-
174
- * `selectolax API reference <https://selectolax.readthedocs.io/en/latest/index.html>`_
175
- * `Video introduction to web scraping using selectolax <https://youtu.be/HpRsfpPuUzE>`_
176
- * `How to Scrape 7k Products with Python using selectolax and httpx <https://www.youtube.com/watch?v=XpGvq755J2U>`_
177
- * `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
178
- * `Modest introduction <https://lexborisov.github.io/Modest/>`_
179
- * `Modest benchmark <https://lexborisov.github.io/benchmark-html-parsers/>`_
180
- * `Python benchmark <https://rushter.com/blog/python-fast-html-parser/>`_
181
- * `Another Python benchmark <https://www.peterbe.com/plog/selectolax-or-pyquery>`_
182
-
183
- License
184
- -------
185
-
186
- * Modest engine — `LGPL2.1 <https://github.com/lexborisov/Modest/blob/master/LICENSE>`_
187
- * selectolax - `MIT <https://github.com/rushter/selectolax/blob/master/LICENSE>`_
@@ -1,26 +0,0 @@
1
- selectolax/__init__.py,sha256=-HUZIEaQkWhTkyAc3ZYkjSig-PwZhip6R_VUo5KmwKk,185
2
- selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
3
- selectolax/lexbor.c,sha256=LMGmz45kAbWJmyUerq7Uo80jdwhz9Q3S5uHo87X38_s,2419892
4
- selectolax/lexbor.cp311-win_arm64.pyd,sha256=zC02oQXe1-chh0I2EwLjNHTBjmm4_pPef9ONAq39c8Y,3186688
5
- selectolax/lexbor.pxd,sha256=cG264E-tFNOFTy0k5bAqV_sZnz4G6a4a21WEhqTL-NI,21516
6
- selectolax/lexbor.pyi,sha256=NS2pI6PL7klkk6xXThHE72Jsi8583xLUdQ6gvCAQovY,27028
7
- selectolax/lexbor.pyx,sha256=-O-g03mLCQKc9F19eMvo3PyoLDtF09IIuFziXJAl6Ao,11520
8
- selectolax/parser.c,sha256=QCsXkpNkYP1shOgzDCyU2gG6_vCXm9QmtCe_iBli73Y,2287009
9
- selectolax/parser.cp311-win_arm64.pyd,sha256=vVLgw1EGfxImnHK8T3wvdkbKBsIcY7jBHRufcQGASaU,2134528
10
- selectolax/parser.pxd,sha256=4pM_CcZlvJlaR8EMjZCnSmnCcJbwcYOldRTBEbfwm48,25145
11
- selectolax/parser.pyi,sha256=5Czf63278MQC01IxY-CHzoDyTS1oHiYRD2OxEscyL1o,25584
12
- selectolax/parser.pyx,sha256=bS2n70o_5OPJ6JuXTBAVUTc-XhxqC4DXzPE4H3-e5Ek,13987
13
- selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- selectolax/utils.pxi,sha256=4rtdRcLWuemxN1qe7Eul5jvAmHZ65r7Gvf67_Wg8Bt4,3566
15
- selectolax/lexbor/attrs.pxi,sha256=KvQaCo0jM3Bva6_xG3TGzkCVFjNQ4kgTxWO95gsGbkw,4007
16
- selectolax/lexbor/node.pxi,sha256=BuJIYcTUucvHUP4w2wl90uBwjcqpv1vRyt4No2O_Quo,30921
17
- selectolax/lexbor/selection.pxi,sha256=4I8cjYMjC2Gz7xLrYWrl7jbnwDoVmab-GvrJKiTALTI,7017
18
- selectolax/lexbor/util.pxi,sha256=0I4ElWIwXxrZCfMmGCtyDU127oMsPCqC3IcUk4QmMAc,582
19
- selectolax/modest/node.pxi,sha256=Da2b3cdmggCX736x0htGvac51SEeGCcY5l-LA5H4HNI,34376
20
- selectolax/modest/selection.pxi,sha256=0elY7JwnpPVaw0QZE1T7A78s9FIph5uWIhwy4sEXGU8,6586
21
- selectolax/modest/util.pxi,sha256=o2nPGGGtRlLqOCa7yPk94CfBzNlVr7ull7osFy6NRX4,570
22
- selectolax-0.3.32.dist-info/licenses/LICENSE,sha256=A7Jb3WZcENcLfZRc7QPdm9zJdwfpIyPodPJu-kdMH6E,1087
23
- selectolax-0.3.32.dist-info/METADATA,sha256=mB27GyUKzOKfZK92CEHDlQ6wrwG1um9kesSgMr8yb1c,6402
24
- selectolax-0.3.32.dist-info/WHEEL,sha256=_6dVEvfjMkp6KZZXihi2C2UP-ewiZXAMezDMkPqYmGo,101
25
- selectolax-0.3.32.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
26
- selectolax-0.3.32.dist-info/RECORD,,