selectolax 0.3.24__cp38-cp38-win_amd64.whl → 0.3.25__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

Binary file
selectolax/parser.pyi CHANGED
@@ -56,6 +56,7 @@ class Selector:
56
56
  ...
57
57
 
58
58
  class Node:
59
+ parser: "HTMLParser"
59
60
  @property
60
61
  def attributes(self) -> dict[str, None | str]:
61
62
  """Get all attributes that belong to the current node.
@@ -161,6 +162,9 @@ class Node:
161
162
  def insert_after(self, value: str | bytes | None) -> None:
162
163
  """Insert a node after the current Node."""
163
164
  ...
165
+ def insert_child(self, value: str | bytes | None) -> None:
166
+ """Insert a node inside (at the end of) the current Node.."""
167
+ ...
164
168
  @property
165
169
  def raw_value(self) -> bytes:
166
170
  """Return the raw (unparsed, original) value of a node.
@@ -276,3 +280,20 @@ class HTMLParser:
276
280
 
277
281
  This is useful for text extraction."""
278
282
  ...
283
+
284
+ def create_tag(tag: str) -> "Node":
285
+ """
286
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
287
+ e.g. `"<div></div>"`.
288
+ """
289
+ ...
290
+
291
+ def parse_fragment(html: str) -> list["Node"]:
292
+ """
293
+ Given HTML, parse it into a list of Nodes, such that the nodes
294
+ correspond to the given HTML.
295
+
296
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
297
+ if they are missing. This function does not add these tags.
298
+ """
299
+ ...
selectolax/parser.pyx CHANGED
@@ -3,6 +3,7 @@ from cpython cimport bool
3
3
 
4
4
  include "modest/selection.pxi"
5
5
  include "modest/node.pxi"
6
+ include "modest/util.pxi"
6
7
  include "utils.pxi"
7
8
 
8
9
  cdef class HTMLParser:
selectolax/utils.pxi CHANGED
@@ -1,5 +1,11 @@
1
+ from typing import Literal, Optional, Union, Type
2
+
1
3
  MAX_HTML_INPUT_SIZE = 250e+7
2
4
 
5
+ ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
6
+ Parser = Union["HTMLParser", "LexborHTMLParser"]
7
+
8
+
3
9
  def preprocess_input(html, decode_errors='ignore'):
4
10
  if isinstance(html, (str, unicode)):
5
11
  bytes_html = html.encode('UTF-8', errors=decode_errors)
@@ -11,3 +17,91 @@ def preprocess_input(html, decode_errors='ignore'):
11
17
  if html_len > MAX_HTML_INPUT_SIZE:
12
18
  raise ValueError("The specified HTML input is too large to be processed (%d bytes)" % html_len)
13
19
  return bytes_html, html_len
20
+
21
+
22
+ def do_create_tag(tag: str, parser_cls: ParserCls):
23
+ if not tag:
24
+ raise ValueError("Tag name cannot be empty")
25
+ return do_parse_fragment(f"<{tag}></{tag}>", parser_cls)[0]
26
+
27
+
28
+ def get_fragment_type(
29
+ html: str,
30
+ parser_cls: ParserCls,
31
+ tree: Optional[Parser] = None,
32
+ ) -> Literal["document", "fragment", "head", "body", "head_and_body", "document_no_head", "document_no_body", "document_no_head_no_body"]:
33
+ if not tree:
34
+ tree = parser_cls(html)
35
+
36
+ import re
37
+ html_re = re.compile(r"<html|<body|<head", re.IGNORECASE)
38
+
39
+ has_html = False
40
+ has_head = False
41
+ has_body = False
42
+ for match in html_re.finditer(html):
43
+ if match[0] == "<html":
44
+ has_html = True
45
+ elif match[0] == "<head":
46
+ has_head = True
47
+ elif match[0] == "<body":
48
+ has_body = True
49
+
50
+ if has_html and has_head and has_body:
51
+ break
52
+
53
+ if has_html and has_head and has_body:
54
+ return "document"
55
+ elif has_html and not has_head and has_body:
56
+ return "document_no_head"
57
+ elif has_html and has_head and not has_body:
58
+ return "document_no_body"
59
+ elif has_html and not has_head and not has_body:
60
+ return "document_no_head_no_body"
61
+ elif has_head and not has_body:
62
+ return "head"
63
+ elif not has_head and has_body:
64
+ return "body"
65
+ elif has_head and has_body:
66
+ return "head_and_body"
67
+ else:
68
+ return "fragment"
69
+
70
+
71
+ def do_parse_fragment(html: str, parser_cls: ParserCls):
72
+ """
73
+ Given HTML, parse it into a list of Nodes, such that the nodes
74
+ correspond to the given HTML.
75
+
76
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
77
+ if they are missing. This function does not add these tags.
78
+ """
79
+ html = html.strip()
80
+ tree = parser_cls(html)
81
+ frag_type = get_fragment_type(html, parser_cls, tree)
82
+
83
+ if frag_type == "document":
84
+ return [tree.root]
85
+ if frag_type == "document_no_head":
86
+ tree.head.decompose(recursive=True)
87
+ return [tree.root]
88
+ if frag_type == "document_no_body":
89
+ tree.body.decompose(recursive=True)
90
+ return [tree.root]
91
+ if frag_type == "document_no_head_no_body":
92
+ tree.head.decompose(recursive=True)
93
+ tree.body.decompose(recursive=True)
94
+ return [tree.root]
95
+ elif frag_type == "head":
96
+ tree.body.decompose(recursive=True)
97
+ return [tree.head]
98
+ elif frag_type == "body":
99
+ tree.head.decompose(recursive=True)
100
+ return [tree.body]
101
+ elif frag_type == "head_and_body":
102
+ return [tree.head, tree.body]
103
+ else:
104
+ return [
105
+ *tree.head.iter(include_text=True),
106
+ *tree.body.iter(include_text=True),
107
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: selectolax
3
- Version: 0.3.24
3
+ Version: 0.3.25
4
4
  Summary: Fast HTML5 parser with CSS selectors.
5
5
  Home-page: https://github.com/rushter/selectolax
6
6
  Author: Artem Golubin
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.9
22
22
  Classifier: Programming Language :: Python :: 3.10
23
23
  Classifier: Programming Language :: Python :: 3.11
24
24
  Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
25
26
  License-File: LICENSE
26
27
  Provides-Extra: cython
27
28
  Requires-Dist: Cython==3.0.11; extra == "cython"
@@ -122,6 +123,10 @@ Available backends
122
123
  Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
123
124
  Most of the features between backends are almost identical, but there are still some differences.
124
125
 
126
+ As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
127
+ and the underlying C library that selectolax uses is not maintained anymore.
128
+
129
+
125
130
  To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
126
131
 
127
132
  .. code:: python
@@ -147,7 +152,7 @@ Simple Benchmark
147
152
  Package Time
148
153
  ============================ ===========
149
154
  Beautiful Soup (html.parser) 61.02 sec.
150
- lxml 9.09 sec.
155
+ lxml / Beautiful Soup (lxml) 9.09 sec.
151
156
  html5_parser 16.10 sec.
152
157
  selectolax (Modest) 2.94 sec.
153
158
  selectolax (Lexbor) 2.39 sec.
@@ -0,0 +1,26 @@
1
+ selectolax/__init__.py,sha256=bnh02Y32fkV7Y_ibTNnzgBjLMKlZDKhtxELxiRpL-JU,185
2
+ selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
3
+ selectolax/lexbor.c,sha256=OXp79KKhS3Zg5JY0cbv05VGROvfgFOnFZAmPs-JdQyY,2354322
4
+ selectolax/lexbor.cp38-win_amd64.pyd,sha256=mCkCq6kWtLscwm0cT5KAkM8p2CIvk4P3GquMiMjFTPc,6932480
5
+ selectolax/lexbor.pxd,sha256=1d9nvZd9rZl27gwPwVV5BlbR2LAi6jDK69Xm9Guz5Kk,21538
6
+ selectolax/lexbor.pyi,sha256=VvVQ1HwZlDK2oXByWE2zFPozRgLL8nyhoXGrNLmwk5Q,5684
7
+ selectolax/lexbor.pyx,sha256=H3-Y78orz2Hop0Qqf8JulJo6f3yBR0kbHRqNPyDOsAc,11097
8
+ selectolax/parser.c,sha256=Y96r4R41n15eztCveImVl1zFJId6TERe9HQp9GFJD04,2215023
9
+ selectolax/parser.cp38-win_amd64.pyd,sha256=Igg3kQa8cb5k-WP2yjV8bRha2t4L0bbuU2Wi69OzuMQ,2143232
10
+ selectolax/parser.pxd,sha256=4pM_CcZlvJlaR8EMjZCnSmnCcJbwcYOldRTBEbfwm48,25145
11
+ selectolax/parser.pyi,sha256=FFpUXXVatpajijEXN2WpKKsesuJv82aE3TZKz_oyY2o,10857
12
+ selectolax/parser.pyx,sha256=lQW4qJ6nCDraCupvcT61zUkgo-S-KIzl9JIGV6hh6hA,13386
13
+ selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ selectolax/utils.pxi,sha256=GFDsBSjUCNh_Zmi7mKvWN-ww2meWKe955DqXlDqhNHc,3560
15
+ selectolax/lexbor/attrs.pxi,sha256=TEJUCGAkFwb14Emecyx4yljKSJMRmFbq8mOcNs35G_c,3204
16
+ selectolax/lexbor/node.pxi,sha256=P-KXzAk6fOo8ilEKAm0yjFQa90xkAXhWhSdce-YDauo,30213
17
+ selectolax/lexbor/selection.pxi,sha256=nRGiDYvpSAQcsWQ_2Z9-4kqebahIJmKWXQBtd3MPsis,6626
18
+ selectolax/lexbor/util.pxi,sha256=0I4ElWIwXxrZCfMmGCtyDU127oMsPCqC3IcUk4QmMAc,582
19
+ selectolax/modest/node.pxi,sha256=xDJTFhcvkF7FGSObnCftZJSIBj7wV1--IdLJbFoIItw,33519
20
+ selectolax/modest/selection.pxi,sha256=0elY7JwnpPVaw0QZE1T7A78s9FIph5uWIhwy4sEXGU8,6586
21
+ selectolax/modest/util.pxi,sha256=o2nPGGGtRlLqOCa7yPk94CfBzNlVr7ull7osFy6NRX4,570
22
+ selectolax-0.3.25.dist-info/LICENSE,sha256=Gy4WGsmAwV9QtqH0HaBHJQ35bt_0irn77fIt1iBncUo,1087
23
+ selectolax-0.3.25.dist-info/METADATA,sha256=sOZWvfD77Qj0NZcS83InlhhHCfhJlvadw2PYrZu2ScQ,6105
24
+ selectolax-0.3.25.dist-info/WHEEL,sha256=BiKefS7XHc6W0zY4ROW4fxXTZBRH27JLKtM6pqE4hBs,99
25
+ selectolax-0.3.25.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
26
+ selectolax-0.3.25.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp38-cp38-win_amd64
5
5
 
@@ -1,24 +0,0 @@
1
- selectolax/__init__.py,sha256=EZ333_G4h_fipJTzF54Lms67-gm6cAP6ywKd8LNLZUs,185
2
- selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
3
- selectolax/lexbor.c,sha256=QbF8u3wW5Q5VE78CEkhTIgARbFvK9OiAUv_MagPVSvU,2135549
4
- selectolax/lexbor.cp38-win_amd64.pyd,sha256=RhktZ_cTTRgUuuCtU8FvvhKFBw0r-f7A2mQsJCzPAkM,6888960
5
- selectolax/lexbor.pxd,sha256=1d9nvZd9rZl27gwPwVV5BlbR2LAi6jDK69Xm9Guz5Kk,21538
6
- selectolax/lexbor.pyi,sha256=BTeyX7yNXYERH2tO3M623j_xpGTRils6pfQv1g50ITY,5067
7
- selectolax/lexbor.pyx,sha256=LY6k1YmL5WOfSaciSr0pf_vf5MX64gJFPjdGINRlUWg,11070
8
- selectolax/parser.c,sha256=QFGyWiy4ahdGzijmS8Rq3GYWHRVVLC5wXvH2UpDZzSQ,1996131
9
- selectolax/parser.cp38-win_amd64.pyd,sha256=tvYAzgsmbnhuxYn2LFrhsHNgElIZX9Iphb1V_Ur597U,2096640
10
- selectolax/parser.pxd,sha256=4pM_CcZlvJlaR8EMjZCnSmnCcJbwcYOldRTBEbfwm48,25145
11
- selectolax/parser.pyi,sha256=G9u1rUaqEogyz1R1qkbrtjxTBLPYNfJgR0DwtD_QEQY,10186
12
- selectolax/parser.pyx,sha256=VC4lwfu86f0uh3z515IkPUgOe9FwtrjFzNrnvhot7iE,13359
13
- selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- selectolax/utils.pxi,sha256=pyiggj5At9fABheMq1Gx2Nx0O2U7kfAPgLAzb_JXKpc,557
15
- selectolax/lexbor/attrs.pxi,sha256=TEJUCGAkFwb14Emecyx4yljKSJMRmFbq8mOcNs35G_c,3204
16
- selectolax/lexbor/node.pxi,sha256=P-KXzAk6fOo8ilEKAm0yjFQa90xkAXhWhSdce-YDauo,30213
17
- selectolax/lexbor/selection.pxi,sha256=nRGiDYvpSAQcsWQ_2Z9-4kqebahIJmKWXQBtd3MPsis,6626
18
- selectolax/modest/node.pxi,sha256=xDJTFhcvkF7FGSObnCftZJSIBj7wV1--IdLJbFoIItw,33519
19
- selectolax/modest/selection.pxi,sha256=0elY7JwnpPVaw0QZE1T7A78s9FIph5uWIhwy4sEXGU8,6586
20
- selectolax-0.3.24.dist-info/LICENSE,sha256=Gy4WGsmAwV9QtqH0HaBHJQ35bt_0irn77fIt1iBncUo,1087
21
- selectolax-0.3.24.dist-info/METADATA,sha256=JXm1yZt3pSsSkdX8CJnf7kvQDem7_r1Z2JTiKiy0Gho,5853
22
- selectolax-0.3.24.dist-info/WHEEL,sha256=1rwUt2BQv3N5S0pE0N9WlEu1uiKd-mQhEp4H-4r377U,99
23
- selectolax-0.3.24.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
24
- selectolax-0.3.24.dist-info/RECORD,,