selectolax 0.3.30__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyi ADDED
@@ -0,0 +1,177 @@
1
+ from typing import Any, Iterator, Literal, TypeVar, NoReturn, overload
2
+
3
+ DefaultT = TypeVar("DefaultT")
4
+
5
+ class LexborAttributes:
6
+ @staticmethod
7
+ def create(node: LexborAttributes) -> LexborAttributes: ...
8
+ def keys(self) -> Iterator[str]: ...
9
+ def items(self) -> Iterator[tuple[str, str | None]]: ...
10
+ def values(self) -> Iterator[str | None]: ...
11
+ def __iter__(self) -> Iterator[str]: ...
12
+ def __len__(self) -> int: ...
13
+ def __getitem__(self, key: str) -> str | None: ...
14
+ def __setitem__(self, key: str, value: str) -> None: ...
15
+ def __delitem__(self, key: str) -> None: ...
16
+ def __contains__(self, key: str) -> bool: ...
17
+ def __repr__(self) -> str: ...
18
+ @overload
19
+ def get(self, key: str, default: DefaultT) -> DefaultT | str | None: ...
20
+ @overload
21
+ def get(self, key: str, default: None = ...) -> str | None: ...
22
+ @overload
23
+ def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
24
+ @overload
25
+ def sget(self, key: str, default: str = "") -> str: ...
26
+
27
+ class LexborSelector:
28
+ def __init__(self, node: LexborNode, query: str): ...
29
+ def css(self, query: str) -> NoReturn: ...
30
+ @property
31
+ def matches(self) -> list[LexborNode]: ...
32
+ @property
33
+ def any_matches(self) -> bool: ...
34
+ def text_contains(
35
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
36
+ ) -> LexborSelector: ...
37
+ def any_text_contains(
38
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
39
+ ) -> bool: ...
40
+ def attribute_longer_than(
41
+ self, attribute: str, length: int, start: str | None = None
42
+ ) -> LexborSelector: ...
43
+ def any_attribute_longer_than(
44
+ self, attribute: str, length: int, start: str | None = None
45
+ ) -> bool: ...
46
+
47
+ class LexborCSSSelector:
48
+ def __init__(self): ...
49
+ def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
50
+ def any_matches(self, query: str, node: LexborNode) -> bool: ...
51
+
52
+ class LexborNode:
53
+ parser: LexborHTMLParser
54
+ @property
55
+ def mem_id(self) -> int: ...
56
+ @property
57
+ def child(self) -> LexborNode | None: ...
58
+ @property
59
+ def first_child(self) -> LexborNode | None: ...
60
+ @property
61
+ def parent(self) -> LexborNode | None: ...
62
+ @property
63
+ def next(self) -> LexborNode | None: ...
64
+ @property
65
+ def prev(self) -> LexborNode | None: ...
66
+ @property
67
+ def last_child(self) -> LexborNode | None: ...
68
+ @property
69
+ def html(self) -> str | None: ...
70
+ def __hash__(self) -> int: ...
71
+ def text_lexbor(self) -> str: ...
72
+ def text(
73
+ self, deep: bool = True, separator: str = "", strip: bool = False
74
+ ) -> str: ...
75
+ def css(self, query: str) -> list[LexborNode]: ...
76
+ @overload
77
+ def css_first(
78
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
79
+ ) -> LexborNode: ...
80
+ @overload
81
+ def css_first(
82
+ self, query: str, default: DefaultT, strict: bool = False
83
+ ) -> LexborNode | DefaultT: ...
84
+ @overload
85
+ def css_first(
86
+ self, query: str, default: None = ..., strict: bool = False
87
+ ) -> LexborNode | None: ...
88
+ def any_css_matches(self, selectors: tuple[str]) -> bool: ...
89
+ def css_matches(self, selector: str) -> bool: ...
90
+ @property
91
+ def tag_id(self) -> int: ...
92
+ @property
93
+ def tag(self) -> str | None: ...
94
+ def decompose(self, recursive: bool = True) -> None: ...
95
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None: ...
96
+ @property
97
+ def attributes(self) -> dict[str, str | None]: ...
98
+ @property
99
+ def attrs(self) -> LexborAttributes: ...
100
+ @property
101
+ def id(self) -> str | None: ...
102
+ def iter(self, include_text: bool = False) -> Iterator[LexborNode]: ...
103
+ def unwrap(self) -> None: ...
104
+ def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
105
+ def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: ...
106
+ def replace_with(self, value: bytes | str | LexborNode) -> None: ...
107
+ def insert_before(self, value: bytes | str | LexborNode) -> None: ...
108
+ def insert_after(self, value: bytes | str | LexborNode) -> None: ...
109
+ def insert_child(self, value: bytes | str | LexborNode) -> None: ...
110
+ @property
111
+ def raw_value(self) -> NoReturn: ...
112
+ def scripts_contain(self, query: str) -> bool: ...
113
+ def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
114
+ def remove(self, recursive: bool = True) -> None: ...
115
+ def select(self, query: str | None = None) -> LexborSelector: ...
116
+ @property
117
+ def text_content(self) -> str | None: ...
118
+
119
+ class LexborHTMLParser:
120
+ def __init__(self, html: str| bytes ): ...
121
+ @property
122
+ def selector(self) -> "LexborCSSSelector": ...
123
+ @property
124
+ def root(self) -> LexborNode | None: ...
125
+ @property
126
+ def body(self) -> LexborNode | None: ...
127
+ @property
128
+ def head(self) -> LexborNode | None: ...
129
+ def tags(self, name: str) -> list[LexborNode]: ...
130
+ def text(
131
+ self, deep: bool = True, separator: str = "", strip: bool = False
132
+ ) -> str: ...
133
+ @property
134
+ def html(self) -> str | None: ...
135
+ def css(self, query: str) -> list[LexborNode]: ...
136
+ @overload
137
+ def css_first(
138
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
139
+ ) -> LexborNode: ...
140
+ @overload
141
+ def css_first(
142
+ self, query: str, default: DefaultT, strict: bool = False
143
+ ) -> LexborNode | DefaultT: ...
144
+ @overload
145
+ def css_first(
146
+ self, query: str, default: None = ..., strict: bool = False
147
+ ) -> LexborNode | None: ...
148
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None: ...
149
+ def select(self, query: str | None = None) -> LexborSelector | None: ...
150
+ def any_css_matches(self, selectors: tuple[str]) -> bool: ...
151
+ def scripts_contain(self, query: str) -> bool: ...
152
+ def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
153
+ def css_matches(self, selector: str) -> bool: ...
154
+ def clone(self) -> LexborHTMLParser: ...
155
+ def unwrap_tags(self, tags: list[str], delete_empty : bool = False) -> None: ...
156
+
157
+ def create_tag(tag: str) -> LexborNode:
158
+ """
159
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
160
+ e.g. `"<div></div>"`.
161
+ """
162
+ ...
163
+
164
+ def parse_fragment(html: str) -> list[LexborNode]:
165
+ """
166
+ Given HTML, parse it into a list of Nodes, such that the nodes
167
+ correspond to the given HTML.
168
+
169
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
170
+ if they are missing. This function does not add these tags.
171
+ """
172
+ ...
173
+
174
+
175
+ class SelectolaxError(Exception):
176
+ """An exception that indicates error."""
177
+ pass
selectolax/lexbor.pyx ADDED
@@ -0,0 +1,357 @@
1
+ from cpython cimport bool
2
+
3
+ _ENCODING = 'UTF-8'
4
+
5
+ include "base.pxi"
6
+ include "utils.pxi"
7
+ include "lexbor/attrs.pxi"
8
+ include "lexbor/node.pxi"
9
+ include "lexbor/selection.pxi"
10
+ include "lexbor/util.pxi"
11
+
12
+ # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
13
+
14
+ cdef class LexborHTMLParser:
15
+ """The lexbor HTML parser.
16
+
17
+ Use this class to parse raw HTML.
18
+
19
+ This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
20
+
21
+ Parameters
22
+ ----------
23
+
24
+ html : str (unicode) or bytes
25
+ """
26
+ def __init__(self, html):
27
+
28
+ cdef size_t html_len
29
+ cdef char* html_chars
30
+
31
+ bytes_html, html_len = preprocess_input(html)
32
+ self._parse_html(bytes_html, html_len)
33
+ self.raw_html = bytes_html
34
+ self._selector = None
35
+
36
+ @property
37
+ def selector(self):
38
+ if self._selector is None:
39
+ self._selector = LexborCSSSelector()
40
+ return self._selector
41
+
42
+
43
+ cdef _parse_html(self, char *html, size_t html_len):
44
+ cdef lxb_status_t status
45
+
46
+ with nogil:
47
+ self.document = lxb_html_document_create()
48
+
49
+ if self.document == NULL:
50
+ raise SelectolaxError("Failed to initialize object for HTML Document.")
51
+
52
+ with nogil:
53
+ status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
54
+ if status != 0x0000:
55
+ raise SelectolaxError("Can't parse HTML.")
56
+
57
+ assert self.document != NULL
58
+
59
+ def __dealloc__(self):
60
+ if self.document != NULL:
61
+ lxb_html_document_destroy(self.document)
62
+
63
+ def __repr__(self):
64
+ return '<LexborHTMLParser chars=%s>' % len(self.root.html)
65
+
66
+ @property
67
+ def root(self):
68
+ """Returns root node."""
69
+ if self.document == NULL:
70
+ return None
71
+ return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
+
73
+ @property
74
+ def body(self):
75
+ """Returns document body."""
76
+ cdef lxb_html_body_element_t* body
77
+ body = lxb_html_document_body_element_noi(self.document)
78
+ if body == NULL:
79
+ return None
80
+ return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
81
+
82
+ @property
83
+ def head(self):
84
+ """Returns document head."""
85
+ cdef lxb_html_head_element_t* head
86
+ head = lxb_html_document_head_element_noi(self.document)
87
+ if head == NULL:
88
+ return None
89
+ return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
90
+
91
+ def tags(self, str name):
92
+ """Returns a list of tags that match specified name.
93
+
94
+ Parameters
95
+ ----------
96
+ name : str (e.g. div)
97
+
98
+ """
99
+
100
+ if not name:
101
+ raise ValueError("Tag name cannot be empty")
102
+ if len(name) > 100:
103
+ raise ValueError("Tag name is too long")
104
+
105
+ cdef lxb_dom_collection_t* collection = NULL
106
+ cdef lxb_status_t status
107
+ pybyte_name = name.encode('UTF-8')
108
+
109
+ result = list()
110
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
111
+
112
+ if collection == NULL:
113
+ return result
114
+ status = lxb_dom_elements_by_tag_name(
115
+ <lxb_dom_element_t *> self.document,
116
+ collection,
117
+ <lxb_char_t *> pybyte_name,
118
+ len(pybyte_name)
119
+ )
120
+ if status != 0x0000:
121
+ lxb_dom_collection_destroy(collection, <bint> True)
122
+ raise SelectolaxError("Can't locate elements.")
123
+
124
+ for i in range(lxb_dom_collection_length_noi(collection)):
125
+ node = LexborNode()._cinit(
126
+ <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
127
+ self
128
+ )
129
+ result.append(node)
130
+ lxb_dom_collection_destroy(collection, <bint> True)
131
+ return result
132
+
133
+ def text(self, bool deep=True, str separator='', bool strip=False):
134
+ """Returns the text of the node including text of all its child nodes.
135
+
136
+ Parameters
137
+ ----------
138
+ strip : bool, default False
139
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
140
+ separator : str, default ''
141
+ The separator to use when joining text from different nodes.
142
+ deep : bool, default True
143
+ If True, includes text from all child nodes.
144
+
145
+ Returns
146
+ -------
147
+ text : str
148
+
149
+ """
150
+ if self.body is None:
151
+ return ""
152
+ return self.body.text(deep=deep, separator=separator, strip=strip)
153
+
154
+ @property
155
+ def html(self):
156
+ """Return HTML representation of the page."""
157
+ if self.document == NULL:
158
+ return None
159
+ node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
160
+ return node.html
161
+
162
+ def css(self, str query):
163
+ """A CSS selector.
164
+
165
+ Matches pattern `query` against HTML tree.
166
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
167
+
168
+ Parameters
169
+ ----------
170
+ query : str
171
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
172
+
173
+ Returns
174
+ -------
175
+ selector : list of `Node` objects
176
+ """
177
+ return self.root.css(query)
178
+
179
+ def css_first(self, str query, default=None, strict=False):
180
+ """Same as `css` but returns only the first match.
181
+
182
+ Parameters
183
+ ----------
184
+
185
+ query : str
186
+ default : bool, default None
187
+ Default value to return if there is no match.
188
+ strict: bool, default True
189
+ Set to True if you want to check if there is strictly only one match in the document.
190
+
191
+
192
+ Returns
193
+ -------
194
+ selector : `LexborNode` object
195
+ """
196
+ return self.root.css_first(query, default, strict)
197
+
198
+ def strip_tags(self, list tags, bool recursive = False):
199
+ """Remove specified tags from the node.
200
+
201
+ Parameters
202
+ ----------
203
+ tags : list of str
204
+ List of tags to remove.
205
+ recursive : bool, default True
206
+ Whenever to delete all its child nodes
207
+
208
+ Examples
209
+ --------
210
+
211
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
212
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
213
+ >>> tree.strip_tags(tags)
214
+ >>> tree.html
215
+ '<html><body><div>Hello world!</div></body></html>'
216
+
217
+ """
218
+ cdef lxb_dom_collection_t* collection = NULL
219
+ cdef lxb_status_t status
220
+
221
+ for tag in tags:
222
+ pybyte_name = tag.encode('UTF-8')
223
+
224
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
225
+
226
+ if collection == NULL:
227
+ raise SelectolaxError("Can't initialize DOM collection.")
228
+
229
+ status = lxb_dom_elements_by_tag_name(
230
+ <lxb_dom_element_t *> self.document,
231
+ collection,
232
+ <lxb_char_t *> pybyte_name,
233
+ len(pybyte_name)
234
+ )
235
+ if status != 0x0000:
236
+ lxb_dom_collection_destroy(collection, <bint> True)
237
+ raise SelectolaxError("Can't locate elements.")
238
+
239
+ for i in range(lxb_dom_collection_length_noi(collection)):
240
+ if recursive:
241
+ lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
242
+ else:
243
+ lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
244
+ lxb_dom_collection_destroy(collection, <bint> True)
245
+
246
+ def select(self, query=None):
247
+ """Select nodes give a CSS selector.
248
+
249
+ Works similarly to the ``css`` method, but supports chained filtering and extra features.
250
+
251
+ Parameters
252
+ ----------
253
+ query : str or None
254
+ The CSS selector to use when searching for nodes.
255
+
256
+ Returns
257
+ -------
258
+ selector : The `Selector` class.
259
+ """
260
+ cdef LexborNode node
261
+ node = self.root
262
+ if node:
263
+ return LexborSelector(node, query)
264
+
265
+ def any_css_matches(self, tuple selectors):
266
+ """Returns True if any of the specified CSS selectors matches a node."""
267
+ return self.root.any_css_matches(selectors)
268
+
269
+ def scripts_contain(self, str query):
270
+ """Returns True if any of the script tags contain specified text.
271
+
272
+ Caches script tags on the first call to improve performance.
273
+
274
+ Parameters
275
+ ----------
276
+ query : str
277
+ The query to check.
278
+
279
+ """
280
+ return self.root.scripts_contain(query)
281
+
282
+
283
+ def script_srcs_contain(self, tuple queries):
284
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
285
+
286
+ Caches values on the first call to improve performance.
287
+
288
+ Parameters
289
+ ----------
290
+ queries : tuple of str
291
+
292
+ """
293
+ return self.root.script_srcs_contain(queries)
294
+
295
+ def css_matches(self, str selector):
296
+ return self.root.css_matches(selector)
297
+
298
+ @staticmethod
299
+ cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
300
+ obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
301
+ obj.document = document
302
+ obj.raw_html = raw_html
303
+ obj.cached_script_texts = None
304
+ obj.cached_script_srcs = None
305
+ obj._selector = None
306
+ return obj
307
+
308
+ def clone(self):
309
+ """Clone the current tree."""
310
+ cdef lxb_html_document_t* cloned_document
311
+ cdef lxb_dom_node_t* cloned_node
312
+
313
+ with nogil:
314
+ cloned_document = lxb_html_document_create()
315
+
316
+ if cloned_document == NULL:
317
+ raise SelectolaxError("Can't create a new document")
318
+
319
+ cloned_document.ready_state = LXB_HTML_DOCUMENT_READY_STATE_COMPLETE
320
+
321
+ with nogil:
322
+ cloned_node = lxb_dom_document_import_node(
323
+ &cloned_document.dom_document,
324
+ <lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document),
325
+ <bint> True
326
+ )
327
+
328
+ if cloned_node == NULL:
329
+ raise SelectolaxError("Can't create a new document")
330
+
331
+ with nogil:
332
+ lxb_dom_node_insert_child(<lxb_dom_node_t * > cloned_document, cloned_node)
333
+
334
+ cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
335
+ return cls
336
+ def unwrap_tags(self, list tags, delete_empty = False):
337
+ """Unwraps specified tags from the HTML tree.
338
+
339
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
340
+
341
+ Parameters
342
+ ----------
343
+ tags : list
344
+ List of tags to remove.
345
+ delete_empty : bool
346
+ Whenever to delete empty tags.
347
+
348
+ Examples
349
+ --------
350
+
351
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
352
+ >>> tree.body.unwrap_tags(['i','a'])
353
+ >>> tree.body.html
354
+ '<body><div>Hello world!</div></body>'
355
+ """
356
+ if self.root is not None:
357
+ self.root.unwrap_tags(tags, delete_empty=delete_empty)