selectolax 0.3.28__cp38-cp38-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyi ADDED
@@ -0,0 +1,172 @@
1
+ from typing import Any, Iterator, Literal, TypeVar, NoReturn, overload
2
+
3
+ DefaultT = TypeVar("DefaultT")
4
+
5
+ class LexborAttributes:
6
+ @staticmethod
7
+ def create(node: LexborAttributes) -> LexborAttributes: ...
8
+ def keys(self) -> Iterator[str]: ...
9
+ def items(self) -> Iterator[tuple[str, str | None]]: ...
10
+ def values(self) -> Iterator[str | None]: ...
11
+ def __iter__(self) -> Iterator[str]: ...
12
+ def __len__(self) -> int: ...
13
+ def __getitem__(self, key: str) -> str | None: ...
14
+ def __setitem__(self, key: str, value: str) -> None: ...
15
+ def __delitem__(self, key: str) -> None: ...
16
+ def __contains__(self, key: str) -> bool: ...
17
+ def __repr__(self) -> str: ...
18
+ @overload
19
+ def get(self, key: str, default: DefaultT) -> DefaultT | str | None: ...
20
+ @overload
21
+ def get(self, key: str, default: None = ...) -> str | None: ...
22
+ @overload
23
+ def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
24
+ @overload
25
+ def sget(self, key: str, default: str = "") -> str: ...
26
+
27
+ class LexborSelector:
28
+ def __init__(self, node: LexborNode, query: str): ...
29
+ def css(self, query: str) -> NoReturn: ...
30
+ @property
31
+ def matches(self) -> list[LexborNode]: ...
32
+ @property
33
+ def any_matches(self) -> bool: ...
34
+ def text_contains(
35
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
36
+ ) -> LexborSelector: ...
37
+ def any_text_contains(
38
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
39
+ ) -> bool: ...
40
+ def attribute_longer_than(
41
+ self, attribute: str, length: int, start: str | None = None
42
+ ) -> LexborSelector: ...
43
+ def any_attribute_longer_than(
44
+ self, attribute: str, length: int, start: str | None = None
45
+ ) -> bool: ...
46
+
47
+ class LexborCSSSelector:
48
+ def __init__(self): ...
49
+ def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
50
+ def any_matches(self, query: str, node: LexborNode) -> bool: ...
51
+
52
+ class LexborNode:
53
+ parser: LexborHTMLParser
54
+ @property
55
+ def mem_id(self) -> int: ...
56
+ @property
57
+ def child(self) -> LexborNode | None: ...
58
+ @property
59
+ def first_child(self) -> LexborNode | None: ...
60
+ @property
61
+ def parent(self) -> LexborNode | None: ...
62
+ @property
63
+ def next(self) -> LexborNode | None: ...
64
+ @property
65
+ def prev(self) -> LexborNode | None: ...
66
+ @property
67
+ def last_child(self) -> LexborNode | None: ...
68
+ @property
69
+ def html(self) -> str | None: ...
70
+ def __hash__(self) -> int: ...
71
+ def text_lexbor(self) -> str: ...
72
+ def text(
73
+ self, deep: bool = True, separator: str = "", strip: bool = False
74
+ ) -> str: ...
75
+ def css(self, query: str) -> list[LexborNode]: ...
76
+ @overload
77
+ def css_first(
78
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
79
+ ) -> LexborNode: ...
80
+ @overload
81
+ def css_first(
82
+ self, query: str, default: DefaultT, strict: bool = False
83
+ ) -> LexborNode | DefaultT: ...
84
+ @overload
85
+ def css_first(
86
+ self, query: str, default: None = ..., strict: bool = False
87
+ ) -> LexborNode | None: ...
88
+ def any_css_matches(self, selectors: tuple[str]) -> bool: ...
89
+ def css_matches(self, selector: str) -> bool: ...
90
+ @property
91
+ def tag_id(self) -> int: ...
92
+ @property
93
+ def tag(self) -> str | None: ...
94
+ def decompose(self, recursive: bool = True) -> None: ...
95
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None: ...
96
+ @property
97
+ def attributes(self) -> dict[str, str | None]: ...
98
+ @property
99
+ def attrs(self) -> LexborAttributes: ...
100
+ @property
101
+ def id(self) -> str | None: ...
102
+ def iter(self, include_text: bool = False) -> Iterator[LexborNode]: ...
103
+ def unwrap(self) -> None: ...
104
+ def unwrap_tags(self, tags: list[str]) -> None: ...
105
+ def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: ...
106
+ def replace_with(self, value: bytes | str | LexborNode) -> None: ...
107
+ def insert_before(self, value: bytes | str | LexborNode) -> None: ...
108
+ def insert_after(self, value: bytes | str | LexborNode) -> None: ...
109
+ def insert_child(self, value: bytes | str | LexborNode) -> None: ...
110
+ @property
111
+ def raw_value(self) -> NoReturn: ...
112
+ def scripts_contain(self, query: str) -> bool: ...
113
+ def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
114
+ def remove(self, recursive: bool = True) -> None: ...
115
+ def select(self, query: str | None = None) -> LexborSelector: ...
116
+ @property
117
+ def text_content(self) -> str | None: ...
118
+
119
+ class LexborHTMLParser:
120
+ def __init__(self, html: str| bytes ): ...
121
+ @property
122
+ def selector(self) -> "LexborCSSSelector": ...
123
+ @property
124
+ def root(self) -> LexborNode | None: ...
125
+ @property
126
+ def body(self) -> LexborNode | None: ...
127
+ @property
128
+ def head(self) -> LexborNode | None: ...
129
+ def tags(self, name: str) -> list[LexborNode]: ...
130
+ def text(
131
+ self, deep: bool = True, separator: str = "", strip: bool = False
132
+ ) -> str: ...
133
+ @property
134
+ def html(self) -> str | None: ...
135
+ def css(self, query: str) -> list[LexborNode]: ...
136
+ @overload
137
+ def css_first(
138
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
139
+ ) -> LexborNode: ...
140
+ @overload
141
+ def css_first(
142
+ self, query: str, default: DefaultT, strict: bool = False
143
+ ) -> LexborNode | DefaultT: ...
144
+ @overload
145
+ def css_first(
146
+ self, query: str, default: None = ..., strict: bool = False
147
+ ) -> LexborNode | None: ...
148
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None: ...
149
+ def select(self, query: str | None = None) -> LexborSelector | None: ...
150
+ def any_css_matches(self, selectors: tuple[str]) -> bool: ...
151
+ def scripts_contain(self, query: str) -> bool: ...
152
+ def scripts_srcs_contain(self, queries: tuple[str]) -> bool: ...
153
+ def css_matches(self, selector: str) -> bool: ...
154
+ def clone(self) -> LexborHTMLParser: ...
155
+ def unwrap_tags(self, tags: list[str]) -> None: ...
156
+
157
+ def create_tag(tag: str) -> LexborNode:
158
+ """
159
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
160
+ e.g. `"<div></div>"`.
161
+ """
162
+ ...
163
+
164
+ def parse_fragment(html: str) -> list[LexborNode]:
165
+ """
166
+ Given HTML, parse it into a list of Nodes, such that the nodes
167
+ correspond to the given HTML.
168
+
169
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
170
+ if they are missing. This function does not add these tags.
171
+ """
172
+ ...
selectolax/lexbor.pyx ADDED
@@ -0,0 +1,349 @@
1
+ from cpython cimport bool
2
+
3
+ _ENCODING = 'UTF-8'
4
+
5
+ include "base.pxi"
6
+ include "utils.pxi"
7
+ include "lexbor/attrs.pxi"
8
+ include "lexbor/node.pxi"
9
+ include "lexbor/selection.pxi"
10
+ include "lexbor/util.pxi"
11
+
12
+ # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
13
+
14
+ cdef class LexborHTMLParser:
15
+ """The lexbor HTML parser.
16
+
17
+ Use this class to parse raw HTML.
18
+
19
+ This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
20
+
21
+ Parameters
22
+ ----------
23
+
24
+ html : str (unicode) or bytes
25
+ """
26
+ def __init__(self, html):
27
+
28
+ cdef size_t html_len
29
+ cdef char* html_chars
30
+
31
+ bytes_html, html_len = preprocess_input(html)
32
+ self._parse_html(bytes_html, html_len)
33
+ self.raw_html = bytes_html
34
+ self._selector = None
35
+
36
+ @property
37
+ def selector(self):
38
+ if self._selector is None:
39
+ self._selector = LexborCSSSelector()
40
+ return self._selector
41
+
42
+
43
+ cdef _parse_html(self, char *html, size_t html_len):
44
+ cdef lxb_status_t status
45
+
46
+ with nogil:
47
+ self.document = lxb_html_document_create()
48
+
49
+ if self.document == NULL:
50
+ raise SelectolaxError("Failed to initialize object for HTML Document.")
51
+
52
+ with nogil:
53
+ status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
54
+ if status != 0x0000:
55
+ raise SelectolaxError("Can't parse HTML.")
56
+
57
+ assert self.document != NULL
58
+
59
+ def __dealloc__(self):
60
+ if self.document != NULL:
61
+ lxb_html_document_destroy(self.document)
62
+
63
+ def __repr__(self):
64
+ return '<LexborHTMLParser chars=%s>' % len(self.root.html)
65
+
66
+ @property
67
+ def root(self):
68
+ """Returns root node."""
69
+ if self.document == NULL:
70
+ return None
71
+ return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
72
+
73
+ @property
74
+ def body(self):
75
+ """Returns document body."""
76
+ cdef lxb_html_body_element_t* body
77
+ body = lxb_html_document_body_element_noi(self.document)
78
+ if body == NULL:
79
+ return None
80
+ return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
81
+
82
+ @property
83
+ def head(self):
84
+ """Returns document head."""
85
+ cdef lxb_html_head_element_t* head
86
+ head = lxb_html_document_head_element_noi(self.document)
87
+ if head == NULL:
88
+ return None
89
+ return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
90
+
91
+ def tags(self, str name):
92
+ """Returns a list of tags that match specified name.
93
+
94
+ Parameters
95
+ ----------
96
+ name : str (e.g. div)
97
+
98
+ """
99
+ cdef lxb_dom_collection_t* collection = NULL
100
+ cdef lxb_status_t status
101
+ pybyte_name = name.encode('UTF-8')
102
+
103
+ result = list()
104
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
105
+
106
+ if collection == NULL:
107
+ return result
108
+ status = lxb_dom_elements_by_tag_name(
109
+ <lxb_dom_element_t *> self.document,
110
+ collection,
111
+ <lxb_char_t *> pybyte_name,
112
+ len(pybyte_name)
113
+ )
114
+ if status != 0x0000:
115
+ lxb_dom_collection_destroy(collection, <bint> True)
116
+ raise SelectolaxError("Can't locate elements.")
117
+
118
+ for i in range(lxb_dom_collection_length_noi(collection)):
119
+ node = LexborNode()._cinit(
120
+ <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
121
+ self
122
+ )
123
+ result.append(node)
124
+ lxb_dom_collection_destroy(collection, <bint> True)
125
+ return result
126
+
127
+ def text(self, bool deep=True, str separator='', bool strip=False):
128
+ """Returns the text of the node including text of all its child nodes.
129
+
130
+ Parameters
131
+ ----------
132
+ strip : bool, default False
133
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
134
+ separator : str, default ''
135
+ The separator to use when joining text from different nodes.
136
+ deep : bool, default True
137
+ If True, includes text from all child nodes.
138
+
139
+ Returns
140
+ -------
141
+ text : str
142
+
143
+ """
144
+ if self.body is None:
145
+ return ""
146
+ return self.body.text(deep=deep, separator=separator, strip=strip)
147
+
148
+ @property
149
+ def html(self):
150
+ """Return HTML representation of the page."""
151
+ if self.document == NULL:
152
+ return None
153
+ node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
154
+ return node.html
155
+
156
+ def css(self, str query):
157
+ """A CSS selector.
158
+
159
+ Matches pattern `query` against HTML tree.
160
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
161
+
162
+ Parameters
163
+ ----------
164
+ query : str
165
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
166
+
167
+ Returns
168
+ -------
169
+ selector : list of `Node` objects
170
+ """
171
+ return self.root.css(query)
172
+
173
+ def css_first(self, str query, default=None, strict=False):
174
+ """Same as `css` but returns only the first match.
175
+
176
+ Parameters
177
+ ----------
178
+
179
+ query : str
180
+ default : bool, default None
181
+ Default value to return if there is no match.
182
+ strict: bool, default True
183
+ Set to True if you want to check if there is strictly only one match in the document.
184
+
185
+
186
+ Returns
187
+ -------
188
+ selector : `LexborNode` object
189
+ """
190
+ return self.root.css_first(query, default, strict)
191
+
192
+ def strip_tags(self, list tags, bool recursive = False):
193
+ """Remove specified tags from the node.
194
+
195
+ Parameters
196
+ ----------
197
+ tags : list of str
198
+ List of tags to remove.
199
+ recursive : bool, default True
200
+ Whenever to delete all its child nodes
201
+
202
+ Examples
203
+ --------
204
+
205
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
206
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
207
+ >>> tree.strip_tags(tags)
208
+ >>> tree.html
209
+ '<html><body><div>Hello world!</div></body></html>'
210
+
211
+ """
212
+ cdef lxb_dom_collection_t* collection = NULL
213
+ cdef lxb_status_t status
214
+
215
+ for tag in tags:
216
+ pybyte_name = tag.encode('UTF-8')
217
+
218
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
219
+
220
+ if collection == NULL:
221
+ raise SelectolaxError("Can't initialize DOM collection.")
222
+
223
+ status = lxb_dom_elements_by_tag_name(
224
+ <lxb_dom_element_t *> self.document,
225
+ collection,
226
+ <lxb_char_t *> pybyte_name,
227
+ len(pybyte_name)
228
+ )
229
+ if status != 0x0000:
230
+ lxb_dom_collection_destroy(collection, <bint> True)
231
+ raise SelectolaxError("Can't locate elements.")
232
+
233
+ for i in range(lxb_dom_collection_length_noi(collection)):
234
+ if recursive:
235
+ lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
236
+ else:
237
+ lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
238
+ lxb_dom_collection_destroy(collection, <bint> True)
239
+
240
+ def select(self, query=None):
241
+ """Select nodes give a CSS selector.
242
+
243
+ Works similarly to the ``css`` method, but supports chained filtering and extra features.
244
+
245
+ Parameters
246
+ ----------
247
+ query : str or None
248
+ The CSS selector to use when searching for nodes.
249
+
250
+ Returns
251
+ -------
252
+ selector : The `Selector` class.
253
+ """
254
+ cdef LexborNode node
255
+ node = self.root
256
+ if node:
257
+ return LexborSelector(node, query)
258
+
259
+ def any_css_matches(self, tuple selectors):
260
+ """Returns True if any of the specified CSS selectors matches a node."""
261
+ return self.root.any_css_matches(selectors)
262
+
263
+ def scripts_contain(self, str query):
264
+ """Returns True if any of the script tags contain specified text.
265
+
266
+ Caches script tags on the first call to improve performance.
267
+
268
+ Parameters
269
+ ----------
270
+ query : str
271
+ The query to check.
272
+
273
+ """
274
+ return self.root.scripts_contain(query)
275
+
276
+
277
+ def script_srcs_contain(self, tuple queries):
278
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
279
+
280
+ Caches values on the first call to improve performance.
281
+
282
+ Parameters
283
+ ----------
284
+ queries : tuple of str
285
+
286
+ """
287
+ return self.root.script_srcs_contain(queries)
288
+
289
+ def css_matches(self, str selector):
290
+ return self.root.css_matches(selector)
291
+
292
+ @staticmethod
293
+ cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
294
+ obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
295
+ obj.document = document
296
+ obj.raw_html = raw_html
297
+ obj.cached_script_texts = None
298
+ obj.cached_script_srcs = None
299
+ obj._selector = None
300
+ return obj
301
+
302
+ def clone(self):
303
+ """Clone the current tree."""
304
+ cdef lxb_html_document_t* cloned_document
305
+ cdef lxb_dom_node_t* cloned_node
306
+
307
+ with nogil:
308
+ cloned_document = lxb_html_document_create()
309
+
310
+ if cloned_document == NULL:
311
+ raise SelectolaxError("Can't create a new document")
312
+
313
+ cloned_document.ready_state = LXB_HTML_DOCUMENT_READY_STATE_COMPLETE
314
+
315
+ with nogil:
316
+ cloned_node = lxb_dom_document_import_node(
317
+ &cloned_document.dom_document,
318
+ <lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document),
319
+ <bint> True
320
+ )
321
+
322
+ if cloned_node == NULL:
323
+ raise SelectolaxError("Can't create a new document")
324
+
325
+ with nogil:
326
+ lxb_dom_node_insert_child(<lxb_dom_node_t * > cloned_document, cloned_node)
327
+
328
+ cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
329
+ return cls
330
+ def unwrap_tags(self, list tags):
331
+ """Unwraps specified tags from the HTML tree.
332
+
333
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
334
+
335
+ Parameters
336
+ ----------
337
+ tags : list
338
+ List of tags to remove.
339
+
340
+ Examples
341
+ --------
342
+
343
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
344
+ >>> tree.body.unwrap_tags(['i','a'])
345
+ >>> tree.body.html
346
+ '<body><div>Hello world!</div></body>'
347
+ """
348
+ if self.root is not None:
349
+ self.root.unwrap_tags(tags)