selectolax 0.3.23__cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

selectolax/lexbor.pyx ADDED
@@ -0,0 +1,346 @@
1
+ from cpython cimport bool
2
+
3
+ _ENCODING = 'UTF-8'
4
+
5
+ include "base.pxi"
6
+ include "utils.pxi"
7
+ include "lexbor/attrs.pxi"
8
+ include "lexbor/node.pxi"
9
+ include "lexbor/selection.pxi"
10
+
11
+ # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
12
+
13
+ cdef class LexborHTMLParser:
14
+ """The lexbor HTML parser.
15
+
16
+ Use this class to parse raw HTML.
17
+
18
+ This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
19
+
20
+ Parameters
21
+ ----------
22
+
23
+ html : str (unicode) or bytes
24
+ """
25
+ def __init__(self, html):
26
+
27
+ cdef size_t html_len
28
+ cdef char* html_chars
29
+
30
+ bytes_html, html_len = preprocess_input(html)
31
+ self._parse_html(bytes_html, html_len)
32
+ self.raw_html = bytes_html
33
+ self._selector = None
34
+
35
+ @property
36
+ def selector(self):
37
+ if self._selector is None:
38
+ self._selector = LexborCSSSelector()
39
+ return self._selector
40
+
41
+
42
+ cdef _parse_html(self, char *html, size_t html_len):
43
+ cdef lxb_status_t status
44
+
45
+ with nogil:
46
+ self.document = lxb_html_document_create()
47
+
48
+ if self.document == NULL:
49
+ raise SelectolaxError("Failed to initialize object for HTML Document.")
50
+
51
+ with nogil:
52
+ status = lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
53
+ if status != 0x0000:
54
+ raise SelectolaxError("Can't parse HTML.")
55
+
56
+ assert self.document != NULL
57
+
58
+ def __dealloc__(self):
59
+ if self.document != NULL:
60
+ lxb_html_document_destroy(self.document)
61
+
62
+ def __repr__(self):
63
+ return '<LexborHTMLParser chars=%s>' % len(self.root.html)
64
+
65
+ @property
66
+ def root(self):
67
+ """Returns root node."""
68
+ if self.document == NULL:
69
+ return None
70
+ return LexborNode()._cinit(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
71
+
72
+ @property
73
+ def body(self):
74
+ """Returns document body."""
75
+ cdef lxb_html_body_element_t* body
76
+ body = lxb_html_document_body_element_noi(self.document)
77
+ if body == NULL:
78
+ return None
79
+ return LexborNode()._cinit(<lxb_dom_node_t *> body, self)
80
+
81
+ @property
82
+ def head(self):
83
+ """Returns document head."""
84
+ cdef lxb_html_head_element_t* head
85
+ head = lxb_html_document_head_element_noi(self.document)
86
+ if head == NULL:
87
+ return None
88
+ return LexborNode()._cinit(<lxb_dom_node_t *> head, self)
89
+
90
+ def tags(self, str name):
91
+ """Returns a list of tags that match specified name.
92
+
93
+ Parameters
94
+ ----------
95
+ name : str (e.g. div)
96
+
97
+ """
98
+ cdef lxb_dom_collection_t* collection = NULL
99
+ cdef lxb_status_t status
100
+ pybyte_name = name.encode('UTF-8')
101
+
102
+ result = list()
103
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
104
+
105
+ if collection == NULL:
106
+ return result
107
+ status = lxb_dom_elements_by_tag_name(
108
+ <lxb_dom_element_t *> self.document,
109
+ collection,
110
+ <lxb_char_t *> pybyte_name,
111
+ len(pybyte_name)
112
+ )
113
+ if status != 0x0000:
114
+ raise SelectolaxError("Can't locate elements.")
115
+
116
+ for i in range(lxb_dom_collection_length_noi(collection)):
117
+ node = LexborNode()._cinit(
118
+ <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
119
+ self
120
+ )
121
+ result.append(node)
122
+ lxb_dom_collection_destroy(collection, <bint> True)
123
+ return result
124
+
125
+ def text(self, bool deep=True, str separator='', bool strip=False):
126
+ """Returns the text of the node including text of all its child nodes.
127
+
128
+ Parameters
129
+ ----------
130
+ strip : bool, default False
131
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
132
+ separator : str, default ''
133
+ The separator to use when joining text from different nodes.
134
+ deep : bool, default True
135
+ If True, includes text from all child nodes.
136
+
137
+ Returns
138
+ -------
139
+ text : str
140
+
141
+ """
142
+ if self.body is None:
143
+ return ""
144
+ return self.body.text(deep=deep, separator=separator, strip=strip)
145
+
146
+ @property
147
+ def html(self):
148
+ """Return HTML representation of the page."""
149
+ if self.document == NULL:
150
+ return None
151
+ node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
152
+ return node.html
153
+
154
+ def css(self, str query):
155
+ """A CSS selector.
156
+
157
+ Matches pattern `query` against HTML tree.
158
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
159
+
160
+ Parameters
161
+ ----------
162
+ query : str
163
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
164
+
165
+ Returns
166
+ -------
167
+ selector : list of `Node` objects
168
+ """
169
+ return self.root.css(query)
170
+
171
+ def css_first(self, str query, default=None, strict=False):
172
+ """Same as `css` but returns only the first match.
173
+
174
+ Parameters
175
+ ----------
176
+
177
+ query : str
178
+ default : bool, default None
179
+ Default value to return if there is no match.
180
+ strict: bool, default True
181
+ Set to True if you want to check if there is strictly only one match in the document.
182
+
183
+
184
+ Returns
185
+ -------
186
+ selector : `LexborNode` object
187
+ """
188
+ return self.root.css_first(query, default, strict)
189
+
190
+ def strip_tags(self, list tags, bool recursive = False):
191
+ """Remove specified tags from the node.
192
+
193
+ Parameters
194
+ ----------
195
+ tags : list of str
196
+ List of tags to remove.
197
+ recursive : bool, default True
198
+ Whenever to delete all its child nodes
199
+
200
+ Examples
201
+ --------
202
+
203
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
204
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
205
+ >>> tree.strip_tags(tags)
206
+ >>> tree.html
207
+ '<html><body><div>Hello world!</div></body></html>'
208
+
209
+ """
210
+ cdef lxb_dom_collection_t* collection = NULL
211
+ cdef lxb_status_t status
212
+
213
+ for tag in tags:
214
+ pybyte_name = tag.encode('UTF-8')
215
+
216
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
217
+
218
+ if collection == NULL:
219
+ raise SelectolaxError("Can't initialize DOM collection.")
220
+
221
+ status = lxb_dom_elements_by_tag_name(
222
+ <lxb_dom_element_t *> self.document,
223
+ collection,
224
+ <lxb_char_t *> pybyte_name,
225
+ len(pybyte_name)
226
+ )
227
+ if status != 0x0000:
228
+ raise SelectolaxError("Can't locate elements.")
229
+
230
+ for i in range(lxb_dom_collection_length_noi(collection)):
231
+ if recursive:
232
+ lxb_dom_node_destroy( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
233
+ else:
234
+ lxb_dom_node_destroy_deep( <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i))
235
+ lxb_dom_collection_destroy(collection, <bint> True)
236
+
237
+ def select(self, query=None):
238
+ """Select nodes give a CSS selector.
239
+
240
+ Works similarly to the ``css`` method, but supports chained filtering and extra features.
241
+
242
+ Parameters
243
+ ----------
244
+ query : str or None
245
+ The CSS selector to use when searching for nodes.
246
+
247
+ Returns
248
+ -------
249
+ selector : The `Selector` class.
250
+ """
251
+ cdef LexborNode node
252
+ node = self.root
253
+ if node:
254
+ return LexborSelector(node, query)
255
+
256
+ def any_css_matches(self, tuple selectors):
257
+ """Returns True if any of the specified CSS selectors matches a node."""
258
+ return self.root.any_css_matches(selectors)
259
+
260
+ def scripts_contain(self, str query):
261
+ """Returns True if any of the script tags contain specified text.
262
+
263
+ Caches script tags on the first call to improve performance.
264
+
265
+ Parameters
266
+ ----------
267
+ query : str
268
+ The query to check.
269
+
270
+ """
271
+ return self.root.scripts_contain(query)
272
+
273
+
274
+ def script_srcs_contain(self, tuple queries):
275
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
276
+
277
+ Caches values on the first call to improve performance.
278
+
279
+ Parameters
280
+ ----------
281
+ queries : tuple of str
282
+
283
+ """
284
+ return self.root.script_srcs_contain(queries)
285
+
286
+ def css_matches(self, str selector):
287
+ return self.root.css_matches(selector)
288
+
289
+ @staticmethod
290
+ cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
291
+ obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
292
+ obj.document = document
293
+ obj.raw_html = raw_html
294
+ obj.cached_script_texts = None
295
+ obj.cached_script_srcs = None
296
+ obj._selector = None
297
+ return obj
298
+
299
+ def clone(self):
300
+ """Clone the current tree."""
301
+ cdef lxb_html_document_t* cloned_document
302
+ cdef lxb_dom_node_t* cloned_node
303
+
304
+ with nogil:
305
+ cloned_document = lxb_html_document_create()
306
+
307
+ if cloned_document == NULL:
308
+ raise SelectolaxError("Can't create a new document")
309
+
310
+ cloned_document.ready_state = LXB_HTML_DOCUMENT_READY_STATE_COMPLETE
311
+
312
+ with nogil:
313
+ cloned_node = lxb_dom_document_import_node(
314
+ &cloned_document.dom_document,
315
+ <lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document),
316
+ <bint> True
317
+ )
318
+
319
+ if cloned_node == NULL:
320
+ raise SelectolaxError("Can't create a new document")
321
+
322
+ with nogil:
323
+ lxb_dom_node_insert_child(<lxb_dom_node_t * > cloned_document, cloned_node)
324
+
325
+ cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
326
+ return cls
327
+ def unwrap_tags(self, list tags):
328
+ """Unwraps specified tags from the HTML tree.
329
+
330
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
331
+
332
+ Parameters
333
+ ----------
334
+ tags : list
335
+ List of tags to remove.
336
+
337
+ Examples
338
+ --------
339
+
340
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
341
+ >>> tree.body.unwrap_tags(['i','a'])
342
+ >>> tree.body.html
343
+ '<body><div>Hello world!</div></body>'
344
+ """
345
+ if self.root is not None:
346
+ self.root.unwrap_tags(tags)