selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
selectolax/lexbor.pyx ADDED
@@ -0,0 +1,677 @@
1
+ from cpython.bool cimport bool
2
+
3
+ _ENCODING = 'UTF-8'
4
+
5
+ include "base.pxi"
6
+ include "utils.pxi"
7
+ include "lexbor/attrs.pxi"
8
+ include "lexbor/node.pxi"
9
+ include "lexbor/selection.pxi"
10
+ include "lexbor/util.pxi"
11
+ include "lexbor/node_remove.pxi"
12
+
13
+ # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
14
+
15
+ cdef class LexborHTMLParser:
16
+ """The lexbor HTML parser.
17
+
18
+ Use this class to parse raw HTML.
19
+
20
+ This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
21
+
22
+ Parameters
23
+ ----------
24
+
25
+ html : str (unicode) or bytes
26
+ """
27
+ def __init__(self, html: str | bytes, is_fragment: bool = False):
28
+ """Create a parser and load HTML.
29
+
30
+ Parameters
31
+ ----------
32
+ html : str or bytes
33
+ HTML content to parse.
34
+ is_fragment : bool, optional
35
+ When ``False`` (default), the input is parsed as a full HTML document.
36
+ If the input is only a fragment, the parser still accepts it and inserts any missing required elements,
37
+ (such as `<html>`, `<head>`, and `<body>`) into the tree,
38
+ according to the HTML parsing rules in the HTML Standard.
39
+ This matches how browsers construct the DOM when they load an HTML page.
40
+ When ``True``, the input is parsed as an HTML fragment.
41
+ The parser does not insert any missing required HTML elements.
42
+ """
43
+ cdef size_t html_len
44
+ cdef object bytes_html
45
+ self._is_fragment = is_fragment
46
+ self._selector = None
47
+ self._new_html_document()
48
+ bytes_html, html_len = preprocess_input(html)
49
+ self._parse_html(bytes_html, html_len)
50
+ self.raw_html = bytes_html
51
+
52
+ cdef inline void _new_html_document(self):
53
+ """Initialize a fresh Lexbor HTML document.
54
+
55
+ Returns
56
+ -------
57
+ None
58
+
59
+ Raises
60
+ ------
61
+ SelectolaxError
62
+ If the underlying Lexbor document cannot be created.
63
+ """
64
+ with nogil:
65
+ self.document = lxb_html_document_create()
66
+
67
+ if self.document == NULL:
68
+ PyErr_SetObject(SelectolaxError, "Failed to initialize object for HTML Document.")
69
+
70
+ cdef int _parse_html(self, char *html, size_t html_len) except -1:
71
+ """Parse HTML content into the internal document.
72
+
73
+ Parameters
74
+ ----------
75
+ html : char *
76
+ Pointer to UTF-8 encoded HTML bytes.
77
+ html_len : size_t
78
+ Length of the HTML buffer.
79
+
80
+ Returns
81
+ -------
82
+ int
83
+ ``0`` on success; ``-1`` when parsing fails.
84
+
85
+ Raises
86
+ ------
87
+ SelectolaxError
88
+ If Lexbor returns a non-OK status.
89
+ RuntimeError
90
+ If the internal document is ``NULL`` after a successful parse.
91
+ """
92
+ cdef lxb_status_t status
93
+
94
+ if self.document == NULL:
95
+ return -1
96
+
97
+ with nogil:
98
+ if self._is_fragment:
99
+ status = self._parse_html_fragment(html, html_len)
100
+ else:
101
+ status = self._parse_html_document(html, html_len)
102
+
103
+ if status != LXB_STATUS_OK:
104
+ PyErr_SetObject(SelectolaxError, "Can't parse HTML.")
105
+ return -1
106
+
107
+ if self.document == NULL:
108
+ PyErr_SetObject(RuntimeError, "document is NULL even after html was parsed correctly")
109
+ return -1
110
+ return 0
111
+
112
+ cdef inline lxb_status_t _parse_html_document(self, char *html, size_t html_len) nogil:
113
+ """Parse HTML as a full HTML document.
114
+ If the input is only a fragment, the parser still accepts it and inserts any missing required elements,
115
+ (such as `<html>`, `<head>`, and `<body>`) into the tree,
116
+ according to the HTML parsing rules in the HTML Standard.
117
+ This matches how browsers construct the DOM when they load an HTML page.
118
+
119
+ Parameters
120
+ ----------
121
+ html : char *
122
+ Pointer to UTF-8 encoded HTML bytes.
123
+ html_len : size_t
124
+ Length of the HTML buffer.
125
+
126
+ Returns
127
+ -------
128
+ lxb_status_t
129
+ Lexbor status code produced by ``lxb_html_document_parse``.
130
+ """
131
+ return lxb_html_document_parse(self.document, <lxb_char_t *> html, html_len)
132
+
133
+ cdef inline lxb_status_t _parse_html_fragment(self, char *html, size_t html_len) nogil:
134
+ """Parse HTML as an HTML fragment.
135
+ The parser does not insert any missing required HTML elements.
136
+
137
+ Parameters
138
+ ----------
139
+ html : char *
140
+ Pointer to UTF-8 encoded HTML bytes.
141
+ html_len : size_t
142
+ Length of the HTML buffer.
143
+
144
+ Returns
145
+ -------
146
+ lxb_status_t
147
+ Lexbor status code; ``LXB_STATUS_OK`` when parsing the fragment succeeded.
148
+ """
149
+ cdef const lxb_char_t *dummy_root_name = <const lxb_char_t *> ""
150
+ cdef size_t dummy_root_len = 0
151
+ cdef lxb_html_element_t *dummy_root = NULL
152
+ cdef lxb_dom_node_t *fragment_html_node = NULL
153
+
154
+ dummy_root = lxb_html_document_create_element(
155
+ self.document,
156
+ dummy_root_name,
157
+ dummy_root_len,
158
+ NULL
159
+ )
160
+ if dummy_root == NULL:
161
+ return LXB_STATUS_ERROR
162
+ fragment_html_node = lxb_html_document_parse_fragment(
163
+ self.document,
164
+ <lxb_dom_element_t *> dummy_root,
165
+ <lxb_char_t *> html,
166
+ html_len
167
+ )
168
+ if fragment_html_node == NULL:
169
+ return LXB_STATUS_ERROR
170
+ # Use the fragment document returned by lexbor as the parser document.
171
+ self.document = <lxb_html_document_t *> fragment_html_node
172
+ return LXB_STATUS_OK
173
+
174
+ def __dealloc__(self):
175
+ """Release the underlying Lexbor HTML document.
176
+
177
+ Returns
178
+ -------
179
+ None
180
+
181
+ Notes
182
+ -----
183
+ Safe to call multiple times; does nothing if the document is already
184
+ freed.
185
+ """
186
+ if self.document != NULL:
187
+ lxb_html_document_destroy(self.document)
188
+
189
+ def __repr__(self):
190
+ """Return a concise representation of the parsed document.
191
+
192
+ Returns
193
+ -------
194
+ str
195
+ A string showing the number of characters in the parsed HTML.
196
+ """
197
+ return f"<LexborHTMLParser chars='{len(self.root.html)}'>"
198
+
199
+ @property
200
+ def selector(self):
201
+ """Return a lazily created CSS selector helper.
202
+
203
+ Returns
204
+ -------
205
+ LexborCSSSelector
206
+ Selector instance bound to this parser.
207
+ """
208
+ if self._selector is None:
209
+ self._selector = LexborCSSSelector()
210
+ return self._selector
211
+
212
+ @property
213
+ def root(self):
214
+ """Return the document root node.
215
+
216
+ Returns
217
+ -------
218
+ LexborNode or None
219
+ Root of the parsed document, or ``None`` if unavailable.
220
+ """
221
+ if self.document == NULL:
222
+ return None
223
+ return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
224
+
225
+ @property
226
+ def body(self):
227
+ """Return document body.
228
+
229
+ Returns
230
+ -------
231
+ LexborNode or None
232
+ ``<body>`` element when present, otherwise ``None``.
233
+ """
234
+ cdef lxb_html_body_element_t* body
235
+ body = lxb_html_document_body_element_noi(self.document)
236
+ if body == NULL:
237
+ return None
238
+ return LexborNode.new(<lxb_dom_node_t *> body, self)
239
+
240
+ @property
241
+ def head(self):
242
+ """Return document head.
243
+
244
+ Returns
245
+ -------
246
+ LexborNode or None
247
+ ``<head>`` element when present, otherwise ``None``.
248
+ """
249
+ cdef lxb_html_head_element_t* head
250
+ head = lxb_html_document_head_element_noi(self.document)
251
+ if head == NULL:
252
+ return None
253
+ return LexborNode.new(<lxb_dom_node_t *> head, self)
254
+
255
+ def tags(self, str name):
256
+ """Return all tags that match the provided name.
257
+
258
+ Parameters
259
+ ----------
260
+ name : str
261
+ Tag name to search for (e.g., ``"div"``).
262
+
263
+ Returns
264
+ -------
265
+ list of LexborNode
266
+ Matching elements in document order.
267
+
268
+ Raises
269
+ ------
270
+ ValueError
271
+ If ``name`` is empty or longer than 100 characters.
272
+ SelectolaxError
273
+ If Lexbor cannot locate the elements.
274
+ """
275
+
276
+ if not name:
277
+ raise ValueError("Tag name cannot be empty")
278
+ if len(name) > 100:
279
+ raise ValueError("Tag name is too long")
280
+
281
+ cdef lxb_dom_collection_t* collection = NULL
282
+ cdef lxb_status_t status
283
+ pybyte_name = name.encode('UTF-8')
284
+
285
+ result = list()
286
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
287
+
288
+ if collection == NULL:
289
+ return result
290
+ status = lxb_dom_elements_by_tag_name(
291
+ <lxb_dom_element_t *> self.document,
292
+ collection,
293
+ <lxb_char_t *> pybyte_name,
294
+ len(pybyte_name)
295
+ )
296
+ if status != 0x0000:
297
+ lxb_dom_collection_destroy(collection, <bint> True)
298
+ raise SelectolaxError("Can't locate elements.")
299
+
300
+ for i in range(lxb_dom_collection_length_noi(collection)):
301
+ node = LexborNode.new(
302
+ <lxb_dom_node_t*> lxb_dom_collection_element_noi(collection, i),
303
+ self
304
+ )
305
+ result.append(node)
306
+ lxb_dom_collection_destroy(collection, <bint> True)
307
+ return result
308
+
309
+ def text(
310
+ self,
311
+ deep: bool = True,
312
+ separator: str = "",
313
+ strip: bool = False,
314
+ skip_empty: bool = False,
315
+ ) -> str:
316
+ """Returns the text of the node including text of all its child nodes.
317
+
318
+ Parameters
319
+ ----------
320
+ strip : bool, default False
321
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
322
+ separator : str, default ''
323
+ The separator to use when joining text from different nodes.
324
+ deep : bool, default True
325
+ If True, includes text from all child nodes.
326
+ skip_empty : bool, optional
327
+ Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
328
+ ``True``. Defaults to ``False``.
329
+
330
+ Returns
331
+ -------
332
+ text : str
333
+ Combined textual content assembled according to the provided options.
334
+ """
335
+ if self.root is None:
336
+ return ""
337
+ return self.root.text(deep=deep, separator=separator, strip=strip, skip_empty=skip_empty)
338
+
339
+ @property
340
+ def html(self):
341
+ """Return HTML representation of the page.
342
+
343
+ Returns
344
+ -------
345
+ str or None
346
+ Serialized HTML of the current document.
347
+ """
348
+ if self.document == NULL:
349
+ return None
350
+ if self._is_fragment:
351
+ return self.root.html
352
+ node = LexborNode.new(<lxb_dom_node_t *> &self.document.dom_document, self)
353
+ return node.html
354
+
355
+ def css(self, str query):
356
+ """A CSS selector.
357
+
358
+ Matches pattern `query` against HTML tree.
359
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
360
+
361
+ Special selectors:
362
+
363
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
364
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
365
+
366
+ Parameters
367
+ ----------
368
+ query : str
369
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
370
+
371
+ Returns
372
+ -------
373
+ selector : list of `Node` objects
374
+ """
375
+ return self.root.css(query)
376
+
377
+ def css_first(self, str query, default=None, strict=False):
378
+ """Same as `css` but returns only the first match.
379
+
380
+ Parameters
381
+ ----------
382
+
383
+ query : str
384
+ default : Any, default None
385
+ Default value to return if there is no match.
386
+ strict: bool, default False
387
+ Set to True if you want to check if there is strictly only one match in the document.
388
+
389
+
390
+ Returns
391
+ -------
392
+ selector : `LexborNode` object
393
+ """
394
+ return self.root.css_first(query, default, strict)
395
+
396
+ def strip_tags(self, list tags, bool recursive = False):
397
+ """Remove specified tags from the node.
398
+
399
+ Parameters
400
+ ----------
401
+ tags : list of str
402
+ List of tags to remove.
403
+ recursive : bool, default False
404
+ Whenever to delete all its child nodes
405
+
406
+ Examples
407
+ --------
408
+
409
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
410
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
411
+ >>> tree.strip_tags(tags)
412
+ >>> tree.html
413
+ '<html><body><div>Hello world!</div></body></html>'
414
+
415
+ Returns
416
+ -------
417
+ None
418
+ """
419
+ cdef lxb_dom_collection_t* collection = NULL
420
+ cdef lxb_status_t status
421
+
422
+ for tag in tags:
423
+ pybyte_name = tag.encode('UTF-8')
424
+
425
+ collection = lxb_dom_collection_make(&self.document.dom_document, 128)
426
+
427
+ if collection == NULL:
428
+ raise SelectolaxError("Can't initialize DOM collection.")
429
+
430
+ status = lxb_dom_elements_by_tag_name(
431
+ <lxb_dom_element_t *> self.document,
432
+ collection,
433
+ <lxb_char_t *> pybyte_name,
434
+ len(pybyte_name)
435
+ )
436
+ if status != 0x0000:
437
+ lxb_dom_collection_destroy(collection, <bint> True)
438
+ raise SelectolaxError("Can't locate elements.")
439
+
440
+ for i in range(lxb_dom_collection_length_noi(collection)):
441
+ if recursive:
442
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
443
+ else:
444
+ lxb_dom_node_destroy(<lxb_dom_node_t *> lxb_dom_collection_element_noi(collection, i))
445
+ lxb_dom_collection_destroy(collection, <bint> True)
446
+
447
+ def select(self, query=None):
448
+ """Select nodes given a CSS selector.
449
+
450
+ Works similarly to the ``css`` method, but supports chained filtering and extra features.
451
+
452
+ Parameters
453
+ ----------
454
+ query : str or None
455
+ The CSS selector to use when searching for nodes.
456
+
457
+ Returns
458
+ -------
459
+ LexborSelector or None
460
+ Selector bound to the root node, or ``None`` if the document is empty.
461
+ """
462
+ cdef LexborNode node
463
+ node = self.root
464
+ if node:
465
+ return LexborSelector(node, query)
466
+ return None
467
+
468
+ def any_css_matches(self, tuple selectors):
469
+ """Return ``True`` if any of the specified CSS selectors match.
470
+
471
+ Parameters
472
+ ----------
473
+ selectors : tuple[str]
474
+ CSS selectors to evaluate.
475
+
476
+ Returns
477
+ -------
478
+ bool
479
+ ``True`` when at least one selector matches.
480
+ """
481
+ return self.root.any_css_matches(selectors)
482
+
483
+ def scripts_contain(self, str query):
484
+ """Return ``True`` if any script tag contains the given text.
485
+
486
+ Caches script tags on the first call to improve performance.
487
+
488
+ Parameters
489
+ ----------
490
+ query : str
491
+ Text to search for within script contents.
492
+
493
+ Returns
494
+ -------
495
+ bool
496
+ ``True`` when a matching script tag is found.
497
+ """
498
+ return self.root.scripts_contain(query)
499
+
500
+ def script_srcs_contain(self, tuple queries):
501
+ """Return ``True`` if any script ``src`` contains one of the strings.
502
+
503
+ Caches values on the first call to improve performance.
504
+
505
+ Parameters
506
+ ----------
507
+ queries : tuple of str
508
+ Strings to look for inside ``src`` attributes.
509
+
510
+ Returns
511
+ -------
512
+ bool
513
+ ``True`` when a matching source value is found.
514
+ """
515
+ return self.root.script_srcs_contain(queries)
516
+
517
+ def css_matches(self, str selector):
518
+ """Return ``True`` if the document matches the selector at least once.
519
+
520
+ Parameters
521
+ ----------
522
+ selector : str
523
+ CSS selector to test.
524
+
525
+ Returns
526
+ -------
527
+ bool
528
+ ``True`` when a match exists.
529
+ """
530
+ return self.root.css_matches(selector)
531
+
532
+ def merge_text_nodes(self):
533
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
534
+
535
+ This is useful for text extraction.
536
+ Use it when you need to strip HTML tags and merge "dangling" text.
537
+
538
+ Examples
539
+ --------
540
+
541
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
542
+ >>> node = tree.css_first('div')
543
+ >>> tree.unwrap_tags(["strong"])
544
+ >>> tree.text(deep=True, separator=" ", strip=True)
545
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
546
+ >>> node.merge_text_nodes()
547
+ >>> tree.text(deep=True, separator=" ", strip=True)
548
+ "John Doe"
549
+
550
+ Returns
551
+ -------
552
+ None
553
+ """
554
+ return self.root.merge_text_nodes()
555
+
556
+ @staticmethod
557
+ cdef LexborHTMLParser from_document(lxb_html_document_t *document, bytes raw_html):
558
+ """Construct a parser from an existing Lexbor document.
559
+
560
+ Parameters
561
+ ----------
562
+ document : lxb_html_document_t *
563
+ Borrowed pointer to an initialized Lexbor HTML document.
564
+ raw_html : bytes
565
+ Original HTML bytes backing the document.
566
+
567
+ Returns
568
+ -------
569
+ LexborHTMLParser
570
+ Parser instance wrapping the provided document.
571
+ """
572
+ obj = <LexborHTMLParser> LexborHTMLParser.__new__(LexborHTMLParser)
573
+ obj.document = document
574
+ obj.raw_html = raw_html
575
+ obj.cached_script_texts = None
576
+ obj.cached_script_srcs = None
577
+ obj._selector = None
578
+ return obj
579
+
580
+ def clone(self):
581
+ """Clone the current document tree.
582
+
583
+ You can use to do temporary modifications without affecting the original HTML tree.
584
+ It is tied to the current parser instance.
585
+ Gets destroyed when the parser instance is destroyed.
586
+
587
+ Returns
588
+ -------
589
+ LexborHTMLParser
590
+ A parser instance backed by a deep-copied document.
591
+ """
592
+ cdef lxb_html_document_t* cloned_document
593
+ cdef lxb_dom_node_t* cloned_node
594
+ cdef LexborHTMLParser cls
595
+
596
+ with nogil:
597
+ cloned_document = lxb_html_document_create()
598
+
599
+ if cloned_document == NULL:
600
+ raise SelectolaxError("Can't create a new document")
601
+
602
+ cloned_document.ready_state = LXB_HTML_DOCUMENT_READY_STATE_COMPLETE
603
+
604
+ with nogil:
605
+ cloned_node = lxb_dom_document_import_node(
606
+ &cloned_document.dom_document,
607
+ <lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document),
608
+ <bint> True
609
+ )
610
+
611
+ if cloned_node == NULL:
612
+ raise SelectolaxError("Can't create a new document")
613
+
614
+ with nogil:
615
+ lxb_dom_node_insert_child(<lxb_dom_node_t * > cloned_document, cloned_node)
616
+
617
+ cls = LexborHTMLParser.from_document(cloned_document, self.raw_html)
618
+ return cls
619
+
620
+ def unwrap_tags(self, list tags, delete_empty = False):
621
+ """Unwraps specified tags from the HTML tree.
622
+
623
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
624
+
625
+ Parameters
626
+ ----------
627
+ tags : list
628
+ List of tags to remove.
629
+ delete_empty : bool
630
+ Whenever to delete empty tags.
631
+
632
+ Examples
633
+ --------
634
+
635
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
636
+ >>> tree.body.unwrap_tags(['i','a'])
637
+ >>> tree.body.html
638
+ '<body><div>Hello world!</div></body>'
639
+
640
+ Returns
641
+ -------
642
+ None
643
+ """
644
+ # faster to check if the document is empty which should determine if we have a root
645
+ if self.document != NULL:
646
+ self.root.unwrap_tags(tags, delete_empty=delete_empty)
647
+
648
+ @property
649
+ def inner_html(self) -> str:
650
+ """Return HTML representation of the child nodes.
651
+
652
+ Works similar to innerHTML in JavaScript.
653
+ Unlike the `.html` property, does not include the current node.
654
+ Can be used to set HTML as well. See the setter docstring.
655
+
656
+ Returns
657
+ -------
658
+ text : str | None
659
+ """
660
+ return self.root.inner_html
661
+
662
+ @inner_html.setter
663
+ def inner_html(self, str html):
664
+ """Set inner HTML to the specified HTML.
665
+
666
+ Replaces existing data inside the node.
667
+ Works similar to innerHTML in JavaScript.
668
+
669
+ Parameters
670
+ ----------
671
+ html : str
672
+
673
+ Returns
674
+ -------
675
+ None
676
+ """
677
+ self.root.inner_html = html