selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
selectolax/lexbor.pyi ADDED
@@ -0,0 +1,1248 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload
4
+
5
+ DefaultT = TypeVar("DefaultT")
6
+
7
+ class LexborAttributes:
8
+ """A dict-like object that represents attributes."""
9
+
10
+ @staticmethod
11
+ def create(node: LexborAttributes) -> LexborAttributes: ...
12
+ def keys(self) -> Iterator[str]: ...
13
+ def items(self) -> Iterator[tuple[str, str | None]]: ...
14
+ def values(self) -> Iterator[str | None]: ...
15
+ def __iter__(self) -> Iterator[str]: ...
16
+ def __len__(self) -> int: ...
17
+ def __getitem__(self, key: str) -> str | None: ...
18
+ def __setitem__(self, key: str, value: Optional[str]) -> None: ...
19
+ def __delitem__(self, key: str) -> None: ...
20
+ def __contains__(self, key: str) -> bool: ...
21
+ def __repr__(self) -> str: ...
22
+ @overload
23
+ def get(self, key: str, default: DefaultT) -> DefaultT | str | None: ...
24
+ @overload
25
+ def get(self, key: str, default: None = ...) -> str | None: ...
26
+ @overload
27
+ def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
28
+ @overload
29
+ def sget(self, key: str, default: str = "") -> str: ...
30
+
31
+ class LexborSelector:
32
+ """An advanced CSS selector that supports additional operations.
33
+
34
+ Think of it as a toolkit that mimics some of the features of XPath.
35
+
36
+ Please note, this is an experimental feature that can change in the future.
37
+ """
38
+
39
+ def __init__(self, node: LexborNode, query: str): ...
40
+ def css(self, query: str) -> NoReturn: ...
41
+ @property
42
+ def matches(self) -> list[LexborNode]:
43
+ """Returns all possible matches"""
44
+ ...
45
+
46
+ @property
47
+ def any_matches(self) -> bool:
48
+ """Returns True if there are any matches"""
49
+ ...
50
+
51
+ def text_contains(
52
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
53
+ ) -> LexborSelector:
54
+ """Filter all current matches given text."""
55
+ ...
56
+
57
+ def any_text_contains(
58
+ self, text: str, deep: bool = True, separator: str = "", strip: bool = False
59
+ ) -> bool:
60
+ """Returns True if any node in the current search scope contains specified text"""
61
+ ...
62
+
63
+ def attribute_longer_than(
64
+ self, attribute: str, length: int, start: str | None = None
65
+ ) -> LexborSelector:
66
+ """Filter all current matches by attribute length.
67
+
68
+ Similar to string-length in XPath.
69
+ """
70
+ ...
71
+
72
+ def any_attribute_longer_than(
73
+ self, attribute: str, length: int, start: str | None = None
74
+ ) -> bool:
75
+ """Returns True any href attribute longer than a specified length.
76
+
77
+ Similar to string-length in XPath.
78
+ """
79
+ ...
80
+
81
+ @property
82
+ def inner_html(self) -> str | None:
83
+ """Return HTML representation of the child nodes.
84
+
85
+ Works similar to innerHTML in JavaScript.
86
+ Unlike the `.html` property, does not include the current node.
87
+ Can be used to set HTML as well. See the setter docstring.
88
+
89
+ Returns
90
+ -------
91
+ text : str or None
92
+ """
93
+ ...
94
+
95
+ @inner_html.setter
96
+ def inner_html(self, html: str):
97
+ """Set inner HTML to the specified HTML.
98
+
99
+ Replaces existing data inside the node.
100
+ Works similar to innerHTML in JavaScript.
101
+
102
+ Parameters
103
+ ----------
104
+ html : str
105
+
106
+ """
107
+ ...
108
+
109
+ class LexborCSSSelector:
110
+ def __init__(self): ...
111
+ def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
112
+ def any_matches(self, query: str, node: LexborNode) -> bool: ...
113
+
114
+ class LexborNode:
115
+ """A class that represents HTML node (element)."""
116
+
117
+ parser: LexborHTMLParser
118
+
119
+ @property
120
+ def mem_id(self) -> int: ...
121
+ @property
122
+ def child(self) -> LexborNode | None:
123
+ """Alias for the `first_child` property.
124
+
125
+ **Deprecated**. Please use `first_child` instead.
126
+ """
127
+ ...
128
+
129
+ @property
130
+ def first_child(self) -> LexborNode | None:
131
+ """Return the first child node."""
132
+ ...
133
+
134
+ @property
135
+ def parent(self) -> LexborNode | None:
136
+ """Return the parent node."""
137
+ ...
138
+
139
+ @property
140
+ def next(self) -> LexborNode | None:
141
+ """Return next node."""
142
+ ...
143
+
144
+ @property
145
+ def prev(self) -> LexborNode | None:
146
+ """Return previous node."""
147
+ ...
148
+
149
+ @property
150
+ def last_child(self) -> LexborNode | None:
151
+ """Return last child node."""
152
+ ...
153
+
154
+ @property
155
+ def html(self) -> str | None:
156
+ """Return HTML representation of the current node including all its child nodes.
157
+
158
+ Returns
159
+ -------
160
+ text : str
161
+ """
162
+ ...
163
+
164
+ def __hash__(self) -> int: ...
165
+ def text_lexbor(self) -> str:
166
+ """Returns the text of the node including text of all its child nodes.
167
+
168
+ Uses builtin method from lexbor.
169
+ """
170
+ ...
171
+
172
+ def text(
173
+ self,
174
+ deep: bool = True,
175
+ separator: str = "",
176
+ strip: bool = False,
177
+ skip_empty: bool = False,
178
+ ) -> str:
179
+ """Return concatenated text from this node.
180
+
181
+ Parameters
182
+ ----------
183
+ deep : bool, optional
184
+ When ``True`` (default), include text from all descendant nodes; when
185
+ ``False``, only include direct children.
186
+ separator : str, optional
187
+ String inserted between successive text fragments.
188
+ strip : bool, optional
189
+ If ``True``, apply ``str.strip()`` to each fragment before joining to
190
+ remove surrounding whitespace. Defaults to ``False``.
191
+ skip_empty : bool, optional
192
+ Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
193
+ ``True``. Defaults to ``False``.
194
+
195
+ Returns
196
+ -------
197
+ text : str
198
+ Combined textual content assembled according to the provided options.
199
+ """
200
+ ...
201
+
202
+ def css(self, query: str) -> list[LexborNode]:
203
+ """Evaluate CSS selector against current node and its child nodes.
204
+
205
+ Matches pattern `query` against HTML tree.
206
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
207
+
208
+ Special selectors:
209
+
210
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
211
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
212
+
213
+
214
+ Parameters
215
+ ----------
216
+ query : str
217
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
218
+
219
+ Returns
220
+ -------
221
+ selector : list of `Node` objects
222
+ """
223
+ ...
224
+
225
+ @overload
226
+ def css_first(
227
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
228
+ ) -> LexborNode:
229
+ """Same as `css` but returns only the first match.
230
+
231
+ Parameters
232
+ ----------
233
+
234
+ query : str
235
+ default : bool, default None
236
+ Default value to return if there is no match.
237
+ strict: bool, default False
238
+ Set to True if you want to check if there is strictly only one match in the document.
239
+
240
+
241
+ Returns
242
+ -------
243
+ selector : `LexborNode` object
244
+ """
245
+ ...
246
+
247
+ @overload
248
+ def css_first(
249
+ self, query: str, default: DefaultT, strict: bool = False
250
+ ) -> LexborNode | DefaultT:
251
+ """Same as `css` but returns only the first match.
252
+
253
+ Parameters
254
+ ----------
255
+
256
+ query : str
257
+ default : bool, default None
258
+ Default value to return if there is no match.
259
+ strict: bool, default False
260
+ Set to True if you want to check if there is strictly only one match in the document.
261
+
262
+
263
+ Returns
264
+ -------
265
+ selector : `LexborNode` object
266
+ """
267
+ ...
268
+
269
+ @overload
270
+ def css_first(
271
+ self, query: str, default: None = ..., strict: bool = False
272
+ ) -> LexborNode | None:
273
+ """Same as `css` but returns only the first match.
274
+
275
+ Parameters
276
+ ----------
277
+
278
+ query : str
279
+ default : bool, default None
280
+ Default value to return if there is no match.
281
+ strict: bool, default False
282
+ Set to True if you want to check if there is strictly only one match in the document.
283
+
284
+
285
+ Returns
286
+ -------
287
+ selector : `LexborNode` object
288
+ """
289
+ ...
290
+
291
+ def any_css_matches(self, selectors: tuple[str]) -> bool:
292
+ """Returns True if any of CSS selectors matches a node"""
293
+ ...
294
+
295
+ def css_matches(self, selector: str) -> bool:
296
+ """Returns True if CSS selector matches a node."""
297
+ ...
298
+
299
+ @property
300
+ def tag_id(self) -> int: ...
301
+ @property
302
+ def tag(self) -> str | None:
303
+ """Return the name of the current tag (e.g. div, p, img).
304
+
305
+ For for non-tag nodes, returns the following names:
306
+
307
+ * `-text` - text node
308
+ * `-document` - document node
309
+ * `-comment` - comment node
310
+
311
+ Returns
312
+ -------
313
+ text : str
314
+ """
315
+ ...
316
+
317
+ def decompose(self, recursive: bool = True) -> None:
318
+ """Remove the current node from the tree.
319
+
320
+ Parameters
321
+ ----------
322
+ recursive : bool, default True
323
+ Whenever to delete all its child nodes
324
+
325
+ Examples
326
+ --------
327
+
328
+ >>> tree = LexborHTMLParser(html)
329
+ >>> for tag in tree.css('script'):
330
+ >>> tag.decompose()
331
+ """
332
+ ...
333
+
334
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
335
+ """Remove specified tags from the HTML tree.
336
+
337
+ Parameters
338
+ ----------
339
+ tags : list
340
+ List of tags to remove.
341
+ recursive : bool, default True
342
+ Whenever to delete all its child nodes
343
+
344
+ Examples
345
+ --------
346
+
347
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
348
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
349
+ >>> tree.strip_tags(tags)
350
+ >>> tree.html
351
+ '<html><body><div>Hello world!</div></body></html>'
352
+ """
353
+ ...
354
+
355
+ @property
356
+ def attributes(self) -> dict[str, str | None]:
357
+ """Get all attributes that belong to the current node.
358
+
359
+ The value of empty attributes is None.
360
+
361
+ Returns
362
+ -------
363
+ attributes : dictionary of all attributes.
364
+
365
+ Examples
366
+ --------
367
+
368
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
369
+ >>> node = tree.css_first('div')
370
+ >>> node.attributes
371
+ {'data': None, 'id': 'my_id'}
372
+ """
373
+ ...
374
+
375
+ @property
376
+ def attrs(self) -> LexborAttributes:
377
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
378
+
379
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
380
+
381
+ Returns
382
+ -------
383
+ attributes : Attributes mapping object.
384
+
385
+ Examples
386
+ --------
387
+
388
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
389
+ >>> node = tree.css_first('div')
390
+ >>> node.attrs
391
+ <div attributes, 1 items>
392
+ >>> node.attrs['id']
393
+ 'a'
394
+ >>> node.attrs['foo'] = 'bar'
395
+ >>> del node.attrs['id']
396
+ >>> node.attributes
397
+ {'foo': 'bar'}
398
+ >>> node.attrs['id'] = 'new_id'
399
+ >>> node.html
400
+ '<div foo="bar" id="new_id"></div>'
401
+ """
402
+ ...
403
+
404
+ @property
405
+ def id(self) -> str | None:
406
+ """Get the id attribute of the node.
407
+
408
+ Returns None if id does not set.
409
+
410
+ Returns
411
+ -------
412
+ text : str
413
+ """
414
+ ...
415
+
416
+ def iter(
417
+ self, include_text: bool = False, skip_empty: bool = False
418
+ ) -> Iterator[LexborNode]:
419
+ """Iterate over direct children of this node.
420
+
421
+ Parameters
422
+ ----------
423
+ include_text : bool, optional
424
+ When ``True``, yield text nodes in addition to element nodes. Defaults
425
+ to ``False``.
426
+ skip_empty : bool, optional
427
+ When ``include_text`` is ``True``, ignore text nodes that
428
+ ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
429
+
430
+ Yields
431
+ ------
432
+ LexborNode
433
+ Child nodes on the same tree level as this node, filtered according
434
+ to the provided options.
435
+ """
436
+ ...
437
+
438
+ def unwrap(self, delete_empty: bool = False) -> None:
439
+ """Replace node with whatever is inside this node.
440
+
441
+ Does nothing if you perform unwrapping second time on the same node.
442
+
443
+ Parameters
444
+ ----------
445
+ delete_empty : bool, default False
446
+ If True, removes empty tags.
447
+
448
+ Examples
449
+ --------
450
+
451
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
452
+ >>> tree.css_first('i').unwrap()
453
+ >>> tree.html
454
+ '<html><head></head><body><div>Hello world!</div></body></html>'
455
+
456
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
457
+ """
458
+ ...
459
+
460
+ def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
461
+ """Unwraps specified tags from the HTML tree.
462
+
463
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
464
+
465
+ Parameters
466
+ ----------
467
+ tags : list
468
+ List of tags to remove.
469
+ delete_empty : bool, default False
470
+ If True, removes empty tags.
471
+
472
+ Examples
473
+ --------
474
+
475
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
476
+ >>> tree.body.unwrap_tags(['i','a'])
477
+ >>> tree.body.html
478
+ '<body><div>Hello world!</div></body>'
479
+
480
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
481
+ """
482
+ ...
483
+
484
+ def merge_text_nodes(self) -> None:
485
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
486
+
487
+ This is useful for text extraction.
488
+ Use it when you need to strip HTML tags and merge "dangling" text.
489
+
490
+ Examples
491
+ --------
492
+
493
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
494
+ >>> node = tree.css_first('div')
495
+ >>> tree.unwrap_tags(["strong"])
496
+ >>> tree.text(deep=True, separator=" ", strip=True)
497
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
498
+ >>> node.merge_text_nodes()
499
+ >>> tree.text(deep=True, separator=" ", strip=True)
500
+ "John Doe"
501
+ """
502
+ ...
503
+
504
+ def traverse(
505
+ self, include_text: bool = False, skip_empty: bool = False
506
+ ) -> Iterator[LexborNode]:
507
+ """Depth-first traversal starting at the current node.
508
+
509
+ Parameters
510
+ ----------
511
+ include_text : bool, optional
512
+ When ``True``, include text nodes in the traversal sequence. Defaults
513
+ to ``False``.
514
+ skip_empty : bool, optional
515
+ Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
516
+ when ``include_text`` is ``True``. Defaults to ``False``.
517
+
518
+ Yields
519
+ ------
520
+ LexborNode
521
+ Nodes encountered in depth-first order beginning with the current
522
+ node, filtered according to the provided options.
523
+ """
524
+ ...
525
+
526
+ def replace_with(self, value: bytes | str | LexborNode) -> None:
527
+ """Replace current Node with specified value.
528
+
529
+ Parameters
530
+ ----------
531
+ value : str, bytes or Node
532
+ The text or Node instance to replace the Node with.
533
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
534
+ Convert and pass the ``Node`` object when you want to work with HTML.
535
+ Does not clone the ``Node`` object.
536
+ All future changes to the passed ``Node`` object will also be taken into account.
537
+
538
+ Examples
539
+ --------
540
+
541
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
542
+ >>> img = tree.css_first('img')
543
+ >>> img.replace_with(img.attributes.get('alt', ''))
544
+ >>> tree.body.child.html
545
+ '<div>Get Laptop</div>'
546
+
547
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
548
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
549
+ >>> img_node = html_parser.css_first('img')
550
+ >>> img_node.replace_with(html_parser2.body.child)
551
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
552
+ """
553
+ ...
554
+
555
+ def insert_before(self, value: bytes | str | LexborNode) -> None:
556
+ """Insert a node before the current Node.
557
+
558
+ Parameters
559
+ ----------
560
+ value : str, bytes or Node
561
+ The text or Node instance to insert before the Node.
562
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
563
+ Convert and pass the ``Node`` object when you want to work with HTML.
564
+ Does not clone the ``Node`` object.
565
+ All future changes to the passed ``Node`` object will also be taken into account.
566
+
567
+ Examples
568
+ --------
569
+
570
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
571
+ >>> img = tree.css_first('img')
572
+ >>> img.insert_before(img.attributes.get('alt', ''))
573
+ >>> tree.body.child.html
574
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
575
+
576
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
577
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
578
+ >>> img_node = html_parser.css_first('img')
579
+ >>> img_node.insert_before(html_parser2.body.child)
580
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
581
+ """
582
+ ...
583
+
584
+ def insert_after(self, value: bytes | str | LexborNode) -> None:
585
+ """Insert a node after the current Node.
586
+
587
+ Parameters
588
+ ----------
589
+ value : str, bytes or Node
590
+ The text or Node instance to insert after the Node.
591
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
592
+ Convert and pass the ``Node`` object when you want to work with HTML.
593
+ Does not clone the ``Node`` object.
594
+ All future changes to the passed ``Node`` object will also be taken into account.
595
+
596
+ Examples
597
+ --------
598
+
599
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
600
+ >>> img = tree.css_first('img')
601
+ >>> img.insert_after(img.attributes.get('alt', ''))
602
+ >>> tree.body.child.html
603
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
604
+
605
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
606
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
607
+ >>> img_node = html_parser.css_first('img')
608
+ >>> img_node.insert_after(html_parser2.body.child)
609
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
610
+ """
611
+ ...
612
+
613
+ def insert_child(self, value: bytes | str | LexborNode) -> None:
614
+ """Insert a node inside (at the end of) the current Node.
615
+
616
+ Parameters
617
+ ----------
618
+ value : str, bytes or Node
619
+ The text or Node instance to insert inside the Node.
620
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
621
+ Convert and pass the ``Node`` object when you want to work with HTML.
622
+ Does not clone the ``Node`` object.
623
+ All future changes to the passed ``Node`` object will also be taken into account.
624
+
625
+ Examples
626
+ --------
627
+
628
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
629
+ >>> div = tree.css_first('div')
630
+ >>> div.insert_child('Laptop')
631
+ >>> tree.body.child.html
632
+ '<div>Get <img src="">Laptop</div>'
633
+
634
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
635
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
636
+ >>> span_node = html_parser.css_first('span')
637
+ >>> span_node.insert_child(html_parser2.body.child)
638
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
639
+ """
640
+ ...
641
+
642
+ @property
643
+ def raw_value(self) -> NoReturn:
644
+ """Return the raw (unparsed, original) value of a node.
645
+
646
+ Currently, works on text nodes only.
647
+
648
+ Returns
649
+ -------
650
+
651
+ raw_value : bytes
652
+
653
+ Examples
654
+ --------
655
+
656
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
657
+ >>> selector = html_parser.css_first('div')
658
+ >>> selector.child.html
659
+ '&lt;test&gt;'
660
+ >>> selector.child.raw_value
661
+ b'&#x3C;test&#x3E;'
662
+ """
663
+ ...
664
+
665
+ def scripts_contain(self, query: str) -> bool:
666
+ """Returns True if any of the script tags contain specified text.
667
+
668
+ Caches script tags on the first call to improve performance.
669
+
670
+ Parameters
671
+ ----------
672
+ query : str
673
+ The query to check.
674
+ """
675
+ ...
676
+
677
+ def script_srcs_contain(self, queries: tuple[str]) -> bool:
678
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
679
+
680
+ Caches values on the first call to improve performance.
681
+
682
+ Parameters
683
+ ----------
684
+ queries : tuple of str
685
+ """
686
+ ...
687
+
688
+ def remove(self, recursive: bool = True) -> None:
689
+ """An alias for the decompose method."""
690
+ ...
691
+
692
+ def select(self, query: str | None = None) -> LexborSelector:
693
+ """Select nodes given a CSS selector.
694
+
695
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
696
+
697
+ Parameters
698
+ ----------
699
+ query : str or None
700
+ The CSS selector to use when searching for nodes.
701
+
702
+ Returns
703
+ -------
704
+ selector : The `Selector` class.
705
+ """
706
+ ...
707
+
708
+ @property
709
+ def text_content(self) -> str | None:
710
+ """Returns the text of the node if it is a text node.
711
+
712
+ Returns None for other nodes.
713
+ Unlike the ``text`` method, does not include child nodes.
714
+
715
+ Returns
716
+ -------
717
+ text : str or None.
718
+ """
719
+ ...
720
+
721
+ @property
722
+ def inner_html(self) -> str | None:
723
+ """Return HTML representation of the child nodes.
724
+
725
+ Works similar to innerHTML in JavaScript.
726
+ Unlike the `.html` property, does not include the current node.
727
+ Can be used to set HTML as well. See the setter docstring.
728
+
729
+ Returns
730
+ -------
731
+ text : str or None
732
+ """
733
+ ...
734
+
735
+ @inner_html.setter
736
+ def inner_html(self, html: str):
737
+ """Set inner HTML to the specified HTML.
738
+
739
+ Replaces existing data inside the node.
740
+ Works similar to innerHTML in JavaScript.
741
+
742
+ Parameters
743
+ ----------
744
+ html : str
745
+
746
+ """
747
+ ...
748
+
749
+ def clone(self) -> LexborNode:
750
+ """Clone the current node.
751
+
752
+ You can it use to do temporary modifications without affecting the original HTML tree.
753
+
754
+ It is tied to the current parser instance.
755
+ Gets destroyed when parser instance is destroyed.
756
+ """
757
+ ...
758
+
759
+ @property
760
+ def is_element_node(self) -> bool:
761
+ """Return True if the node represents an element node."""
762
+ ...
763
+
764
+ @property
765
+ def is_text_node(self) -> bool:
766
+ """Return True if the node represents a text node."""
767
+ ...
768
+
769
+ @property
770
+ def is_comment_node(self) -> bool:
771
+ """Return True if the node represents a comment node."""
772
+ ...
773
+
774
+ @property
775
+ def is_document_node(self) -> bool:
776
+ """Return True if the node represents a document node."""
777
+ ...
778
+
779
+ @property
780
+ def is_empty_text_node(self) -> bool:
781
+ """Check whether the current node is an empty text node.
782
+
783
+ Returns
784
+ -------
785
+ bool
786
+ ``True`` when the node is a text node and ``lxb_dom_node_is_empty``
787
+ reports that it contains no characters.
788
+ """
789
+ ...
790
+
791
+ class LexborHTMLParser:
792
+ """The lexbor HTML parser.
793
+
794
+ Use this class to parse raw HTML.
795
+
796
+ This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
797
+
798
+ Parameters
799
+ ----------
800
+
801
+ html : str (unicode) or bytes
802
+ """
803
+
804
+ raw_html: bytes
805
+
806
+ def __init__(self, html: str | bytes, is_fragment: bool = False) -> None:
807
+ """Create a parser and load HTML.
808
+
809
+ Parameters
810
+ ----------
811
+ html : str or bytes
812
+ HTML content to parse.
813
+ is_fragment : bool, optional
814
+ When ``False`` (default), the input is parsed as a full HTML document.
815
+ If the input is only a fragment, the parser still accepts it and inserts any missing required elements,
816
+ (such as `<html>`, `<head>`, and `<body>`) into the tree,
817
+ according to the HTML parsing rules in the HTML Standard.
818
+ This matches how browsers construct the DOM when they load an HTML page.
819
+ When ``True``, the input is parsed as an HTML fragment.
820
+ The parser does not insert any missing required HTML elements.
821
+ """
822
+ ...
823
+
824
+ def __repr__(self) -> str:
825
+ """Return a concise representation of the parsed document.
826
+
827
+ Returns
828
+ -------
829
+ str
830
+ A string showing the number of characters in the parsed HTML.
831
+ """
832
+ ...
833
+
834
+ @property
835
+ def selector(self) -> LexborCSSSelector:
836
+ """Return a lazily created CSS selector helper.
837
+
838
+ Returns
839
+ -------
840
+ LexborCSSSelector
841
+ Selector instance bound to this parser.
842
+ """
843
+ ...
844
+
845
+ @property
846
+ def root(self) -> LexborNode | None:
847
+ """Return the document root node.
848
+
849
+ Returns
850
+ -------
851
+ LexborNode or None
852
+ Root of the parsed document, or ``None`` if unavailable.
853
+ """
854
+ ...
855
+
856
+ @property
857
+ def body(self) -> LexborNode | None:
858
+ """Return document body.
859
+
860
+ Returns
861
+ -------
862
+ LexborNode or None
863
+ ``<body>`` element when present, otherwise ``None``.
864
+ """
865
+ ...
866
+
867
+ @property
868
+ def head(self) -> LexborNode | None:
869
+ """Return document head.
870
+
871
+ Returns
872
+ -------
873
+ LexborNode or None
874
+ ``<head>`` element when present, otherwise ``None``.
875
+ """
876
+ ...
877
+
878
+ def tags(self, name: str) -> list[LexborNode]:
879
+ """Return all tags that match the provided name.
880
+
881
+ Parameters
882
+ ----------
883
+ name : str
884
+ Tag name to search for (e.g., ``"div"``).
885
+
886
+ Returns
887
+ -------
888
+ list of LexborNode
889
+ Matching elements in document order.
890
+
891
+ Raises
892
+ ------
893
+ ValueError
894
+ If ``name`` is empty or longer than 100 characters.
895
+ SelectolaxError
896
+ If Lexbor cannot locate the elements.
897
+ """
898
+ ...
899
+
900
+ def text(
901
+ self,
902
+ deep: bool = True,
903
+ separator: str = "",
904
+ strip: bool = False,
905
+ skip_empty: bool = False,
906
+ ) -> str:
907
+ """Returns the text of the node including text of all its child nodes.
908
+
909
+ Parameters
910
+ ----------
911
+ strip : bool, default False
912
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
913
+ separator : str, default ''
914
+ The separator to use when joining text from different nodes.
915
+ deep : bool, default True
916
+ If True, includes text from all child nodes.
917
+ skip_empty : bool, optional
918
+ Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
919
+ ``True``. Defaults to ``False``.
920
+
921
+ Returns
922
+ -------
923
+ text : str
924
+ Combined textual content assembled according to the provided options.
925
+ """
926
+ ...
927
+
928
+ @property
929
+ def html(self) -> str | None:
930
+ """Return HTML representation of the page.
931
+
932
+ Returns
933
+ -------
934
+ str or None
935
+ Serialized HTML of the current document.
936
+ """
937
+ ...
938
+
939
+ def css(self, query: str) -> list[LexborNode]:
940
+ """A CSS selector.
941
+
942
+ Matches pattern `query` against HTML tree.
943
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
944
+
945
+ Special selectors:
946
+
947
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
948
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
949
+
950
+ Parameters
951
+ ----------
952
+ query : str
953
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
954
+
955
+ Returns
956
+ -------
957
+ selector : list of `Node` objects
958
+ """
959
+ ...
960
+
961
+ @overload
962
+ def css_first(
963
+ self, query: str, default: Any = ..., strict: Literal[True] = ...
964
+ ) -> LexborNode:
965
+ """Same as `css` but returns only the first match.
966
+
967
+ Parameters
968
+ ----------
969
+
970
+ query : str
971
+ default : Any, default None
972
+ Default value to return if there is no match.
973
+ strict: bool, default False
974
+ Set to True if you want to check if there is strictly only one match in the document.
975
+
976
+
977
+ Returns
978
+ -------
979
+ selector : `LexborNode` object
980
+ """
981
+ ...
982
+
983
+ @overload
984
+ def css_first(
985
+ self, query: str, default: DefaultT, strict: bool = False
986
+ ) -> LexborNode | DefaultT:
987
+ """Same as `css` but returns only the first match.
988
+
989
+ Parameters
990
+ ----------
991
+
992
+ query : str
993
+ default : Any, default None
994
+ Default value to return if there is no match.
995
+ strict: bool, default False
996
+ Set to True if you want to check if there is strictly only one match in the document.
997
+
998
+
999
+ Returns
1000
+ -------
1001
+ selector : `LexborNode` object
1002
+ """
1003
+ ...
1004
+
1005
+ @overload
1006
+ def css_first(
1007
+ self, query: str, default: None = ..., strict: bool = False
1008
+ ) -> LexborNode | None:
1009
+ """Same as `css` but returns only the first match.
1010
+
1011
+ Parameters
1012
+ ----------
1013
+
1014
+ query : str
1015
+ default : Any, default None
1016
+ Default value to return if there is no match.
1017
+ strict: bool, default False
1018
+ Set to True if you want to check if there is strictly only one match in the document.
1019
+
1020
+
1021
+ Returns
1022
+ -------
1023
+ selector : `LexborNode` object
1024
+ """
1025
+ ...
1026
+
1027
+ def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
1028
+ """Remove specified tags from the node.
1029
+
1030
+ Parameters
1031
+ ----------
1032
+ tags : list of str
1033
+ List of tags to remove.
1034
+ recursive : bool, default False
1035
+ Whenever to delete all its child nodes
1036
+
1037
+ Examples
1038
+ --------
1039
+
1040
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
1041
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
1042
+ >>> tree.strip_tags(tags)
1043
+ >>> tree.html
1044
+ '<html><body><div>Hello world!</div></body></html>'
1045
+
1046
+ Returns
1047
+ -------
1048
+ None
1049
+ """
1050
+ ...
1051
+
1052
+ def select(self, query: str | None = None) -> LexborSelector | None:
1053
+ """Select nodes given a CSS selector.
1054
+
1055
+ Works similarly to the ``css`` method, but supports chained filtering and extra features.
1056
+
1057
+ Parameters
1058
+ ----------
1059
+ query : str or None
1060
+ The CSS selector to use when searching for nodes.
1061
+
1062
+ Returns
1063
+ -------
1064
+ LexborSelector or None
1065
+ Selector bound to the root node, or ``None`` if the document is empty.
1066
+ """
1067
+ ...
1068
+
1069
+ def any_css_matches(self, selectors: tuple[str]) -> bool:
1070
+ """Return ``True`` if any of the specified CSS selectors match.
1071
+
1072
+ Parameters
1073
+ ----------
1074
+ selectors : tuple[str]
1075
+ CSS selectors to evaluate.
1076
+
1077
+ Returns
1078
+ -------
1079
+ bool
1080
+ ``True`` when at least one selector matches.
1081
+ """
1082
+ ...
1083
+
1084
+ def scripts_contain(self, query: str) -> bool:
1085
+ """Return ``True`` if any script tag contains the given text.
1086
+
1087
+ Caches script tags on the first call to improve performance.
1088
+
1089
+ Parameters
1090
+ ----------
1091
+ query : str
1092
+ Text to search for within script contents.
1093
+
1094
+ Returns
1095
+ -------
1096
+ bool
1097
+ ``True`` when a matching script tag is found.
1098
+ """
1099
+ ...
1100
+
1101
+ def script_srcs_contain(self, queries: tuple[str]) -> bool:
1102
+ """Return ``True`` if any script ``src`` contains one of the strings.
1103
+
1104
+ Caches values on the first call to improve performance.
1105
+
1106
+ Parameters
1107
+ ----------
1108
+ queries : tuple of str
1109
+ Strings to look for inside ``src`` attributes.
1110
+
1111
+ Returns
1112
+ -------
1113
+ bool
1114
+ ``True`` when a matching source value is found.
1115
+ """
1116
+ ...
1117
+
1118
+ def css_matches(self, selector: str) -> bool:
1119
+ """Return ``True`` if the document matches the selector at least once.
1120
+
1121
+ Parameters
1122
+ ----------
1123
+ selector : str
1124
+ CSS selector to test.
1125
+
1126
+ Returns
1127
+ -------
1128
+ bool
1129
+ ``True`` when a match exists.
1130
+ """
1131
+ ...
1132
+
1133
+ def merge_text_nodes(self) -> None:
1134
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
1135
+
1136
+ This is useful for text extraction.
1137
+ Use it when you need to strip HTML tags and merge "dangling" text.
1138
+
1139
+ Examples
1140
+ --------
1141
+
1142
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
1143
+ >>> node = tree.css_first('div')
1144
+ >>> tree.unwrap_tags(["strong"])
1145
+ >>> tree.text(deep=True, separator=" ", strip=True)
1146
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
1147
+ >>> node.merge_text_nodes()
1148
+ >>> tree.text(deep=True, separator=" ", strip=True)
1149
+ "John Doe"
1150
+
1151
+ Returns
1152
+ -------
1153
+ None
1154
+ """
1155
+ ...
1156
+
1157
+ def clone(self) -> LexborHTMLParser:
1158
+ """Clone the current document tree.
1159
+
1160
+ You can use it to do temporary modifications without affecting the original HTML tree.
1161
+ It is tied to the current parser instance.
1162
+ Gets destroyed when the parser instance is destroyed.
1163
+
1164
+ Returns
1165
+ -------
1166
+ LexborHTMLParser
1167
+ A parser instance backed by a deep-copied document.
1168
+ """
1169
+ ...
1170
+
1171
+ def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
1172
+ """Unwraps specified tags from the HTML tree.
1173
+
1174
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
1175
+
1176
+ Parameters
1177
+ ----------
1178
+ tags : list
1179
+ List of tags to remove.
1180
+ delete_empty : bool
1181
+ Whenever to delete empty tags.
1182
+
1183
+ Examples
1184
+ --------
1185
+
1186
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
1187
+ >>> tree.body.unwrap_tags(['i','a'])
1188
+ >>> tree.body.html
1189
+ '<body><div>Hello world!</div></body>'
1190
+
1191
+ Returns
1192
+ -------
1193
+ None
1194
+ """
1195
+ ...
1196
+
1197
+ @property
1198
+ def inner_html(self) -> str:
1199
+ """Return HTML representation of the child nodes.
1200
+
1201
+ Works similar to innerHTML in JavaScript.
1202
+ Unlike the `.html` property, does not include the current node.
1203
+ Can be used to set HTML as well. See the setter docstring.
1204
+
1205
+ Returns
1206
+ -------
1207
+ text : str | None
1208
+ """
1209
+ ...
1210
+
1211
+ @inner_html.setter
1212
+ def inner_html(self, html: str) -> None:
1213
+ """Set inner HTML to the specified HTML.
1214
+
1215
+ Replaces existing data inside the node.
1216
+ Works similar to innerHTML in JavaScript.
1217
+
1218
+ Parameters
1219
+ ----------
1220
+ html : str
1221
+
1222
+ Returns
1223
+ -------
1224
+ None
1225
+ """
1226
+ ...
1227
+
1228
+ def create_tag(tag: str) -> LexborNode:
1229
+ """
1230
+ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
1231
+ e.g. `"<div></div>"`.
1232
+ """
1233
+ ...
1234
+
1235
+ def parse_fragment(html: str) -> list[LexborNode]:
1236
+ """
1237
+ Given HTML, parse it into a list of Nodes, such that the nodes
1238
+ correspond to the given HTML.
1239
+
1240
+ For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
1241
+ if they are missing. This function does not add these tags.
1242
+ """
1243
+ ...
1244
+
1245
+ class SelectolaxError(Exception):
1246
+ """An exception that indicates error."""
1247
+
1248
+ pass