selectolax 0.3.28__cp310-cp310-win32.whl → 0.3.34__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +119 -46
- selectolax/lexbor/selection.pxi +34 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +52987 -55179
- selectolax/lexbor.cp310-win32.pyd +0 -0
- selectolax/lexbor.pxd +36 -40
- selectolax/lexbor.pyi +770 -65
- selectolax/lexbor.pyx +58 -19
- selectolax/modest/node.pxi +64 -45
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +50190 -52171
- selectolax/parser.cp310-win32.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +489 -52
- selectolax/parser.pyx +43 -33
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/METADATA +0 -193
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
selectolax/parser.pyi
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Iterator, Literal, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
5
5
|
class _Attributes:
|
|
6
|
+
"""A dict-like object that represents attributes."""
|
|
7
|
+
|
|
6
8
|
@staticmethod
|
|
7
9
|
def create(node: Node, decode_errors: str) -> _Attributes: ...
|
|
8
10
|
def keys(self) -> Iterator[str]: ...
|
|
@@ -22,7 +24,9 @@ class _Attributes:
|
|
|
22
24
|
@overload
|
|
23
25
|
def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
|
|
24
26
|
@overload
|
|
25
|
-
def sget(self, key: str, default: str = "") -> str:
|
|
27
|
+
def sget(self, key: str, default: str = "") -> str:
|
|
28
|
+
"""Same as get, but returns empty strings instead of None values for empty attributes."""
|
|
29
|
+
...
|
|
26
30
|
|
|
27
31
|
class Selector:
|
|
28
32
|
"""An advanced CSS selector that supports additional operations.
|
|
@@ -69,47 +73,133 @@ class Selector:
|
|
|
69
73
|
...
|
|
70
74
|
|
|
71
75
|
class Node:
|
|
76
|
+
"""A class that represents HTML node (element)."""
|
|
77
|
+
|
|
72
78
|
parser: HTMLParser
|
|
73
79
|
@property
|
|
74
80
|
def attributes(self) -> dict[str, str | None]:
|
|
75
81
|
"""Get all attributes that belong to the current node.
|
|
76
82
|
|
|
77
|
-
The value of empty attributes is None.
|
|
83
|
+
The value of empty attributes is None.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
attributes : dictionary of all attributes.
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
|
|
92
|
+
>>> tree = HTMLParser("<div data id='my_id'></div>")
|
|
93
|
+
>>> node = tree.css_first('div')
|
|
94
|
+
>>> node.attributes
|
|
95
|
+
{'data': None, 'id': 'my_id'}
|
|
96
|
+
"""
|
|
78
97
|
...
|
|
79
98
|
@property
|
|
80
99
|
def attrs(self) -> _Attributes:
|
|
81
|
-
"""A dict-like object that is similar to the attributes property, but operates directly on the Node data.
|
|
100
|
+
"""A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
|
|
101
|
+
|
|
102
|
+
.. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
attributes : Attributes mapping object.
|
|
107
|
+
|
|
108
|
+
Examples
|
|
109
|
+
--------
|
|
110
|
+
|
|
111
|
+
>>> tree = HTMLParser("<div id='a'></div>")
|
|
112
|
+
>>> node = tree.css_first('div')
|
|
113
|
+
>>> node.attrs
|
|
114
|
+
<div attributes, 1 items>
|
|
115
|
+
>>> node.attrs['id']
|
|
116
|
+
'a'
|
|
117
|
+
>>> node.attrs['foo'] = 'bar'
|
|
118
|
+
>>> del node.attrs['id']
|
|
119
|
+
>>> node.attributes
|
|
120
|
+
{'foo': 'bar'}
|
|
121
|
+
>>> node.attrs['id'] = 'new_id'
|
|
122
|
+
>>> node.html
|
|
123
|
+
'<div foo="bar" id="new_id"></div>'
|
|
124
|
+
"""
|
|
82
125
|
...
|
|
83
126
|
@property
|
|
84
127
|
def id(self) -> str | None:
|
|
85
128
|
"""Get the id attribute of the node.
|
|
86
129
|
|
|
87
|
-
Returns None if id does not set.
|
|
130
|
+
Returns None if id does not set.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
text : str
|
|
135
|
+
"""
|
|
88
136
|
...
|
|
89
137
|
|
|
90
138
|
def mem_id(self) -> int:
|
|
91
|
-
"""Get the mem_id of the node.
|
|
139
|
+
"""Get the mem_id attribute of the node.
|
|
92
140
|
|
|
93
|
-
Returns
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
text : int
|
|
144
|
+
"""
|
|
94
145
|
...
|
|
95
146
|
|
|
96
147
|
def __hash__(self) -> int:
|
|
97
|
-
"""
|
|
148
|
+
"""Get the hash of this node
|
|
98
149
|
:return: int
|
|
99
150
|
"""
|
|
100
151
|
...
|
|
101
152
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
102
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
153
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
strip : bool, default False
|
|
158
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
159
|
+
separator : str, default ''
|
|
160
|
+
The separator to use when joining text from different nodes.
|
|
161
|
+
deep : bool, default True
|
|
162
|
+
If True, includes text from all child nodes.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
text : str
|
|
167
|
+
"""
|
|
103
168
|
...
|
|
104
169
|
def iter(self, include_text: bool = False) -> Iterator[Node]:
|
|
105
|
-
"""Iterate over nodes on the current level.
|
|
170
|
+
"""Iterate over nodes on the current level.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
include_text : bool
|
|
175
|
+
If True, includes text nodes as well.
|
|
176
|
+
|
|
177
|
+
Yields
|
|
178
|
+
-------
|
|
179
|
+
node
|
|
180
|
+
"""
|
|
106
181
|
...
|
|
107
182
|
def traverse(self, include_text: bool = False) -> Iterator[Node]:
|
|
108
|
-
"""Iterate over all child and next nodes starting from the current level.
|
|
183
|
+
"""Iterate over all child and next nodes starting from the current level.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
include_text : bool
|
|
188
|
+
If True, includes text nodes as well.
|
|
189
|
+
|
|
190
|
+
Yields
|
|
191
|
+
-------
|
|
192
|
+
node
|
|
193
|
+
"""
|
|
109
194
|
...
|
|
110
195
|
@property
|
|
111
196
|
def tag(self) -> str:
|
|
112
|
-
"""Return the name of the current tag (e.g. div, p, img).
|
|
197
|
+
"""Return the name of the current tag (e.g. div, p, img).
|
|
198
|
+
|
|
199
|
+
Returns
|
|
200
|
+
-------
|
|
201
|
+
text : str
|
|
202
|
+
"""
|
|
113
203
|
...
|
|
114
204
|
@property
|
|
115
205
|
def child(self) -> Node | None:
|
|
@@ -133,7 +223,12 @@ class Node:
|
|
|
133
223
|
...
|
|
134
224
|
@property
|
|
135
225
|
def html(self) -> str | None:
|
|
136
|
-
"""Return HTML representation of the current node including all its child nodes.
|
|
226
|
+
"""Return HTML representation of the current node including all its child nodes.
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
text : str
|
|
231
|
+
"""
|
|
137
232
|
...
|
|
138
233
|
def css(self, query: str) -> list[Node]:
|
|
139
234
|
"""Evaluate CSS selector against current node and its child nodes."""
|
|
@@ -145,84 +240,315 @@ class Node:
|
|
|
145
240
|
"""Returns True if CSS selector matches a node."""
|
|
146
241
|
...
|
|
147
242
|
@overload
|
|
148
|
-
def css_first(
|
|
149
|
-
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
150
|
-
) -> Node: ...
|
|
151
|
-
@overload
|
|
152
243
|
def css_first(
|
|
153
244
|
self, query: str, default: DefaultT, strict: bool = False
|
|
154
245
|
) -> Node | DefaultT: ...
|
|
155
246
|
@overload
|
|
156
247
|
def css_first(
|
|
157
|
-
self, query: str, default: None =
|
|
158
|
-
) -> Node | None:
|
|
248
|
+
self, query: str, default: None = None, strict: bool = False
|
|
249
|
+
) -> Node | None | DefaultT:
|
|
159
250
|
"""Evaluate CSS selector against current node and its child nodes."""
|
|
160
251
|
...
|
|
161
252
|
def decompose(self, recursive: bool = True) -> None:
|
|
162
|
-
"""Remove a Node from the tree.
|
|
253
|
+
"""Remove a Node from the tree.
|
|
254
|
+
|
|
255
|
+
Parameters
|
|
256
|
+
----------
|
|
257
|
+
recursive : bool, default True
|
|
258
|
+
Whenever to delete all its child nodes
|
|
259
|
+
|
|
260
|
+
Examples
|
|
261
|
+
--------
|
|
262
|
+
|
|
263
|
+
>>> tree = HTMLParser(html)
|
|
264
|
+
>>> for tag in tree.css('script'):
|
|
265
|
+
>>> tag.decompose()
|
|
266
|
+
"""
|
|
163
267
|
...
|
|
164
268
|
def remove(self, recursive: bool = True) -> None:
|
|
165
269
|
"""An alias for the decompose method."""
|
|
166
270
|
...
|
|
167
|
-
def unwrap(self) -> None:
|
|
168
|
-
"""Replace node with whatever is inside this node.
|
|
271
|
+
def unwrap(self, delete_empty: bool = False) -> None:
|
|
272
|
+
"""Replace node with whatever is inside this node.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
delete_empty : bool, default False
|
|
277
|
+
Whenever to delete empty tags.
|
|
278
|
+
|
|
279
|
+
Examples
|
|
280
|
+
--------
|
|
281
|
+
|
|
282
|
+
>>> tree = HTMLParser("<div>Hello <i>world</i>!</div>")
|
|
283
|
+
>>> tree.css_first('i').unwrap()
|
|
284
|
+
>>> tree.html
|
|
285
|
+
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
286
|
+
|
|
287
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
288
|
+
"""
|
|
169
289
|
...
|
|
170
290
|
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
171
|
-
"""Remove specified tags from the HTML tree.
|
|
291
|
+
"""Remove specified tags from the HTML tree.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
tags : list
|
|
296
|
+
List of tags to remove.
|
|
297
|
+
recursive : bool, default True
|
|
298
|
+
Whenever to delete all its child nodes
|
|
299
|
+
|
|
300
|
+
Examples
|
|
301
|
+
--------
|
|
302
|
+
|
|
303
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
304
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
305
|
+
>>> tree.strip_tags(tags)
|
|
306
|
+
>>> tree.html
|
|
307
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
308
|
+
"""
|
|
172
309
|
...
|
|
173
|
-
def unwrap_tags(self, tags: list[str]) -> None:
|
|
310
|
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
174
311
|
"""Unwraps specified tags from the HTML tree.
|
|
175
312
|
|
|
176
|
-
Works the same as the unwrap method, but applied to a list of tags.
|
|
313
|
+
Works the same as the unwrap method, but applied to a list of tags.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
tags : list
|
|
318
|
+
List of tags to remove.
|
|
319
|
+
delete_empty : bool, default False
|
|
320
|
+
Whenever to delete empty tags.
|
|
321
|
+
|
|
322
|
+
Examples
|
|
323
|
+
--------
|
|
324
|
+
|
|
325
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
326
|
+
>>> tree.body.unwrap_tags(['i','a'])
|
|
327
|
+
>>> tree.body.html
|
|
328
|
+
'<body><div>Hello world!</div></body>'
|
|
329
|
+
|
|
330
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
331
|
+
"""
|
|
177
332
|
...
|
|
178
333
|
def replace_with(self, value: str | bytes | None) -> None:
|
|
179
|
-
"""Replace current Node with specified value.
|
|
334
|
+
"""Replace current Node with specified value.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
value : str, bytes or Node
|
|
339
|
+
The text or Node instance to replace the Node with.
|
|
340
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
341
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
342
|
+
Does not clone the ``Node`` object.
|
|
343
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
344
|
+
|
|
345
|
+
Examples
|
|
346
|
+
--------
|
|
347
|
+
|
|
348
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
349
|
+
>>> img = tree.css_first('img')
|
|
350
|
+
>>> img.replace_with(img.attributes.get('alt', ''))
|
|
351
|
+
>>> tree.body.child.html
|
|
352
|
+
'<div>Get Laptop</div>'
|
|
353
|
+
|
|
354
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
355
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
356
|
+
>>> img_node = html_parser.css_first('img')
|
|
357
|
+
>>> img_node.replace_with(html_parser2.body.child)
|
|
358
|
+
'<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
|
|
359
|
+
"""
|
|
180
360
|
...
|
|
181
361
|
def insert_before(self, value: str | bytes | None) -> None:
|
|
182
|
-
"""Insert a node before the current Node.
|
|
362
|
+
"""Insert a node before the current Node.
|
|
363
|
+
|
|
364
|
+
Parameters
|
|
365
|
+
----------
|
|
366
|
+
value : str, bytes or Node
|
|
367
|
+
The text or Node instance to insert before the Node.
|
|
368
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
369
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
370
|
+
Does not clone the ``Node`` object.
|
|
371
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
372
|
+
|
|
373
|
+
Examples
|
|
374
|
+
--------
|
|
375
|
+
|
|
376
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
377
|
+
>>> img = tree.css_first('img')
|
|
378
|
+
>>> img.insert_before(img.attributes.get('alt', ''))
|
|
379
|
+
>>> tree.body.child.html
|
|
380
|
+
'<div>Get Laptop<img src="" alt="Laptop"></div>'
|
|
381
|
+
|
|
382
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
383
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
384
|
+
>>> img_node = html_parser.css_first('img')
|
|
385
|
+
>>> img_node.insert_before(html_parser2.body.child)
|
|
386
|
+
<div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
|
|
387
|
+
"""
|
|
183
388
|
...
|
|
184
389
|
def insert_after(self, value: str | bytes | None) -> None:
|
|
185
|
-
"""Insert a node after the current Node.
|
|
390
|
+
"""Insert a node after the current Node.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
value : str, bytes or Node
|
|
395
|
+
The text or Node instance to insert after the Node.
|
|
396
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
397
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
398
|
+
Does not clone the ``Node`` object.
|
|
399
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
400
|
+
|
|
401
|
+
Examples
|
|
402
|
+
--------
|
|
403
|
+
|
|
404
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
405
|
+
>>> img = tree.css_first('img')
|
|
406
|
+
>>> img.insert_after(img.attributes.get('alt', ''))
|
|
407
|
+
>>> tree.body.child.html
|
|
408
|
+
'<div>Get <img src="" alt="Laptop">Laptop</div>'
|
|
409
|
+
|
|
410
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
411
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
412
|
+
>>> img_node = html_parser.css_first('img')
|
|
413
|
+
>>> img_node.insert_after(html_parser2.body.child)
|
|
414
|
+
<div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
|
|
415
|
+
"""
|
|
186
416
|
...
|
|
187
417
|
def insert_child(self, value: str | bytes | None) -> None:
|
|
188
|
-
"""Insert a node inside (at the end of) the current Node
|
|
418
|
+
"""Insert a node inside (at the end of) the current Node.
|
|
419
|
+
|
|
420
|
+
Parameters
|
|
421
|
+
----------
|
|
422
|
+
value : str, bytes or Node
|
|
423
|
+
The text or Node instance to insert inside the Node.
|
|
424
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
425
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
426
|
+
Does not clone the ``Node`` object.
|
|
427
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
428
|
+
|
|
429
|
+
Examples
|
|
430
|
+
--------
|
|
431
|
+
|
|
432
|
+
>>> tree = HTMLParser('<div>Get <img src=""></div>')
|
|
433
|
+
>>> div = tree.css_first('div')
|
|
434
|
+
>>> div.insert_child('Laptop')
|
|
435
|
+
>>> tree.body.child.html
|
|
436
|
+
'<div>Get <img src="">Laptop</div>'
|
|
437
|
+
|
|
438
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
|
|
439
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
440
|
+
>>> span_node = html_parser.css_first('span')
|
|
441
|
+
>>> span_node.insert_child(html_parser2.body.child)
|
|
442
|
+
<div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
|
|
443
|
+
"""
|
|
189
444
|
...
|
|
190
445
|
@property
|
|
191
446
|
def raw_value(self) -> bytes:
|
|
192
447
|
"""Return the raw (unparsed, original) value of a node.
|
|
193
448
|
|
|
194
|
-
Currently, works on text nodes only.
|
|
449
|
+
Currently, works on text nodes only.
|
|
450
|
+
|
|
451
|
+
Returns
|
|
452
|
+
-------
|
|
453
|
+
|
|
454
|
+
raw_value : bytes
|
|
455
|
+
|
|
456
|
+
Examples
|
|
457
|
+
--------
|
|
458
|
+
|
|
459
|
+
>>> html_parser = HTMLParser('<div><test></div>')
|
|
460
|
+
>>> selector = html_parser.css_first('div')
|
|
461
|
+
>>> selector.child.html
|
|
462
|
+
'<test>'
|
|
463
|
+
>>> selector.child.raw_value
|
|
464
|
+
b'<test>'
|
|
465
|
+
"""
|
|
195
466
|
...
|
|
196
467
|
def select(self, query: str | None = None) -> Selector:
|
|
197
468
|
"""Select nodes given a CSS selector.
|
|
198
469
|
|
|
199
470
|
Works similarly to the css method, but supports chained filtering and extra features.
|
|
471
|
+
|
|
472
|
+
Parameters
|
|
473
|
+
----------
|
|
474
|
+
query : str or None
|
|
475
|
+
The CSS selector to use when searching for nodes.
|
|
476
|
+
|
|
477
|
+
Returns
|
|
478
|
+
-------
|
|
479
|
+
selector : The `Selector` class.
|
|
200
480
|
"""
|
|
201
481
|
...
|
|
202
482
|
def scripts_contain(self, query: str) -> bool:
|
|
203
483
|
"""Returns True if any of the script tags contain specified text.
|
|
204
484
|
|
|
205
|
-
Caches script tags on the first call to improve performance.
|
|
485
|
+
Caches script tags on the first call to improve performance.
|
|
486
|
+
|
|
487
|
+
Parameters
|
|
488
|
+
----------
|
|
489
|
+
query : str
|
|
490
|
+
The query to check.
|
|
491
|
+
"""
|
|
206
492
|
...
|
|
207
493
|
def script_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
208
494
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
209
495
|
|
|
210
|
-
Caches values on the first call to improve performance.
|
|
496
|
+
Caches values on the first call to improve performance.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
queries : tuple of str
|
|
501
|
+
"""
|
|
211
502
|
...
|
|
212
503
|
@property
|
|
213
504
|
def text_content(self) -> str | None:
|
|
214
505
|
"""Returns the text of the node if it is a text node.
|
|
215
506
|
|
|
216
|
-
Returns None for other nodes.
|
|
507
|
+
Returns None for other nodes.
|
|
508
|
+
Unlike the ``text`` method, does not include child nodes.
|
|
509
|
+
|
|
510
|
+
Returns
|
|
511
|
+
-------
|
|
512
|
+
text : str or None.
|
|
217
513
|
"""
|
|
218
514
|
...
|
|
219
515
|
def merge_text_nodes(self):
|
|
220
516
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
221
517
|
|
|
222
|
-
This is useful for text extraction.
|
|
518
|
+
This is useful for text extraction.
|
|
519
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
520
|
+
|
|
521
|
+
Examples
|
|
522
|
+
--------
|
|
523
|
+
|
|
524
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
525
|
+
>>> node = tree.css_first('div')
|
|
526
|
+
>>> tree.unwrap_tags(["strong"])
|
|
527
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
528
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
529
|
+
>>> node.merge_text_nodes()
|
|
530
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
531
|
+
"John Doe"
|
|
532
|
+
"""
|
|
223
533
|
...
|
|
224
534
|
|
|
225
535
|
class HTMLParser:
|
|
536
|
+
"""The HTML parser.
|
|
537
|
+
|
|
538
|
+
Use this class to parse raw HTML.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
|
|
543
|
+
html : str (unicode) or bytes
|
|
544
|
+
detect_encoding : bool, default True
|
|
545
|
+
If `True` and html type is `bytes` then encoding will be detected automatically.
|
|
546
|
+
use_meta_tags : bool, default True
|
|
547
|
+
Whether to use meta tags in encoding detection process.
|
|
548
|
+
decode_errors : str, default 'ignore'
|
|
549
|
+
Same as in builtin's str.decode, i.e 'strict', 'ignore' or 'replace'.
|
|
550
|
+
"""
|
|
551
|
+
|
|
226
552
|
def __init__(
|
|
227
553
|
self,
|
|
228
554
|
html: bytes | str,
|
|
@@ -233,27 +559,50 @@ class HTMLParser:
|
|
|
233
559
|
def css(self, query: str) -> list[Node]:
|
|
234
560
|
"""A CSS selector.
|
|
235
561
|
|
|
236
|
-
Matches pattern query against HTML tree.
|
|
562
|
+
Matches pattern `query` against HTML tree.
|
|
563
|
+
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
query : str
|
|
568
|
+
CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
selector : list of `Node` objects
|
|
573
|
+
"""
|
|
237
574
|
...
|
|
238
575
|
@overload
|
|
239
|
-
def css_first(
|
|
240
|
-
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
241
|
-
) -> Node: ...
|
|
242
|
-
@overload
|
|
243
576
|
def css_first(
|
|
244
577
|
self, query: str, default: DefaultT, strict: bool = False
|
|
245
578
|
) -> Node | DefaultT: ...
|
|
246
579
|
@overload
|
|
247
580
|
def css_first(
|
|
248
|
-
self, query: str, default: None =
|
|
249
|
-
) -> Node | None:
|
|
250
|
-
"""Same as css but returns only the first match.
|
|
581
|
+
self, query: str, default: None = None, strict: bool = False
|
|
582
|
+
) -> Node | None | DefaultT:
|
|
583
|
+
"""Same as `css` but returns only the first match.
|
|
584
|
+
|
|
585
|
+
Parameters
|
|
586
|
+
----------
|
|
587
|
+
|
|
588
|
+
query : str
|
|
589
|
+
default : bool, default None
|
|
590
|
+
Default value to return if there is no match.
|
|
591
|
+
strict: bool, default True
|
|
592
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
Returns
|
|
596
|
+
-------
|
|
597
|
+
selector : `Node` object
|
|
598
|
+
"""
|
|
251
599
|
...
|
|
252
600
|
@property
|
|
253
601
|
def input_encoding(self) -> str:
|
|
254
602
|
"""Return encoding of the HTML document.
|
|
255
603
|
|
|
256
|
-
Returns unknown in case the encoding is not determined.
|
|
604
|
+
Returns `unknown` in case the encoding is not determined.
|
|
605
|
+
"""
|
|
257
606
|
...
|
|
258
607
|
@property
|
|
259
608
|
def root(self) -> Node | None:
|
|
@@ -268,16 +617,70 @@ class HTMLParser:
|
|
|
268
617
|
"""Returns document body."""
|
|
269
618
|
...
|
|
270
619
|
def tags(self, name: str) -> list[Node]:
|
|
271
|
-
"""Returns a list of tags that match specified name.
|
|
620
|
+
"""Returns a list of tags that match specified name.
|
|
621
|
+
|
|
622
|
+
Parameters
|
|
623
|
+
----------
|
|
624
|
+
name : str (e.g. div)
|
|
625
|
+
"""
|
|
272
626
|
...
|
|
273
627
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
274
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
628
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
629
|
+
|
|
630
|
+
Parameters
|
|
631
|
+
----------
|
|
632
|
+
strip : bool, default False
|
|
633
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
634
|
+
separator : str, default ''
|
|
635
|
+
The separator to use when joining text from different nodes.
|
|
636
|
+
deep : bool, default True
|
|
637
|
+
If True, includes text from all child nodes.
|
|
638
|
+
|
|
639
|
+
Returns
|
|
640
|
+
-------
|
|
641
|
+
text : str
|
|
642
|
+
"""
|
|
275
643
|
...
|
|
276
|
-
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
277
|
-
|
|
644
|
+
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
645
|
+
"""Remove specified tags from the node.
|
|
646
|
+
|
|
647
|
+
Parameters
|
|
648
|
+
----------
|
|
649
|
+
tags : list of str
|
|
650
|
+
List of tags to remove.
|
|
651
|
+
recursive : bool, default True
|
|
652
|
+
Whenever to delete all its child nodes
|
|
653
|
+
|
|
654
|
+
Examples
|
|
655
|
+
--------
|
|
656
|
+
|
|
657
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
658
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
659
|
+
>>> tree.strip_tags(tags)
|
|
660
|
+
>>> tree.html
|
|
661
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
662
|
+
"""
|
|
663
|
+
...
|
|
664
|
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
278
665
|
"""Unwraps specified tags from the HTML tree.
|
|
279
666
|
|
|
280
|
-
Works the same as th unwrap method, but applied to a list of tags.
|
|
667
|
+
Works the same as th unwrap method, but applied to a list of tags.
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
tags : list
|
|
672
|
+
List of tags to remove.
|
|
673
|
+
delete_empty : bool, default False
|
|
674
|
+
If True, removes empty tags.
|
|
675
|
+
|
|
676
|
+
Examples
|
|
677
|
+
--------
|
|
678
|
+
|
|
679
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
680
|
+
>>> tree.head.unwrap_tags(['i','a'])
|
|
681
|
+
>>> tree.head.html
|
|
682
|
+
'<body><div>Hello world!</div></body>'
|
|
683
|
+
"""
|
|
281
684
|
...
|
|
282
685
|
@property
|
|
283
686
|
def html(self) -> str | None:
|
|
@@ -286,7 +689,16 @@ class HTMLParser:
|
|
|
286
689
|
def select(self, query: str | None = None) -> Selector | None:
|
|
287
690
|
"""Select nodes given a CSS selector.
|
|
288
691
|
|
|
289
|
-
Works similarly to
|
|
692
|
+
Works similarly to the ``css`` method, but supports chained filtering and extra features.
|
|
693
|
+
|
|
694
|
+
Parameters
|
|
695
|
+
----------
|
|
696
|
+
query : str or None
|
|
697
|
+
The CSS selector to use when searching for nodes.
|
|
698
|
+
|
|
699
|
+
Returns
|
|
700
|
+
-------
|
|
701
|
+
selector : The `Selector` class.
|
|
290
702
|
"""
|
|
291
703
|
...
|
|
292
704
|
def any_css_matches(self, selectors: tuple[str]) -> bool:
|
|
@@ -295,12 +707,23 @@ class HTMLParser:
|
|
|
295
707
|
def scripts_contain(self, query: str) -> bool:
|
|
296
708
|
"""Returns True if any of the script tags contain specified text.
|
|
297
709
|
|
|
298
|
-
Caches script tags on the first call to improve performance.
|
|
710
|
+
Caches script tags on the first call to improve performance.
|
|
711
|
+
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
query : str
|
|
715
|
+
The query to check.
|
|
716
|
+
"""
|
|
299
717
|
...
|
|
300
718
|
def scripts_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
301
719
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
302
720
|
|
|
303
|
-
Caches values on the first call to improve performance.
|
|
721
|
+
Caches values on the first call to improve performance.
|
|
722
|
+
|
|
723
|
+
Parameters
|
|
724
|
+
----------
|
|
725
|
+
queries : tuple of str
|
|
726
|
+
"""
|
|
304
727
|
...
|
|
305
728
|
def css_matches(self, selector: str) -> bool: ...
|
|
306
729
|
def clone(self) -> HTMLParser:
|
|
@@ -309,7 +732,21 @@ class HTMLParser:
|
|
|
309
732
|
def merge_text_nodes(self):
|
|
310
733
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
311
734
|
|
|
312
|
-
This is useful for text extraction.
|
|
735
|
+
This is useful for text extraction.
|
|
736
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
737
|
+
|
|
738
|
+
Examples
|
|
739
|
+
--------
|
|
740
|
+
|
|
741
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
742
|
+
>>> node = tree.css_first('div')
|
|
743
|
+
>>> tree.unwrap_tags(["strong"])
|
|
744
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
745
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
746
|
+
>>> node.merge_text_nodes()
|
|
747
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
748
|
+
"John Doe"
|
|
749
|
+
"""
|
|
313
750
|
...
|
|
314
751
|
|
|
315
752
|
def create_tag(tag: str) -> Node:
|