selectolax 0.3.29__cp310-cp310-win32.whl → 0.3.34__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +108 -47
- selectolax/lexbor/selection.pxi +34 -25
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +52987 -55311
- selectolax/lexbor.cp310-win32.pyd +0 -0
- selectolax/lexbor.pxd +36 -40
- selectolax/lexbor.pyi +770 -65
- selectolax/lexbor.pyx +54 -17
- selectolax/modest/node.pxi +45 -42
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +50190 -52325
- selectolax/parser.cp310-win32.pyd +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +489 -45
- selectolax/parser.pyx +39 -31
- selectolax/utils.pxi +13 -3
- selectolax-0.3.34.dist-info/METADATA +32 -0
- selectolax-0.3.34.dist-info/RECORD +26 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/WHEEL +1 -1
- selectolax-0.3.29.dist-info/METADATA +0 -194
- selectolax-0.3.29.dist-info/RECORD +0 -26
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.29.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
selectolax/parser.pyi
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from typing import Iterator,
|
|
1
|
+
from typing import Iterator, Literal, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
5
5
|
class _Attributes:
|
|
6
|
+
"""A dict-like object that represents attributes."""
|
|
7
|
+
|
|
6
8
|
@staticmethod
|
|
7
9
|
def create(node: Node, decode_errors: str) -> _Attributes: ...
|
|
8
10
|
def keys(self) -> Iterator[str]: ...
|
|
@@ -22,7 +24,9 @@ class _Attributes:
|
|
|
22
24
|
@overload
|
|
23
25
|
def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
|
|
24
26
|
@overload
|
|
25
|
-
def sget(self, key: str, default: str = "") -> str:
|
|
27
|
+
def sget(self, key: str, default: str = "") -> str:
|
|
28
|
+
"""Same as get, but returns empty strings instead of None values for empty attributes."""
|
|
29
|
+
...
|
|
26
30
|
|
|
27
31
|
class Selector:
|
|
28
32
|
"""An advanced CSS selector that supports additional operations.
|
|
@@ -69,47 +73,133 @@ class Selector:
|
|
|
69
73
|
...
|
|
70
74
|
|
|
71
75
|
class Node:
|
|
76
|
+
"""A class that represents HTML node (element)."""
|
|
77
|
+
|
|
72
78
|
parser: HTMLParser
|
|
73
79
|
@property
|
|
74
80
|
def attributes(self) -> dict[str, str | None]:
|
|
75
81
|
"""Get all attributes that belong to the current node.
|
|
76
82
|
|
|
77
|
-
The value of empty attributes is None.
|
|
83
|
+
The value of empty attributes is None.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
attributes : dictionary of all attributes.
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
|
|
92
|
+
>>> tree = HTMLParser("<div data id='my_id'></div>")
|
|
93
|
+
>>> node = tree.css_first('div')
|
|
94
|
+
>>> node.attributes
|
|
95
|
+
{'data': None, 'id': 'my_id'}
|
|
96
|
+
"""
|
|
78
97
|
...
|
|
79
98
|
@property
|
|
80
99
|
def attrs(self) -> _Attributes:
|
|
81
|
-
"""A dict-like object that is similar to the attributes property, but operates directly on the Node data.
|
|
100
|
+
"""A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
|
|
101
|
+
|
|
102
|
+
.. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
attributes : Attributes mapping object.
|
|
107
|
+
|
|
108
|
+
Examples
|
|
109
|
+
--------
|
|
110
|
+
|
|
111
|
+
>>> tree = HTMLParser("<div id='a'></div>")
|
|
112
|
+
>>> node = tree.css_first('div')
|
|
113
|
+
>>> node.attrs
|
|
114
|
+
<div attributes, 1 items>
|
|
115
|
+
>>> node.attrs['id']
|
|
116
|
+
'a'
|
|
117
|
+
>>> node.attrs['foo'] = 'bar'
|
|
118
|
+
>>> del node.attrs['id']
|
|
119
|
+
>>> node.attributes
|
|
120
|
+
{'foo': 'bar'}
|
|
121
|
+
>>> node.attrs['id'] = 'new_id'
|
|
122
|
+
>>> node.html
|
|
123
|
+
'<div foo="bar" id="new_id"></div>'
|
|
124
|
+
"""
|
|
82
125
|
...
|
|
83
126
|
@property
|
|
84
127
|
def id(self) -> str | None:
|
|
85
128
|
"""Get the id attribute of the node.
|
|
86
129
|
|
|
87
|
-
Returns None if id does not set.
|
|
130
|
+
Returns None if id does not set.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
text : str
|
|
135
|
+
"""
|
|
88
136
|
...
|
|
89
137
|
|
|
90
138
|
def mem_id(self) -> int:
|
|
91
|
-
"""Get the mem_id of the node.
|
|
139
|
+
"""Get the mem_id attribute of the node.
|
|
92
140
|
|
|
93
|
-
Returns
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
text : int
|
|
144
|
+
"""
|
|
94
145
|
...
|
|
95
146
|
|
|
96
147
|
def __hash__(self) -> int:
|
|
97
|
-
"""
|
|
148
|
+
"""Get the hash of this node
|
|
98
149
|
:return: int
|
|
99
150
|
"""
|
|
100
151
|
...
|
|
101
152
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
102
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
153
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
strip : bool, default False
|
|
158
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
159
|
+
separator : str, default ''
|
|
160
|
+
The separator to use when joining text from different nodes.
|
|
161
|
+
deep : bool, default True
|
|
162
|
+
If True, includes text from all child nodes.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
text : str
|
|
167
|
+
"""
|
|
103
168
|
...
|
|
104
169
|
def iter(self, include_text: bool = False) -> Iterator[Node]:
|
|
105
|
-
"""Iterate over nodes on the current level.
|
|
170
|
+
"""Iterate over nodes on the current level.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
include_text : bool
|
|
175
|
+
If True, includes text nodes as well.
|
|
176
|
+
|
|
177
|
+
Yields
|
|
178
|
+
-------
|
|
179
|
+
node
|
|
180
|
+
"""
|
|
106
181
|
...
|
|
107
182
|
def traverse(self, include_text: bool = False) -> Iterator[Node]:
|
|
108
|
-
"""Iterate over all child and next nodes starting from the current level.
|
|
183
|
+
"""Iterate over all child and next nodes starting from the current level.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
include_text : bool
|
|
188
|
+
If True, includes text nodes as well.
|
|
189
|
+
|
|
190
|
+
Yields
|
|
191
|
+
-------
|
|
192
|
+
node
|
|
193
|
+
"""
|
|
109
194
|
...
|
|
110
195
|
@property
|
|
111
196
|
def tag(self) -> str:
|
|
112
|
-
"""Return the name of the current tag (e.g. div, p, img).
|
|
197
|
+
"""Return the name of the current tag (e.g. div, p, img).
|
|
198
|
+
|
|
199
|
+
Returns
|
|
200
|
+
-------
|
|
201
|
+
text : str
|
|
202
|
+
"""
|
|
113
203
|
...
|
|
114
204
|
@property
|
|
115
205
|
def child(self) -> Node | None:
|
|
@@ -133,7 +223,12 @@ class Node:
|
|
|
133
223
|
...
|
|
134
224
|
@property
|
|
135
225
|
def html(self) -> str | None:
|
|
136
|
-
"""Return HTML representation of the current node including all its child nodes.
|
|
226
|
+
"""Return HTML representation of the current node including all its child nodes.
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
text : str
|
|
231
|
+
"""
|
|
137
232
|
...
|
|
138
233
|
def css(self, query: str) -> list[Node]:
|
|
139
234
|
"""Evaluate CSS selector against current node and its child nodes."""
|
|
@@ -146,79 +241,314 @@ class Node:
|
|
|
146
241
|
...
|
|
147
242
|
@overload
|
|
148
243
|
def css_first(
|
|
149
|
-
|
|
150
|
-
) -> Node | DefaultT:
|
|
151
|
-
...
|
|
244
|
+
self, query: str, default: DefaultT, strict: bool = False
|
|
245
|
+
) -> Node | DefaultT: ...
|
|
152
246
|
@overload
|
|
153
247
|
def css_first(
|
|
154
|
-
|
|
248
|
+
self, query: str, default: None = None, strict: bool = False
|
|
155
249
|
) -> Node | None | DefaultT:
|
|
250
|
+
"""Evaluate CSS selector against current node and its child nodes."""
|
|
156
251
|
...
|
|
157
252
|
def decompose(self, recursive: bool = True) -> None:
|
|
158
|
-
"""Remove a Node from the tree.
|
|
253
|
+
"""Remove a Node from the tree.
|
|
254
|
+
|
|
255
|
+
Parameters
|
|
256
|
+
----------
|
|
257
|
+
recursive : bool, default True
|
|
258
|
+
Whenever to delete all its child nodes
|
|
259
|
+
|
|
260
|
+
Examples
|
|
261
|
+
--------
|
|
262
|
+
|
|
263
|
+
>>> tree = HTMLParser(html)
|
|
264
|
+
>>> for tag in tree.css('script'):
|
|
265
|
+
>>> tag.decompose()
|
|
266
|
+
"""
|
|
159
267
|
...
|
|
160
268
|
def remove(self, recursive: bool = True) -> None:
|
|
161
269
|
"""An alias for the decompose method."""
|
|
162
270
|
...
|
|
163
|
-
def unwrap(self) -> None:
|
|
164
|
-
"""Replace node with whatever is inside this node.
|
|
271
|
+
def unwrap(self, delete_empty: bool = False) -> None:
|
|
272
|
+
"""Replace node with whatever is inside this node.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
delete_empty : bool, default False
|
|
277
|
+
Whenever to delete empty tags.
|
|
278
|
+
|
|
279
|
+
Examples
|
|
280
|
+
--------
|
|
281
|
+
|
|
282
|
+
>>> tree = HTMLParser("<div>Hello <i>world</i>!</div>")
|
|
283
|
+
>>> tree.css_first('i').unwrap()
|
|
284
|
+
>>> tree.html
|
|
285
|
+
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
286
|
+
|
|
287
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
288
|
+
"""
|
|
165
289
|
...
|
|
166
290
|
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
167
|
-
"""Remove specified tags from the HTML tree.
|
|
291
|
+
"""Remove specified tags from the HTML tree.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
tags : list
|
|
296
|
+
List of tags to remove.
|
|
297
|
+
recursive : bool, default True
|
|
298
|
+
Whenever to delete all its child nodes
|
|
299
|
+
|
|
300
|
+
Examples
|
|
301
|
+
--------
|
|
302
|
+
|
|
303
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
304
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
305
|
+
>>> tree.strip_tags(tags)
|
|
306
|
+
>>> tree.html
|
|
307
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
308
|
+
"""
|
|
168
309
|
...
|
|
169
310
|
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
170
311
|
"""Unwraps specified tags from the HTML tree.
|
|
171
312
|
|
|
172
|
-
Works the same as the unwrap method, but applied to a list of tags.
|
|
313
|
+
Works the same as the unwrap method, but applied to a list of tags.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
tags : list
|
|
318
|
+
List of tags to remove.
|
|
319
|
+
delete_empty : bool, default False
|
|
320
|
+
Whenever to delete empty tags.
|
|
321
|
+
|
|
322
|
+
Examples
|
|
323
|
+
--------
|
|
324
|
+
|
|
325
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
326
|
+
>>> tree.body.unwrap_tags(['i','a'])
|
|
327
|
+
>>> tree.body.html
|
|
328
|
+
'<body><div>Hello world!</div></body>'
|
|
329
|
+
|
|
330
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
331
|
+
"""
|
|
173
332
|
...
|
|
174
333
|
def replace_with(self, value: str | bytes | None) -> None:
|
|
175
|
-
"""Replace current Node with specified value.
|
|
334
|
+
"""Replace current Node with specified value.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
value : str, bytes or Node
|
|
339
|
+
The text or Node instance to replace the Node with.
|
|
340
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
341
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
342
|
+
Does not clone the ``Node`` object.
|
|
343
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
344
|
+
|
|
345
|
+
Examples
|
|
346
|
+
--------
|
|
347
|
+
|
|
348
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
349
|
+
>>> img = tree.css_first('img')
|
|
350
|
+
>>> img.replace_with(img.attributes.get('alt', ''))
|
|
351
|
+
>>> tree.body.child.html
|
|
352
|
+
'<div>Get Laptop</div>'
|
|
353
|
+
|
|
354
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
355
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
356
|
+
>>> img_node = html_parser.css_first('img')
|
|
357
|
+
>>> img_node.replace_with(html_parser2.body.child)
|
|
358
|
+
'<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
|
|
359
|
+
"""
|
|
176
360
|
...
|
|
177
361
|
def insert_before(self, value: str | bytes | None) -> None:
|
|
178
|
-
"""Insert a node before the current Node.
|
|
362
|
+
"""Insert a node before the current Node.
|
|
363
|
+
|
|
364
|
+
Parameters
|
|
365
|
+
----------
|
|
366
|
+
value : str, bytes or Node
|
|
367
|
+
The text or Node instance to insert before the Node.
|
|
368
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
369
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
370
|
+
Does not clone the ``Node`` object.
|
|
371
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
372
|
+
|
|
373
|
+
Examples
|
|
374
|
+
--------
|
|
375
|
+
|
|
376
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
377
|
+
>>> img = tree.css_first('img')
|
|
378
|
+
>>> img.insert_before(img.attributes.get('alt', ''))
|
|
379
|
+
>>> tree.body.child.html
|
|
380
|
+
'<div>Get Laptop<img src="" alt="Laptop"></div>'
|
|
381
|
+
|
|
382
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
383
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
384
|
+
>>> img_node = html_parser.css_first('img')
|
|
385
|
+
>>> img_node.insert_before(html_parser2.body.child)
|
|
386
|
+
<div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
|
|
387
|
+
"""
|
|
179
388
|
...
|
|
180
389
|
def insert_after(self, value: str | bytes | None) -> None:
|
|
181
|
-
"""Insert a node after the current Node.
|
|
390
|
+
"""Insert a node after the current Node.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
value : str, bytes or Node
|
|
395
|
+
The text or Node instance to insert after the Node.
|
|
396
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
397
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
398
|
+
Does not clone the ``Node`` object.
|
|
399
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
400
|
+
|
|
401
|
+
Examples
|
|
402
|
+
--------
|
|
403
|
+
|
|
404
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
405
|
+
>>> img = tree.css_first('img')
|
|
406
|
+
>>> img.insert_after(img.attributes.get('alt', ''))
|
|
407
|
+
>>> tree.body.child.html
|
|
408
|
+
'<div>Get <img src="" alt="Laptop">Laptop</div>'
|
|
409
|
+
|
|
410
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
411
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
412
|
+
>>> img_node = html_parser.css_first('img')
|
|
413
|
+
>>> img_node.insert_after(html_parser2.body.child)
|
|
414
|
+
<div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
|
|
415
|
+
"""
|
|
182
416
|
...
|
|
183
417
|
def insert_child(self, value: str | bytes | None) -> None:
|
|
184
|
-
"""Insert a node inside (at the end of) the current Node
|
|
418
|
+
"""Insert a node inside (at the end of) the current Node.
|
|
419
|
+
|
|
420
|
+
Parameters
|
|
421
|
+
----------
|
|
422
|
+
value : str, bytes or Node
|
|
423
|
+
The text or Node instance to insert inside the Node.
|
|
424
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
425
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
426
|
+
Does not clone the ``Node`` object.
|
|
427
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
428
|
+
|
|
429
|
+
Examples
|
|
430
|
+
--------
|
|
431
|
+
|
|
432
|
+
>>> tree = HTMLParser('<div>Get <img src=""></div>')
|
|
433
|
+
>>> div = tree.css_first('div')
|
|
434
|
+
>>> div.insert_child('Laptop')
|
|
435
|
+
>>> tree.body.child.html
|
|
436
|
+
'<div>Get <img src="">Laptop</div>'
|
|
437
|
+
|
|
438
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
|
|
439
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
440
|
+
>>> span_node = html_parser.css_first('span')
|
|
441
|
+
>>> span_node.insert_child(html_parser2.body.child)
|
|
442
|
+
<div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
|
|
443
|
+
"""
|
|
185
444
|
...
|
|
186
445
|
@property
|
|
187
446
|
def raw_value(self) -> bytes:
|
|
188
447
|
"""Return the raw (unparsed, original) value of a node.
|
|
189
448
|
|
|
190
|
-
Currently, works on text nodes only.
|
|
449
|
+
Currently, works on text nodes only.
|
|
450
|
+
|
|
451
|
+
Returns
|
|
452
|
+
-------
|
|
453
|
+
|
|
454
|
+
raw_value : bytes
|
|
455
|
+
|
|
456
|
+
Examples
|
|
457
|
+
--------
|
|
458
|
+
|
|
459
|
+
>>> html_parser = HTMLParser('<div><test></div>')
|
|
460
|
+
>>> selector = html_parser.css_first('div')
|
|
461
|
+
>>> selector.child.html
|
|
462
|
+
'<test>'
|
|
463
|
+
>>> selector.child.raw_value
|
|
464
|
+
b'<test>'
|
|
465
|
+
"""
|
|
191
466
|
...
|
|
192
467
|
def select(self, query: str | None = None) -> Selector:
|
|
193
468
|
"""Select nodes given a CSS selector.
|
|
194
469
|
|
|
195
470
|
Works similarly to the css method, but supports chained filtering and extra features.
|
|
471
|
+
|
|
472
|
+
Parameters
|
|
473
|
+
----------
|
|
474
|
+
query : str or None
|
|
475
|
+
The CSS selector to use when searching for nodes.
|
|
476
|
+
|
|
477
|
+
Returns
|
|
478
|
+
-------
|
|
479
|
+
selector : The `Selector` class.
|
|
196
480
|
"""
|
|
197
481
|
...
|
|
198
482
|
def scripts_contain(self, query: str) -> bool:
|
|
199
483
|
"""Returns True if any of the script tags contain specified text.
|
|
200
484
|
|
|
201
|
-
Caches script tags on the first call to improve performance.
|
|
485
|
+
Caches script tags on the first call to improve performance.
|
|
486
|
+
|
|
487
|
+
Parameters
|
|
488
|
+
----------
|
|
489
|
+
query : str
|
|
490
|
+
The query to check.
|
|
491
|
+
"""
|
|
202
492
|
...
|
|
203
493
|
def script_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
204
494
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
205
495
|
|
|
206
|
-
Caches values on the first call to improve performance.
|
|
496
|
+
Caches values on the first call to improve performance.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
queries : tuple of str
|
|
501
|
+
"""
|
|
207
502
|
...
|
|
208
503
|
@property
|
|
209
504
|
def text_content(self) -> str | None:
|
|
210
505
|
"""Returns the text of the node if it is a text node.
|
|
211
506
|
|
|
212
|
-
Returns None for other nodes.
|
|
507
|
+
Returns None for other nodes.
|
|
508
|
+
Unlike the ``text`` method, does not include child nodes.
|
|
509
|
+
|
|
510
|
+
Returns
|
|
511
|
+
-------
|
|
512
|
+
text : str or None.
|
|
213
513
|
"""
|
|
214
514
|
...
|
|
215
515
|
def merge_text_nodes(self):
|
|
216
516
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
217
517
|
|
|
218
|
-
This is useful for text extraction.
|
|
518
|
+
This is useful for text extraction.
|
|
519
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
520
|
+
|
|
521
|
+
Examples
|
|
522
|
+
--------
|
|
523
|
+
|
|
524
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
525
|
+
>>> node = tree.css_first('div')
|
|
526
|
+
>>> tree.unwrap_tags(["strong"])
|
|
527
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
528
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
529
|
+
>>> node.merge_text_nodes()
|
|
530
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
531
|
+
"John Doe"
|
|
532
|
+
"""
|
|
219
533
|
...
|
|
220
534
|
|
|
221
535
|
class HTMLParser:
|
|
536
|
+
"""The HTML parser.
|
|
537
|
+
|
|
538
|
+
Use this class to parse raw HTML.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
|
|
543
|
+
html : str (unicode) or bytes
|
|
544
|
+
detect_encoding : bool, default True
|
|
545
|
+
If `True` and html type is `bytes` then encoding will be detected automatically.
|
|
546
|
+
use_meta_tags : bool, default True
|
|
547
|
+
Whether to use meta tags in encoding detection process.
|
|
548
|
+
decode_errors : str, default 'ignore'
|
|
549
|
+
Same as in builtin's str.decode, i.e 'strict', 'ignore' or 'replace'.
|
|
550
|
+
"""
|
|
551
|
+
|
|
222
552
|
def __init__(
|
|
223
553
|
self,
|
|
224
554
|
html: bytes | str,
|
|
@@ -229,24 +559,50 @@ class HTMLParser:
|
|
|
229
559
|
def css(self, query: str) -> list[Node]:
|
|
230
560
|
"""A CSS selector.
|
|
231
561
|
|
|
232
|
-
Matches pattern query against HTML tree.
|
|
562
|
+
Matches pattern `query` against HTML tree.
|
|
563
|
+
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
query : str
|
|
568
|
+
CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
selector : list of `Node` objects
|
|
573
|
+
"""
|
|
233
574
|
...
|
|
234
575
|
@overload
|
|
235
576
|
def css_first(
|
|
236
577
|
self, query: str, default: DefaultT, strict: bool = False
|
|
237
|
-
) -> Node | DefaultT:
|
|
238
|
-
...
|
|
239
|
-
|
|
578
|
+
) -> Node | DefaultT: ...
|
|
240
579
|
@overload
|
|
241
580
|
def css_first(
|
|
242
|
-
|
|
581
|
+
self, query: str, default: None = None, strict: bool = False
|
|
243
582
|
) -> Node | None | DefaultT:
|
|
583
|
+
"""Same as `css` but returns only the first match.
|
|
584
|
+
|
|
585
|
+
Parameters
|
|
586
|
+
----------
|
|
587
|
+
|
|
588
|
+
query : str
|
|
589
|
+
default : bool, default None
|
|
590
|
+
Default value to return if there is no match.
|
|
591
|
+
strict: bool, default True
|
|
592
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
Returns
|
|
596
|
+
-------
|
|
597
|
+
selector : `Node` object
|
|
598
|
+
"""
|
|
244
599
|
...
|
|
245
600
|
@property
|
|
246
601
|
def input_encoding(self) -> str:
|
|
247
602
|
"""Return encoding of the HTML document.
|
|
248
603
|
|
|
249
|
-
Returns unknown in case the encoding is not determined.
|
|
604
|
+
Returns `unknown` in case the encoding is not determined.
|
|
605
|
+
"""
|
|
250
606
|
...
|
|
251
607
|
@property
|
|
252
608
|
def root(self) -> Node | None:
|
|
@@ -261,16 +617,70 @@ class HTMLParser:
|
|
|
261
617
|
"""Returns document body."""
|
|
262
618
|
...
|
|
263
619
|
def tags(self, name: str) -> list[Node]:
|
|
264
|
-
"""Returns a list of tags that match specified name.
|
|
620
|
+
"""Returns a list of tags that match specified name.
|
|
621
|
+
|
|
622
|
+
Parameters
|
|
623
|
+
----------
|
|
624
|
+
name : str (e.g. div)
|
|
625
|
+
"""
|
|
265
626
|
...
|
|
266
627
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
267
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
628
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
629
|
+
|
|
630
|
+
Parameters
|
|
631
|
+
----------
|
|
632
|
+
strip : bool, default False
|
|
633
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
634
|
+
separator : str, default ''
|
|
635
|
+
The separator to use when joining text from different nodes.
|
|
636
|
+
deep : bool, default True
|
|
637
|
+
If True, includes text from all child nodes.
|
|
638
|
+
|
|
639
|
+
Returns
|
|
640
|
+
-------
|
|
641
|
+
text : str
|
|
642
|
+
"""
|
|
643
|
+
...
|
|
644
|
+
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
645
|
+
"""Remove specified tags from the node.
|
|
646
|
+
|
|
647
|
+
Parameters
|
|
648
|
+
----------
|
|
649
|
+
tags : list of str
|
|
650
|
+
List of tags to remove.
|
|
651
|
+
recursive : bool, default True
|
|
652
|
+
Whenever to delete all its child nodes
|
|
653
|
+
|
|
654
|
+
Examples
|
|
655
|
+
--------
|
|
656
|
+
|
|
657
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
658
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
659
|
+
>>> tree.strip_tags(tags)
|
|
660
|
+
>>> tree.html
|
|
661
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
662
|
+
"""
|
|
268
663
|
...
|
|
269
|
-
def strip_tags(self, tags: list[str], recursive: bool = False) -> None: ...
|
|
270
664
|
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
271
665
|
"""Unwraps specified tags from the HTML tree.
|
|
272
666
|
|
|
273
|
-
Works the same as th unwrap method, but applied to a list of tags.
|
|
667
|
+
Works the same as th unwrap method, but applied to a list of tags.
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
tags : list
|
|
672
|
+
List of tags to remove.
|
|
673
|
+
delete_empty : bool, default False
|
|
674
|
+
If True, removes empty tags.
|
|
675
|
+
|
|
676
|
+
Examples
|
|
677
|
+
--------
|
|
678
|
+
|
|
679
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
680
|
+
>>> tree.head.unwrap_tags(['i','a'])
|
|
681
|
+
>>> tree.head.html
|
|
682
|
+
'<body><div>Hello world!</div></body>'
|
|
683
|
+
"""
|
|
274
684
|
...
|
|
275
685
|
@property
|
|
276
686
|
def html(self) -> str | None:
|
|
@@ -279,7 +689,16 @@ class HTMLParser:
|
|
|
279
689
|
def select(self, query: str | None = None) -> Selector | None:
|
|
280
690
|
"""Select nodes given a CSS selector.
|
|
281
691
|
|
|
282
|
-
Works similarly to
|
|
692
|
+
Works similarly to the ``css`` method, but supports chained filtering and extra features.
|
|
693
|
+
|
|
694
|
+
Parameters
|
|
695
|
+
----------
|
|
696
|
+
query : str or None
|
|
697
|
+
The CSS selector to use when searching for nodes.
|
|
698
|
+
|
|
699
|
+
Returns
|
|
700
|
+
-------
|
|
701
|
+
selector : The `Selector` class.
|
|
283
702
|
"""
|
|
284
703
|
...
|
|
285
704
|
def any_css_matches(self, selectors: tuple[str]) -> bool:
|
|
@@ -288,12 +707,23 @@ class HTMLParser:
|
|
|
288
707
|
def scripts_contain(self, query: str) -> bool:
|
|
289
708
|
"""Returns True if any of the script tags contain specified text.
|
|
290
709
|
|
|
291
|
-
Caches script tags on the first call to improve performance.
|
|
710
|
+
Caches script tags on the first call to improve performance.
|
|
711
|
+
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
query : str
|
|
715
|
+
The query to check.
|
|
716
|
+
"""
|
|
292
717
|
...
|
|
293
718
|
def scripts_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
294
719
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
295
720
|
|
|
296
|
-
Caches values on the first call to improve performance.
|
|
721
|
+
Caches values on the first call to improve performance.
|
|
722
|
+
|
|
723
|
+
Parameters
|
|
724
|
+
----------
|
|
725
|
+
queries : tuple of str
|
|
726
|
+
"""
|
|
297
727
|
...
|
|
298
728
|
def css_matches(self, selector: str) -> bool: ...
|
|
299
729
|
def clone(self) -> HTMLParser:
|
|
@@ -302,7 +732,21 @@ class HTMLParser:
|
|
|
302
732
|
def merge_text_nodes(self):
|
|
303
733
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
304
734
|
|
|
305
|
-
This is useful for text extraction.
|
|
735
|
+
This is useful for text extraction.
|
|
736
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
737
|
+
|
|
738
|
+
Examples
|
|
739
|
+
--------
|
|
740
|
+
|
|
741
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
742
|
+
>>> node = tree.css_first('div')
|
|
743
|
+
>>> tree.unwrap_tags(["strong"])
|
|
744
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
745
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
746
|
+
>>> node.merge_text_nodes()
|
|
747
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
748
|
+
"John Doe"
|
|
749
|
+
"""
|
|
306
750
|
...
|
|
307
751
|
|
|
308
752
|
def create_tag(tag: str) -> Node:
|