selectolax 0.3.28__cp39-cp39-musllinux_1_2_aarch64.whl → 0.4.0__cp39-cp39-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of selectolax might be problematic. Click here for more details.
- selectolax/__init__.py +3 -5
- selectolax/lexbor/attrs.pxi +26 -9
- selectolax/lexbor/node.pxi +225 -58
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +57 -26
- selectolax/lexbor/util.pxi +1 -0
- selectolax/lexbor.c +21988 -22274
- selectolax/lexbor.cpython-39-aarch64-linux-gnu.so +0 -0
- selectolax/lexbor.pxd +44 -40
- selectolax/lexbor.pyi +847 -65
- selectolax/lexbor.pyx +98 -23
- selectolax/modest/node.pxi +68 -46
- selectolax/modest/selection.pxi +24 -22
- selectolax/modest/util.pxi +1 -0
- selectolax/parser.c +18150 -20047
- selectolax/parser.cpython-39-aarch64-linux-gnu.so +0 -0
- selectolax/parser.pxd +17 -20
- selectolax/parser.pyi +493 -53
- selectolax/parser.pyx +45 -35
- selectolax/utils.pxi +13 -3
- selectolax-0.4.0.dist-info/METADATA +32 -0
- selectolax-0.4.0.dist-info/RECORD +27 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/WHEEL +1 -1
- selectolax-0.3.28.dist-info/METADATA +0 -183
- selectolax-0.3.28.dist-info/RECORD +0 -26
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {selectolax-0.3.28.dist-info → selectolax-0.4.0.dist-info}/top_level.txt +0 -0
selectolax/parser.pyi
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Iterator, Literal, TypeVar, overload
|
|
2
2
|
|
|
3
3
|
DefaultT = TypeVar("DefaultT")
|
|
4
4
|
|
|
5
5
|
class _Attributes:
|
|
6
|
+
"""A dict-like object that represents attributes."""
|
|
7
|
+
|
|
6
8
|
@staticmethod
|
|
7
9
|
def create(node: Node, decode_errors: str) -> _Attributes: ...
|
|
8
10
|
def keys(self) -> Iterator[str]: ...
|
|
@@ -22,7 +24,9 @@ class _Attributes:
|
|
|
22
24
|
@overload
|
|
23
25
|
def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ...
|
|
24
26
|
@overload
|
|
25
|
-
def sget(self, key: str, default: str = "") -> str:
|
|
27
|
+
def sget(self, key: str, default: str = "") -> str:
|
|
28
|
+
"""Same as get, but returns empty strings instead of None values for empty attributes."""
|
|
29
|
+
...
|
|
26
30
|
|
|
27
31
|
class Selector:
|
|
28
32
|
"""An advanced CSS selector that supports additional operations.
|
|
@@ -69,51 +73,140 @@ class Selector:
|
|
|
69
73
|
...
|
|
70
74
|
|
|
71
75
|
class Node:
|
|
76
|
+
"""A class that represents HTML node (element)."""
|
|
77
|
+
|
|
72
78
|
parser: HTMLParser
|
|
73
79
|
@property
|
|
74
80
|
def attributes(self) -> dict[str, str | None]:
|
|
75
81
|
"""Get all attributes that belong to the current node.
|
|
76
82
|
|
|
77
|
-
The value of empty attributes is None.
|
|
83
|
+
The value of empty attributes is None.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
attributes : dictionary of all attributes.
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
|
|
92
|
+
>>> tree = HTMLParser("<div data id='my_id'></div>")
|
|
93
|
+
>>> node = tree.css_first('div')
|
|
94
|
+
>>> node.attributes
|
|
95
|
+
{'data': None, 'id': 'my_id'}
|
|
96
|
+
"""
|
|
78
97
|
...
|
|
79
98
|
@property
|
|
80
99
|
def attrs(self) -> _Attributes:
|
|
81
|
-
"""A dict-like object that is similar to the attributes property, but operates directly on the Node data.
|
|
100
|
+
"""A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
|
|
101
|
+
|
|
102
|
+
.. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
attributes : Attributes mapping object.
|
|
107
|
+
|
|
108
|
+
Examples
|
|
109
|
+
--------
|
|
110
|
+
|
|
111
|
+
>>> tree = HTMLParser("<div id='a'></div>")
|
|
112
|
+
>>> node = tree.css_first('div')
|
|
113
|
+
>>> node.attrs
|
|
114
|
+
<div attributes, 1 items>
|
|
115
|
+
>>> node.attrs['id']
|
|
116
|
+
'a'
|
|
117
|
+
>>> node.attrs['foo'] = 'bar'
|
|
118
|
+
>>> del node.attrs['id']
|
|
119
|
+
>>> node.attributes
|
|
120
|
+
{'foo': 'bar'}
|
|
121
|
+
>>> node.attrs['id'] = 'new_id'
|
|
122
|
+
>>> node.html
|
|
123
|
+
'<div foo="bar" id="new_id"></div>'
|
|
124
|
+
"""
|
|
82
125
|
...
|
|
83
126
|
@property
|
|
84
127
|
def id(self) -> str | None:
|
|
85
128
|
"""Get the id attribute of the node.
|
|
86
129
|
|
|
87
|
-
Returns None if id does not set.
|
|
130
|
+
Returns None if id does not set.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
text : str
|
|
135
|
+
"""
|
|
88
136
|
...
|
|
89
137
|
|
|
90
138
|
def mem_id(self) -> int:
|
|
91
|
-
"""Get the mem_id of the node.
|
|
139
|
+
"""Get the mem_id attribute of the node.
|
|
92
140
|
|
|
93
|
-
Returns
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
text : int
|
|
144
|
+
"""
|
|
94
145
|
...
|
|
95
146
|
|
|
96
147
|
def __hash__(self) -> int:
|
|
97
|
-
"""
|
|
148
|
+
"""Get the hash of this node
|
|
98
149
|
:return: int
|
|
99
150
|
"""
|
|
100
151
|
...
|
|
101
152
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
102
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
153
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
strip : bool, default False
|
|
158
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
159
|
+
separator : str, default ''
|
|
160
|
+
The separator to use when joining text from different nodes.
|
|
161
|
+
deep : bool, default True
|
|
162
|
+
If True, includes text from all child nodes.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
text : str
|
|
167
|
+
"""
|
|
103
168
|
...
|
|
104
169
|
def iter(self, include_text: bool = False) -> Iterator[Node]:
|
|
105
|
-
"""Iterate over nodes on the current level.
|
|
170
|
+
"""Iterate over nodes on the current level.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
include_text : bool
|
|
175
|
+
If True, includes text nodes as well.
|
|
176
|
+
|
|
177
|
+
Yields
|
|
178
|
+
-------
|
|
179
|
+
node
|
|
180
|
+
"""
|
|
106
181
|
...
|
|
107
182
|
def traverse(self, include_text: bool = False) -> Iterator[Node]:
|
|
108
|
-
"""Iterate over all child and next nodes starting from the current level.
|
|
183
|
+
"""Iterate over all child and next nodes starting from the current level.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
include_text : bool
|
|
188
|
+
If True, includes text nodes as well.
|
|
189
|
+
|
|
190
|
+
Yields
|
|
191
|
+
-------
|
|
192
|
+
node
|
|
193
|
+
"""
|
|
109
194
|
...
|
|
110
195
|
@property
|
|
111
196
|
def tag(self) -> str:
|
|
112
|
-
"""Return the name of the current tag (e.g. div, p, img).
|
|
197
|
+
"""Return the name of the current tag (e.g. div, p, img).
|
|
198
|
+
|
|
199
|
+
Returns
|
|
200
|
+
-------
|
|
201
|
+
text : str
|
|
202
|
+
"""
|
|
113
203
|
...
|
|
114
204
|
@property
|
|
115
205
|
def child(self) -> Node | None:
|
|
116
|
-
"""
|
|
206
|
+
"""Alias for the `first_child` property.
|
|
207
|
+
|
|
208
|
+
**Deprecated**. Please use `first_child` instead.
|
|
209
|
+
"""
|
|
117
210
|
...
|
|
118
211
|
@property
|
|
119
212
|
def parent(self) -> Node | None:
|
|
@@ -133,7 +226,12 @@ class Node:
|
|
|
133
226
|
...
|
|
134
227
|
@property
|
|
135
228
|
def html(self) -> str | None:
|
|
136
|
-
"""Return HTML representation of the current node including all its child nodes.
|
|
229
|
+
"""Return HTML representation of the current node including all its child nodes.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
text : str
|
|
234
|
+
"""
|
|
137
235
|
...
|
|
138
236
|
def css(self, query: str) -> list[Node]:
|
|
139
237
|
"""Evaluate CSS selector against current node and its child nodes."""
|
|
@@ -145,84 +243,315 @@ class Node:
|
|
|
145
243
|
"""Returns True if CSS selector matches a node."""
|
|
146
244
|
...
|
|
147
245
|
@overload
|
|
148
|
-
def css_first(
|
|
149
|
-
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
150
|
-
) -> Node: ...
|
|
151
|
-
@overload
|
|
152
246
|
def css_first(
|
|
153
247
|
self, query: str, default: DefaultT, strict: bool = False
|
|
154
248
|
) -> Node | DefaultT: ...
|
|
155
249
|
@overload
|
|
156
250
|
def css_first(
|
|
157
|
-
self, query: str, default: None =
|
|
158
|
-
) -> Node | None:
|
|
251
|
+
self, query: str, default: None = None, strict: bool = False
|
|
252
|
+
) -> Node | None | DefaultT:
|
|
159
253
|
"""Evaluate CSS selector against current node and its child nodes."""
|
|
160
254
|
...
|
|
161
255
|
def decompose(self, recursive: bool = True) -> None:
|
|
162
|
-
"""Remove a Node from the tree.
|
|
256
|
+
"""Remove a Node from the tree.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
recursive : bool, default True
|
|
261
|
+
Whenever to delete all its child nodes
|
|
262
|
+
|
|
263
|
+
Examples
|
|
264
|
+
--------
|
|
265
|
+
|
|
266
|
+
>>> tree = HTMLParser(html)
|
|
267
|
+
>>> for tag in tree.css('script'):
|
|
268
|
+
>>> tag.decompose()
|
|
269
|
+
"""
|
|
163
270
|
...
|
|
164
271
|
def remove(self, recursive: bool = True) -> None:
|
|
165
272
|
"""An alias for the decompose method."""
|
|
166
273
|
...
|
|
167
|
-
def unwrap(self) -> None:
|
|
168
|
-
"""Replace node with whatever is inside this node.
|
|
274
|
+
def unwrap(self, delete_empty: bool = False) -> None:
|
|
275
|
+
"""Replace node with whatever is inside this node.
|
|
276
|
+
|
|
277
|
+
Parameters
|
|
278
|
+
----------
|
|
279
|
+
delete_empty : bool, default False
|
|
280
|
+
Whenever to delete empty tags.
|
|
281
|
+
|
|
282
|
+
Examples
|
|
283
|
+
--------
|
|
284
|
+
|
|
285
|
+
>>> tree = HTMLParser("<div>Hello <i>world</i>!</div>")
|
|
286
|
+
>>> tree.css_first('i').unwrap()
|
|
287
|
+
>>> tree.html
|
|
288
|
+
'<html><head></head><body><div>Hello world!</div></body></html>'
|
|
289
|
+
|
|
290
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
291
|
+
"""
|
|
169
292
|
...
|
|
170
293
|
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
171
|
-
"""Remove specified tags from the HTML tree.
|
|
294
|
+
"""Remove specified tags from the HTML tree.
|
|
295
|
+
|
|
296
|
+
Parameters
|
|
297
|
+
----------
|
|
298
|
+
tags : list
|
|
299
|
+
List of tags to remove.
|
|
300
|
+
recursive : bool, default True
|
|
301
|
+
Whenever to delete all its child nodes
|
|
302
|
+
|
|
303
|
+
Examples
|
|
304
|
+
--------
|
|
305
|
+
|
|
306
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
307
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
308
|
+
>>> tree.strip_tags(tags)
|
|
309
|
+
>>> tree.html
|
|
310
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
311
|
+
"""
|
|
172
312
|
...
|
|
173
|
-
def unwrap_tags(self, tags: list[str]) -> None:
|
|
313
|
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
174
314
|
"""Unwraps specified tags from the HTML tree.
|
|
175
315
|
|
|
176
|
-
Works the same as the unwrap method, but applied to a list of tags.
|
|
316
|
+
Works the same as the unwrap method, but applied to a list of tags.
|
|
317
|
+
|
|
318
|
+
Parameters
|
|
319
|
+
----------
|
|
320
|
+
tags : list
|
|
321
|
+
List of tags to remove.
|
|
322
|
+
delete_empty : bool, default False
|
|
323
|
+
Whenever to delete empty tags.
|
|
324
|
+
|
|
325
|
+
Examples
|
|
326
|
+
--------
|
|
327
|
+
|
|
328
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
329
|
+
>>> tree.body.unwrap_tags(['i','a'])
|
|
330
|
+
>>> tree.body.html
|
|
331
|
+
'<body><div>Hello world!</div></body>'
|
|
332
|
+
|
|
333
|
+
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
|
|
334
|
+
"""
|
|
177
335
|
...
|
|
178
336
|
def replace_with(self, value: str | bytes | None) -> None:
|
|
179
|
-
"""Replace current Node with specified value.
|
|
337
|
+
"""Replace current Node with specified value.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
value : str, bytes or Node
|
|
342
|
+
The text or Node instance to replace the Node with.
|
|
343
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
344
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
345
|
+
Does not clone the ``Node`` object.
|
|
346
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
347
|
+
|
|
348
|
+
Examples
|
|
349
|
+
--------
|
|
350
|
+
|
|
351
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
352
|
+
>>> img = tree.css_first('img')
|
|
353
|
+
>>> img.replace_with(img.attributes.get('alt', ''))
|
|
354
|
+
>>> tree.body.child.html
|
|
355
|
+
'<div>Get Laptop</div>'
|
|
356
|
+
|
|
357
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
358
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
359
|
+
>>> img_node = html_parser.css_first('img')
|
|
360
|
+
>>> img_node.replace_with(html_parser2.body.child)
|
|
361
|
+
'<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
|
|
362
|
+
"""
|
|
180
363
|
...
|
|
181
364
|
def insert_before(self, value: str | bytes | None) -> None:
|
|
182
|
-
"""Insert a node before the current Node.
|
|
365
|
+
"""Insert a node before the current Node.
|
|
366
|
+
|
|
367
|
+
Parameters
|
|
368
|
+
----------
|
|
369
|
+
value : str, bytes or Node
|
|
370
|
+
The text or Node instance to insert before the Node.
|
|
371
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
372
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
373
|
+
Does not clone the ``Node`` object.
|
|
374
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
375
|
+
|
|
376
|
+
Examples
|
|
377
|
+
--------
|
|
378
|
+
|
|
379
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
380
|
+
>>> img = tree.css_first('img')
|
|
381
|
+
>>> img.insert_before(img.attributes.get('alt', ''))
|
|
382
|
+
>>> tree.body.child.html
|
|
383
|
+
'<div>Get Laptop<img src="" alt="Laptop"></div>'
|
|
384
|
+
|
|
385
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
386
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
387
|
+
>>> img_node = html_parser.css_first('img')
|
|
388
|
+
>>> img_node.insert_before(html_parser2.body.child)
|
|
389
|
+
<div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
|
|
390
|
+
"""
|
|
183
391
|
...
|
|
184
392
|
def insert_after(self, value: str | bytes | None) -> None:
|
|
185
|
-
"""Insert a node after the current Node.
|
|
393
|
+
"""Insert a node after the current Node.
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
value : str, bytes or Node
|
|
398
|
+
The text or Node instance to insert after the Node.
|
|
399
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
400
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
401
|
+
Does not clone the ``Node`` object.
|
|
402
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
403
|
+
|
|
404
|
+
Examples
|
|
405
|
+
--------
|
|
406
|
+
|
|
407
|
+
>>> tree = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
|
|
408
|
+
>>> img = tree.css_first('img')
|
|
409
|
+
>>> img.insert_after(img.attributes.get('alt', ''))
|
|
410
|
+
>>> tree.body.child.html
|
|
411
|
+
'<div>Get <img src="" alt="Laptop">Laptop</div>'
|
|
412
|
+
|
|
413
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
|
|
414
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
415
|
+
>>> img_node = html_parser.css_first('img')
|
|
416
|
+
>>> img_node.insert_after(html_parser2.body.child)
|
|
417
|
+
<div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
|
|
418
|
+
"""
|
|
186
419
|
...
|
|
187
420
|
def insert_child(self, value: str | bytes | None) -> None:
|
|
188
|
-
"""Insert a node inside (at the end of) the current Node
|
|
421
|
+
"""Insert a node inside (at the end of) the current Node.
|
|
422
|
+
|
|
423
|
+
Parameters
|
|
424
|
+
----------
|
|
425
|
+
value : str, bytes or Node
|
|
426
|
+
The text or Node instance to insert inside the Node.
|
|
427
|
+
When a text string is passed, it's treated as text. All HTML tags will be escaped.
|
|
428
|
+
Convert and pass the ``Node`` object when you want to work with HTML.
|
|
429
|
+
Does not clone the ``Node`` object.
|
|
430
|
+
All future changes to the passed ``Node`` object will also be taken into account.
|
|
431
|
+
|
|
432
|
+
Examples
|
|
433
|
+
--------
|
|
434
|
+
|
|
435
|
+
>>> tree = HTMLParser('<div>Get <img src=""></div>')
|
|
436
|
+
>>> div = tree.css_first('div')
|
|
437
|
+
>>> div.insert_child('Laptop')
|
|
438
|
+
>>> tree.body.child.html
|
|
439
|
+
'<div>Get <img src="">Laptop</div>'
|
|
440
|
+
|
|
441
|
+
>>> html_parser = HTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
|
|
442
|
+
>>> html_parser2 = HTMLParser('<div>Test</div>')
|
|
443
|
+
>>> span_node = html_parser.css_first('span')
|
|
444
|
+
>>> span_node.insert_child(html_parser2.body.child)
|
|
445
|
+
<div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
|
|
446
|
+
"""
|
|
189
447
|
...
|
|
190
448
|
@property
|
|
191
449
|
def raw_value(self) -> bytes:
|
|
192
450
|
"""Return the raw (unparsed, original) value of a node.
|
|
193
451
|
|
|
194
|
-
Currently, works on text nodes only.
|
|
452
|
+
Currently, works on text nodes only.
|
|
453
|
+
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
|
|
457
|
+
raw_value : bytes
|
|
458
|
+
|
|
459
|
+
Examples
|
|
460
|
+
--------
|
|
461
|
+
|
|
462
|
+
>>> html_parser = HTMLParser('<div><test></div>')
|
|
463
|
+
>>> selector = html_parser.css_first('div')
|
|
464
|
+
>>> selector.child.html
|
|
465
|
+
'<test>'
|
|
466
|
+
>>> selector.child.raw_value
|
|
467
|
+
b'<test>'
|
|
468
|
+
"""
|
|
195
469
|
...
|
|
196
470
|
def select(self, query: str | None = None) -> Selector:
|
|
197
471
|
"""Select nodes given a CSS selector.
|
|
198
472
|
|
|
199
473
|
Works similarly to the css method, but supports chained filtering and extra features.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
query : str or None
|
|
478
|
+
The CSS selector to use when searching for nodes.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
selector : The `Selector` class.
|
|
200
483
|
"""
|
|
201
484
|
...
|
|
202
485
|
def scripts_contain(self, query: str) -> bool:
|
|
203
486
|
"""Returns True if any of the script tags contain specified text.
|
|
204
487
|
|
|
205
|
-
Caches script tags on the first call to improve performance.
|
|
488
|
+
Caches script tags on the first call to improve performance.
|
|
489
|
+
|
|
490
|
+
Parameters
|
|
491
|
+
----------
|
|
492
|
+
query : str
|
|
493
|
+
The query to check.
|
|
494
|
+
"""
|
|
206
495
|
...
|
|
207
496
|
def script_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
208
497
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
209
498
|
|
|
210
|
-
Caches values on the first call to improve performance.
|
|
499
|
+
Caches values on the first call to improve performance.
|
|
500
|
+
|
|
501
|
+
Parameters
|
|
502
|
+
----------
|
|
503
|
+
queries : tuple of str
|
|
504
|
+
"""
|
|
211
505
|
...
|
|
212
506
|
@property
|
|
213
507
|
def text_content(self) -> str | None:
|
|
214
508
|
"""Returns the text of the node if it is a text node.
|
|
215
509
|
|
|
216
|
-
Returns None for other nodes.
|
|
510
|
+
Returns None for other nodes.
|
|
511
|
+
Unlike the ``text`` method, does not include child nodes.
|
|
512
|
+
|
|
513
|
+
Returns
|
|
514
|
+
-------
|
|
515
|
+
text : str or None.
|
|
217
516
|
"""
|
|
218
517
|
...
|
|
219
518
|
def merge_text_nodes(self):
|
|
220
519
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
221
520
|
|
|
222
|
-
This is useful for text extraction.
|
|
521
|
+
This is useful for text extraction.
|
|
522
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
523
|
+
|
|
524
|
+
Examples
|
|
525
|
+
--------
|
|
526
|
+
|
|
527
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
528
|
+
>>> node = tree.css_first('div')
|
|
529
|
+
>>> tree.unwrap_tags(["strong"])
|
|
530
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
531
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
532
|
+
>>> node.merge_text_nodes()
|
|
533
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
534
|
+
"John Doe"
|
|
535
|
+
"""
|
|
223
536
|
...
|
|
224
537
|
|
|
225
538
|
class HTMLParser:
|
|
539
|
+
"""The HTML parser.
|
|
540
|
+
|
|
541
|
+
Use this class to parse raw HTML.
|
|
542
|
+
|
|
543
|
+
Parameters
|
|
544
|
+
----------
|
|
545
|
+
|
|
546
|
+
html : str (unicode) or bytes
|
|
547
|
+
detect_encoding : bool, default True
|
|
548
|
+
If `True` and html type is `bytes` then encoding will be detected automatically.
|
|
549
|
+
use_meta_tags : bool, default True
|
|
550
|
+
Whether to use meta tags in encoding detection process.
|
|
551
|
+
decode_errors : str, default 'ignore'
|
|
552
|
+
Same as in builtin's str.decode, i.e 'strict', 'ignore' or 'replace'.
|
|
553
|
+
"""
|
|
554
|
+
|
|
226
555
|
def __init__(
|
|
227
556
|
self,
|
|
228
557
|
html: bytes | str,
|
|
@@ -233,27 +562,50 @@ class HTMLParser:
|
|
|
233
562
|
def css(self, query: str) -> list[Node]:
|
|
234
563
|
"""A CSS selector.
|
|
235
564
|
|
|
236
|
-
Matches pattern query against HTML tree.
|
|
565
|
+
Matches pattern `query` against HTML tree.
|
|
566
|
+
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
query : str
|
|
571
|
+
CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
|
|
572
|
+
|
|
573
|
+
Returns
|
|
574
|
+
-------
|
|
575
|
+
selector : list of `Node` objects
|
|
576
|
+
"""
|
|
237
577
|
...
|
|
238
578
|
@overload
|
|
239
|
-
def css_first(
|
|
240
|
-
self, query: str, default: Any = ..., strict: Literal[True] = ...
|
|
241
|
-
) -> Node: ...
|
|
242
|
-
@overload
|
|
243
579
|
def css_first(
|
|
244
580
|
self, query: str, default: DefaultT, strict: bool = False
|
|
245
581
|
) -> Node | DefaultT: ...
|
|
246
582
|
@overload
|
|
247
583
|
def css_first(
|
|
248
|
-
self, query: str, default: None =
|
|
249
|
-
) -> Node | None:
|
|
250
|
-
"""Same as css but returns only the first match.
|
|
584
|
+
self, query: str, default: None = None, strict: bool = False
|
|
585
|
+
) -> Node | None | DefaultT:
|
|
586
|
+
"""Same as `css` but returns only the first match.
|
|
587
|
+
|
|
588
|
+
Parameters
|
|
589
|
+
----------
|
|
590
|
+
|
|
591
|
+
query : str
|
|
592
|
+
default : bool, default None
|
|
593
|
+
Default value to return if there is no match.
|
|
594
|
+
strict: bool, default False
|
|
595
|
+
Set to True if you want to check if there is strictly only one match in the document.
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
Returns
|
|
599
|
+
-------
|
|
600
|
+
selector : `Node` object
|
|
601
|
+
"""
|
|
251
602
|
...
|
|
252
603
|
@property
|
|
253
604
|
def input_encoding(self) -> str:
|
|
254
605
|
"""Return encoding of the HTML document.
|
|
255
606
|
|
|
256
|
-
Returns unknown in case the encoding is not determined.
|
|
607
|
+
Returns `unknown` in case the encoding is not determined.
|
|
608
|
+
"""
|
|
257
609
|
...
|
|
258
610
|
@property
|
|
259
611
|
def root(self) -> Node | None:
|
|
@@ -268,16 +620,70 @@ class HTMLParser:
|
|
|
268
620
|
"""Returns document body."""
|
|
269
621
|
...
|
|
270
622
|
def tags(self, name: str) -> list[Node]:
|
|
271
|
-
"""Returns a list of tags that match specified name.
|
|
623
|
+
"""Returns a list of tags that match specified name.
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
name : str (e.g. div)
|
|
628
|
+
"""
|
|
272
629
|
...
|
|
273
630
|
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
|
|
274
|
-
"""Returns the text of the node including text of all its child nodes.
|
|
631
|
+
"""Returns the text of the node including text of all its child nodes.
|
|
632
|
+
|
|
633
|
+
Parameters
|
|
634
|
+
----------
|
|
635
|
+
strip : bool, default False
|
|
636
|
+
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
|
|
637
|
+
separator : str, default ''
|
|
638
|
+
The separator to use when joining text from different nodes.
|
|
639
|
+
deep : bool, default True
|
|
640
|
+
If True, includes text from all child nodes.
|
|
641
|
+
|
|
642
|
+
Returns
|
|
643
|
+
-------
|
|
644
|
+
text : str
|
|
645
|
+
"""
|
|
646
|
+
...
|
|
647
|
+
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
|
|
648
|
+
"""Remove specified tags from the node.
|
|
649
|
+
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
652
|
+
tags : list of str
|
|
653
|
+
List of tags to remove.
|
|
654
|
+
recursive : bool, default True
|
|
655
|
+
Whenever to delete all its child nodes
|
|
656
|
+
|
|
657
|
+
Examples
|
|
658
|
+
--------
|
|
659
|
+
|
|
660
|
+
>>> tree = HTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
|
|
661
|
+
>>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
|
|
662
|
+
>>> tree.strip_tags(tags)
|
|
663
|
+
>>> tree.html
|
|
664
|
+
'<html><body><div>Hello world!</div></body></html>'
|
|
665
|
+
"""
|
|
275
666
|
...
|
|
276
|
-
def
|
|
277
|
-
def unwrap_tags(self, tags: list[str]) -> None:
|
|
667
|
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
|
|
278
668
|
"""Unwraps specified tags from the HTML tree.
|
|
279
669
|
|
|
280
|
-
Works the same as th unwrap method, but applied to a list of tags.
|
|
670
|
+
Works the same as th unwrap method, but applied to a list of tags.
|
|
671
|
+
|
|
672
|
+
Parameters
|
|
673
|
+
----------
|
|
674
|
+
tags : list
|
|
675
|
+
List of tags to remove.
|
|
676
|
+
delete_empty : bool, default False
|
|
677
|
+
If True, removes empty tags.
|
|
678
|
+
|
|
679
|
+
Examples
|
|
680
|
+
--------
|
|
681
|
+
|
|
682
|
+
>>> tree = HTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
|
|
683
|
+
>>> tree.head.unwrap_tags(['i','a'])
|
|
684
|
+
>>> tree.head.html
|
|
685
|
+
'<body><div>Hello world!</div></body>'
|
|
686
|
+
"""
|
|
281
687
|
...
|
|
282
688
|
@property
|
|
283
689
|
def html(self) -> str | None:
|
|
@@ -286,7 +692,16 @@ class HTMLParser:
|
|
|
286
692
|
def select(self, query: str | None = None) -> Selector | None:
|
|
287
693
|
"""Select nodes given a CSS selector.
|
|
288
694
|
|
|
289
|
-
Works similarly to
|
|
695
|
+
Works similarly to the ``css`` method, but supports chained filtering and extra features.
|
|
696
|
+
|
|
697
|
+
Parameters
|
|
698
|
+
----------
|
|
699
|
+
query : str or None
|
|
700
|
+
The CSS selector to use when searching for nodes.
|
|
701
|
+
|
|
702
|
+
Returns
|
|
703
|
+
-------
|
|
704
|
+
selector : The `Selector` class.
|
|
290
705
|
"""
|
|
291
706
|
...
|
|
292
707
|
def any_css_matches(self, selectors: tuple[str]) -> bool:
|
|
@@ -295,12 +710,23 @@ class HTMLParser:
|
|
|
295
710
|
def scripts_contain(self, query: str) -> bool:
|
|
296
711
|
"""Returns True if any of the script tags contain specified text.
|
|
297
712
|
|
|
298
|
-
Caches script tags on the first call to improve performance.
|
|
713
|
+
Caches script tags on the first call to improve performance.
|
|
714
|
+
|
|
715
|
+
Parameters
|
|
716
|
+
----------
|
|
717
|
+
query : str
|
|
718
|
+
The query to check.
|
|
719
|
+
"""
|
|
299
720
|
...
|
|
300
721
|
def scripts_srcs_contain(self, queries: tuple[str]) -> bool:
|
|
301
722
|
"""Returns True if any of the script SRCs attributes contain on of the specified text.
|
|
302
723
|
|
|
303
|
-
Caches values on the first call to improve performance.
|
|
724
|
+
Caches values on the first call to improve performance.
|
|
725
|
+
|
|
726
|
+
Parameters
|
|
727
|
+
----------
|
|
728
|
+
queries : tuple of str
|
|
729
|
+
"""
|
|
304
730
|
...
|
|
305
731
|
def css_matches(self, selector: str) -> bool: ...
|
|
306
732
|
def clone(self) -> HTMLParser:
|
|
@@ -309,7 +735,21 @@ class HTMLParser:
|
|
|
309
735
|
def merge_text_nodes(self):
|
|
310
736
|
"""Iterates over all text nodes and merges all text nodes that are close to each other.
|
|
311
737
|
|
|
312
|
-
This is useful for text extraction.
|
|
738
|
+
This is useful for text extraction.
|
|
739
|
+
Use it when you need to strip HTML tags and merge "dangling" text.
|
|
740
|
+
|
|
741
|
+
Examples
|
|
742
|
+
--------
|
|
743
|
+
|
|
744
|
+
>>> tree = HTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
|
|
745
|
+
>>> node = tree.css_first('div')
|
|
746
|
+
>>> tree.unwrap_tags(["strong"])
|
|
747
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
748
|
+
"J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
|
|
749
|
+
>>> node.merge_text_nodes()
|
|
750
|
+
>>> tree.text(deep=True, separator=" ", strip=True)
|
|
751
|
+
"John Doe"
|
|
752
|
+
"""
|
|
313
753
|
...
|
|
314
754
|
|
|
315
755
|
def create_tag(tag: str) -> Node:
|