selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1112 @@
1
+ cimport cython
2
+ from cpython.exc cimport PyErr_SetNone
3
+
4
+ import logging
5
+
6
+ logger = logging.getLogger("selectolax")
7
+
8
+ _TAG_TO_NAME = {
9
+ 0x0005: "-doctype",
10
+ 0x0002: "-text",
11
+ 0x0004: "-comment",
12
+ }
13
+ ctypedef fused str_or_LexborNode:
14
+ str
15
+ bytes
16
+ LexborNode
17
+
18
+ ctypedef fused str_or_bytes:
19
+ str
20
+ bytes
21
+
22
+ cdef inline bytes to_bytes(str_or_LexborNode value):
23
+ cdef bytes bytes_val
24
+ if isinstance(value, unicode):
25
+ bytes_val = <bytes> value.encode("utf-8")
26
+ elif isinstance(value, bytes):
27
+ bytes_val = <bytes> value
28
+ return bytes_val
29
+
30
+
31
+ @cython.final
32
+ cdef class LexborNode:
33
+ """A class that represents HTML node (element)."""
34
+
35
+ @staticmethod
36
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
37
+ cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
38
+ lxbnode.node = node
39
+ lxbnode.parser = parser
40
+ return lxbnode
41
+
42
+ @property
43
+ def mem_id(self):
44
+ return <size_t> self.node
45
+
46
+ @property
47
+ def child(self):
48
+ """Alias for the `first_child` property.
49
+
50
+ **Deprecated**. Please use `first_child` instead.
51
+ """
52
+ return self.first_child
53
+
54
+ @property
55
+ def first_child(self):
56
+ """Return the first child node."""
57
+ cdef LexborNode node
58
+ if self.node.first_child:
59
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
60
+ return node
61
+ return None
62
+
63
+ @property
64
+ def parent(self):
65
+ """Return the parent node."""
66
+ cdef LexborNode node
67
+ if self.node.parent != NULL:
68
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
69
+ return node
70
+ return None
71
+
72
+ @property
73
+ def next(self):
74
+ """Return next node."""
75
+ cdef LexborNode node
76
+ if self.node.next != NULL:
77
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
78
+ return node
79
+ return None
80
+
81
+ @property
82
+ def prev(self):
83
+ """Return previous node."""
84
+ cdef LexborNode node
85
+ if self.node.prev != NULL:
86
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
87
+ return node
88
+ return None
89
+
90
+ @property
91
+ def last_child(self):
92
+ """Return last child node."""
93
+ cdef LexborNode node
94
+ if self.node.last_child != NULL:
95
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
96
+ return node
97
+ return None
98
+
99
+ @property
100
+ def html(self):
101
+ """Return HTML representation of the current node including all its child nodes.
102
+
103
+ Returns
104
+ -------
105
+ text : str
106
+ """
107
+ cdef lexbor_str_t *lxb_str
108
+ cdef lxb_status_t status
109
+
110
+ lxb_str = lexbor_str_create()
111
+ status = lxb_html_serialize_tree_str(self.node, lxb_str)
112
+ if status == 0 and lxb_str.data:
113
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
114
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
115
+ return html
116
+ return None
117
+
118
+ def __hash__(self):
119
+ return self.mem_id
120
+
121
+ def text_lexbor(self):
122
+ """Returns the text of the node including text of all its child nodes.
123
+
124
+ Uses builtin method from lexbor.
125
+ """
126
+
127
+ cdef size_t str_len = 0
128
+ cdef lxb_char_t * text
129
+
130
+ text = lxb_dom_node_text_content(self.node, &str_len)
131
+ if <int> str_len == 0:
132
+ raise RuntimeError("Can't extract text")
133
+
134
+ unicode_text = text.decode(_ENCODING)
135
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
136
+ return unicode_text
137
+
138
+ def text(self, bool deep=True, str separator='', bool strip=False, bool skip_empty=False):
139
+ """Return concatenated text from this node.
140
+
141
+ Parameters
142
+ ----------
143
+ deep : bool, optional
144
+ When ``True`` (default), include text from all descendant nodes; when
145
+ ``False``, only include direct children.
146
+ separator : str, optional
147
+ String inserted between successive text fragments.
148
+ strip : bool, optional
149
+ If ``True``, apply ``str.strip()`` to each fragment before joining to
150
+ remove surrounding whitespace. Defaults to ``False``.
151
+ skip_empty : bool, optional
152
+ Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
153
+ ``True``. Defaults to ``False``.
154
+
155
+ Returns
156
+ -------
157
+ text : str
158
+ Combined textual content assembled according to the provided options.
159
+
160
+ """
161
+ cdef unsigned char * text
162
+ cdef lxb_dom_node_t * node = <lxb_dom_node_t *> self.node.first_child
163
+
164
+ if not deep:
165
+ container = TextContainer(separator, strip)
166
+ if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
167
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
168
+ if text != NULL:
169
+ if not skip_empty or not self.is_empty_text_node:
170
+ py_text = text.decode(_ENCODING)
171
+ container.append(py_text)
172
+
173
+ while node != NULL:
174
+ if node.type == LXB_DOM_NODE_TYPE_TEXT:
175
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
176
+ if text != NULL:
177
+ if not skip_empty or not self._is_empty_text_node(node):
178
+ py_text = text.decode(_ENCODING)
179
+ container.append(py_text)
180
+ node = node.next
181
+ return container.text
182
+ else:
183
+ container = TextContainer(separator, strip)
184
+ if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
185
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
186
+ if text != NULL:
187
+ if not skip_empty or not self.is_empty_text_node:
188
+ container.append(text.decode(_ENCODING))
189
+
190
+ lxb_dom_node_simple_walk(
191
+ <lxb_dom_node_t *> self.node,
192
+ <lxb_dom_node_simple_walker_f> text_callback,
193
+ <void *> container
194
+ )
195
+ return container.text
196
+
197
+ def css(self, str query):
198
+ """Evaluate CSS selector against current node and its child nodes.
199
+
200
+ Matches pattern `query` against HTML tree.
201
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
202
+
203
+ Special selectors:
204
+
205
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
206
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
207
+
208
+
209
+ Parameters
210
+ ----------
211
+ query : str
212
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
213
+
214
+ Returns
215
+ -------
216
+ selector : list of `Node` objects
217
+ """
218
+ return self.parser.selector.find(query, self)
219
+
220
+ def css_first(self, str query, default=None, bool strict=False):
221
+ """Same as `css` but returns only the first match.
222
+
223
+ When `strict=False` stops at the first match. Works faster.
224
+
225
+ Parameters
226
+ ----------
227
+
228
+ query : str
229
+ default : Any, default None
230
+ Default value to return if there is no match.
231
+ strict: bool, default False
232
+ Set to True if you want to check if there is strictly only one match in the document.
233
+
234
+
235
+ Returns
236
+ -------
237
+ selector : `LexborNode` object
238
+ """
239
+ if strict:
240
+ results = self.parser.selector.find(query, self)
241
+ else:
242
+ results = self.parser.selector.find_first(query, self)
243
+ n_results = len(results)
244
+ if n_results > 0:
245
+ if strict and n_results > 1:
246
+ raise ValueError("Expected 1 match, but found %s matches" % n_results)
247
+ return results[0]
248
+ return default
249
+
250
+ def any_css_matches(self, tuple selectors):
251
+ """Returns True if any of CSS selectors matches a node"""
252
+ for selector in selectors:
253
+ if self.parser.selector.any_matches(selector, self):
254
+ return True
255
+ return False
256
+
257
+ def css_matches(self, str selector):
258
+ """Returns True if CSS selector matches a node."""
259
+ return bool(self.parser.selector.any_matches(selector, self))
260
+
261
+ def __repr__(self):
262
+ return '<LexborNode %s>' % self.tag
263
+
264
+ @property
265
+ def tag_id(self):
266
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(self.node)
267
+ return tag_id
268
+
269
+ @property
270
+ def tag(self):
271
+ """Return the name of the current tag (e.g. div, p, img).
272
+
273
+ For for non-tag nodes, returns the following names:
274
+
275
+ * `-text` - text node
276
+ * `-document` - document node
277
+ * `-comment` - comment node
278
+
279
+ This
280
+
281
+ Returns
282
+ -------
283
+ text : str
284
+ """
285
+
286
+ cdef lxb_char_t *c_text
287
+ cdef size_t str_len = 0
288
+ if self.tag_id in [LXB_TAG__EM_DOCTYPE, LXB_TAG__TEXT, LXB_TAG__EM_COMMENT]:
289
+ return _TAG_TO_NAME[self.tag_id]
290
+ c_text = lxb_dom_element_qualified_name(<lxb_dom_element_t *> self.node, &str_len)
291
+ text = None
292
+ if c_text:
293
+ text = c_text.decode(_ENCODING)
294
+ return text
295
+
296
+ def decompose(self, bool recursive=True):
297
+ """Remove the current node from the tree.
298
+
299
+ Parameters
300
+ ----------
301
+ recursive : bool, default True
302
+ Whenever to delete all its child nodes
303
+
304
+ Examples
305
+ --------
306
+
307
+ >>> tree = LexborHTMLParser(html)
308
+ >>> for tag in tree.css('script'):
309
+ >>> tag.decompose()
310
+
311
+ """
312
+ if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
313
+ raise SelectolaxError("Decomposing the root node is not allowed.")
314
+
315
+ if recursive:
316
+ node_remove_deep(<lxb_dom_node_t *> self.node)
317
+ else:
318
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
319
+
320
+ def strip_tags(self, list tags, bool recursive = False):
321
+ """Remove specified tags from the HTML tree.
322
+
323
+ Parameters
324
+ ----------
325
+ tags : list
326
+ List of tags to remove.
327
+ recursive : bool, default True
328
+ Whenever to delete all its child nodes
329
+
330
+ Examples
331
+ --------
332
+
333
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
334
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
335
+ >>> tree.strip_tags(tags)
336
+ >>> tree.html
337
+ '<html><body><div>Hello world!</div></body></html>'
338
+
339
+ """
340
+ cdef LexborNode element
341
+ for tag in tags:
342
+ for element in self.css(tag):
343
+ element.decompose(recursive=recursive)
344
+
345
+ @property
346
+ def attributes(self):
347
+ """Get all attributes that belong to the current node.
348
+
349
+ The value of empty attributes is None.
350
+
351
+ Returns
352
+ -------
353
+ attributes : dictionary of all attributes.
354
+
355
+ Examples
356
+ --------
357
+
358
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
359
+ >>> node = tree.css_first('div')
360
+ >>> node.attributes
361
+ {'data': None, 'id': 'my_id'}
362
+ """
363
+ cdef lxb_dom_attr_t *attr = lxb_dom_element_first_attribute_noi(<lxb_dom_element_t *> self.node)
364
+ cdef size_t str_len = 0
365
+ attributes = dict()
366
+
367
+ if not self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT):
368
+ return attributes
369
+
370
+ while attr != NULL:
371
+ key = lxb_dom_attr_local_name_noi(attr, &str_len)
372
+ value = lxb_dom_attr_value_noi(attr, &str_len)
373
+
374
+ if value:
375
+ py_value = value.decode(_ENCODING)
376
+ else:
377
+ py_value = None
378
+ attributes[key.decode(_ENCODING)] = py_value
379
+
380
+ attr = attr.next
381
+ return attributes
382
+
383
+ @property
384
+ def attrs(self):
385
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
386
+
387
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
388
+
389
+ Returns
390
+ -------
391
+ attributes : Attributes mapping object.
392
+
393
+ Examples
394
+ --------
395
+
396
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
397
+ >>> node = tree.css_first('div')
398
+ >>> node.attrs
399
+ <div attributes, 1 items>
400
+ >>> node.attrs['id']
401
+ 'a'
402
+ >>> node.attrs['foo'] = 'bar'
403
+ >>> del node.attrs['id']
404
+ >>> node.attributes
405
+ {'foo': 'bar'}
406
+ >>> node.attrs['id'] = 'new_id'
407
+ >>> node.html
408
+ '<div foo="bar" id="new_id"></div>'
409
+ """
410
+ cdef LexborAttributes attributes = LexborAttributes.create(<lxb_dom_node_t *> self.node)
411
+ return attributes
412
+
413
+ @property
414
+ def id(self):
415
+ """Get the id attribute of the node.
416
+
417
+ Returns None if id does not set.
418
+
419
+ Returns
420
+ -------
421
+ text : str
422
+ """
423
+ cdef char * key = 'id'
424
+ cdef size_t str_len
425
+ cdef lxb_dom_attr_t * attr = lxb_dom_element_attr_by_name(
426
+ <lxb_dom_element_t *> self.node,
427
+ <lxb_char_t *> key, 2
428
+ )
429
+ if attr != NULL:
430
+ value = lxb_dom_attr_value_noi(attr, &str_len)
431
+ return value.decode(_ENCODING) if value else None
432
+ return None
433
+
434
+ def iter(self, bool include_text = False, bool skip_empty = False):
435
+ """Iterate over direct children of this node.
436
+
437
+ Parameters
438
+ ----------
439
+ include_text : bool, optional
440
+ When ``True``, yield text nodes in addition to element nodes. Defaults
441
+ to ``False``.
442
+ skip_empty : bool, optional
443
+ When ``include_text`` is ``True``, ignore text nodes that
444
+ ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
445
+
446
+ Yields
447
+ ------
448
+ LexborNode
449
+ Child nodes on the same tree level as this node, filtered according
450
+ to the provided options.
451
+ """
452
+
453
+ cdef lxb_dom_node_t *node = self.node.first_child
454
+ cdef LexborNode next_node
455
+
456
+ while node != NULL:
457
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
458
+ node = node.next
459
+ continue
460
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and self._is_empty_text_node(node):
461
+ node = node.next
462
+ continue
463
+
464
+ next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
465
+ yield next_node
466
+ node = node.next
467
+
468
+ def __iter__(self):
469
+ return self.iter()
470
+
471
+ def __next__(self):
472
+ return self.next
473
+
474
+ def unwrap(self, bint delete_empty=False):
475
+ """Replace node with whatever is inside this node.
476
+
477
+ Does nothing if you perform unwrapping second time on the same node.
478
+
479
+ Parameters
480
+ ----------
481
+ delete_empty : bool, default False
482
+ If True, removes empty tags.
483
+
484
+ Examples
485
+ --------
486
+
487
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
488
+ >>> tree.css_first('i').unwrap()
489
+ >>> tree.html
490
+ '<html><head></head><body><div>Hello world!</div></body></html>'
491
+
492
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
493
+ """
494
+
495
+ if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
496
+ logger.error("Attempt to unwrap removed node. Does nothing.")
497
+ return
498
+
499
+ if self.node.first_child == NULL:
500
+ if delete_empty:
501
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
502
+ return
503
+ cdef lxb_dom_node_t * next_node
504
+ cdef lxb_dom_node_t * current_node
505
+
506
+ if self.node.first_child.next != NULL:
507
+ current_node = self.node.first_child
508
+ next_node = current_node.next
509
+
510
+ while next_node != NULL:
511
+ next_node = current_node.next
512
+ lxb_dom_node_insert_before(self.node, current_node)
513
+ current_node = next_node
514
+ else:
515
+ lxb_dom_node_insert_before(self.node, self.node.first_child)
516
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
517
+
518
+ def unwrap_tags(self, list tags, bint delete_empty = False):
519
+ """Unwraps specified tags from the HTML tree.
520
+
521
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
522
+
523
+ Parameters
524
+ ----------
525
+ tags : list
526
+ List of tags to remove.
527
+ delete_empty : bool, default False
528
+ If True, removes empty tags.
529
+
530
+ Examples
531
+ --------
532
+
533
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
534
+ >>> tree.body.unwrap_tags(['i','a'])
535
+ >>> tree.body.html
536
+ '<body><div>Hello world!</div></body>'
537
+
538
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
539
+ """
540
+ cdef LexborNode element
541
+ for tag in tags:
542
+ for element in self.css(tag):
543
+ element.unwrap(delete_empty)
544
+
545
+ def merge_text_nodes(self):
546
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
547
+
548
+ This is useful for text extraction.
549
+ Use it when you need to strip HTML tags and merge "dangling" text.
550
+
551
+ Examples
552
+ --------
553
+
554
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
555
+ >>> node = tree.css_first('div')
556
+ >>> tree.unwrap_tags(["strong"])
557
+ >>> tree.text(deep=True, separator=" ", strip=True)
558
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
559
+ >>> node.merge_text_nodes()
560
+ >>> tree.text(deep=True, separator=" ", strip=True)
561
+ "John Doe"
562
+ """
563
+ cdef lxb_dom_node_t *node = self.node.first_child
564
+ cdef lxb_dom_node_t *next_node
565
+ cdef lxb_char_t *left_text
566
+ cdef lxb_char_t *right_text
567
+ cdef size_t left_length, right_length
568
+
569
+ while node != NULL:
570
+ next_node = node.next
571
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
572
+ left_text = lxb_dom_node_text_content(node.prev, &left_length)
573
+ right_text = lxb_dom_node_text_content(node, &right_length)
574
+ if left_text and right_text:
575
+ combined = (<bytes> left_text[:left_length]) + (<bytes> right_text[:right_length])
576
+ lxb_dom_node_text_content_set(node, combined, len(combined))
577
+ lxb_dom_node_remove(node.prev)
578
+
579
+ if left_text is not NULL:
580
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
581
+ if right_text is not NULL:
582
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
583
+
584
+ if node.first_child:
585
+ LexborNode.new(node, self.parser).merge_text_nodes()
586
+ node = next_node
587
+
588
+ def traverse(self, bool include_text = False, bool skip_empty = False):
589
+ """Depth-first traversal starting at the current node.
590
+
591
+ Parameters
592
+ ----------
593
+ include_text : bool, optional
594
+ When ``True``, include text nodes in the traversal sequence. Defaults
595
+ to ``False``.
596
+ skip_empty : bool, optional
597
+ Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
598
+ when ``include_text`` is ``True``. Defaults to ``False``.
599
+
600
+ Yields
601
+ ------
602
+ LexborNode
603
+ Nodes encountered in depth-first order beginning with the current
604
+ node, filtered according to the provided options.
605
+ """
606
+ cdef lxb_dom_node_t * root = self.node
607
+ cdef lxb_dom_node_t * node = root
608
+ cdef LexborNode lxb_node
609
+
610
+ while node != NULL:
611
+ if include_text or node.type != LXB_DOM_NODE_TYPE_TEXT:
612
+ if not skip_empty or not self._is_empty_text_node(node):
613
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
614
+ yield lxb_node
615
+
616
+ if node.first_child != NULL:
617
+ node = node.first_child
618
+ else:
619
+ while node != root and node.next == NULL:
620
+ node = node.parent
621
+ if node == root:
622
+ break
623
+ node = node.next
624
+
625
+ def replace_with(self, str_or_LexborNode value):
626
+ """Replace current Node with specified value.
627
+
628
+ Parameters
629
+ ----------
630
+ value : str, bytes or Node
631
+ The text or Node instance to replace the Node with.
632
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
633
+ Convert and pass the ``Node`` object when you want to work with HTML.
634
+ Does not clone the ``Node`` object.
635
+ All future changes to the passed ``Node`` object will also be taken into account.
636
+
637
+ Examples
638
+ --------
639
+
640
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
641
+ >>> img = tree.css_first('img')
642
+ >>> img.replace_with(img.attributes.get('alt', ''))
643
+ >>> tree.body.child.html
644
+ '<div>Get Laptop</div>'
645
+
646
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
647
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
648
+ >>> img_node = html_parser.css_first('img')
649
+ >>> img_node.replace_with(html_parser2.body.child)
650
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
651
+ """
652
+ cdef lxb_dom_node_t * new_node
653
+
654
+ if isinstance(value, (str, bytes, unicode)):
655
+ bytes_val = to_bytes(value)
656
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
657
+ &self.parser.document.dom_document,
658
+ <lxb_char_t *> bytes_val, len(bytes_val)
659
+ )
660
+ if new_node == NULL:
661
+ raise SelectolaxError("Can't create a new node")
662
+ lxb_dom_node_insert_before(self.node, new_node)
663
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
664
+ elif isinstance(value, LexborNode):
665
+ new_node = lxb_dom_document_import_node(
666
+ &self.parser.document.dom_document,
667
+ <lxb_dom_node_t *> value.node,
668
+ <bint> True
669
+ )
670
+ if new_node == NULL:
671
+ raise SelectolaxError("Can't create a new node")
672
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
673
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
674
+ else:
675
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
676
+
677
+ def insert_before(self, str_or_LexborNode value):
678
+ """
679
+ Insert a node before the current Node.
680
+
681
+ Parameters
682
+ ----------
683
+ value : str, bytes or Node
684
+ The text or Node instance to insert before the Node.
685
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
686
+ Convert and pass the ``Node`` object when you want to work with HTML.
687
+ Does not clone the ``Node`` object.
688
+ All future changes to the passed ``Node`` object will also be taken into account.
689
+
690
+ Examples
691
+ --------
692
+
693
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
694
+ >>> img = tree.css_first('img')
695
+ >>> img.insert_before(img.attributes.get('alt', ''))
696
+ >>> tree.body.child.html
697
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
698
+
699
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
700
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
701
+ >>> img_node = html_parser.css_first('img')
702
+ >>> img_node.insert_before(html_parser2.body.child)
703
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
704
+ """
705
+ cdef lxb_dom_node_t * new_node
706
+
707
+ if isinstance(value, (str, bytes, unicode)):
708
+ bytes_val = to_bytes(value)
709
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
710
+ &self.parser.document.dom_document,
711
+ <lxb_char_t *> bytes_val, len(bytes_val)
712
+ )
713
+ if new_node == NULL:
714
+ raise SelectolaxError("Can't create a new node")
715
+ lxb_dom_node_insert_before(self.node, new_node)
716
+ elif isinstance(value, LexborNode):
717
+ new_node = lxb_dom_document_import_node(
718
+ &self.parser.document.dom_document,
719
+ <lxb_dom_node_t *> value.node,
720
+ <bint> True
721
+ )
722
+ if new_node == NULL:
723
+ raise SelectolaxError("Can't create a new node")
724
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
725
+ else:
726
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
727
+
728
+ def insert_after(self, str_or_LexborNode value):
729
+ """
730
+ Insert a node after the current Node.
731
+
732
+ Parameters
733
+ ----------
734
+ value : str, bytes or Node
735
+ The text or Node instance to insert after the Node.
736
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
737
+ Convert and pass the ``Node`` object when you want to work with HTML.
738
+ Does not clone the ``Node`` object.
739
+ All future changes to the passed ``Node`` object will also be taken into account.
740
+
741
+ Examples
742
+ --------
743
+
744
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
745
+ >>> img = tree.css_first('img')
746
+ >>> img.insert_after(img.attributes.get('alt', ''))
747
+ >>> tree.body.child.html
748
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
749
+
750
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
751
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
752
+ >>> img_node = html_parser.css_first('img')
753
+ >>> img_node.insert_after(html_parser2.body.child)
754
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
755
+ """
756
+ cdef lxb_dom_node_t * new_node
757
+
758
+ if isinstance(value, (str, bytes, unicode)):
759
+ bytes_val = to_bytes(value)
760
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
761
+ &self.parser.document.dom_document,
762
+ <lxb_char_t *> bytes_val, len(bytes_val)
763
+ )
764
+ if new_node == NULL:
765
+ raise SelectolaxError("Can't create a new node")
766
+ lxb_dom_node_insert_after(self.node, new_node)
767
+ elif isinstance(value, LexborNode):
768
+ new_node = lxb_dom_document_import_node(
769
+ &self.parser.document.dom_document,
770
+ <lxb_dom_node_t *> value.node,
771
+ <bint> True
772
+ )
773
+ if new_node == NULL:
774
+ raise SelectolaxError("Can't create a new node")
775
+ lxb_dom_node_insert_after(self.node, <lxb_dom_node_t *> new_node)
776
+ else:
777
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
778
+
779
+ def insert_child(self, str_or_LexborNode value):
780
+ """
781
+ Insert a node inside (at the end of) the current Node.
782
+
783
+ Parameters
784
+ ----------
785
+ value : str, bytes or Node
786
+ The text or Node instance to insert inside the Node.
787
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
788
+ Convert and pass the ``Node`` object when you want to work with HTML.
789
+ Does not clone the ``Node`` object.
790
+ All future changes to the passed ``Node`` object will also be taken into account.
791
+
792
+ Examples
793
+ --------
794
+
795
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
796
+ >>> div = tree.css_first('div')
797
+ >>> div.insert_child('Laptop')
798
+ >>> tree.body.child.html
799
+ '<div>Get <img src="">Laptop</div>'
800
+
801
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
802
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
803
+ >>> span_node = html_parser.css_first('span')
804
+ >>> span_node.insert_child(html_parser2.body.child)
805
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
806
+ """
807
+ cdef lxb_dom_node_t * new_node
808
+
809
+ if isinstance(value, (str, bytes, unicode)):
810
+ bytes_val = to_bytes(value)
811
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
812
+ &self.parser.document.dom_document,
813
+ <lxb_char_t *> bytes_val, len(bytes_val)
814
+ )
815
+ if new_node == NULL:
816
+ raise SelectolaxError("Can't create a new node")
817
+ lxb_dom_node_insert_child(self.node, new_node)
818
+ elif isinstance(value, LexborNode):
819
+ new_node = lxb_dom_document_import_node(
820
+ &self.parser.document.dom_document,
821
+ <lxb_dom_node_t *> value.node,
822
+ <bint> True
823
+ )
824
+ if new_node == NULL:
825
+ raise SelectolaxError("Can't create a new node")
826
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
827
+ else:
828
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
829
+
830
+ @property
831
+ def raw_value(self):
832
+ """Return the raw (unparsed, original) value of a node.
833
+
834
+ Currently, works on text nodes only.
835
+
836
+ Returns
837
+ -------
838
+
839
+ raw_value : bytes
840
+
841
+ Examples
842
+ --------
843
+
844
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
845
+ >>> selector = html_parser.css_first('div')
846
+ >>> selector.child.html
847
+ '&lt;test&gt;'
848
+ >>> selector.child.raw_value
849
+ b'&#x3C;test&#x3E;'
850
+ """
851
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
852
+
853
+ def scripts_contain(self, str query):
854
+ """Returns True if any of the script tags contain specified text.
855
+
856
+ Caches script tags on the first call to improve performance.
857
+
858
+ Parameters
859
+ ----------
860
+ query : str
861
+ The query to check.
862
+
863
+ """
864
+ cdef LexborNode node
865
+ if self.parser.cached_script_texts is None:
866
+ nodes = self.parser.selector.find('script', self)
867
+ text_nodes = []
868
+ for node in nodes:
869
+ node_text = node.text(deep=True)
870
+ if node_text:
871
+ text_nodes.append(node_text)
872
+ self.parser.cached_script_texts = text_nodes
873
+
874
+ for text in self.parser.cached_script_texts:
875
+ if query in text:
876
+ return True
877
+ return False
878
+
879
+ def script_srcs_contain(self, tuple queries):
880
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
881
+
882
+ Caches values on the first call to improve performance.
883
+
884
+ Parameters
885
+ ----------
886
+ queries : tuple of str
887
+
888
+ """
889
+ cdef LexborNode node
890
+ if self.parser.cached_script_srcs is None:
891
+ nodes = self.parser.selector.find('script', self)
892
+ src_nodes = []
893
+ for node in nodes:
894
+ node_src = node.attrs.get('src')
895
+ if node_src:
896
+ src_nodes.append(node_src)
897
+ self.parser.cached_script_srcs = src_nodes
898
+
899
+ for text in self.parser.cached_script_srcs:
900
+ for query in queries:
901
+ if query in text:
902
+ return True
903
+ return False
904
+
905
+ def remove(self, bool recursive=True):
906
+ """An alias for the decompose method."""
907
+ self.decompose(recursive)
908
+
909
+ def select(self, query=None):
910
+ """Select nodes given a CSS selector.
911
+
912
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
913
+
914
+ Parameters
915
+ ----------
916
+ query : str or None
917
+ The CSS selector to use when searching for nodes.
918
+
919
+ Returns
920
+ -------
921
+ selector : The `Selector` class.
922
+ """
923
+ return LexborSelector(self, query)
924
+
925
+ def __eq__(self, other):
926
+ if isinstance(other, str):
927
+ return self.html == other
928
+ if not isinstance(other, LexborNode):
929
+ return False
930
+ return self.html == other.html
931
+
932
+ @property
933
+ def text_content(self):
934
+ """Returns the text of the node if it is a text node.
935
+
936
+ Returns None for other nodes.
937
+ Unlike the ``text`` method, does not include child nodes.
938
+
939
+ Returns
940
+ -------
941
+ text : str or None.
942
+ """
943
+ cdef unsigned char * text
944
+ cdef lxb_dom_node_t * node = <lxb_dom_node_t *> self.node.first_child
945
+ cdef TextContainer container
946
+ if not self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
947
+ return None
948
+
949
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
950
+ if text != NULL:
951
+ container = TextContainer.new_with_defaults()
952
+ py_text = text.decode(_ENCODING)
953
+ container.append(py_text)
954
+ return container.text
955
+ return None
956
+
957
+ @property
958
+ def inner_html(self) -> str | None:
959
+ """Return HTML representation of the child nodes.
960
+
961
+ Works similar to innerHTML in JavaScript.
962
+ Unlike the `.html` property, does not include the current node.
963
+ Can be used to set HTML as well. See the setter docstring.
964
+
965
+ Returns
966
+ -------
967
+ text : str | None
968
+ """
969
+
970
+ cdef lexbor_str_t *lxb_str
971
+ cdef lxb_status_t status
972
+
973
+ lxb_str = lexbor_str_create()
974
+ status = lxb_html_serialize_deep_str(self.node, lxb_str)
975
+ if status == 0 and lxb_str.data:
976
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
977
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
978
+ return html
979
+ return None
980
+
981
+ @inner_html.setter
982
+ def inner_html(self, str html) -> None:
983
+ """Set inner HTML to the specified HTML.
984
+
985
+ Replaces existing data inside the node.
986
+ Works similar to innerHTML in JavaScript.
987
+
988
+ Parameters
989
+ ----------
990
+ html : str | None
991
+
992
+ """
993
+ cdef bytes bytes_val
994
+ bytes_val = <bytes> html.encode("utf-8")
995
+ lxb_html_element_inner_html_set(
996
+ <lxb_html_element_t *> self.node,
997
+ <lxb_char_t *> bytes_val, len(bytes_val)
998
+ )
999
+
1000
+ def clone(self) -> LexborNode:
1001
+ """Clone the current node.
1002
+
1003
+ You can use to do temporary modifications without affecting the original HTML tree.
1004
+
1005
+ It is tied to the current parser instance.
1006
+ Gets destroyed when parser instance is destroyed.
1007
+ """
1008
+ cdef lxb_dom_node_t * node
1009
+ node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
1010
+ return LexborNode.new(node, self.parser)
1011
+
1012
+ cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type):
1013
+ return self.node != NULL and self.node.type == expected_type
1014
+
1015
+ @property
1016
+ def is_element_node(self) -> bool:
1017
+ """Return True if the node represents an element node."""
1018
+ return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT)
1019
+
1020
+ @property
1021
+ def is_text_node(self) -> bool:
1022
+ """Return True if the node represents a text node."""
1023
+ return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT)
1024
+
1025
+ @property
1026
+ def is_comment_node(self) -> bool:
1027
+ """Return True if the node represents a comment node."""
1028
+ return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT)
1029
+
1030
+ @property
1031
+ def is_document_node(self) -> bool:
1032
+ """Return True if the node represents a document node."""
1033
+ return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT)
1034
+
1035
+ @property
1036
+ def is_empty_text_node(self) -> bool:
1037
+ """Check whether the current node is an empty text node.
1038
+
1039
+ Returns
1040
+ -------
1041
+ bool
1042
+ ``True`` when the node is a text node and
1043
+ ``lxb_dom_node_is_empty`` reports that its parent subtree contains
1044
+ only whitespace (or nothing).
1045
+ """
1046
+ return self._is_empty_text_node(self.node)
1047
+
1048
+ cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
1049
+ if node.type != LXB_DOM_NODE_TYPE_TEXT:
1050
+ return False
1051
+
1052
+ # lexbor's emptiness check walks children of the passed node; for a
1053
+ # text node we need to evaluate its parent so the text itself is
1054
+ # inspected.
1055
+ if node.parent != NULL:
1056
+ return lxb_dom_node_is_empty(node.parent)
1057
+ return lxb_dom_node_is_empty(node)
1058
+
1059
+
1060
+ @cython.internal
1061
+ @cython.final
1062
+ cdef class TextContainer:
1063
+ cdef str _text
1064
+ cdef str separator
1065
+ cdef bint strip
1066
+
1067
+ @staticmethod
1068
+ cdef TextContainer new_with_defaults():
1069
+ cdef TextContainer cls = TextContainer.__new__(TextContainer)
1070
+ cls._text = ''
1071
+ cls.separator = ''
1072
+ cls.strip = False
1073
+ return cls
1074
+
1075
+ def __init__(self, str separator = '', bool strip = False):
1076
+ self._text = ""
1077
+ self.separator = separator
1078
+ self.strip = strip
1079
+
1080
+ def append(self, str node_text):
1081
+ if self.strip:
1082
+ self._text += node_text.strip() + self.separator
1083
+ else:
1084
+ self._text += node_text + self.separator
1085
+
1086
+ @property
1087
+ def text(self):
1088
+ if self.separator and self._text and self._text.endswith(self.separator):
1089
+ self._text = self._text[:-len(self.separator)]
1090
+ return self._text
1091
+
1092
+ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
1093
+ cdef unsigned char *text
1094
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
1095
+ if tag_id != LXB_TAG__TEXT:
1096
+ return LEXBOR_ACTION_OK
1097
+
1098
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
1099
+ if not text:
1100
+ return LEXBOR_ACTION_OK
1101
+
1102
+ try:
1103
+ py_str = text.decode(_ENCODING, "replace")
1104
+
1105
+ except Exception as e:
1106
+ PyErr_SetNone(e)
1107
+ return LEXBOR_ACTION_STOP
1108
+
1109
+ cdef TextContainer cls
1110
+ cls = <TextContainer> ctx
1111
+ cls.append(py_str)
1112
+ return LEXBOR_ACTION_OK