selectolax 0.4.0__cp314-cp314-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

@@ -0,0 +1,1034 @@
1
+ cimport cython
2
+ from cpython.exc cimport PyErr_SetNone
3
+
4
+ import logging
5
+
6
+ logger = logging.getLogger("selectolax")
7
+
8
+ _TAG_TO_NAME = {
9
+ 0x0005: "-doctype",
10
+ 0x0002: "-text",
11
+ 0x0004: "-comment",
12
+ }
13
+ ctypedef fused str_or_LexborNode:
14
+ str
15
+ bytes
16
+ LexborNode
17
+
18
+ ctypedef fused str_or_bytes:
19
+ str
20
+ bytes
21
+
22
+ cdef inline bytes to_bytes(str_or_LexborNode value):
23
+ cdef bytes bytes_val
24
+ if isinstance(value, unicode):
25
+ bytes_val = <bytes>value.encode("utf-8")
26
+ elif isinstance(value, bytes):
27
+ bytes_val = <bytes>value
28
+ return bytes_val
29
+
30
+
31
+ @cython.final
32
+ cdef class LexborNode:
33
+ """A class that represents HTML node (element)."""
34
+
35
+ @staticmethod
36
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
37
+ cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
38
+ lxbnode.node = node
39
+ lxbnode.parser = parser
40
+ return lxbnode
41
+
42
+ @property
43
+ def mem_id(self):
44
+ return <size_t> self.node
45
+
46
+ @property
47
+ def child(self):
48
+ """Alias for the `first_child` property.
49
+
50
+ **Deprecated**. Please use `first_child` instead.
51
+ """
52
+ return self.first_child
53
+
54
+ @property
55
+ def first_child(self):
56
+ """Return the first child node."""
57
+ cdef LexborNode node
58
+ if self.node.first_child:
59
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
60
+ return node
61
+ return None
62
+
63
+ @property
64
+ def parent(self):
65
+ """Return the parent node."""
66
+ cdef LexborNode node
67
+ if self.node.parent != NULL:
68
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
69
+ return node
70
+ return None
71
+
72
+ @property
73
+ def next(self):
74
+ """Return next node."""
75
+ cdef LexborNode node
76
+ if self.node.next != NULL:
77
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
78
+ return node
79
+ return None
80
+
81
+ @property
82
+ def prev(self):
83
+ """Return previous node."""
84
+ cdef LexborNode node
85
+ if self.node.prev != NULL:
86
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
87
+ return node
88
+ return None
89
+
90
+ @property
91
+ def last_child(self):
92
+ """Return last child node."""
93
+ cdef LexborNode node
94
+ if self.node.last_child != NULL:
95
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
96
+ return node
97
+ return None
98
+
99
+ @property
100
+ def html(self):
101
+ """Return HTML representation of the current node including all its child nodes.
102
+
103
+ Returns
104
+ -------
105
+ text : str
106
+ """
107
+ cdef lexbor_str_t *lxb_str
108
+ cdef lxb_status_t status
109
+
110
+ lxb_str = lexbor_str_create()
111
+ status = lxb_html_serialize_tree_str(self.node, lxb_str)
112
+ if status == 0 and lxb_str.data:
113
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
114
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
115
+ return html
116
+ return None
117
+
118
+ def __hash__(self):
119
+ return self.mem_id
120
+
121
+ def text_lexbor(self):
122
+ """Returns the text of the node including text of all its child nodes.
123
+
124
+ Uses builtin method from lexbor.
125
+ """
126
+
127
+ cdef size_t str_len = 0
128
+ cdef lxb_char_t * text
129
+
130
+ text = lxb_dom_node_text_content(self.node, &str_len)
131
+ if <int>str_len == 0:
132
+ raise RuntimeError("Can't extract text")
133
+
134
+ unicode_text = text.decode(_ENCODING)
135
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
136
+ return unicode_text
137
+
138
+ def text(self, bool deep=True, str separator='', bool strip=False):
139
+ """Returns the text of the node including text of all its child nodes.
140
+
141
+ Parameters
142
+ ----------
143
+ strip : bool, default False
144
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
145
+ separator : str, default ''
146
+ The separator to use when joining text from different nodes.
147
+ deep : bool, default True
148
+ If True, includes text from all child nodes.
149
+
150
+ Returns
151
+ -------
152
+ text : str
153
+
154
+ """
155
+ cdef unsigned char * text
156
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
157
+
158
+ if not deep:
159
+ container = TextContainer(separator, strip)
160
+ if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT:
161
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
162
+ if text != NULL:
163
+ py_text = text.decode(_ENCODING)
164
+ container.append(py_text)
165
+
166
+ while node != NULL:
167
+ if node.type == LXB_DOM_NODE_TYPE_TEXT:
168
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
169
+ if text != NULL:
170
+ py_text = text.decode(_ENCODING)
171
+ container.append(py_text)
172
+ node = node.next
173
+ return container.text
174
+ else:
175
+ container = TextContainer(separator, strip)
176
+ if self.node.type == LXB_DOM_NODE_TYPE_TEXT:
177
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
178
+ if text != NULL:
179
+ container.append(text.decode(_ENCODING))
180
+
181
+ lxb_dom_node_simple_walk(
182
+ <lxb_dom_node_t *> self.node,
183
+ <lxb_dom_node_simple_walker_f>text_callback,
184
+ <void *>container
185
+ )
186
+ return container.text
187
+
188
+ def css(self, str query):
189
+ """Evaluate CSS selector against current node and its child nodes.
190
+
191
+ Matches pattern `query` against HTML tree.
192
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
193
+
194
+ Special selectors:
195
+
196
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
197
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
198
+
199
+
200
+ Parameters
201
+ ----------
202
+ query : str
203
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
204
+
205
+ Returns
206
+ -------
207
+ selector : list of `Node` objects
208
+ """
209
+ return self.parser.selector.find(query, self)
210
+
211
+ def css_first(self, str query, default=None, bool strict=False):
212
+ """Same as `css` but returns only the first match.
213
+
214
+ When `strict=False` stops at the first match. Works faster.
215
+
216
+ Parameters
217
+ ----------
218
+
219
+ query : str
220
+ default : Any, default None
221
+ Default value to return if there is no match.
222
+ strict: bool, default False
223
+ Set to True if you want to check if there is strictly only one match in the document.
224
+
225
+
226
+ Returns
227
+ -------
228
+ selector : `LexborNode` object
229
+ """
230
+ if strict:
231
+ results = self.parser.selector.find(query, self)
232
+ else:
233
+ results = self.parser.selector.find_first(query, self)
234
+ n_results = len(results)
235
+ if n_results > 0:
236
+ if strict and n_results > 1:
237
+ raise ValueError("Expected 1 match, but found %s matches" % n_results)
238
+ return results[0]
239
+ return default
240
+
241
+ def any_css_matches(self, tuple selectors):
242
+ """Returns True if any of CSS selectors matches a node"""
243
+ for selector in selectors:
244
+ if self.parser.selector.any_matches(selector, self):
245
+ return True
246
+ return False
247
+
248
+ def css_matches(self, str selector):
249
+ """Returns True if CSS selector matches a node."""
250
+ return bool(self.parser.selector.any_matches(selector, self))
251
+
252
+ def __repr__(self):
253
+ return '<LexborNode %s>' % self.tag
254
+
255
+ @property
256
+ def tag_id(self):
257
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(self.node)
258
+ return tag_id
259
+
260
+ @property
261
+ def tag(self):
262
+ """Return the name of the current tag (e.g. div, p, img).
263
+
264
+ For for non-tag nodes, returns the following names:
265
+
266
+ * `-text` - text node
267
+ * `-document` - document node
268
+ * `-comment` - comment node
269
+
270
+ This
271
+
272
+ Returns
273
+ -------
274
+ text : str
275
+ """
276
+
277
+ cdef lxb_char_t *c_text
278
+ cdef size_t str_len = 0
279
+ if self.tag_id in [LXB_TAG__EM_DOCTYPE, LXB_TAG__TEXT, LXB_TAG__EM_COMMENT]:
280
+ return _TAG_TO_NAME[self.tag_id]
281
+ c_text = lxb_dom_element_qualified_name(<lxb_dom_element_t *> self.node, &str_len)
282
+ text = None
283
+ if c_text:
284
+ text = c_text.decode(_ENCODING)
285
+ return text
286
+
287
+ def decompose(self, bool recursive=True):
288
+ """Remove the current node from the tree.
289
+
290
+ Parameters
291
+ ----------
292
+ recursive : bool, default True
293
+ Whenever to delete all its child nodes
294
+
295
+ Examples
296
+ --------
297
+
298
+ >>> tree = LexborHTMLParser(html)
299
+ >>> for tag in tree.css('script'):
300
+ >>> tag.decompose()
301
+
302
+ """
303
+ if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
304
+ raise SelectolaxError("Decomposing the root node is not allowed.")
305
+
306
+ if recursive:
307
+ node_remove_deep(<lxb_dom_node_t *> self.node)
308
+ else:
309
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
310
+
311
+ def strip_tags(self, list tags, bool recursive = False):
312
+ """Remove specified tags from the HTML tree.
313
+
314
+ Parameters
315
+ ----------
316
+ tags : list
317
+ List of tags to remove.
318
+ recursive : bool, default True
319
+ Whenever to delete all its child nodes
320
+
321
+ Examples
322
+ --------
323
+
324
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
325
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
326
+ >>> tree.strip_tags(tags)
327
+ >>> tree.html
328
+ '<html><body><div>Hello world!</div></body></html>'
329
+
330
+ """
331
+ cdef LexborNode element
332
+ for tag in tags:
333
+ for element in self.css(tag):
334
+ element.decompose(recursive=recursive)
335
+
336
+ @property
337
+ def attributes(self):
338
+ """Get all attributes that belong to the current node.
339
+
340
+ The value of empty attributes is None.
341
+
342
+ Returns
343
+ -------
344
+ attributes : dictionary of all attributes.
345
+
346
+ Examples
347
+ --------
348
+
349
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
350
+ >>> node = tree.css_first('div')
351
+ >>> node.attributes
352
+ {'data': None, 'id': 'my_id'}
353
+ """
354
+ cdef lxb_dom_attr_t *attr = lxb_dom_element_first_attribute_noi(<lxb_dom_element_t *> self.node)
355
+ cdef size_t str_len = 0
356
+ attributes = dict()
357
+
358
+ if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
359
+ return attributes
360
+
361
+ while attr != NULL:
362
+ key = lxb_dom_attr_local_name_noi(attr, &str_len)
363
+ value = lxb_dom_attr_value_noi(attr, &str_len)
364
+
365
+ if value:
366
+ py_value = value.decode(_ENCODING)
367
+ else:
368
+ py_value = None
369
+ attributes[key.decode(_ENCODING)] = py_value
370
+
371
+ attr = attr.next
372
+ return attributes
373
+
374
+ @property
375
+ def attrs(self):
376
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
377
+
378
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
379
+
380
+ Returns
381
+ -------
382
+ attributes : Attributes mapping object.
383
+
384
+ Examples
385
+ --------
386
+
387
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
388
+ >>> node = tree.css_first('div')
389
+ >>> node.attrs
390
+ <div attributes, 1 items>
391
+ >>> node.attrs['id']
392
+ 'a'
393
+ >>> node.attrs['foo'] = 'bar'
394
+ >>> del node.attrs['id']
395
+ >>> node.attributes
396
+ {'foo': 'bar'}
397
+ >>> node.attrs['id'] = 'new_id'
398
+ >>> node.html
399
+ '<div foo="bar" id="new_id"></div>'
400
+ """
401
+ cdef LexborAttributes attributes = LexborAttributes.create(<lxb_dom_node_t *>self.node)
402
+ return attributes
403
+
404
+ @property
405
+ def id(self):
406
+ """Get the id attribute of the node.
407
+
408
+ Returns None if id does not set.
409
+
410
+ Returns
411
+ -------
412
+ text : str
413
+ """
414
+ cdef char * key = 'id'
415
+ cdef size_t str_len
416
+ cdef lxb_dom_attr_t * attr = lxb_dom_element_attr_by_name(
417
+ <lxb_dom_element_t *> self.node,
418
+ <lxb_char_t *> key, 2
419
+ )
420
+ if attr != NULL:
421
+ value = lxb_dom_attr_value_noi(attr, &str_len)
422
+ return value.decode(_ENCODING) if value else None
423
+ return None
424
+
425
+ def iter(self, include_text=False):
426
+ """Iterate over nodes on the current level.
427
+
428
+ Parameters
429
+ ----------
430
+ include_text : bool
431
+ If True, includes text nodes as well.
432
+
433
+ Yields
434
+ -------
435
+ node
436
+ """
437
+
438
+ cdef lxb_dom_node_t *node = self.node.first_child
439
+ cdef LexborNode next_node
440
+
441
+ while node != NULL:
442
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
443
+ node = node.next
444
+ continue
445
+
446
+ next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
447
+ yield next_node
448
+ node = node.next
449
+
450
+ def unwrap(self, bint delete_empty=False):
451
+ """Replace node with whatever is inside this node.
452
+
453
+ Does nothing if you perform unwrapping second time on the same node.
454
+
455
+ Parameters
456
+ ----------
457
+ delete_empty : bool, default False
458
+ If True, removes empty tags.
459
+
460
+ Examples
461
+ --------
462
+
463
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
464
+ >>> tree.css_first('i').unwrap()
465
+ >>> tree.html
466
+ '<html><head></head><body><div>Hello world!</div></body></html>'
467
+
468
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
469
+ """
470
+
471
+ if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
472
+ logger.error("Attempt to unwrap removed node. Does nothing.")
473
+ return
474
+
475
+ if self.node.first_child == NULL:
476
+ if delete_empty:
477
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
478
+ return
479
+ cdef lxb_dom_node_t* next_node
480
+ cdef lxb_dom_node_t* current_node
481
+
482
+ if self.node.first_child.next != NULL:
483
+ current_node = self.node.first_child
484
+ next_node = current_node.next
485
+
486
+ while next_node != NULL:
487
+ next_node = current_node.next
488
+ lxb_dom_node_insert_before(self.node, current_node)
489
+ current_node = next_node
490
+ else:
491
+ lxb_dom_node_insert_before(self.node, self.node.first_child)
492
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
493
+
494
+ def unwrap_tags(self, list tags, bint delete_empty = False):
495
+ """Unwraps specified tags from the HTML tree.
496
+
497
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
498
+
499
+ Parameters
500
+ ----------
501
+ tags : list
502
+ List of tags to remove.
503
+ delete_empty : bool, default False
504
+ If True, removes empty tags.
505
+
506
+ Examples
507
+ --------
508
+
509
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
510
+ >>> tree.body.unwrap_tags(['i','a'])
511
+ >>> tree.body.html
512
+ '<body><div>Hello world!</div></body>'
513
+
514
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
515
+ """
516
+ cdef LexborNode element
517
+ for tag in tags:
518
+ for element in self.css(tag):
519
+ element.unwrap(delete_empty)
520
+
521
+ def merge_text_nodes(self):
522
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
523
+
524
+ This is useful for text extraction.
525
+ Use it when you need to strip HTML tags and merge "dangling" text.
526
+
527
+ Examples
528
+ --------
529
+
530
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
531
+ >>> node = tree.css_first('div')
532
+ >>> tree.unwrap_tags(["strong"])
533
+ >>> tree.text(deep=True, separator=" ", strip=True)
534
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
535
+ >>> node.merge_text_nodes()
536
+ >>> tree.text(deep=True, separator=" ", strip=True)
537
+ "John Doe"
538
+ """
539
+ cdef lxb_dom_node_t *node = self.node.first_child
540
+ cdef lxb_dom_node_t *next_node
541
+ cdef lxb_char_t *left_text
542
+ cdef lxb_char_t *right_text
543
+ cdef size_t left_length, right_length
544
+
545
+ while node != NULL:
546
+ next_node = node.next
547
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
548
+ left_text = lxb_dom_node_text_content(node.prev, &left_length)
549
+ right_text = lxb_dom_node_text_content(node, &right_length)
550
+ if left_text and right_text:
551
+ combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
552
+ lxb_dom_node_text_content_set(node, combined, len(combined))
553
+ lxb_dom_node_remove(node.prev)
554
+
555
+ if left_text is not NULL:
556
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, left_text)
557
+ if right_text is not NULL:
558
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, right_text)
559
+
560
+ if node.first_child:
561
+ LexborNode.new(node, self.parser).merge_text_nodes()
562
+ node = next_node
563
+
564
+ def traverse(self, include_text=False):
565
+ """Iterate over all child and next nodes starting from the current level.
566
+
567
+ Parameters
568
+ ----------
569
+ include_text : bool
570
+ If True, includes text nodes as well.
571
+
572
+ Yields
573
+ -------
574
+ node
575
+ """
576
+ cdef lxb_dom_node_t * root = self.node
577
+ cdef lxb_dom_node_t * node = root
578
+ cdef LexborNode lxb_node
579
+
580
+ while node != NULL:
581
+ if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
582
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
583
+ yield lxb_node
584
+
585
+ if node.first_child != NULL:
586
+ node = node.first_child
587
+ else:
588
+ while node != root and node.next == NULL:
589
+ node = node.parent
590
+ if node == root:
591
+ break
592
+ node = node.next
593
+
594
+ def replace_with(self, str_or_LexborNode value):
595
+ """Replace current Node with specified value.
596
+
597
+ Parameters
598
+ ----------
599
+ value : str, bytes or Node
600
+ The text or Node instance to replace the Node with.
601
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
602
+ Convert and pass the ``Node`` object when you want to work with HTML.
603
+ Does not clone the ``Node`` object.
604
+ All future changes to the passed ``Node`` object will also be taken into account.
605
+
606
+ Examples
607
+ --------
608
+
609
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
610
+ >>> img = tree.css_first('img')
611
+ >>> img.replace_with(img.attributes.get('alt', ''))
612
+ >>> tree.body.child.html
613
+ '<div>Get Laptop</div>'
614
+
615
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
616
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
617
+ >>> img_node = html_parser.css_first('img')
618
+ >>> img_node.replace_with(html_parser2.body.child)
619
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
620
+ """
621
+ cdef lxb_dom_node_t * new_node
622
+
623
+ if isinstance(value, (str, bytes, unicode)):
624
+ bytes_val = to_bytes(value)
625
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
626
+ &self.parser.document.dom_document,
627
+ <lxb_char_t *> bytes_val, len(bytes_val)
628
+ )
629
+ if new_node == NULL:
630
+ raise SelectolaxError("Can't create a new node")
631
+ lxb_dom_node_insert_before(self.node, new_node)
632
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
633
+ elif isinstance(value, LexborNode):
634
+ new_node = lxb_dom_document_import_node(
635
+ &self.parser.document.dom_document,
636
+ <lxb_dom_node_t *> value.node,
637
+ <bint> True
638
+ )
639
+ if new_node == NULL:
640
+ raise SelectolaxError("Can't create a new node")
641
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
642
+ lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
643
+ else:
644
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
645
+
646
+ def insert_before(self, str_or_LexborNode value):
647
+ """
648
+ Insert a node before the current Node.
649
+
650
+ Parameters
651
+ ----------
652
+ value : str, bytes or Node
653
+ The text or Node instance to insert before the Node.
654
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
655
+ Convert and pass the ``Node`` object when you want to work with HTML.
656
+ Does not clone the ``Node`` object.
657
+ All future changes to the passed ``Node`` object will also be taken into account.
658
+
659
+ Examples
660
+ --------
661
+
662
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
663
+ >>> img = tree.css_first('img')
664
+ >>> img.insert_before(img.attributes.get('alt', ''))
665
+ >>> tree.body.child.html
666
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
667
+
668
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
669
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
670
+ >>> img_node = html_parser.css_first('img')
671
+ >>> img_node.insert_before(html_parser2.body.child)
672
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
673
+ """
674
+ cdef lxb_dom_node_t * new_node
675
+
676
+ if isinstance(value, (str, bytes, unicode)):
677
+ bytes_val = to_bytes(value)
678
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
679
+ &self.parser.document.dom_document,
680
+ <lxb_char_t *> bytes_val, len(bytes_val)
681
+ )
682
+ if new_node == NULL:
683
+ raise SelectolaxError("Can't create a new node")
684
+ lxb_dom_node_insert_before(self.node, new_node)
685
+ elif isinstance(value, LexborNode):
686
+ new_node = lxb_dom_document_import_node(
687
+ &self.parser.document.dom_document,
688
+ <lxb_dom_node_t *> value.node,
689
+ <bint> True
690
+ )
691
+ if new_node == NULL:
692
+ raise SelectolaxError("Can't create a new node")
693
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
694
+ else:
695
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
696
+
697
+ def insert_after(self, str_or_LexborNode value):
698
+ """
699
+ Insert a node after the current Node.
700
+
701
+ Parameters
702
+ ----------
703
+ value : str, bytes or Node
704
+ The text or Node instance to insert after the Node.
705
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
706
+ Convert and pass the ``Node`` object when you want to work with HTML.
707
+ Does not clone the ``Node`` object.
708
+ All future changes to the passed ``Node`` object will also be taken into account.
709
+
710
+ Examples
711
+ --------
712
+
713
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
714
+ >>> img = tree.css_first('img')
715
+ >>> img.insert_after(img.attributes.get('alt', ''))
716
+ >>> tree.body.child.html
717
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
718
+
719
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
720
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
721
+ >>> img_node = html_parser.css_first('img')
722
+ >>> img_node.insert_after(html_parser2.body.child)
723
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
724
+ """
725
+ cdef lxb_dom_node_t * new_node
726
+
727
+ if isinstance(value, (str, bytes, unicode)):
728
+ bytes_val = to_bytes(value)
729
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
730
+ &self.parser.document.dom_document,
731
+ <lxb_char_t *> bytes_val, len(bytes_val)
732
+ )
733
+ if new_node == NULL:
734
+ raise SelectolaxError("Can't create a new node")
735
+ lxb_dom_node_insert_after(self.node, new_node)
736
+ elif isinstance(value, LexborNode):
737
+ new_node = lxb_dom_document_import_node(
738
+ &self.parser.document.dom_document,
739
+ <lxb_dom_node_t *> value.node,
740
+ <bint> True
741
+ )
742
+ if new_node == NULL:
743
+ raise SelectolaxError("Can't create a new node")
744
+ lxb_dom_node_insert_after(self.node, <lxb_dom_node_t *> new_node)
745
+ else:
746
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
747
+
748
+ def insert_child(self, str_or_LexborNode value):
749
+ """
750
+ Insert a node inside (at the end of) the current Node.
751
+
752
+ Parameters
753
+ ----------
754
+ value : str, bytes or Node
755
+ The text or Node instance to insert inside the Node.
756
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
757
+ Convert and pass the ``Node`` object when you want to work with HTML.
758
+ Does not clone the ``Node`` object.
759
+ All future changes to the passed ``Node`` object will also be taken into account.
760
+
761
+ Examples
762
+ --------
763
+
764
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
765
+ >>> div = tree.css_first('div')
766
+ >>> div.insert_child('Laptop')
767
+ >>> tree.body.child.html
768
+ '<div>Get <img src="">Laptop</div>'
769
+
770
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
771
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
772
+ >>> span_node = html_parser.css_first('span')
773
+ >>> span_node.insert_child(html_parser2.body.child)
774
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
775
+ """
776
+ cdef lxb_dom_node_t * new_node
777
+
778
+ if isinstance(value, (str, bytes, unicode)):
779
+ bytes_val = to_bytes(value)
780
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
781
+ &self.parser.document.dom_document,
782
+ <lxb_char_t *> bytes_val, len(bytes_val)
783
+ )
784
+ if new_node == NULL:
785
+ raise SelectolaxError("Can't create a new node")
786
+ lxb_dom_node_insert_child(self.node, new_node)
787
+ elif isinstance(value, LexborNode):
788
+ new_node = lxb_dom_document_import_node(
789
+ &self.parser.document.dom_document,
790
+ <lxb_dom_node_t *> value.node,
791
+ <bint> True
792
+ )
793
+ if new_node == NULL:
794
+ raise SelectolaxError("Can't create a new node")
795
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
796
+ else:
797
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
798
+
799
+ @property
800
+ def raw_value(self):
801
+ """Return the raw (unparsed, original) value of a node.
802
+
803
+ Currently, works on text nodes only.
804
+
805
+ Returns
806
+ -------
807
+
808
+ raw_value : bytes
809
+
810
+ Examples
811
+ --------
812
+
813
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
814
+ >>> selector = html_parser.css_first('div')
815
+ >>> selector.child.html
816
+ '&lt;test&gt;'
817
+ >>> selector.child.raw_value
818
+ b'&#x3C;test&#x3E;'
819
+ """
820
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
821
+
822
+ def scripts_contain(self, str query):
823
+ """Returns True if any of the script tags contain specified text.
824
+
825
+ Caches script tags on the first call to improve performance.
826
+
827
+ Parameters
828
+ ----------
829
+ query : str
830
+ The query to check.
831
+
832
+ """
833
+ cdef LexborNode node
834
+ if self.parser.cached_script_texts is None:
835
+ nodes = self.parser.selector.find('script', self)
836
+ text_nodes = []
837
+ for node in nodes:
838
+ node_text = node.text(deep=True)
839
+ if node_text:
840
+ text_nodes.append(node_text)
841
+ self.parser.cached_script_texts = text_nodes
842
+
843
+ for text in self.parser.cached_script_texts:
844
+ if query in text:
845
+ return True
846
+ return False
847
+
848
+ def script_srcs_contain(self, tuple queries):
849
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
850
+
851
+ Caches values on the first call to improve performance.
852
+
853
+ Parameters
854
+ ----------
855
+ queries : tuple of str
856
+
857
+ """
858
+ cdef LexborNode node
859
+ if self.parser.cached_script_srcs is None:
860
+ nodes = self.parser.selector.find('script', self)
861
+ src_nodes = []
862
+ for node in nodes:
863
+ node_src = node.attrs.get('src')
864
+ if node_src:
865
+ src_nodes.append(node_src)
866
+ self.parser.cached_script_srcs = src_nodes
867
+
868
+ for text in self.parser.cached_script_srcs:
869
+ for query in queries:
870
+ if query in text:
871
+ return True
872
+ return False
873
+
874
+ def remove(self, bool recursive=True):
875
+ """An alias for the decompose method."""
876
+ self.decompose(recursive)
877
+
878
+ def select(self, query=None):
879
+ """Select nodes given a CSS selector.
880
+
881
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
882
+
883
+ Parameters
884
+ ----------
885
+ query : str or None
886
+ The CSS selector to use when searching for nodes.
887
+
888
+ Returns
889
+ -------
890
+ selector : The `Selector` class.
891
+ """
892
+ return LexborSelector(self, query)
893
+
894
+ def __eq__(self, other):
895
+ if isinstance(other, str):
896
+ return self.html == other
897
+ if not isinstance(other, LexborNode):
898
+ return False
899
+ return self.html == other.html
900
+
901
+ @property
902
+ def text_content(self):
903
+ """Returns the text of the node if it is a text node.
904
+
905
+ Returns None for other nodes.
906
+ Unlike the ``text`` method, does not include child nodes.
907
+
908
+ Returns
909
+ -------
910
+ text : str or None.
911
+ """
912
+ cdef unsigned char * text
913
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
914
+ cdef TextContainer container
915
+ if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
916
+ return None
917
+
918
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
919
+ if text != NULL:
920
+ container = TextContainer.new_with_defaults()
921
+ py_text = text.decode(_ENCODING)
922
+ container.append(py_text)
923
+ return container.text
924
+
925
+ @property
926
+ def inner_html(self) -> str:
927
+ """Return HTML representation of the child nodes.
928
+
929
+ Works similar to innerHTML in JavaScript.
930
+ Unlike the `.html` property, does not include the current node.
931
+ Can be used to set HTML as well. See the setter docstring.
932
+
933
+ Returns
934
+ -------
935
+ text : str | None
936
+ """
937
+
938
+ cdef lexbor_str_t *lxb_str
939
+ cdef lxb_status_t status
940
+
941
+ lxb_str = lexbor_str_create()
942
+ status = lxb_html_serialize_deep_str(self.node, lxb_str)
943
+ if status == 0 and lxb_str.data:
944
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
945
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
946
+ return html
947
+ return None
948
+
949
+ @inner_html.setter
950
+ def inner_html(self, str html):
951
+ """Set inner HTML to the specified HTML.
952
+
953
+ Replaces existing data inside the node.
954
+ Works similar to innerHTML in JavaScript.
955
+
956
+ Parameters
957
+ ----------
958
+ html : str | None
959
+
960
+ """
961
+ cdef bytes bytes_val
962
+ bytes_val = <bytes>html.encode("utf-8")
963
+ lxb_html_element_inner_html_set(
964
+ <lxb_html_element_t *>self.node,
965
+ <lxb_char_t *> bytes_val, len(bytes_val)
966
+ )
967
+
968
+ def clone(self) -> LexborNode:
969
+ """Clone the current node.
970
+
971
+ You can use to do temporary modifications without affecting the original HTML tree.
972
+
973
+ It is tied to the current parser instance.
974
+ Gets destroyed when parser instance is destroyed.
975
+ """
976
+ cdef lxb_dom_node_t* node
977
+ node = lxb_dom_node_clone(<lxb_dom_node_t *> self.node, 1)
978
+ return LexborNode.new(node, self.parser)
979
+
980
+
981
+ @cython.internal
982
+ @cython.final
983
+ cdef class TextContainer:
984
+ cdef str _text
985
+ cdef str separator
986
+ cdef bint strip
987
+
988
+ @staticmethod
989
+ cdef TextContainer new_with_defaults():
990
+ cdef TextContainer cls = TextContainer.__new__(TextContainer)
991
+ cls._text = ''
992
+ cls.separator = ''
993
+ cls.strip = False
994
+ return cls
995
+
996
+ def __init__(self, str separator = '', bool strip = False):
997
+ self._text = ""
998
+ self.separator = separator
999
+ self.strip = strip
1000
+
1001
+ def append(self, str node_text):
1002
+ if self.strip:
1003
+ self._text += node_text.strip() + self.separator
1004
+ else:
1005
+ self._text += node_text + self.separator
1006
+
1007
+ @property
1008
+ def text(self):
1009
+ if self.separator and self._text and self._text.endswith(self.separator):
1010
+ self._text = self._text[:-len(self.separator)]
1011
+ return self._text
1012
+
1013
+
1014
+ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
1015
+ cdef unsigned char *text
1016
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
1017
+ if tag_id != LXB_TAG__TEXT:
1018
+ return LEXBOR_ACTION_OK
1019
+
1020
+ text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
1021
+ if not text:
1022
+ return LEXBOR_ACTION_OK
1023
+
1024
+ try:
1025
+ py_str = text.decode(_ENCODING, "replace")
1026
+
1027
+ except Exception as e:
1028
+ PyErr_SetNone(e)
1029
+ return LEXBOR_ACTION_STOP
1030
+
1031
+ cdef TextContainer cls
1032
+ cls = <TextContainer> ctx
1033
+ cls.append(py_str)
1034
+ return LEXBOR_ACTION_OK