selectolax 0.3.30__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

@@ -0,0 +1,879 @@
1
+ cimport cython
2
+
3
+ _TAG_TO_NAME = {
4
+ 0x0005: "- doctype",
5
+ 0x0002: "-text",
6
+ 0x0004: "-comment",
7
+ }
8
+ ctypedef fused str_or_LexborNode:
9
+ str
10
+ bytes
11
+ LexborNode
12
+
13
+ cdef inline bytes to_bytes(str_or_LexborNode value):
14
+ cdef bytes bytes_val
15
+ if isinstance(value, unicode):
16
+ bytes_val = <bytes>value.encode("utf-8")
17
+ elif isinstance(value, bytes):
18
+ bytes_val = <bytes>value
19
+ return bytes_val
20
+
21
+ @cython.final
22
+ cdef class LexborNode:
23
+ """A class that represents HTML node (element)."""
24
+
25
+ cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
26
+ self.parser = parser
27
+ self.node = node
28
+ return self
29
+
30
+ @property
31
+ def mem_id(self):
32
+ return <size_t> self.node
33
+
34
+ @property
35
+ def child(self):
36
+ """Alias for the `first_child` property."""
37
+ return self.first_child
38
+
39
+ @property
40
+ def first_child(self):
41
+ """Return the first child node."""
42
+ cdef LexborNode node
43
+ if self.node.first_child:
44
+ node = LexborNode()
45
+ node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
46
+ return node
47
+ return None
48
+
49
+ @property
50
+ def parent(self):
51
+ """Return the parent node."""
52
+ cdef LexborNode node
53
+ if self.node.parent:
54
+ node = LexborNode()
55
+ node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
56
+ return node
57
+ return None
58
+
59
+ @property
60
+ def next(self):
61
+ """Return next node."""
62
+ cdef LexborNode node
63
+ if self.node.next:
64
+ node = LexborNode()
65
+ node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
66
+ return node
67
+ return None
68
+
69
+ @property
70
+ def prev(self):
71
+ """Return previous node."""
72
+ cdef LexborNode node
73
+ if self.node.prev:
74
+ node = LexborNode()
75
+ node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
76
+ return node
77
+ return None
78
+
79
+ @property
80
+ def last_child(self):
81
+ """Return last child node."""
82
+ cdef LexborNode node
83
+ if self.node.last_child:
84
+ node = LexborNode()
85
+ node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
86
+ return node
87
+ return None
88
+
89
+ @property
90
+ def html(self):
91
+ """Return HTML representation of the current node including all its child nodes.
92
+
93
+ Returns
94
+ -------
95
+ text : str
96
+ """
97
+ cdef lexbor_str_t *lxb_str
98
+ cdef lxb_status_t status
99
+
100
+ lxb_str = lexbor_str_create()
101
+ status = lxb_html_serialize_tree_str(self.node, lxb_str)
102
+ if status == 0 and lxb_str.data:
103
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
104
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
105
+ return html
106
+ return None
107
+
108
+ def __hash__(self):
109
+ return self.mem_id
110
+
111
+ def text_lexbor(self):
112
+ """Returns the text of the node including text of all its child nodes.
113
+
114
+ Uses builtin method from lexbor.
115
+ """
116
+
117
+ cdef size_t str_len = 0
118
+ cdef lxb_char_t * text
119
+
120
+ text = lxb_dom_node_text_content(self.node, &str_len)
121
+ if <int>str_len == 0:
122
+ raise RuntimeError("Can't extract text")
123
+
124
+ unicode_text = text.decode(_ENCODING)
125
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
126
+ return unicode_text
127
+
128
+ def text(self, bool deep=True, str separator='', bool strip=False):
129
+ """Returns the text of the node including text of all its child nodes.
130
+
131
+ Parameters
132
+ ----------
133
+ strip : bool, default False
134
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
135
+ separator : str, default ''
136
+ The separator to use when joining text from different nodes.
137
+ deep : bool, default True
138
+ If True, includes text from all child nodes.
139
+
140
+ Returns
141
+ -------
142
+ text : str
143
+
144
+ """
145
+ cdef unsigned char * text
146
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
147
+
148
+ if not deep:
149
+ container = TextContainer(separator, strip)
150
+ if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT:
151
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
152
+ if text != NULL:
153
+ py_text = text.decode(_ENCODING)
154
+ container.append(py_text)
155
+
156
+ while node != NULL:
157
+ if node.type == LXB_DOM_NODE_TYPE_TEXT:
158
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
159
+ if text != NULL:
160
+ py_text = text.decode(_ENCODING)
161
+ container.append(py_text)
162
+ node = node.next
163
+ return container.text
164
+ else:
165
+ container = TextContainer(separator, strip)
166
+ if self.node.type == LXB_DOM_NODE_TYPE_TEXT:
167
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
168
+ if text != NULL:
169
+ container.append(text.decode(_ENCODING))
170
+
171
+ lxb_dom_node_simple_walk(
172
+ <lxb_dom_node_t *> self.node,
173
+ <lxb_dom_node_simple_walker_f>text_callback,
174
+ <void *>container
175
+ )
176
+ return container.text
177
+
178
+ def css(self, str query):
179
+ """Evaluate CSS selector against current node and its child nodes.
180
+
181
+ Matches pattern `query` against HTML tree.
182
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
183
+
184
+ Parameters
185
+ ----------
186
+ query : str
187
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
188
+
189
+ Returns
190
+ -------
191
+ selector : list of `Node` objects
192
+ """
193
+ return self.parser.selector.find(query, self)
194
+
195
+ def css_first(self, str query, default=None, bool strict=False):
196
+ """Same as `css` but returns only the first match.
197
+
198
+ Parameters
199
+ ----------
200
+
201
+ query : str
202
+ default : bool, default None
203
+ Default value to return if there is no match.
204
+ strict: bool, default True
205
+ Set to True if you want to check if there is strictly only one match in the document.
206
+
207
+
208
+ Returns
209
+ -------
210
+ selector : `LexborNode` object
211
+ """
212
+ # TODO: This can be improved.
213
+ results = self.css(query)
214
+ n_results = len(results)
215
+ if n_results > 0:
216
+ if strict and n_results > 1:
217
+ raise ValueError("Expected 1 match, but found %s matches" % n_results)
218
+ return results[0]
219
+ return default
220
+
221
+ def any_css_matches(self, tuple selectors):
222
+ """Returns True if any of CSS selectors matches a node"""
223
+ for selector in selectors:
224
+ if self.parser.selector.any_matches(selector, self):
225
+ return True
226
+ return False
227
+
228
+ def css_matches(self, str selector):
229
+ """Returns True if CSS selector matches a node."""
230
+ return self.parser.selector.any_matches(selector, self)
231
+
232
+ def __repr__(self):
233
+ return '<LexborNode %s>' % self.tag
234
+
235
+ @property
236
+ def tag_id(self):
237
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(self.node)
238
+ return tag_id
239
+
240
+ @property
241
+ def tag(self):
242
+ """Return the name of the current tag (e.g. div, p, img).
243
+
244
+ Returns
245
+ -------
246
+ text : str
247
+ """
248
+
249
+ cdef lxb_char_t *c_text
250
+ cdef size_t str_len = 0
251
+ if self.tag_id in [LXB_TAG__EM_DOCTYPE, LXB_TAG__TEXT, LXB_TAG__EM_COMMENT]:
252
+ return _TAG_TO_NAME[self.tag_id]
253
+ c_text = lxb_dom_element_qualified_name(<lxb_dom_element_t *> self.node, &str_len)
254
+ text = None
255
+ if c_text:
256
+ text = c_text.decode(_ENCODING)
257
+ return text
258
+
259
+
260
+ def decompose(self, bool recursive=True):
261
+ """Remove the current node from the tree.
262
+
263
+ Parameters
264
+ ----------
265
+ recursive : bool, default True
266
+ Whenever to delete all its child nodes
267
+
268
+ Examples
269
+ --------
270
+
271
+ >>> tree = LexborHTMLParser(html)
272
+ >>> for tag in tree.css('script'):
273
+ >>> tag.decompose()
274
+
275
+ """
276
+ if recursive:
277
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
278
+ else:
279
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
280
+
281
+ def strip_tags(self, list tags, bool recursive = False):
282
+ """Remove specified tags from the HTML tree.
283
+
284
+ Parameters
285
+ ----------
286
+ tags : list
287
+ List of tags to remove.
288
+ recursive : bool, default True
289
+ Whenever to delete all its child nodes
290
+
291
+ Examples
292
+ --------
293
+
294
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
295
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
296
+ >>> tree.strip_tags(tags)
297
+ >>> tree.html
298
+ '<html><body><div>Hello world!</div></body></html>'
299
+
300
+ """
301
+ for tag in tags:
302
+ for element in self.css(tag):
303
+ element.decompose(recursive=recursive)
304
+
305
+
306
+ @property
307
+ def attributes(self):
308
+ """Get all attributes that belong to the current node.
309
+
310
+ The value of empty attributes is None.
311
+
312
+ Returns
313
+ -------
314
+ attributes : dictionary of all attributes.
315
+
316
+ Examples
317
+ --------
318
+
319
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
320
+ >>> node = tree.css_first('div')
321
+ >>> node.attributes
322
+ {'data': None, 'id': 'my_id'}
323
+ """
324
+ cdef lxb_dom_attr_t *attr = lxb_dom_element_first_attribute_noi(<lxb_dom_element_t *> self.node)
325
+ cdef size_t str_len = 0
326
+ attributes = dict()
327
+
328
+ while attr != NULL:
329
+ key = lxb_dom_attr_local_name_noi(attr, &str_len)
330
+ value = lxb_dom_attr_value_noi(attr, &str_len)
331
+
332
+ if value:
333
+ py_value = value.decode(_ENCODING)
334
+ else:
335
+ py_value = None
336
+ attributes[key.decode(_ENCODING)] = py_value
337
+
338
+ attr = attr.next
339
+ return attributes
340
+
341
+ @property
342
+ def attrs(self):
343
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
344
+
345
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
346
+
347
+ Returns
348
+ -------
349
+ attributes : Attributes mapping object.
350
+
351
+ Examples
352
+ --------
353
+
354
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
355
+ >>> node = tree.css_first('div')
356
+ >>> node.attrs
357
+ <div attributes, 1 items>
358
+ >>> node.attrs['id']
359
+ 'a'
360
+ >>> node.attrs['foo'] = 'bar'
361
+ >>> del node.attrs['id']
362
+ >>> node.attributes
363
+ {'foo': 'bar'}
364
+ >>> node.attrs['id'] = 'new_id'
365
+ >>> node.html
366
+ '<div foo="bar" id="new_id"></div>'
367
+ """
368
+ cdef LexborAttributes attributes = LexborAttributes.create(<lxb_dom_node_t *>self.node)
369
+ return attributes
370
+
371
+ @property
372
+ def id(self):
373
+ """Get the id attribute of the node.
374
+
375
+ Returns None if id does not set.
376
+
377
+ Returns
378
+ -------
379
+ text : str
380
+ """
381
+ cdef char * key = 'id'
382
+ cdef size_t str_len
383
+ cdef lxb_dom_attr_t * attr = lxb_dom_element_attr_by_name(
384
+ <lxb_dom_element_t *> self.node,
385
+ <lxb_char_t *> key, 2
386
+ )
387
+ if attr != NULL:
388
+ value = lxb_dom_attr_value_noi(attr, &str_len)
389
+ return value.decode(_ENCODING) if value else None
390
+ return None
391
+
392
+ def iter(self, include_text=False):
393
+ """Iterate over nodes on the current level.
394
+
395
+ Parameters
396
+ ----------
397
+ include_text : bool
398
+ If True, includes text nodes as well.
399
+
400
+ Yields
401
+ -------
402
+ node
403
+ """
404
+
405
+ cdef lxb_dom_node_t *node = self.node.first_child
406
+ cdef LexborNode next_node
407
+
408
+ while node != NULL:
409
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
410
+ node = node.next
411
+ continue
412
+
413
+ next_node = LexborNode()
414
+ next_node._cinit(<lxb_dom_node_t *> node, self.parser)
415
+ yield next_node
416
+ node = node.next
417
+
418
+
419
+ def unwrap(self, delete_empty=False):
420
+ """Replace node with whatever is inside this node.
421
+
422
+ Parameters
423
+ ----------
424
+ delete_empty : bool, default False
425
+ If True, removes empty tags.
426
+
427
+ Examples
428
+ --------
429
+
430
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
431
+ >>> tree.css_first('i').unwrap()
432
+ >>> tree.html
433
+ '<html><head></head><body><div>Hello world!</div></body></html>'
434
+
435
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
436
+ """
437
+ if self.node.first_child == NULL:
438
+ if delete_empty:
439
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
440
+ return
441
+ cdef lxb_dom_node_t* next_node;
442
+ cdef lxb_dom_node_t* current_node;
443
+
444
+ if self.node.first_child.next != NULL:
445
+ current_node = self.node.first_child
446
+ next_node = current_node.next
447
+
448
+ while next_node != NULL:
449
+ next_node = current_node.next
450
+ lxb_dom_node_insert_before(self.node, current_node)
451
+ current_node = next_node
452
+ else:
453
+ lxb_dom_node_insert_before(self.node, self.node.first_child)
454
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
455
+
456
+ def unwrap_tags(self, list tags, delete_empty = False):
457
+ """Unwraps specified tags from the HTML tree.
458
+
459
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
460
+
461
+ Parameters
462
+ ----------
463
+ tags : list
464
+ List of tags to remove.
465
+ delete_empty : bool, default False
466
+ If True, removes empty tags.
467
+
468
+ Examples
469
+ --------
470
+
471
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
472
+ >>> tree.body.unwrap_tags(['i','a'])
473
+ >>> tree.body.html
474
+ '<body><div>Hello world!</div></body>'
475
+
476
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
477
+ """
478
+
479
+ for tag in tags:
480
+ for element in self.css(tag):
481
+ element.unwrap(delete_empty)
482
+
483
+
484
+ def traverse(self, include_text=False):
485
+ """Iterate over all child and next nodes starting from the current level.
486
+
487
+ Parameters
488
+ ----------
489
+ include_text : bool
490
+ If True, includes text nodes as well.
491
+
492
+ Yields
493
+ -------
494
+ node
495
+ """
496
+ cdef lxb_dom_node_t * root = self.node
497
+ cdef lxb_dom_node_t * node = root
498
+ cdef LexborNode lxb_node
499
+
500
+ while node != NULL:
501
+ if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
502
+ lxb_node = LexborNode()
503
+ lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
504
+ yield lxb_node
505
+
506
+ if node.first_child != NULL:
507
+ node = node.first_child
508
+ else:
509
+ while node != root and node.next == NULL:
510
+ node = node.parent
511
+ if node == root:
512
+ break
513
+ node = node.next
514
+
515
+ def replace_with(self, str_or_LexborNode value):
516
+ """Replace current Node with specified value.
517
+
518
+ Parameters
519
+ ----------
520
+ value : str, bytes or Node
521
+ The text or Node instance to replace the Node with.
522
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
523
+ Convert and pass the ``Node`` object when you want to work with HTML.
524
+ Does not clone the ``Node`` object.
525
+ All future changes to the passed ``Node`` object will also be taken into account.
526
+
527
+ Examples
528
+ --------
529
+
530
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
531
+ >>> img = tree.css_first('img')
532
+ >>> img.replace_with(img.attributes.get('alt', ''))
533
+ >>> tree.body.child.html
534
+ '<div>Get Laptop</div>'
535
+
536
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
537
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
538
+ >>> img_node = html_parser.css_first('img')
539
+ >>> img_node.replace_with(html_parser2.body.child)
540
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
541
+ """
542
+ cdef lxb_dom_node_t * new_node
543
+
544
+ if isinstance(value, (str, bytes, unicode)):
545
+ bytes_val = to_bytes(value)
546
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
547
+ &self.parser.document.dom_document,
548
+ <lxb_char_t *> bytes_val, len(bytes_val)
549
+ )
550
+ if new_node == NULL:
551
+ raise SelectolaxError("Can't create a new node")
552
+ lxb_dom_node_insert_before(self.node, new_node)
553
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
554
+ elif isinstance(value, LexborNode):
555
+ new_node = lxb_dom_document_import_node(
556
+ &self.parser.document.dom_document,
557
+ <lxb_dom_node_t *> value.node,
558
+ <bint> True
559
+ )
560
+ if new_node == NULL:
561
+ raise SelectolaxError("Can't create a new node")
562
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
563
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
564
+ else:
565
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
566
+
567
+
568
+ def insert_before(self, str_or_LexborNode value):
569
+ """
570
+ Insert a node before the current Node.
571
+
572
+ Parameters
573
+ ----------
574
+ value : str, bytes or Node
575
+ The text or Node instance to insert before the Node.
576
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
577
+ Convert and pass the ``Node`` object when you want to work with HTML.
578
+ Does not clone the ``Node`` object.
579
+ All future changes to the passed ``Node`` object will also be taken into account.
580
+
581
+ Examples
582
+ --------
583
+
584
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
585
+ >>> img = tree.css_first('img')
586
+ >>> img.insert_before(img.attributes.get('alt', ''))
587
+ >>> tree.body.child.html
588
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
589
+
590
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
591
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
592
+ >>> img_node = html_parser.css_first('img')
593
+ >>> img_node.insert_before(html_parser2.body.child)
594
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
595
+ """
596
+ cdef lxb_dom_node_t * new_node
597
+
598
+ if isinstance(value, (str, bytes, unicode)):
599
+ bytes_val = to_bytes(value)
600
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
601
+ &self.parser.document.dom_document,
602
+ <lxb_char_t *> bytes_val, len(bytes_val)
603
+ )
604
+ if new_node == NULL:
605
+ raise SelectolaxError("Can't create a new node")
606
+ lxb_dom_node_insert_before(self.node, new_node)
607
+ elif isinstance(value, LexborNode):
608
+ new_node = lxb_dom_document_import_node(
609
+ &self.parser.document.dom_document,
610
+ <lxb_dom_node_t *> value.node,
611
+ <bint> True
612
+ )
613
+ if new_node == NULL:
614
+ raise SelectolaxError("Can't create a new node")
615
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
616
+ else:
617
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
618
+
619
+ def insert_after(self, str_or_LexborNode value):
620
+ """
621
+ Insert a node after the current Node.
622
+
623
+ Parameters
624
+ ----------
625
+ value : str, bytes or Node
626
+ The text or Node instance to insert after the Node.
627
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
628
+ Convert and pass the ``Node`` object when you want to work with HTML.
629
+ Does not clone the ``Node`` object.
630
+ All future changes to the passed ``Node`` object will also be taken into account.
631
+
632
+ Examples
633
+ --------
634
+
635
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
636
+ >>> img = tree.css_first('img')
637
+ >>> img.insert_after(img.attributes.get('alt', ''))
638
+ >>> tree.body.child.html
639
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
640
+
641
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
642
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
643
+ >>> img_node = html_parser.css_first('img')
644
+ >>> img_node.insert_after(html_parser2.body.child)
645
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
646
+ """
647
+ cdef lxb_dom_node_t * new_node
648
+
649
+ if isinstance(value, (str, bytes, unicode)):
650
+ bytes_val = to_bytes(value)
651
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
652
+ &self.parser.document.dom_document,
653
+ <lxb_char_t *> bytes_val, len(bytes_val)
654
+ )
655
+ if new_node == NULL:
656
+ raise SelectolaxError("Can't create a new node")
657
+ lxb_dom_node_insert_after(self.node, new_node)
658
+ elif isinstance(value, LexborNode):
659
+ new_node = lxb_dom_document_import_node(
660
+ &self.parser.document.dom_document,
661
+ <lxb_dom_node_t *> value.node,
662
+ <bint> True
663
+ )
664
+ if new_node == NULL:
665
+ raise SelectolaxError("Can't create a new node")
666
+ lxb_dom_node_insert_after(self.node, <lxb_dom_node_t *> new_node)
667
+ else:
668
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
669
+
670
+ def insert_child(self, str_or_LexborNode value):
671
+ """
672
+ Insert a node inside (at the end of) the current Node.
673
+
674
+ Parameters
675
+ ----------
676
+ value : str, bytes or Node
677
+ The text or Node instance to insert inside the Node.
678
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
679
+ Convert and pass the ``Node`` object when you want to work with HTML.
680
+ Does not clone the ``Node`` object.
681
+ All future changes to the passed ``Node`` object will also be taken into account.
682
+
683
+ Examples
684
+ --------
685
+
686
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
687
+ >>> div = tree.css_first('div')
688
+ >>> div.insert_child('Laptop')
689
+ >>> tree.body.child.html
690
+ '<div>Get <img src="">Laptop</div>'
691
+
692
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
693
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
694
+ >>> span_node = html_parser.css_first('span')
695
+ >>> span_node.insert_child(html_parser2.body.child)
696
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
697
+ """
698
+ cdef lxb_dom_node_t * new_node
699
+
700
+ if isinstance(value, (str, bytes, unicode)):
701
+ bytes_val = to_bytes(value)
702
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
703
+ &self.parser.document.dom_document,
704
+ <lxb_char_t *> bytes_val, len(bytes_val)
705
+ )
706
+ if new_node == NULL:
707
+ raise SelectolaxError("Can't create a new node")
708
+ lxb_dom_node_insert_child(self.node, new_node)
709
+ elif isinstance(value, LexborNode):
710
+ new_node = lxb_dom_document_import_node(
711
+ &self.parser.document.dom_document,
712
+ <lxb_dom_node_t *> value.node,
713
+ <bint> True
714
+ )
715
+ if new_node == NULL:
716
+ raise SelectolaxError("Can't create a new node")
717
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
718
+ else:
719
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
720
+
721
+ @property
722
+ def raw_value(self):
723
+ """Return the raw (unparsed, original) value of a node.
724
+
725
+ Currently, works on text nodes only.
726
+
727
+ Returns
728
+ -------
729
+
730
+ raw_value : bytes
731
+
732
+ Examples
733
+ --------
734
+
735
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
736
+ >>> selector = html_parser.css_first('div')
737
+ >>> selector.child.html
738
+ '&lt;test&gt;'
739
+ >>> selector.child.raw_value
740
+ b'&#x3C;test&#x3E;'
741
+ """
742
+ raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
743
+
744
+ def scripts_contain(self, str query):
745
+ """Returns True if any of the script tags contain specified text.
746
+
747
+ Caches script tags on the first call to improve performance.
748
+
749
+ Parameters
750
+ ----------
751
+ query : str
752
+ The query to check.
753
+
754
+ """
755
+ if self.parser.cached_script_texts is None:
756
+ nodes = self.parser.selector.find('script', self)
757
+ text_nodes = []
758
+ for node in nodes:
759
+ node_text = node.text(deep=True)
760
+ if node_text:
761
+ text_nodes.append(node_text)
762
+ self.parser.cached_script_texts = text_nodes
763
+
764
+ for text in self.parser.cached_script_texts:
765
+ if query in text:
766
+ return True
767
+ return False
768
+
769
+ def script_srcs_contain(self, tuple queries):
770
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
771
+
772
+ Caches values on the first call to improve performance.
773
+
774
+ Parameters
775
+ ----------
776
+ queries : tuple of str
777
+
778
+ """
779
+ if self.parser.cached_script_srcs is None:
780
+ nodes = self.parser.selector.find('script', self)
781
+ src_nodes = []
782
+ for node in nodes:
783
+ node_src = node.attrs.get('src')
784
+ if node_src:
785
+ src_nodes.append(node_src)
786
+ self.parser.cached_script_srcs = src_nodes
787
+
788
+ for text in self.parser.cached_script_srcs:
789
+ for query in queries:
790
+ if query in text:
791
+ return True
792
+ return False
793
+
794
+ def remove(self, bool recursive=True):
795
+ """An alias for the decompose method."""
796
+ self.decompose(recursive)
797
+
798
+ def select(self, query=None):
799
+ """Select nodes given a CSS selector.
800
+
801
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
802
+
803
+ Parameters
804
+ ----------
805
+ query : str or None
806
+ The CSS selector to use when searching for nodes.
807
+
808
+ Returns
809
+ -------
810
+ selector : The `Selector` class.
811
+ """
812
+ return LexborSelector(self, query)
813
+
814
+ def __eq__(self, other):
815
+ if isinstance(other, str):
816
+ return self.html == other
817
+ if not isinstance(other, LexborNode):
818
+ return False
819
+ return self.html == other.html
820
+
821
+ @property
822
+ def text_content(self):
823
+ """Returns the text of the node if it is a text node.
824
+
825
+ Returns None for other nodes.
826
+ Unlike the ``text`` method, does not include child nodes.
827
+
828
+ Returns
829
+ -------
830
+ text : str or None.
831
+ """
832
+ cdef unsigned char * text
833
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
834
+
835
+ container = TextContainer()
836
+ if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
837
+ return None
838
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
839
+ if text != NULL:
840
+ py_text = text.decode(_ENCODING)
841
+ container.append(py_text)
842
+ return container.text
843
+ @cython.final
844
+ cdef class TextContainer:
845
+ cdef str _text
846
+ cdef public str separator
847
+ cdef public bool strip
848
+
849
+ def __init__(self, str separator = '', bool strip = False):
850
+ self._text = ""
851
+ self.separator = separator
852
+ self.strip = strip
853
+
854
+ def append(self, node_text):
855
+ if self.strip:
856
+ self._text += node_text.strip() + self.separator
857
+ else:
858
+ self._text += node_text + self.separator
859
+ @property
860
+ def text(self):
861
+ if self.separator and self._text and self._text.endswith(self.separator):
862
+ self._text = self._text[:-len(self.separator)]
863
+ return self._text
864
+
865
+
866
+ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
867
+ cdef unsigned char *text;
868
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
869
+ if tag_id != LXB_TAG__TEXT:
870
+ return LEXBOR_ACTION_OK
871
+
872
+ text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
873
+ if not text:
874
+ return LEXBOR_ACTION_OK
875
+ py_str = text.decode(_ENCODING)
876
+ cdef object cls
877
+ cls = <object> ctx
878
+ cls.append(py_str)
879
+ return LEXBOR_ACTION_OK