selectolax 0.3.28__cp38-cp38-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

@@ -0,0 +1,867 @@
1
+ cimport cython
2
+
3
+ _TAG_TO_NAME = {
4
+ 0x0005: "- doctype",
5
+ 0x0002: "-text",
6
+ 0x0004: "-comment",
7
+ }
8
+ ctypedef fused str_or_LexborNode:
9
+ basestring
10
+ bytes
11
+ LexborNode
12
+
13
+ cdef inline bytes to_bytes(str_or_LexborNode value):
14
+ cdef bytes bytes_val
15
+ if isinstance(value, (str, unicode)):
16
+ bytes_val = value.encode(_ENCODING)
17
+ elif isinstance(value, bytes):
18
+ bytes_val = <char*> value
19
+ return bytes_val
20
+
21
+ @cython.final
22
+ cdef class LexborNode:
23
+ """A class that represents HTML node (element)."""
24
+
25
+ cdef _cinit(self, lxb_dom_node_t *node, LexborHTMLParser parser):
26
+ self.parser = parser
27
+ self.node = node
28
+ return self
29
+
30
+ @property
31
+ def mem_id(self):
32
+ return <size_t> self.node
33
+
34
+ @property
35
+ def child(self):
36
+ """Alias for the `first_child` property."""
37
+ return self.first_child
38
+
39
+ @property
40
+ def first_child(self):
41
+ """Return the first child node."""
42
+ cdef LexborNode node
43
+ if self.node.first_child:
44
+ node = LexborNode()
45
+ node._cinit(<lxb_dom_node_t *> self.node.first_child, self.parser)
46
+ return node
47
+ return None
48
+
49
+ @property
50
+ def parent(self):
51
+ """Return the parent node."""
52
+ cdef LexborNode node
53
+ if self.node.parent:
54
+ node = LexborNode()
55
+ node._cinit(<lxb_dom_node_t *> self.node.parent, self.parser)
56
+ return node
57
+ return None
58
+
59
+ @property
60
+ def next(self):
61
+ """Return next node."""
62
+ cdef LexborNode node
63
+ if self.node.next:
64
+ node = LexborNode()
65
+ node._cinit(<lxb_dom_node_t *> self.node.next, self.parser)
66
+ return node
67
+ return None
68
+
69
+ @property
70
+ def prev(self):
71
+ """Return previous node."""
72
+ cdef LexborNode node
73
+ if self.node.prev:
74
+ node = LexborNode()
75
+ node._cinit(<lxb_dom_node_t *> self.node.prev, self.parser)
76
+ return node
77
+ return None
78
+
79
+ @property
80
+ def last_child(self):
81
+ """Return last child node."""
82
+ cdef LexborNode node
83
+ if self.node.last_child:
84
+ node = LexborNode()
85
+ node._cinit(<lxb_dom_node_t *> self.node.last_child, self.parser)
86
+ return node
87
+ return None
88
+
89
+ @property
90
+ def html(self):
91
+ """Return HTML representation of the current node including all its child nodes.
92
+
93
+ Returns
94
+ -------
95
+ text : str
96
+ """
97
+ cdef lexbor_str_t *lxb_str
98
+ cdef lxb_status_t status
99
+
100
+ lxb_str = lexbor_str_create()
101
+ status = lxb_html_serialize_tree_str(self.node, lxb_str)
102
+ if status == 0 and lxb_str.data:
103
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
104
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
105
+ return html
106
+ return None
107
+
108
+ def __hash__(self):
109
+ return self.mem_id
110
+
111
+ def text_lexbor(self):
112
+ """Returns the text of the node including text of all its child nodes.
113
+
114
+ Uses builtin method from lexbor.
115
+ """
116
+
117
+ cdef size_t str_len = 0
118
+ cdef lxb_char_t * text
119
+
120
+ text = lxb_dom_node_text_content(self.node, &str_len)
121
+ if <int>str_len == 0:
122
+ raise RuntimeError("Can't extract text")
123
+
124
+ unicode_text = text.decode(_ENCODING)
125
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
126
+ return unicode_text
127
+
128
+ def text(self, bool deep=True, str separator='', bool strip=False):
129
+ """Returns the text of the node including text of all its child nodes.
130
+
131
+ Parameters
132
+ ----------
133
+ strip : bool, default False
134
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
135
+ separator : str, default ''
136
+ The separator to use when joining text from different nodes.
137
+ deep : bool, default True
138
+ If True, includes text from all child nodes.
139
+
140
+ Returns
141
+ -------
142
+ text : str
143
+
144
+ """
145
+ cdef unsigned char * text
146
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
147
+
148
+ if not deep:
149
+ container = TextContainer(separator, strip)
150
+ if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT:
151
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
152
+ if text != NULL:
153
+ py_text = text.decode(_ENCODING)
154
+ container.append(py_text)
155
+
156
+ while node != NULL:
157
+ if node.type == LXB_DOM_NODE_TYPE_TEXT:
158
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
159
+ if text != NULL:
160
+ py_text = text.decode(_ENCODING)
161
+ container.append(py_text)
162
+ node = node.next
163
+ return container.text
164
+ else:
165
+ container = TextContainer(separator, strip)
166
+ if self.node.type == LXB_DOM_NODE_TYPE_TEXT:
167
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
168
+ if text != NULL:
169
+ container.append(text.decode(_ENCODING))
170
+
171
+ lxb_dom_node_simple_walk(
172
+ <lxb_dom_node_t *> self.node,
173
+ <lxb_dom_node_simple_walker_f>text_callback,
174
+ <void *>container
175
+ )
176
+ return container.text
177
+
178
+ def css(self, str query):
179
+ """Evaluate CSS selector against current node and its child nodes.
180
+
181
+ Matches pattern `query` against HTML tree.
182
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
183
+
184
+ Parameters
185
+ ----------
186
+ query : str
187
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
188
+
189
+ Returns
190
+ -------
191
+ selector : list of `Node` objects
192
+ """
193
+ return self.parser.selector.find(query, self)
194
+
195
+ def css_first(self, str query, default=None, bool strict=False):
196
+ """Same as `css` but returns only the first match.
197
+
198
+ Parameters
199
+ ----------
200
+
201
+ query : str
202
+ default : bool, default None
203
+ Default value to return if there is no match.
204
+ strict: bool, default True
205
+ Set to True if you want to check if there is strictly only one match in the document.
206
+
207
+
208
+ Returns
209
+ -------
210
+ selector : `LexborNode` object
211
+ """
212
+ # TODO: This can be improved.
213
+ results = self.css(query)
214
+ n_results = len(results)
215
+ if n_results > 0:
216
+ if strict and n_results > 1:
217
+ raise ValueError("Expected 1 match, but found %s matches" % n_results)
218
+ return results[0]
219
+ return default
220
+
221
+ def any_css_matches(self, tuple selectors):
222
+ """Returns True if any of CSS selectors matches a node"""
223
+ for selector in selectors:
224
+ if self.parser.selector.any_matches(selector, self):
225
+ return True
226
+ return False
227
+
228
+ def css_matches(self, str selector):
229
+ """Returns True if CSS selector matches a node."""
230
+ return self.parser.selector.any_matches(selector, self)
231
+
232
+ def __repr__(self):
233
+ return '<LexborNode %s>' % self.tag
234
+
235
+ @property
236
+ def tag_id(self):
237
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(self.node)
238
+ return tag_id
239
+
240
+ @property
241
+ def tag(self):
242
+ """Return the name of the current tag (e.g. div, p, img).
243
+
244
+ Returns
245
+ -------
246
+ text : str
247
+ """
248
+
249
+ cdef lxb_char_t *c_text
250
+ cdef size_t str_len = 0
251
+ if self.tag_id in [LXB_TAG__EM_DOCTYPE, LXB_TAG__TEXT, LXB_TAG__EM_COMMENT]:
252
+ return _TAG_TO_NAME[self.tag_id]
253
+ c_text = lxb_dom_element_qualified_name(<lxb_dom_element_t *> self.node, &str_len)
254
+ text = None
255
+ if c_text:
256
+ text = c_text.decode(_ENCODING)
257
+ return text
258
+
259
+
260
+ def decompose(self, bool recursive=True):
261
+ """Remove the current node from the tree.
262
+
263
+ Parameters
264
+ ----------
265
+ recursive : bool, default True
266
+ Whenever to delete all its child nodes
267
+
268
+ Examples
269
+ --------
270
+
271
+ >>> tree = LexborHTMLParser(html)
272
+ >>> for tag in tree.css('script'):
273
+ >>> tag.decompose()
274
+
275
+ """
276
+ if recursive:
277
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
278
+ else:
279
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
280
+
281
+ def strip_tags(self, list tags, bool recursive = False):
282
+ """Remove specified tags from the HTML tree.
283
+
284
+ Parameters
285
+ ----------
286
+ tags : list
287
+ List of tags to remove.
288
+ recursive : bool, default True
289
+ Whenever to delete all its child nodes
290
+
291
+ Examples
292
+ --------
293
+
294
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
295
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
296
+ >>> tree.strip_tags(tags)
297
+ >>> tree.html
298
+ '<html><body><div>Hello world!</div></body></html>'
299
+
300
+ """
301
+ for tag in tags:
302
+ for element in self.css(tag):
303
+ element.decompose(recursive=recursive)
304
+
305
+
306
+ @property
307
+ def attributes(self):
308
+ """Get all attributes that belong to the current node.
309
+
310
+ The value of empty attributes is None.
311
+
312
+ Returns
313
+ -------
314
+ attributes : dictionary of all attributes.
315
+
316
+ Examples
317
+ --------
318
+
319
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
320
+ >>> node = tree.css_first('div')
321
+ >>> node.attributes
322
+ {'data': None, 'id': 'my_id'}
323
+ """
324
+ cdef lxb_dom_attr_t *attr = lxb_dom_element_first_attribute_noi(<lxb_dom_element_t *> self.node)
325
+ cdef size_t str_len = 0
326
+ attributes = dict()
327
+
328
+ while attr != NULL:
329
+ key = lxb_dom_attr_local_name_noi(attr, &str_len)
330
+ value = lxb_dom_attr_value_noi(attr, &str_len)
331
+
332
+ if value:
333
+ py_value = value.decode(_ENCODING)
334
+ else:
335
+ py_value = None
336
+ attributes[key.decode(_ENCODING)] = py_value
337
+
338
+ attr = attr.next
339
+ return attributes
340
+
341
+ @property
342
+ def attrs(self):
343
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
344
+
345
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
346
+
347
+ Returns
348
+ -------
349
+ attributes : Attributes mapping object.
350
+
351
+ Examples
352
+ --------
353
+
354
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
355
+ >>> node = tree.css_first('div')
356
+ >>> node.attrs
357
+ <div attributes, 1 items>
358
+ >>> node.attrs['id']
359
+ 'a'
360
+ >>> node.attrs['foo'] = 'bar'
361
+ >>> del node.attrs['id']
362
+ >>> node.attributes
363
+ {'foo': 'bar'}
364
+ >>> node.attrs['id'] = 'new_id'
365
+ >>> node.html
366
+ '<div foo="bar" id="new_id"></div>'
367
+ """
368
+ cdef LexborAttributes attributes = LexborAttributes.create(<lxb_dom_node_t *>self.node)
369
+ return attributes
370
+
371
+ @property
372
+ def id(self):
373
+ """Get the id attribute of the node.
374
+
375
+ Returns None if id does not set.
376
+
377
+ Returns
378
+ -------
379
+ text : str
380
+ """
381
+ cdef char * key = 'id'
382
+ cdef size_t str_len
383
+ cdef lxb_dom_attr_t * attr = lxb_dom_element_attr_by_name(
384
+ <lxb_dom_element_t *> self.node,
385
+ <lxb_char_t *> key, 2
386
+ )
387
+ if attr != NULL:
388
+ value = lxb_dom_attr_value_noi(attr, &str_len)
389
+ return value.decode(_ENCODING) if value else None
390
+ return None
391
+
392
+ def iter(self, include_text=False):
393
+ """Iterate over nodes on the current level.
394
+
395
+ Parameters
396
+ ----------
397
+ include_text : bool
398
+ If True, includes text nodes as well.
399
+
400
+ Yields
401
+ -------
402
+ node
403
+ """
404
+
405
+ cdef lxb_dom_node_t *node = self.node.first_child
406
+ cdef LexborNode next_node
407
+
408
+ while node != NULL:
409
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
410
+ node = node.next
411
+ continue
412
+
413
+ next_node = LexborNode()
414
+ next_node._cinit(<lxb_dom_node_t *> node, self.parser)
415
+ yield next_node
416
+ node = node.next
417
+
418
+
419
+ def unwrap(self):
420
+ """Replace node with whatever is inside this node.
421
+
422
+ Examples
423
+ --------
424
+
425
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
426
+ >>> tree.css_first('i').unwrap()
427
+ >>> tree.html
428
+ '<html><head></head><body><div>Hello world!</div></body></html>'
429
+
430
+ """
431
+ if self.node.first_child == NULL:
432
+ return
433
+ cdef lxb_dom_node_t* next_node;
434
+ cdef lxb_dom_node_t* current_node;
435
+
436
+ if self.node.first_child.next != NULL:
437
+ current_node = self.node.first_child
438
+ next_node = current_node.next
439
+
440
+ while next_node != NULL:
441
+ next_node = current_node.next
442
+ lxb_dom_node_insert_before(self.node, current_node)
443
+ current_node = next_node
444
+ else:
445
+ lxb_dom_node_insert_before(self.node, self.node.first_child)
446
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
447
+
448
+ def unwrap_tags(self, list tags):
449
+ """Unwraps specified tags from the HTML tree.
450
+
451
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
452
+
453
+ Parameters
454
+ ----------
455
+ tags : list
456
+ List of tags to remove.
457
+
458
+ Examples
459
+ --------
460
+
461
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
462
+ >>> tree.body.unwrap_tags(['i','a'])
463
+ >>> tree.body.html
464
+ '<body><div>Hello world!</div></body>'
465
+ """
466
+
467
+ for tag in tags:
468
+ for element in self.css(tag):
469
+ element.unwrap()
470
+
471
+
472
+ def traverse(self, include_text=False):
473
+ """Iterate over all child and next nodes starting from the current level.
474
+
475
+ Parameters
476
+ ----------
477
+ include_text : bool
478
+ If True, includes text nodes as well.
479
+
480
+ Yields
481
+ -------
482
+ node
483
+ """
484
+ cdef lxb_dom_node_t * root = self.node
485
+ cdef lxb_dom_node_t * node = root
486
+ cdef LexborNode lxb_node
487
+
488
+ while node != NULL:
489
+ if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
490
+ lxb_node = LexborNode()
491
+ lxb_node._cinit(<lxb_dom_node_t *> node, self.parser)
492
+ yield lxb_node
493
+
494
+ if node.first_child != NULL:
495
+ node = node.first_child
496
+ else:
497
+ while node != root and node.next == NULL:
498
+ node = node.parent
499
+ if node == root:
500
+ break
501
+ node = node.next
502
+
503
+ def replace_with(self, str_or_LexborNode value):
504
+ """Replace current Node with specified value.
505
+
506
+ Parameters
507
+ ----------
508
+ value : str, bytes or Node
509
+ The text or Node instance to replace the Node with.
510
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
511
+ Convert and pass the ``Node`` object when you want to work with HTML.
512
+ Does not clone the ``Node`` object.
513
+ All future changes to the passed ``Node`` object will also be taken into account.
514
+
515
+ Examples
516
+ --------
517
+
518
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
519
+ >>> img = tree.css_first('img')
520
+ >>> img.replace_with(img.attributes.get('alt', ''))
521
+ >>> tree.body.child.html
522
+ '<div>Get Laptop</div>'
523
+
524
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
525
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
526
+ >>> img_node = html_parser.css_first('img')
527
+ >>> img_node.replace_with(html_parser2.body.child)
528
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
529
+ """
530
+ cdef lxb_dom_node_t * new_node
531
+
532
+ if isinstance(value, (str, bytes, unicode)):
533
+ bytes_val = to_bytes(value)
534
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
535
+ &self.parser.document.dom_document,
536
+ <lxb_char_t *> bytes_val, len(bytes_val)
537
+ )
538
+ if new_node == NULL:
539
+ raise SelectolaxError("Can't create a new node")
540
+ lxb_dom_node_insert_before(self.node, new_node)
541
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
542
+ elif isinstance(value, LexborNode):
543
+ new_node = lxb_dom_document_import_node(
544
+ &self.parser.document.dom_document,
545
+ <lxb_dom_node_t *> value.node,
546
+ <bint> True
547
+ )
548
+ if new_node == NULL:
549
+ raise SelectolaxError("Can't create a new node")
550
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
551
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
552
+ else:
553
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
554
+
555
+
556
+ def insert_before(self, str_or_LexborNode value):
557
+ """
558
+ Insert a node before the current Node.
559
+
560
+ Parameters
561
+ ----------
562
+ value : str, bytes or Node
563
+ The text or Node instance to insert before the Node.
564
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
565
+ Convert and pass the ``Node`` object when you want to work with HTML.
566
+ Does not clone the ``Node`` object.
567
+ All future changes to the passed ``Node`` object will also be taken into account.
568
+
569
+ Examples
570
+ --------
571
+
572
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
573
+ >>> img = tree.css_first('img')
574
+ >>> img.insert_before(img.attributes.get('alt', ''))
575
+ >>> tree.body.child.html
576
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
577
+
578
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
579
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
580
+ >>> img_node = html_parser.css_first('img')
581
+ >>> img_node.insert_before(html_parser2.body.child)
582
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
583
+ """
584
+ cdef lxb_dom_node_t * new_node
585
+
586
+ if isinstance(value, (str, bytes, unicode)):
587
+ bytes_val = to_bytes(value)
588
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
589
+ &self.parser.document.dom_document,
590
+ <lxb_char_t *> bytes_val, len(bytes_val)
591
+ )
592
+ if new_node == NULL:
593
+ raise SelectolaxError("Can't create a new node")
594
+ lxb_dom_node_insert_before(self.node, new_node)
595
+ elif isinstance(value, LexborNode):
596
+ new_node = lxb_dom_document_import_node(
597
+ &self.parser.document.dom_document,
598
+ <lxb_dom_node_t *> value.node,
599
+ <bint> True
600
+ )
601
+ if new_node == NULL:
602
+ raise SelectolaxError("Can't create a new node")
603
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
604
+ else:
605
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
606
+
607
+ def insert_after(self, str_or_LexborNode value):
608
+ """
609
+ Insert a node after the current Node.
610
+
611
+ Parameters
612
+ ----------
613
+ value : str, bytes or Node
614
+ The text or Node instance to insert after the Node.
615
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
616
+ Convert and pass the ``Node`` object when you want to work with HTML.
617
+ Does not clone the ``Node`` object.
618
+ All future changes to the passed ``Node`` object will also be taken into account.
619
+
620
+ Examples
621
+ --------
622
+
623
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
624
+ >>> img = tree.css_first('img')
625
+ >>> img.insert_after(img.attributes.get('alt', ''))
626
+ >>> tree.body.child.html
627
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
628
+
629
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
630
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
631
+ >>> img_node = html_parser.css_first('img')
632
+ >>> img_node.insert_after(html_parser2.body.child)
633
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
634
+ """
635
+ cdef lxb_dom_node_t * new_node
636
+
637
+ if isinstance(value, (str, bytes, unicode)):
638
+ bytes_val = to_bytes(value)
639
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
640
+ &self.parser.document.dom_document,
641
+ <lxb_char_t *> bytes_val, len(bytes_val)
642
+ )
643
+ if new_node == NULL:
644
+ raise SelectolaxError("Can't create a new node")
645
+ lxb_dom_node_insert_after(self.node, new_node)
646
+ elif isinstance(value, LexborNode):
647
+ new_node = lxb_dom_document_import_node(
648
+ &self.parser.document.dom_document,
649
+ <lxb_dom_node_t *> value.node,
650
+ <bint> True
651
+ )
652
+ if new_node == NULL:
653
+ raise SelectolaxError("Can't create a new node")
654
+ lxb_dom_node_insert_after(self.node, <lxb_dom_node_t *> new_node)
655
+ else:
656
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
657
+
658
+ def insert_child(self, str_or_LexborNode value):
659
+ """
660
+ Insert a node inside (at the end of) the current Node.
661
+
662
+ Parameters
663
+ ----------
664
+ value : str, bytes or Node
665
+ The text or Node instance to insert inside the Node.
666
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
667
+ Convert and pass the ``Node`` object when you want to work with HTML.
668
+ Does not clone the ``Node`` object.
669
+ All future changes to the passed ``Node`` object will also be taken into account.
670
+
671
+ Examples
672
+ --------
673
+
674
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
675
+ >>> div = tree.css_first('div')
676
+ >>> div.insert_child('Laptop')
677
+ >>> tree.body.child.html
678
+ '<div>Get <img src="">Laptop</div>'
679
+
680
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
681
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
682
+ >>> span_node = html_parser.css_first('span')
683
+ >>> span_node.insert_child(html_parser2.body.child)
684
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
685
+ """
686
+ cdef lxb_dom_node_t * new_node
687
+
688
+ if isinstance(value, (str, bytes, unicode)):
689
+ bytes_val = to_bytes(value)
690
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
691
+ &self.parser.document.dom_document,
692
+ <lxb_char_t *> bytes_val, len(bytes_val)
693
+ )
694
+ if new_node == NULL:
695
+ raise SelectolaxError("Can't create a new node")
696
+ lxb_dom_node_insert_child(self.node, new_node)
697
+ elif isinstance(value, LexborNode):
698
+ new_node = lxb_dom_document_import_node(
699
+ &self.parser.document.dom_document,
700
+ <lxb_dom_node_t *> value.node,
701
+ <bint> True
702
+ )
703
+ if new_node == NULL:
704
+ raise SelectolaxError("Can't create a new node")
705
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
706
+ else:
707
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
708
+
709
+ @property
710
+ def raw_value(self):
711
+ """Return the raw (unparsed, original) value of a node.
712
+
713
+ Currently, works on text nodes only.
714
+
715
+ Returns
716
+ -------
717
+
718
+ raw_value : bytes
719
+
720
+ Examples
721
+ --------
722
+
723
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
724
+ >>> selector = html_parser.css_first('div')
725
+ >>> selector.child.html
726
+ '&lt;test&gt;'
727
+ >>> selector.child.raw_value
728
+ b'&#x3C;test&#x3E;'
729
+ """
730
+ raise SelectolaxError("This features is not supported by the lexbor backend. Please use Modest backend.")
731
+
732
+ def scripts_contain(self, str query):
733
+ """Returns True if any of the script tags contain specified text.
734
+
735
+ Caches script tags on the first call to improve performance.
736
+
737
+ Parameters
738
+ ----------
739
+ query : str
740
+ The query to check.
741
+
742
+ """
743
+ if self.parser.cached_script_texts is None:
744
+ nodes = self.parser.selector.find('script', self)
745
+ text_nodes = []
746
+ for node in nodes:
747
+ node_text = node.text(deep=True)
748
+ if node_text:
749
+ text_nodes.append(node_text)
750
+ self.parser.cached_script_texts = text_nodes
751
+
752
+ for text in self.parser.cached_script_texts:
753
+ if query in text:
754
+ return True
755
+ return False
756
+
757
+ def script_srcs_contain(self, tuple queries):
758
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
759
+
760
+ Caches values on the first call to improve performance.
761
+
762
+ Parameters
763
+ ----------
764
+ queries : tuple of str
765
+
766
+ """
767
+ if self.parser.cached_script_srcs is None:
768
+ nodes = self.parser.selector.find('script', self)
769
+ src_nodes = []
770
+ for node in nodes:
771
+ node_src = node.attrs.get('src')
772
+ if node_src:
773
+ src_nodes.append(node_src)
774
+ self.parser.cached_script_srcs = src_nodes
775
+
776
+ for text in self.parser.cached_script_srcs:
777
+ for query in queries:
778
+ if query in text:
779
+ return True
780
+ return False
781
+
782
+ def remove(self, bool recursive=True):
783
+ """An alias for the decompose method."""
784
+ self.decompose(recursive)
785
+
786
+ def select(self, query=None):
787
+ """Select nodes given a CSS selector.
788
+
789
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
790
+
791
+ Parameters
792
+ ----------
793
+ query : str or None
794
+ The CSS selector to use when searching for nodes.
795
+
796
+ Returns
797
+ -------
798
+ selector : The `Selector` class.
799
+ """
800
+ return LexborSelector(self, query)
801
+
802
+ def __eq__(self, other):
803
+ if isinstance(other, str):
804
+ return self.html == other
805
+ if not isinstance(other, LexborNode):
806
+ return False
807
+ return self.html == other.html
808
+
809
+ @property
810
+ def text_content(self):
811
+ """Returns the text of the node if it is a text node.
812
+
813
+ Returns None for other nodes.
814
+ Unlike the ``text`` method, does not include child nodes.
815
+
816
+ Returns
817
+ -------
818
+ text : str or None.
819
+ """
820
+ cdef unsigned char * text
821
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
822
+
823
+ container = TextContainer()
824
+ if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
825
+ return None
826
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
827
+ if text != NULL:
828
+ py_text = text.decode(_ENCODING)
829
+ container.append(py_text)
830
+ return container.text
831
+ @cython.final
832
+ cdef class TextContainer:
833
+ cdef str _text
834
+ cdef public str separator
835
+ cdef public bool strip
836
+
837
+ def __init__(self, str separator = '', bool strip = False):
838
+ self._text = ""
839
+ self.separator = separator
840
+ self.strip = strip
841
+
842
+ def append(self, node_text):
843
+ if self.strip:
844
+ self._text += node_text.strip() + self.separator
845
+ else:
846
+ self._text += node_text + self.separator
847
+ @property
848
+ def text(self):
849
+ if self.separator and self._text and self._text.endswith(self.separator):
850
+ self._text = self._text[:-len(self.separator)]
851
+ return self._text
852
+
853
+
854
+ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
855
+ cdef unsigned char *text;
856
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
857
+ if tag_id != LXB_TAG__TEXT:
858
+ return LEXBOR_ACTION_OK
859
+
860
+ text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
861
+ if not text:
862
+ return LEXBOR_ACTION_OK
863
+ py_str = text.decode(_ENCODING)
864
+ cdef object cls
865
+ cls = <object> ctx
866
+ cls.append(py_str)
867
+ return LEXBOR_ACTION_OK