selectolax 0.3.34__cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of selectolax might be problematic. Click here for more details.

@@ -0,0 +1,940 @@
1
+ cimport cython
2
+ from cpython.exc cimport PyErr_SetNone
3
+
4
+ _TAG_TO_NAME = {
5
+ 0x0005: "- doctype",
6
+ 0x0002: "-text",
7
+ 0x0004: "-comment",
8
+ }
9
+ ctypedef fused str_or_LexborNode:
10
+ str
11
+ bytes
12
+ LexborNode
13
+
14
+ cdef inline bytes to_bytes(str_or_LexborNode value):
15
+ cdef bytes bytes_val
16
+ if isinstance(value, unicode):
17
+ bytes_val = <bytes>value.encode("utf-8")
18
+ elif isinstance(value, bytes):
19
+ bytes_val = <bytes>value
20
+ return bytes_val
21
+
22
+
23
+ @cython.final
24
+ cdef class LexborNode:
25
+ """A class that represents HTML node (element)."""
26
+
27
+ @staticmethod
28
+ cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
29
+ cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
30
+ lxbnode.node = node
31
+ lxbnode.parser = parser
32
+ return lxbnode
33
+
34
+ @property
35
+ def mem_id(self):
36
+ return <size_t> self.node
37
+
38
+ @property
39
+ def child(self):
40
+ """Alias for the `first_child` property."""
41
+ return self.first_child
42
+
43
+ @property
44
+ def first_child(self):
45
+ """Return the first child node."""
46
+ cdef LexborNode node
47
+ if self.node.first_child:
48
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.first_child, self.parser)
49
+ return node
50
+ return None
51
+
52
+ @property
53
+ def parent(self):
54
+ """Return the parent node."""
55
+ cdef LexborNode node
56
+ if self.node.parent != NULL:
57
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.parent, self.parser)
58
+ return node
59
+ return None
60
+
61
+ @property
62
+ def next(self):
63
+ """Return next node."""
64
+ cdef LexborNode node
65
+ if self.node.next != NULL:
66
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.next, self.parser)
67
+ return node
68
+ return None
69
+
70
+ @property
71
+ def prev(self):
72
+ """Return previous node."""
73
+ cdef LexborNode node
74
+ if self.node.prev != NULL:
75
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.prev, self.parser)
76
+ return node
77
+ return None
78
+
79
+ @property
80
+ def last_child(self):
81
+ """Return last child node."""
82
+ cdef LexborNode node
83
+ if self.node.last_child != NULL:
84
+ node = LexborNode.new(<lxb_dom_node_t *> self.node.last_child, self.parser)
85
+ return node
86
+ return None
87
+
88
+ @property
89
+ def html(self):
90
+ """Return HTML representation of the current node including all its child nodes.
91
+
92
+ Returns
93
+ -------
94
+ text : str
95
+ """
96
+ cdef lexbor_str_t *lxb_str
97
+ cdef lxb_status_t status
98
+
99
+ lxb_str = lexbor_str_create()
100
+ status = lxb_html_serialize_tree_str(self.node, lxb_str)
101
+ if status == 0 and lxb_str.data:
102
+ html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
103
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
104
+ return html
105
+ return None
106
+
107
+ def __hash__(self):
108
+ return self.mem_id
109
+
110
+ def text_lexbor(self):
111
+ """Returns the text of the node including text of all its child nodes.
112
+
113
+ Uses builtin method from lexbor.
114
+ """
115
+
116
+ cdef size_t str_len = 0
117
+ cdef lxb_char_t * text
118
+
119
+ text = lxb_dom_node_text_content(self.node, &str_len)
120
+ if <int>str_len == 0:
121
+ raise RuntimeError("Can't extract text")
122
+
123
+ unicode_text = text.decode(_ENCODING)
124
+ lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
125
+ return unicode_text
126
+
127
+ def text(self, bool deep=True, str separator='', bool strip=False):
128
+ """Returns the text of the node including text of all its child nodes.
129
+
130
+ Parameters
131
+ ----------
132
+ strip : bool, default False
133
+ If true, calls ``str.strip()`` on each text part to remove extra white spaces.
134
+ separator : str, default ''
135
+ The separator to use when joining text from different nodes.
136
+ deep : bool, default True
137
+ If True, includes text from all child nodes.
138
+
139
+ Returns
140
+ -------
141
+ text : str
142
+
143
+ """
144
+ cdef unsigned char * text
145
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
146
+
147
+ if not deep:
148
+ container = TextContainer(separator, strip)
149
+ if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT:
150
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
151
+ if text != NULL:
152
+ py_text = text.decode(_ENCODING)
153
+ container.append(py_text)
154
+
155
+ while node != NULL:
156
+ if node.type == LXB_DOM_NODE_TYPE_TEXT:
157
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
158
+ if text != NULL:
159
+ py_text = text.decode(_ENCODING)
160
+ container.append(py_text)
161
+ node = node.next
162
+ return container.text
163
+ else:
164
+ container = TextContainer(separator, strip)
165
+ if self.node.type == LXB_DOM_NODE_TYPE_TEXT:
166
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
167
+ if text != NULL:
168
+ container.append(text.decode(_ENCODING))
169
+
170
+ lxb_dom_node_simple_walk(
171
+ <lxb_dom_node_t *> self.node,
172
+ <lxb_dom_node_simple_walker_f>text_callback,
173
+ <void *>container
174
+ )
175
+ return container.text
176
+
177
+ def css(self, str query):
178
+ """Evaluate CSS selector against current node and its child nodes.
179
+
180
+ Matches pattern `query` against HTML tree.
181
+ `CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
182
+
183
+ Special selectors:
184
+
185
+ - parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
186
+ - parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
187
+
188
+
189
+ Parameters
190
+ ----------
191
+ query : str
192
+ CSS selector (e.g. "div > :nth-child(2n+1):not(:has(a))").
193
+
194
+ Returns
195
+ -------
196
+ selector : list of `Node` objects
197
+ """
198
+ return self.parser.selector.find(query, self)
199
+
200
+ def css_first(self, str query, default=None, bool strict=False):
201
+ """Same as `css` but returns only the first match.
202
+
203
+ Parameters
204
+ ----------
205
+
206
+ query : str
207
+ default : bool, default None
208
+ Default value to return if there is no match.
209
+ strict: bool, default True
210
+ Set to True if you want to check if there is strictly only one match in the document.
211
+
212
+
213
+ Returns
214
+ -------
215
+ selector : `LexborNode` object
216
+ """
217
+ # TODO: This can be improved.
218
+ results = self.css(query)
219
+ n_results = len(results)
220
+ if n_results > 0:
221
+ if strict and n_results > 1:
222
+ raise ValueError("Expected 1 match, but found %s matches" % n_results)
223
+ return results[0]
224
+ return default
225
+
226
+ def any_css_matches(self, tuple selectors):
227
+ """Returns True if any of CSS selectors matches a node"""
228
+ for selector in selectors:
229
+ if self.parser.selector.any_matches(selector, self):
230
+ return True
231
+ return False
232
+
233
+ def css_matches(self, str selector):
234
+ """Returns True if CSS selector matches a node."""
235
+ return self.parser.selector.any_matches(selector, self)
236
+
237
+ def __repr__(self):
238
+ return '<LexborNode %s>' % self.tag
239
+
240
+ @property
241
+ def tag_id(self):
242
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(self.node)
243
+ return tag_id
244
+
245
+ @property
246
+ def tag(self):
247
+ """Return the name of the current tag (e.g. div, p, img).
248
+
249
+ Returns
250
+ -------
251
+ text : str
252
+ """
253
+
254
+ cdef lxb_char_t *c_text
255
+ cdef size_t str_len = 0
256
+ if self.tag_id in [LXB_TAG__EM_DOCTYPE, LXB_TAG__TEXT, LXB_TAG__EM_COMMENT]:
257
+ return _TAG_TO_NAME[self.tag_id]
258
+ c_text = lxb_dom_element_qualified_name(<lxb_dom_element_t *> self.node, &str_len)
259
+ text = None
260
+ if c_text:
261
+ text = c_text.decode(_ENCODING)
262
+ return text
263
+
264
+ def decompose(self, bool recursive=True):
265
+ """Remove the current node from the tree.
266
+
267
+ Parameters
268
+ ----------
269
+ recursive : bool, default True
270
+ Whenever to delete all its child nodes
271
+
272
+ Examples
273
+ --------
274
+
275
+ >>> tree = LexborHTMLParser(html)
276
+ >>> for tag in tree.css('script'):
277
+ >>> tag.decompose()
278
+
279
+ """
280
+ if self.node == <lxb_dom_node_t *> lxb_dom_document_root(&self.parser.document.dom_document):
281
+ raise SelectolaxError("Decomposing the root node is not allowed.")
282
+
283
+ if recursive:
284
+ lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
285
+ else:
286
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
287
+
288
+ def strip_tags(self, list tags, bool recursive = False):
289
+ """Remove specified tags from the HTML tree.
290
+
291
+ Parameters
292
+ ----------
293
+ tags : list
294
+ List of tags to remove.
295
+ recursive : bool, default True
296
+ Whenever to delete all its child nodes
297
+
298
+ Examples
299
+ --------
300
+
301
+ >>> tree = LexborHTMLParser('<html><head></head><body><script></script><div>Hello world!</div></body></html>')
302
+ >>> tags = ['head', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
303
+ >>> tree.strip_tags(tags)
304
+ >>> tree.html
305
+ '<html><body><div>Hello world!</div></body></html>'
306
+
307
+ """
308
+ cdef LexborNode element
309
+ for tag in tags:
310
+ for element in self.css(tag):
311
+ element.decompose(recursive=recursive)
312
+
313
+ @property
314
+ def attributes(self):
315
+ """Get all attributes that belong to the current node.
316
+
317
+ The value of empty attributes is None.
318
+
319
+ Returns
320
+ -------
321
+ attributes : dictionary of all attributes.
322
+
323
+ Examples
324
+ --------
325
+
326
+ >>> tree = LexborHTMLParser("<div data id='my_id'></div>")
327
+ >>> node = tree.css_first('div')
328
+ >>> node.attributes
329
+ {'data': None, 'id': 'my_id'}
330
+ """
331
+ cdef lxb_dom_attr_t *attr = lxb_dom_element_first_attribute_noi(<lxb_dom_element_t *> self.node)
332
+ cdef size_t str_len = 0
333
+ attributes = dict()
334
+
335
+ while attr != NULL:
336
+ key = lxb_dom_attr_local_name_noi(attr, &str_len)
337
+ value = lxb_dom_attr_value_noi(attr, &str_len)
338
+
339
+ if value:
340
+ py_value = value.decode(_ENCODING)
341
+ else:
342
+ py_value = None
343
+ attributes[key.decode(_ENCODING)] = py_value
344
+
345
+ attr = attr.next
346
+ return attributes
347
+
348
+ @property
349
+ def attrs(self):
350
+ """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
351
+
352
+ .. warning:: Use ``attributes`` instead, if you don't want to modify Node attributes.
353
+
354
+ Returns
355
+ -------
356
+ attributes : Attributes mapping object.
357
+
358
+ Examples
359
+ --------
360
+
361
+ >>> tree = LexborHTMLParser("<div id='a'></div>")
362
+ >>> node = tree.css_first('div')
363
+ >>> node.attrs
364
+ <div attributes, 1 items>
365
+ >>> node.attrs['id']
366
+ 'a'
367
+ >>> node.attrs['foo'] = 'bar'
368
+ >>> del node.attrs['id']
369
+ >>> node.attributes
370
+ {'foo': 'bar'}
371
+ >>> node.attrs['id'] = 'new_id'
372
+ >>> node.html
373
+ '<div foo="bar" id="new_id"></div>'
374
+ """
375
+ cdef LexborAttributes attributes = LexborAttributes.create(<lxb_dom_node_t *>self.node)
376
+ return attributes
377
+
378
+ @property
379
+ def id(self):
380
+ """Get the id attribute of the node.
381
+
382
+ Returns None if id does not set.
383
+
384
+ Returns
385
+ -------
386
+ text : str
387
+ """
388
+ cdef char * key = 'id'
389
+ cdef size_t str_len
390
+ cdef lxb_dom_attr_t * attr = lxb_dom_element_attr_by_name(
391
+ <lxb_dom_element_t *> self.node,
392
+ <lxb_char_t *> key, 2
393
+ )
394
+ if attr != NULL:
395
+ value = lxb_dom_attr_value_noi(attr, &str_len)
396
+ return value.decode(_ENCODING) if value else None
397
+ return None
398
+
399
+ def iter(self, include_text=False):
400
+ """Iterate over nodes on the current level.
401
+
402
+ Parameters
403
+ ----------
404
+ include_text : bool
405
+ If True, includes text nodes as well.
406
+
407
+ Yields
408
+ -------
409
+ node
410
+ """
411
+
412
+ cdef lxb_dom_node_t *node = self.node.first_child
413
+ cdef LexborNode next_node
414
+
415
+ while node != NULL:
416
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
417
+ node = node.next
418
+ continue
419
+
420
+ next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
421
+ yield next_node
422
+ node = node.next
423
+
424
+ def unwrap(self, bint delete_empty=False):
425
+ """Replace node with whatever is inside this node.
426
+
427
+ Parameters
428
+ ----------
429
+ delete_empty : bool, default False
430
+ If True, removes empty tags.
431
+
432
+ Examples
433
+ --------
434
+
435
+ >>> tree = LexborHTMLParser("<div>Hello <i>world</i>!</div>")
436
+ >>> tree.css_first('i').unwrap()
437
+ >>> tree.html
438
+ '<html><head></head><body><div>Hello world!</div></body></html>'
439
+
440
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
441
+ """
442
+ if self.node.first_child == NULL:
443
+ if delete_empty:
444
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
445
+ return
446
+ cdef lxb_dom_node_t* next_node
447
+ cdef lxb_dom_node_t* current_node
448
+
449
+ if self.node.first_child.next != NULL:
450
+ current_node = self.node.first_child
451
+ next_node = current_node.next
452
+
453
+ while next_node != NULL:
454
+ next_node = current_node.next
455
+ lxb_dom_node_insert_before(self.node, current_node)
456
+ current_node = next_node
457
+ else:
458
+ lxb_dom_node_insert_before(self.node, self.node.first_child)
459
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
460
+
461
+ def unwrap_tags(self, list tags, bint delete_empty = False):
462
+ """Unwraps specified tags from the HTML tree.
463
+
464
+ Works the same as the ``unwrap`` method, but applied to a list of tags.
465
+
466
+ Parameters
467
+ ----------
468
+ tags : list
469
+ List of tags to remove.
470
+ delete_empty : bool, default False
471
+ If True, removes empty tags.
472
+
473
+ Examples
474
+ --------
475
+
476
+ >>> tree = LexborHTMLParser("<div><a href="">Hello</a> <i>world</i>!</div>")
477
+ >>> tree.body.unwrap_tags(['i','a'])
478
+ >>> tree.body.html
479
+ '<body><div>Hello world!</div></body>'
480
+
481
+ Note: by default, empty tags are ignored, use "delete_empty" to change this.
482
+ """
483
+ cdef LexborNode element
484
+ for tag in tags:
485
+ for element in self.css(tag):
486
+ element.unwrap(delete_empty)
487
+
488
+ def merge_text_nodes(self):
489
+ """Iterates over all text nodes and merges all text nodes that are close to each other.
490
+
491
+ This is useful for text extraction.
492
+ Use it when you need to strip HTML tags and merge "dangling" text.
493
+
494
+ Examples
495
+ --------
496
+
497
+ >>> tree = LexborHTMLParser("<div><p><strong>J</strong>ohn</p><p>Doe</p></div>")
498
+ >>> node = tree.css_first('div')
499
+ >>> tree.unwrap_tags(["strong"])
500
+ >>> tree.text(deep=True, separator=" ", strip=True)
501
+ "J ohn Doe" # Text extraction produces an extra space because the strong tag was removed.
502
+ >>> node.merge_text_nodes()
503
+ >>> tree.text(deep=True, separator=" ", strip=True)
504
+ "John Doe"
505
+ """
506
+ cdef lxb_dom_node_t *node = self.node.first_child
507
+ cdef lxb_dom_node_t *next_node
508
+ cdef lxb_char_t *left_text
509
+ cdef lxb_char_t *right_text
510
+ cdef size_t left_length, right_length
511
+
512
+ while node != NULL:
513
+ next_node = node.next
514
+ if node.type == LXB_DOM_NODE_TYPE_TEXT and node.prev and node.prev.type == LXB_DOM_NODE_TYPE_TEXT:
515
+ left_text = lxb_dom_node_text_content(node.prev, &left_length)
516
+ right_text = lxb_dom_node_text_content(node, &right_length)
517
+ if left_text and right_text:
518
+ combined = (<bytes>left_text[:left_length]) + (<bytes>right_text[:right_length])
519
+ lxb_dom_node_text_content_set(node, combined, len(combined))
520
+ lxb_dom_node_remove(node.prev)
521
+ if node.first_child:
522
+ LexborNode.new(node, self.parser).merge_text_nodes()
523
+ node = next_node
524
+
525
+ def traverse(self, include_text=False):
526
+ """Iterate over all child and next nodes starting from the current level.
527
+
528
+ Parameters
529
+ ----------
530
+ include_text : bool
531
+ If True, includes text nodes as well.
532
+
533
+ Yields
534
+ -------
535
+ node
536
+ """
537
+ cdef lxb_dom_node_t * root = self.node
538
+ cdef lxb_dom_node_t * node = root
539
+ cdef LexborNode lxb_node
540
+
541
+ while node != NULL:
542
+ if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
543
+ lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
544
+ yield lxb_node
545
+
546
+ if node.first_child != NULL:
547
+ node = node.first_child
548
+ else:
549
+ while node != root and node.next == NULL:
550
+ node = node.parent
551
+ if node == root:
552
+ break
553
+ node = node.next
554
+
555
+ def replace_with(self, str_or_LexborNode value):
556
+ """Replace current Node with specified value.
557
+
558
+ Parameters
559
+ ----------
560
+ value : str, bytes or Node
561
+ The text or Node instance to replace the Node with.
562
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
563
+ Convert and pass the ``Node`` object when you want to work with HTML.
564
+ Does not clone the ``Node`` object.
565
+ All future changes to the passed ``Node`` object will also be taken into account.
566
+
567
+ Examples
568
+ --------
569
+
570
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
571
+ >>> img = tree.css_first('img')
572
+ >>> img.replace_with(img.attributes.get('alt', ''))
573
+ >>> tree.body.child.html
574
+ '<div>Get Laptop</div>'
575
+
576
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
577
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
578
+ >>> img_node = html_parser.css_first('img')
579
+ >>> img_node.replace_with(html_parser2.body.child)
580
+ '<div>Get <span alt="Laptop"><div>Test</div> <div></div></span></div>'
581
+ """
582
+ cdef lxb_dom_node_t * new_node
583
+
584
+ if isinstance(value, (str, bytes, unicode)):
585
+ bytes_val = to_bytes(value)
586
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
587
+ &self.parser.document.dom_document,
588
+ <lxb_char_t *> bytes_val, len(bytes_val)
589
+ )
590
+ if new_node == NULL:
591
+ raise SelectolaxError("Can't create a new node")
592
+ lxb_dom_node_insert_before(self.node, new_node)
593
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
594
+ elif isinstance(value, LexborNode):
595
+ new_node = lxb_dom_document_import_node(
596
+ &self.parser.document.dom_document,
597
+ <lxb_dom_node_t *> value.node,
598
+ <bint> True
599
+ )
600
+ if new_node == NULL:
601
+ raise SelectolaxError("Can't create a new node")
602
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
603
+ lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
604
+ else:
605
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
606
+
607
+ def insert_before(self, str_or_LexborNode value):
608
+ """
609
+ Insert a node before the current Node.
610
+
611
+ Parameters
612
+ ----------
613
+ value : str, bytes or Node
614
+ The text or Node instance to insert before the Node.
615
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
616
+ Convert and pass the ``Node`` object when you want to work with HTML.
617
+ Does not clone the ``Node`` object.
618
+ All future changes to the passed ``Node`` object will also be taken into account.
619
+
620
+ Examples
621
+ --------
622
+
623
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
624
+ >>> img = tree.css_first('img')
625
+ >>> img.insert_before(img.attributes.get('alt', ''))
626
+ >>> tree.body.child.html
627
+ '<div>Get Laptop<img src="" alt="Laptop"></div>'
628
+
629
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
630
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
631
+ >>> img_node = html_parser.css_first('img')
632
+ >>> img_node.insert_before(html_parser2.body.child)
633
+ <div>Get <span alt="Laptop"><div>Test</div><img src="/jpg"> <div></div></span></div>'
634
+ """
635
+ cdef lxb_dom_node_t * new_node
636
+
637
+ if isinstance(value, (str, bytes, unicode)):
638
+ bytes_val = to_bytes(value)
639
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
640
+ &self.parser.document.dom_document,
641
+ <lxb_char_t *> bytes_val, len(bytes_val)
642
+ )
643
+ if new_node == NULL:
644
+ raise SelectolaxError("Can't create a new node")
645
+ lxb_dom_node_insert_before(self.node, new_node)
646
+ elif isinstance(value, LexborNode):
647
+ new_node = lxb_dom_document_import_node(
648
+ &self.parser.document.dom_document,
649
+ <lxb_dom_node_t *> value.node,
650
+ <bint> True
651
+ )
652
+ if new_node == NULL:
653
+ raise SelectolaxError("Can't create a new node")
654
+ lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
655
+ else:
656
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
657
+
658
+ def insert_after(self, str_or_LexborNode value):
659
+ """
660
+ Insert a node after the current Node.
661
+
662
+ Parameters
663
+ ----------
664
+ value : str, bytes or Node
665
+ The text or Node instance to insert after the Node.
666
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
667
+ Convert and pass the ``Node`` object when you want to work with HTML.
668
+ Does not clone the ``Node`` object.
669
+ All future changes to the passed ``Node`` object will also be taken into account.
670
+
671
+ Examples
672
+ --------
673
+
674
+ >>> tree = LexborHTMLParser('<div>Get <img src="" alt="Laptop"></div>')
675
+ >>> img = tree.css_first('img')
676
+ >>> img.insert_after(img.attributes.get('alt', ''))
677
+ >>> tree.body.child.html
678
+ '<div>Get <img src="" alt="Laptop">Laptop</div>'
679
+
680
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>')
681
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
682
+ >>> img_node = html_parser.css_first('img')
683
+ >>> img_node.insert_after(html_parser2.body.child)
684
+ <div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
685
+ """
686
+ cdef lxb_dom_node_t * new_node
687
+
688
+ if isinstance(value, (str, bytes, unicode)):
689
+ bytes_val = to_bytes(value)
690
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
691
+ &self.parser.document.dom_document,
692
+ <lxb_char_t *> bytes_val, len(bytes_val)
693
+ )
694
+ if new_node == NULL:
695
+ raise SelectolaxError("Can't create a new node")
696
+ lxb_dom_node_insert_after(self.node, new_node)
697
+ elif isinstance(value, LexborNode):
698
+ new_node = lxb_dom_document_import_node(
699
+ &self.parser.document.dom_document,
700
+ <lxb_dom_node_t *> value.node,
701
+ <bint> True
702
+ )
703
+ if new_node == NULL:
704
+ raise SelectolaxError("Can't create a new node")
705
+ lxb_dom_node_insert_after(self.node, <lxb_dom_node_t *> new_node)
706
+ else:
707
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
708
+
709
+ def insert_child(self, str_or_LexborNode value):
710
+ """
711
+ Insert a node inside (at the end of) the current Node.
712
+
713
+ Parameters
714
+ ----------
715
+ value : str, bytes or Node
716
+ The text or Node instance to insert inside the Node.
717
+ When a text string is passed, it's treated as text. All HTML tags will be escaped.
718
+ Convert and pass the ``Node`` object when you want to work with HTML.
719
+ Does not clone the ``Node`` object.
720
+ All future changes to the passed ``Node`` object will also be taken into account.
721
+
722
+ Examples
723
+ --------
724
+
725
+ >>> tree = LexborHTMLParser('<div>Get <img src=""></div>')
726
+ >>> div = tree.css_first('div')
727
+ >>> div.insert_child('Laptop')
728
+ >>> tree.body.child.html
729
+ '<div>Get <img src="">Laptop</div>'
730
+
731
+ >>> html_parser = LexborHTMLParser('<div>Get <span alt="Laptop"> <div>Laptop</div> </span></div>')
732
+ >>> html_parser2 = LexborHTMLParser('<div>Test</div>')
733
+ >>> span_node = html_parser.css_first('span')
734
+ >>> span_node.insert_child(html_parser2.body.child)
735
+ <div>Get <span alt="Laptop"> <div>Laptop</div> <div>Test</div> </span></div>'
736
+ """
737
+ cdef lxb_dom_node_t * new_node
738
+
739
+ if isinstance(value, (str, bytes, unicode)):
740
+ bytes_val = to_bytes(value)
741
+ new_node = <lxb_dom_node_t *> lxb_dom_document_create_text_node(
742
+ &self.parser.document.dom_document,
743
+ <lxb_char_t *> bytes_val, len(bytes_val)
744
+ )
745
+ if new_node == NULL:
746
+ raise SelectolaxError("Can't create a new node")
747
+ lxb_dom_node_insert_child(self.node, new_node)
748
+ elif isinstance(value, LexborNode):
749
+ new_node = lxb_dom_document_import_node(
750
+ &self.parser.document.dom_document,
751
+ <lxb_dom_node_t *> value.node,
752
+ <bint> True
753
+ )
754
+ if new_node == NULL:
755
+ raise SelectolaxError("Can't create a new node")
756
+ lxb_dom_node_insert_child(self.node, <lxb_dom_node_t *> new_node)
757
+ else:
758
+ raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
759
+
760
+ @property
761
+ def raw_value(self):
762
+ """Return the raw (unparsed, original) value of a node.
763
+
764
+ Currently, works on text nodes only.
765
+
766
+ Returns
767
+ -------
768
+
769
+ raw_value : bytes
770
+
771
+ Examples
772
+ --------
773
+
774
+ >>> html_parser = LexborHTMLParser('<div>&#x3C;test&#x3E;</div>')
775
+ >>> selector = html_parser.css_first('div')
776
+ >>> selector.child.html
777
+ '&lt;test&gt;'
778
+ >>> selector.child.raw_value
779
+ b'&#x3C;test&#x3E;'
780
+ """
781
+ raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
782
+
783
+ def scripts_contain(self, str query):
784
+ """Returns True if any of the script tags contain specified text.
785
+
786
+ Caches script tags on the first call to improve performance.
787
+
788
+ Parameters
789
+ ----------
790
+ query : str
791
+ The query to check.
792
+
793
+ """
794
+ cdef LexborNode node
795
+ if self.parser.cached_script_texts is None:
796
+ nodes = self.parser.selector.find('script', self)
797
+ text_nodes = []
798
+ for node in nodes:
799
+ node_text = node.text(deep=True)
800
+ if node_text:
801
+ text_nodes.append(node_text)
802
+ self.parser.cached_script_texts = text_nodes
803
+
804
+ for text in self.parser.cached_script_texts:
805
+ if query in text:
806
+ return True
807
+ return False
808
+
809
+ def script_srcs_contain(self, tuple queries):
810
+ """Returns True if any of the script SRCs attributes contain on of the specified text.
811
+
812
+ Caches values on the first call to improve performance.
813
+
814
+ Parameters
815
+ ----------
816
+ queries : tuple of str
817
+
818
+ """
819
+ cdef LexborNode node
820
+ if self.parser.cached_script_srcs is None:
821
+ nodes = self.parser.selector.find('script', self)
822
+ src_nodes = []
823
+ for node in nodes:
824
+ node_src = node.attrs.get('src')
825
+ if node_src:
826
+ src_nodes.append(node_src)
827
+ self.parser.cached_script_srcs = src_nodes
828
+
829
+ for text in self.parser.cached_script_srcs:
830
+ for query in queries:
831
+ if query in text:
832
+ return True
833
+ return False
834
+
835
+ def remove(self, bool recursive=True):
836
+ """An alias for the decompose method."""
837
+ self.decompose(recursive)
838
+
839
+ def select(self, query=None):
840
+ """Select nodes given a CSS selector.
841
+
842
+ Works similarly to the the ``css`` method, but supports chained filtering and extra features.
843
+
844
+ Parameters
845
+ ----------
846
+ query : str or None
847
+ The CSS selector to use when searching for nodes.
848
+
849
+ Returns
850
+ -------
851
+ selector : The `Selector` class.
852
+ """
853
+ return LexborSelector(self, query)
854
+
855
+ def __eq__(self, other):
856
+ if isinstance(other, str):
857
+ return self.html == other
858
+ if not isinstance(other, LexborNode):
859
+ return False
860
+ return self.html == other.html
861
+
862
+ @property
863
+ def text_content(self):
864
+ """Returns the text of the node if it is a text node.
865
+
866
+ Returns None for other nodes.
867
+ Unlike the ``text`` method, does not include child nodes.
868
+
869
+ Returns
870
+ -------
871
+ text : str or None.
872
+ """
873
+ cdef unsigned char * text
874
+ cdef lxb_dom_node_t* node = <lxb_dom_node_t*> self.node.first_child
875
+ cdef TextContainer container
876
+ if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
877
+ return None
878
+
879
+ text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
880
+ if text != NULL:
881
+ container = TextContainer.new_with_defaults()
882
+ py_text = text.decode(_ENCODING)
883
+ container.append(py_text)
884
+ return container.text
885
+
886
+
887
+ @cython.internal
888
+ @cython.final
889
+ cdef class TextContainer:
890
+ cdef str _text
891
+ cdef str separator
892
+ cdef bint strip
893
+
894
+ @staticmethod
895
+ cdef TextContainer new_with_defaults():
896
+ cdef TextContainer cls = TextContainer.__new__(TextContainer)
897
+ cls._text = ''
898
+ cls.separator = ''
899
+ cls.strip = False
900
+ return cls
901
+
902
+ def __init__(self, str separator = '', bool strip = False):
903
+ self._text = ""
904
+ self.separator = separator
905
+ self.strip = strip
906
+
907
+ def append(self, str node_text):
908
+ if self.strip:
909
+ self._text += node_text.strip() + self.separator
910
+ else:
911
+ self._text += node_text + self.separator
912
+
913
+ @property
914
+ def text(self):
915
+ if self.separator and self._text and self._text.endswith(self.separator):
916
+ self._text = self._text[:-len(self.separator)]
917
+ return self._text
918
+
919
+
920
+ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
921
+ cdef unsigned char *text
922
+ cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
923
+ if tag_id != LXB_TAG__TEXT:
924
+ return LEXBOR_ACTION_OK
925
+
926
+ text = <unsigned char*> lexbor_str_data_noi(&(<lxb_dom_text_t *> node).char_data.data)
927
+ if not text:
928
+ return LEXBOR_ACTION_OK
929
+
930
+ try:
931
+ py_str = text.decode(_ENCODING)
932
+
933
+ except Exception as e:
934
+ PyErr_SetNone(e)
935
+ return LEXBOR_ACTION_STOP
936
+
937
+ cdef TextContainer cls
938
+ cls = <TextContainer> ctx
939
+ cls.append(py_str)
940
+ return LEXBOR_ACTION_OK