nokolexbor 0.4.2 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d9b715c95c44d85153796fbb2bd0760cde7c83d4a58d0df026a5bd5cddbd7dc
4
- data.tar.gz: '01958cbe391c1dd9b57419ee84bc7db14c961428ec3c91c916d8a34cc36afddb'
3
+ metadata.gz: f8686e83a70fe40537a072997e55f11ae40839ccf0b66125e18a251d9ba2c265
4
+ data.tar.gz: edfedec732ab50ee7a68a56c92b0d71166fdcf091c29105b0995e64096c5ebac
5
5
  SHA512:
6
- metadata.gz: 8bbb957b15cac859b5565c57ccd0830b0ce184db5e63bf5b609b788ac4854e9620ba60e5477de1f85492357b216c4d39cded91ea2297142cef7ee529a0aa1692
7
- data.tar.gz: 9dc2200b3506a5ff3eb14ea4438b09798f6845715238193e09d76468943d08e125992ce758bd3a946ad5b5efcb6443a58fa3aae68d2d7b381410883a974d98da
6
+ metadata.gz: be9df7bc7747b0d155764069f6782e27127127c1c2ec74394b1d9ae01c56b3428811ee4aec1bf4e89fbc8bd6be61508d6dd26b93bd9929c2fdd70cb4e1b28a7d
7
+ data.tar.gz: 91d16cbe5143e480bd52f5fdbcfe7aa57ca8127cb8f4ea9f249280f027264297426c00f5c5525cf365969668d4240e80352738b86385e2a489e5f8e5d89a6e11
@@ -22,6 +22,11 @@
22
22
  extern "C" {
23
23
  #endif
24
24
 
25
+ static size_t tmp_len;
26
+ #define NODE_NAME(node) lxb_dom_node_name_qualified((node), &tmp_len)
27
+ #define NODE_NS_HREF(node) ((node)->prefix ? lxb_ns_by_id((node)->owner_document->ns, (node)->ns, &tmp_len) : NULL)
28
+ #define NODE_NS_PREFIX(node) lxb_ns_by_id((node)->owner_document->prefix, (node)->prefix, &tmp_len)
29
+
25
30
  /*
26
31
  * Some of the basic types pointer to structures:
27
32
  */
@@ -918,7 +923,7 @@ XMLPUBFUN long XMLCALL
918
923
  xmlGetLineNo (const xmlNode *node);
919
924
  #if defined(LIBXML_TREE_ENABLED) || defined(LIBXML_DEBUG_ENABLED)
920
925
  XMLPUBFUN xmlChar * XMLCALL
921
- xmlGetNodePath (const xmlNode *node);
926
+ nl_xmlGetNodePath (const lxb_dom_node_t *node);
922
927
  #endif /* defined(LIBXML_TREE_ENABLED) || defined(LIBXML_DEBUG_ENABLED) */
923
928
  XMLPUBFUN lxb_dom_node_t_ptr XMLCALL
924
929
  nl_xmlDocGetRootElement (const lxb_dom_document_t *doc);
@@ -1,9 +1,15 @@
1
1
  #include "nokolexbor.h"
2
+ #include "config.h"
2
3
 
3
4
  extern VALUE mNokolexbor;
4
5
  extern VALUE cNokolexborNode;
5
6
  VALUE cNokolexborDocument;
6
7
 
8
+ #ifdef HAVE_PTHREAD_H
9
+ #include <pthread.h>
10
+ pthread_key_t p_key_parser;
11
+ #endif
12
+
7
13
  static void
8
14
  free_nl_document(lxb_html_document_t *document)
9
15
  {
@@ -44,18 +50,27 @@ nl_document_parse(VALUE self, VALUE rb_string_or_io)
44
50
  const char *html_c = StringValuePtr(rb_html);
45
51
  size_t html_len = RSTRING_LEN(rb_html);
46
52
 
47
- lxb_html_document_t *document;
48
-
49
- document = lxb_html_document_create();
50
- if (document == NULL) {
51
- rb_raise(rb_eRuntimeError, "Error creating document");
53
+ #ifdef HAVE_PTHREAD_H
54
+ lxb_html_parser_t *g_parser = (lxb_html_parser_t *)pthread_getspecific(p_key_parser);
55
+ #else
56
+ lxb_html_parser_t *g_parser = NULL;
57
+ #endif
58
+ if (g_parser == NULL) {
59
+ g_parser = lxb_html_parser_create();
60
+ lxb_status_t status = lxb_html_parser_init(g_parser);
61
+ if (status != LXB_STATUS_OK) {
62
+ nl_raise_lexbor_error(status);
63
+ }
64
+ g_parser->tree->scripting = true;
65
+ #ifdef HAVE_PTHREAD_H
66
+ pthread_setspecific(p_key_parser, g_parser);
67
+ #endif
52
68
  }
53
69
 
54
- lxb_dom_document_scripting_set(lxb_dom_interface_document(document), true);
70
+ lxb_html_document_t *document = lxb_html_parse(g_parser, (const lxb_char_t *)html_c, html_len);
55
71
 
56
- lxb_status_t status = lxb_html_document_parse(document, (const lxb_char_t *)html_c, html_len);
57
- if (status != LXB_STATUS_OK) {
58
- nl_raise_lexbor_error(status);
72
+ if (document == NULL) {
73
+ rb_raise(rb_eRuntimeError, "Error parsing document");
59
74
  }
60
75
 
61
76
  return TypedData_Wrap_Struct(cNokolexborDocument, &nl_document_type, document);
@@ -127,8 +142,21 @@ nl_document_root(VALUE self)
127
142
  return nl_rb_node_create(lxb_dom_document_root(doc), self);
128
143
  }
129
144
 
145
+ static void
146
+ free_parser(void *data)
147
+ {
148
+ lxb_html_parser_t *g_parser = (lxb_html_parser_t *)data;
149
+ if (g_parser != NULL) {
150
+ g_parser = lxb_html_parser_destroy(g_parser);
151
+ }
152
+ }
153
+
130
154
  void Init_nl_document(void)
131
155
  {
156
+ #ifdef HAVE_PTHREAD_H
157
+ pthread_key_create(&p_key_parser, free_parser);
158
+ #endif
159
+
132
160
  cNokolexborDocument = rb_define_class_under(mNokolexbor, "Document", cNokolexborNode);
133
161
  rb_define_singleton_method(cNokolexborDocument, "new", nl_document_new, 0);
134
162
  rb_define_singleton_method(cNokolexborDocument, "parse", nl_document_parse, 1);
@@ -1,4 +1,5 @@
1
1
  #include "nokolexbor.h"
2
+ #include "libxml/tree.h"
2
3
 
3
4
  #define SORT_NAME nl_css_result
4
5
  #define SORT_TYPE lxb_dom_node_t *
@@ -871,6 +872,9 @@ nl_node_destroy(VALUE self)
871
872
  static VALUE
872
873
  nl_node_equals(VALUE self, VALUE other)
873
874
  {
875
+ if (!rb_obj_is_kind_of(other, cNokolexborNode)) {
876
+ return Qfalse;
877
+ }
874
878
  lxb_dom_node_t *node1 = nl_rb_node_unwrap(self);
875
879
  lxb_dom_node_t *node2 = nl_rb_node_unwrap(other);
876
880
  return node1 == node2 ? Qtrue : Qfalse;
@@ -1141,6 +1145,22 @@ nl_node_source_location(VALUE self)
1141
1145
  return ULONG2NUM(node->source_location);
1142
1146
  }
1143
1147
 
1148
+ /**
1149
+ * @return [String] The path associated with this Node.
1150
+ */
1151
+ static VALUE
1152
+ nl_node_path(VALUE self)
1153
+ {
1154
+ lxb_dom_node_t *node = nl_rb_node_unwrap(self);
1155
+ char* path = nl_xmlGetNodePath(node);
1156
+ if (path == NULL) {
1157
+ return Qnil;
1158
+ }
1159
+ VALUE ret = rb_utf8_str_new_cstr(path);
1160
+ nl_xmlFree(path);
1161
+ return ret;
1162
+ }
1163
+
1144
1164
  void Init_nl_node(void)
1145
1165
  {
1146
1166
  cNokolexborNode = rb_define_class_under(mNokolexbor, "Node", rb_cObject);
@@ -1186,6 +1206,7 @@ void Init_nl_node(void)
1186
1206
  rb_define_method(cNokolexborNode, "clone", nl_node_clone, 0);
1187
1207
  rb_define_method(cNokolexborNode, "inspect", nl_node_inspect, -1);
1188
1208
  rb_define_method(cNokolexborNode, "source_location", nl_node_source_location, 0);
1209
+ rb_define_method(cNokolexborNode, "path", nl_node_path, 0);
1189
1210
 
1190
1211
  rb_define_alias(cNokolexborNode, "attr", "[]");
1191
1212
  rb_define_alias(cNokolexborNode, "get_attribute", "[]");
@@ -1195,6 +1216,7 @@ void Init_nl_node(void)
1195
1216
  rb_define_alias(cNokolexborNode, "delete", "remove_attr");
1196
1217
  rb_define_alias(cNokolexborNode, "elements", "element_children");
1197
1218
  rb_define_alias(cNokolexborNode, "remove_attribute", "remove_attr");
1219
+ rb_define_alias(cNokolexborNode, "node_name", "name");
1198
1220
  rb_define_alias(cNokolexborNode, "text", "content");
1199
1221
  rb_define_alias(cNokolexborNode, "inner_text", "content");
1200
1222
  rb_define_alias(cNokolexborNode, "to_str", "content");
@@ -157,4 +157,228 @@ nl_xmlDocGetRootElement(const lxb_dom_document_t *doc) {
157
157
  void
158
158
  nl_xmlFreeNodeList(lxb_dom_node_t_ptr cur) {
159
159
  // Should never be called
160
+ }
161
+
162
+ /**
163
+ * xmlGetNodePath:
164
+ * @node: a node
165
+ *
166
+ * Build a structure based Path for the given node
167
+ *
168
+ * Returns the new path or NULL in case of error. The caller must free
169
+ * the returned string
170
+ */
171
+ xmlChar *
172
+ nl_xmlGetNodePath(const lxb_dom_node_t *node)
173
+ {
174
+ const lxb_dom_node_t *cur, *tmp, *next;
175
+ xmlChar *buffer = NULL, *temp;
176
+ size_t buf_len;
177
+ xmlChar *buf;
178
+ const char *sep;
179
+ const char *name;
180
+ char nametemp[100];
181
+ int occur = 0, generic;
182
+
183
+ if ((node == NULL) || (node->type == XML_NAMESPACE_DECL))
184
+ return (NULL);
185
+
186
+ buf_len = 500;
187
+ buffer = (xmlChar *) nl_xmlMallocAtomic(buf_len);
188
+ if (buffer == NULL) {
189
+ xmlTreeErrMemory("getting node path");
190
+ return (NULL);
191
+ }
192
+ buf = (xmlChar *) nl_xmlMallocAtomic(buf_len);
193
+ if (buf == NULL) {
194
+ xmlTreeErrMemory("getting node path");
195
+ nl_xmlFree(buffer);
196
+ return (NULL);
197
+ }
198
+
199
+ buffer[0] = 0;
200
+ cur = node;
201
+ do {
202
+ name = "";
203
+ sep = "?";
204
+ occur = 0;
205
+ const lxb_char_t* cur_name = NODE_NAME(cur);
206
+ const lxb_char_t* cur_ns_prefix = NODE_NS_PREFIX(cur);
207
+ if ((cur->type == LXB_DOM_NODE_TYPE_DOCUMENT) ||
208
+ (cur->type == XML_HTML_DOCUMENT_NODE)) {
209
+ if (buffer[0] == '/')
210
+ break;
211
+ sep = "/";
212
+ next = NULL;
213
+ } else if (cur->type == LXB_DOM_NODE_TYPE_ELEMENT) {
214
+ generic = 0;
215
+ sep = "/";
216
+ name = (const char *) cur_name;
217
+ next = cur->parent;
218
+
219
+ /*
220
+ * Thumbler index computation
221
+ * TODO: the occurrence test seems bogus for namespaced names
222
+ */
223
+ tmp = cur->prev;
224
+ while (tmp != NULL) {
225
+ if ((tmp->type == LXB_DOM_NODE_TYPE_ELEMENT) &&
226
+ (generic ||
227
+ (nl_xmlStrEqual(cur_name, NODE_NAME(tmp)) &&
228
+ ((tmp->ns == cur->ns) ||
229
+ ((tmp->ns != NULL) && (cur->ns != NULL) &&
230
+ (nl_xmlStrEqual(cur_ns_prefix, NODE_NS_PREFIX(tmp))))))))
231
+ occur++;
232
+ tmp = tmp->prev;
233
+ }
234
+ if (occur == 0) {
235
+ tmp = cur->next;
236
+ while (tmp != NULL && occur == 0) {
237
+ if ((tmp->type == LXB_DOM_NODE_TYPE_ELEMENT) &&
238
+ (generic ||
239
+ (nl_xmlStrEqual(cur_name, NODE_NAME(tmp)) &&
240
+ ((tmp->ns == cur->ns) ||
241
+ ((tmp->ns != NULL) && (cur->ns != NULL) &&
242
+ (nl_xmlStrEqual(cur_ns_prefix, NODE_NS_PREFIX(tmp))))))))
243
+ occur++;
244
+ tmp = tmp->next;
245
+ }
246
+ if (occur != 0)
247
+ occur = 1;
248
+ } else
249
+ occur++;
250
+ } else if (cur->type == LXB_DOM_NODE_TYPE_COMMENT) {
251
+ sep = "/";
252
+ name = "comment()";
253
+ next = cur->parent;
254
+
255
+ /*
256
+ * Thumbler index computation
257
+ */
258
+ tmp = cur->prev;
259
+ while (tmp != NULL) {
260
+ if (tmp->type == LXB_DOM_NODE_TYPE_COMMENT)
261
+ occur++;
262
+ tmp = tmp->prev;
263
+ }
264
+ if (occur == 0) {
265
+ tmp = cur->next;
266
+ while (tmp != NULL && occur == 0) {
267
+ if (tmp->type == LXB_DOM_NODE_TYPE_COMMENT)
268
+ occur++;
269
+ tmp = tmp->next;
270
+ }
271
+ if (occur != 0)
272
+ occur = 1;
273
+ } else
274
+ occur++;
275
+ } else if ((cur->type == LXB_DOM_NODE_TYPE_TEXT) ||
276
+ (cur->type == LXB_DOM_NODE_TYPE_CDATA_SECTION)) {
277
+ sep = "/";
278
+ name = "text()";
279
+ next = cur->parent;
280
+
281
+ /*
282
+ * Thumbler index computation
283
+ */
284
+ tmp = cur->prev;
285
+ while (tmp != NULL) {
286
+ if ((tmp->type == LXB_DOM_NODE_TYPE_TEXT) ||
287
+ (tmp->type == LXB_DOM_NODE_TYPE_CDATA_SECTION))
288
+ occur++;
289
+ tmp = tmp->prev;
290
+ }
291
+ /*
292
+ * Evaluate if this is the only text- or CDATA-section-node;
293
+ * if yes, then we'll get "text()", otherwise "text()[1]".
294
+ */
295
+ if (occur == 0) {
296
+ tmp = cur->next;
297
+ while (tmp != NULL) {
298
+ if ((tmp->type == LXB_DOM_NODE_TYPE_TEXT) ||
299
+ (tmp->type == LXB_DOM_NODE_TYPE_CDATA_SECTION))
300
+ {
301
+ occur = 1;
302
+ break;
303
+ }
304
+ tmp = tmp->next;
305
+ }
306
+ } else
307
+ occur++;
308
+ } else if (cur->type == LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION) {
309
+ sep = "/";
310
+ snprintf(nametemp, sizeof(nametemp) - 1,
311
+ "processing-instruction('%s')", (char *)cur_name);
312
+ nametemp[sizeof(nametemp) - 1] = 0;
313
+ name = nametemp;
314
+
315
+ next = cur->parent;
316
+
317
+ /*
318
+ * Thumbler index computation
319
+ */
320
+ tmp = cur->prev;
321
+ while (tmp != NULL) {
322
+ if ((tmp->type == LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION) &&
323
+ (nl_xmlStrEqual(cur_name, NODE_NAME(tmp))))
324
+ occur++;
325
+ tmp = tmp->prev;
326
+ }
327
+ if (occur == 0) {
328
+ tmp = cur->next;
329
+ while (tmp != NULL && occur == 0) {
330
+ if ((tmp->type == LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION) &&
331
+ (nl_xmlStrEqual(cur_name, NODE_NAME(tmp))))
332
+ occur++;
333
+ tmp = tmp->next;
334
+ }
335
+ if (occur != 0)
336
+ occur = 1;
337
+ } else
338
+ occur++;
339
+
340
+ } else if (cur->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
341
+ sep = "/@";
342
+ name = (const char *) lxb_dom_attr_qualified_name(cur, &tmp_len);
343
+ next = ((lxb_dom_attr_t_ptr)cur)->owner;
344
+ } else {
345
+ nl_xmlFree(buf);
346
+ nl_xmlFree(buffer);
347
+ return (NULL);
348
+ }
349
+
350
+ /*
351
+ * Make sure there is enough room
352
+ */
353
+ if (nl_xmlStrlen(buffer) + sizeof(nametemp) + 20 > buf_len) {
354
+ buf_len =
355
+ 2 * buf_len + nl_xmlStrlen(buffer) + sizeof(nametemp) + 20;
356
+ temp = (xmlChar *) nl_xmlRealloc(buffer, buf_len);
357
+ if (temp == NULL) {
358
+ xmlTreeErrMemory("getting node path");
359
+ nl_xmlFree(buf);
360
+ nl_xmlFree(buffer);
361
+ return (NULL);
362
+ }
363
+ buffer = temp;
364
+ temp = (xmlChar *) nl_xmlRealloc(buf, buf_len);
365
+ if (temp == NULL) {
366
+ xmlTreeErrMemory("getting node path");
367
+ nl_xmlFree(buf);
368
+ nl_xmlFree(buffer);
369
+ return (NULL);
370
+ }
371
+ buf = temp;
372
+ }
373
+ if (occur == 0)
374
+ snprintf((char *) buf, buf_len, "%s%s%s",
375
+ sep, name, (char *) buffer);
376
+ else
377
+ snprintf((char *) buf, buf_len, "%s%s[%d]%s",
378
+ sep, name, occur, (char *) buffer);
379
+ snprintf((char *) buffer, buf_len, "%s", (char *)buf);
380
+ cur = next;
381
+ } while (cur != NULL);
382
+ nl_xmlFree(buf);
383
+ return (buffer);
160
384
  }
@@ -139,12 +139,6 @@
139
139
  #define XPATH_MAX_RECURSION_DEPTH 5000
140
140
  #endif
141
141
 
142
- static size_t tmp_len;
143
-
144
- #define NODE_NAME(node) lxb_dom_node_name_qualified((node), &tmp_len)
145
- #define NODE_NS_HREF(node) ((node)->prefix ? lxb_ns_by_id((node)->owner_document->ns, (node)->ns, &tmp_len) : NULL)
146
- #define NODE_NS_PREFIX(node) lxb_ns_by_id((node)->owner_document->prefix, (node)->prefix, &tmp_len)
147
-
148
142
  /*
149
143
  * TODO:
150
144
  * There are a few spots where some tests are done which depend upon ascii
@@ -51,12 +51,20 @@ module Nokolexbor
51
51
  def element?
52
52
  type == ELEMENT_NODE
53
53
  end
54
+ alias_method :elem?, :element?
54
55
 
55
56
  # @return true if this is a {Document}
56
57
  def document?
57
58
  is_a?(Nokolexbor::Document)
58
59
  end
59
60
 
61
+ # Get the path to this node as a CSS expression
62
+ def css_path
63
+ path.split(%r{/}).filter_map do |part|
64
+ part.empty? ? nil : part.gsub(/\[(\d+)\]/, ':nth-of-type(\1)')
65
+ end.join(" > ")
66
+ end
67
+
60
68
  # Get a list of ancestor Node of this Node
61
69
  #
62
70
  # @param [String, nil] selector The selector to match ancestors
@@ -285,6 +293,11 @@ module Nokolexbor
285
293
  parent_node.add_child(self)
286
294
  end
287
295
 
296
+ # @return true if this Node's attributes include <value>
297
+ def value?(value)
298
+ values.include?(value)
299
+ end
300
+
288
301
  # Iterate over each attribute name and value pair of this Node.
289
302
  #
290
303
  # @yield [String,String] The name and value of the current attribute.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nokolexbor
4
- VERSION = '0.4.2'
4
+ VERSION = '0.5.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokolexbor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yicheng Zhou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-05 00:00:00.000000000 Z
11
+ date: 2023-10-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler