makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
@@ -1,9 +1,17 @@
1
+ /* ruby_node.c - the shared, representation-neutral node core.
2
+ *
3
+ * HTML (Lexbor) nodes and XML (custom-arena) nodes are two representations of the
4
+ * same Ruby-facing Node abstraction. This file owns what is common to BOTH: the
5
+ * TypedData types that distinguish the two wrappers (so a representation-specific
6
+ * accessor rejects the wrong kind via Ruby's own type machinery), the shared GC
7
+ * functions, and the kind-agnostic accessors used for identity and document
8
+ * lookup. The HTML node implementation (wrap/unwrap + reader methods) lives in
9
+ * ruby_html_node.c, the XML one in ruby_xml_node.c. */
1
10
  #include "glue.h"
2
-
3
- #include <lexbor/ns/ns.h> /* lxb_ns_by_id, LXB_NS__UNDEF (namespaceURI) */
11
+ #include "../xml/mkr_xml_node.h" /* mkr_xml_doc_t::doc_node, for the kind-aware mkr_node_raw */
4
12
 
5
13
  /* ------------------------------------------------------------------ */
6
- /* Node wrapper type */
14
+ /* GC + TypedData types */
7
15
  /* ------------------------------------------------------------------ */
8
16
 
9
17
  static void
@@ -16,7 +24,7 @@ mkr_node_gc_mark(void *ptr)
16
24
  static void
17
25
  mkr_node_gc_free(void *ptr)
18
26
  {
19
- /* The lxb_dom_node_t is owned by the document arena; never freed here. */
27
+ /* The node is owned by the document arena (HTML or XML); never freed here. */
20
28
  xfree(ptr);
21
29
  }
22
30
 
@@ -27,61 +35,65 @@ mkr_node_memsize(const void *ptr)
27
35
  return sizeof(mkr_node_data_t);
28
36
  }
29
37
 
38
+ /* HTML and XML nodes share the mkr_node_data_t layout (node pointer + keepalive
39
+ * Document) and the same GC functions, but are wrapped under DISTINCT TypedData
40
+ * types so the representation is checked by Ruby's own type machinery: an HTML
41
+ * accessor (mkr_html_node_unwrap, via mkr_html_node_type) raises TypeError when
42
+ * handed an XML node and vice versa - it is structurally impossible to read one
43
+ * representation's pointer as the other's. mkr_node_type is the shared base (both
44
+ * derive from it via .parent), so the kind-agnostic identity accessors accept
45
+ * either. This is the single source of HTML/XML node-pointer safety; there is no
46
+ * ambiguous "return an lxb_dom_node_t for any node" unwrap. */
30
47
  const rb_data_type_t mkr_node_type = {
31
48
  "Makiri::Node",
32
49
  { mkr_node_gc_mark, mkr_node_gc_free, mkr_node_memsize, },
33
50
  0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
34
51
  };
52
+ const rb_data_type_t mkr_html_node_type = {
53
+ "Makiri::HTML::Node",
54
+ { mkr_node_gc_mark, mkr_node_gc_free, mkr_node_memsize, },
55
+ &mkr_node_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
56
+ };
57
+ const rb_data_type_t mkr_xml_node_type = {
58
+ "Makiri::XML::Node",
59
+ { mkr_node_gc_mark, mkr_node_gc_free, mkr_node_memsize, },
60
+ &mkr_node_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
61
+ };
35
62
 
36
63
  /* ------------------------------------------------------------------ */
37
- /* wrap / unwrap */
64
+ /* kind-agnostic accessors (identity / document) */
38
65
  /* ------------------------------------------------------------------ */
39
66
 
40
- VALUE
41
- mkr_wrap_node(lxb_dom_node_t *node, VALUE document)
42
- {
43
- if (node == NULL) {
44
- return Qnil;
45
- }
46
-
47
- /* The document node maps back onto the Ruby Document object. */
48
- if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT) {
49
- return document;
50
- }
51
-
52
- VALUE klass;
53
- switch (node->type) {
54
- case LXB_DOM_NODE_TYPE_ELEMENT: klass = mkr_cElement; break;
55
- case LXB_DOM_NODE_TYPE_ATTRIBUTE: klass = mkr_cAttribute; break;
56
- case LXB_DOM_NODE_TYPE_TEXT: klass = mkr_cText; break;
57
- case LXB_DOM_NODE_TYPE_COMMENT: klass = mkr_cComment; break;
58
- case LXB_DOM_NODE_TYPE_CDATA_SECTION: klass = mkr_cCData; break;
59
- case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION:
60
- klass = mkr_cProcessingInstruction; break;
61
- case LXB_DOM_NODE_TYPE_DOCUMENT_TYPE: klass = mkr_cDocumentType; break;
62
- case LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT:
63
- klass = mkr_cDocumentFragment; break;
64
- default: klass = mkr_cNode; break;
65
- }
66
-
67
- mkr_node_data_t *nd;
68
- VALUE obj = TypedData_Make_Struct(klass, mkr_node_data_t, &mkr_node_type, nd);
69
- nd->node = node;
70
- nd->document = document;
71
- return obj;
72
- }
73
-
74
- lxb_dom_node_t *
75
- mkr_node_unwrap(VALUE rb_node)
67
+ /* The kind-AGNOSTIC raw node pointer (base mkr_node_type, accepts HTML or XML),
68
+ * as an opaque void* - dereferencing it requires an explicit cast, so it cannot be
69
+ * mistaken for a typed pointer. Only for the few sites where the representation is
70
+ * either irrelevant (identity comparison) or already guaranteed by an external
71
+ * same-document/kind check (the XPath context node). The Document branch is
72
+ * kind-aware (XML Document -> its arena document node, HTML -> the Lexbor one). */
73
+ void *
74
+ mkr_node_raw(VALUE rb_node)
76
75
  {
77
76
  if (rb_obj_is_kind_of(rb_node, mkr_cDocument)) {
78
- return (lxb_dom_node_t *)mkr_doc_unwrap(rb_node);
77
+ mkr_parsed_t *parsed = mkr_doc_parsed(rb_node);
78
+ if (mkr_parsed_kind(parsed) == MKR_DOC_XML) {
79
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(parsed);
80
+ return xdoc ? (void *)xdoc->doc_node : NULL;
81
+ }
82
+ return (void *)mkr_html_doc_unwrap(rb_node);
79
83
  }
80
84
  mkr_node_data_t *nd;
81
85
  TypedData_Get_Struct(rb_node, mkr_node_data_t, &mkr_node_type, nd);
82
86
  return nd->node;
83
87
  }
84
88
 
89
+ /* Node identity as an integer, for #==/#eql?/#hash/#pointer_id - kind-agnostic and
90
+ * never dereferenced. */
91
+ uintptr_t
92
+ mkr_node_id(VALUE rb_node)
93
+ {
94
+ return (uintptr_t)mkr_node_raw(rb_node);
95
+ }
96
+
85
97
  VALUE
86
98
  mkr_node_document(VALUE rb_node)
87
99
  {
@@ -92,709 +104,3 @@ mkr_node_document(VALUE rb_node)
92
104
  TypedData_Get_Struct(rb_node, mkr_node_data_t, &mkr_node_type, nd);
93
105
  return nd->document;
94
106
  }
95
-
96
- /* ------------------------------------------------------------------ */
97
- /* name / type / content */
98
- /* ------------------------------------------------------------------ */
99
-
100
- /*
101
- * Node name. Matches Nokogiri: lowercase tag name for HTML elements
102
- * (Lexbor lowercases during tokenization), and the un-prefixed DOM names
103
- * "text"/"comment"/"#cdata-section"/"document" for the other kinds.
104
- */
105
- static VALUE
106
- mkr_node_name(VALUE self)
107
- {
108
- lxb_dom_node_t *node = mkr_node_unwrap(self);
109
- size_t len = 0;
110
- const lxb_char_t *name;
111
-
112
- switch (node->type) {
113
- case LXB_DOM_NODE_TYPE_ELEMENT:
114
- name = lxb_dom_element_qualified_name(lxb_dom_interface_element(node), &len);
115
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
116
- case LXB_DOM_NODE_TYPE_ATTRIBUTE:
117
- name = lxb_dom_attr_qualified_name(lxb_dom_interface_attr(node), &len);
118
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
119
- case LXB_DOM_NODE_TYPE_TEXT:
120
- return rb_utf8_str_new_cstr("text");
121
- case LXB_DOM_NODE_TYPE_COMMENT:
122
- return rb_utf8_str_new_cstr("comment");
123
- case LXB_DOM_NODE_TYPE_CDATA_SECTION:
124
- return rb_utf8_str_new_cstr("#cdata-section");
125
- case LXB_DOM_NODE_TYPE_DOCUMENT:
126
- return rb_utf8_str_new_cstr("document");
127
- default:
128
- name = lxb_dom_node_name(node, &len);
129
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
130
- }
131
- }
132
-
133
- /* ------------------------------------------------------------------ */
134
- /* namespace (WHATWG DOM Element/Attr: namespaceURI/prefix/localName) */
135
- /* ------------------------------------------------------------------ */
136
-
137
- /*
138
- * Local name (DOM `localName`): the name without any prefix — "div" for
139
- * <div>, "path" for an SVG <path>, "href" for an xlink:href attribute.
140
- * Defined on Element and Attribute only; nil for the other node kinds (the DOM
141
- * gives a Text/Comment/Document no localName).
142
- */
143
- static VALUE
144
- mkr_node_local_name(VALUE self)
145
- {
146
- lxb_dom_node_t *node = mkr_node_unwrap(self);
147
- size_t len = 0;
148
- const lxb_char_t *name;
149
-
150
- switch (node->type) {
151
- case LXB_DOM_NODE_TYPE_ELEMENT:
152
- name = lxb_dom_element_local_name(lxb_dom_interface_element(node), &len);
153
- break;
154
- case LXB_DOM_NODE_TYPE_ATTRIBUTE:
155
- name = lxb_dom_attr_local_name(lxb_dom_interface_attr(node), &len);
156
- break;
157
- default:
158
- return Qnil;
159
- }
160
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
161
- }
162
-
163
- /*
164
- * Namespace prefix (DOM `prefix`): nil unless the qualified name is
165
- * `prefix:local` — typically nil for HTML5-parsed content. Derived from the
166
- * qualified-vs-local length (qualified == prefix ":" local), so a colon inside
167
- * a local name can't be mistaken for a separator. Element/Attribute only.
168
- */
169
- static VALUE
170
- mkr_node_prefix(VALUE self)
171
- {
172
- lxb_dom_node_t *node = mkr_node_unwrap(self);
173
- const lxb_char_t *q = NULL;
174
- size_t qlen = 0, llen = 0;
175
-
176
- switch (node->type) {
177
- case LXB_DOM_NODE_TYPE_ELEMENT: {
178
- lxb_dom_element_t *el = lxb_dom_interface_element(node);
179
- q = lxb_dom_element_qualified_name(el, &qlen);
180
- (void) lxb_dom_element_local_name(el, &llen);
181
- break;
182
- }
183
- case LXB_DOM_NODE_TYPE_ATTRIBUTE: {
184
- lxb_dom_attr_t *at = lxb_dom_interface_attr(node);
185
- q = lxb_dom_attr_qualified_name(at, &qlen);
186
- (void) lxb_dom_attr_local_name(at, &llen);
187
- break;
188
- }
189
- default:
190
- return Qnil;
191
- }
192
- if (q == NULL || qlen <= llen + 1) { /* no "prefix:" segment */
193
- return Qnil;
194
- }
195
- return mkr_ruby_str_from_borrowed(
196
- mkr_borrowed_text((const char *)q, qlen - llen - 1));
197
- }
198
-
199
- /*
200
- * The fixed namespaces the HTML parser assigns to foreign-content attributes by
201
- * prefix (the "adjust foreign attributes" step). Lexbor tags an attribute node
202
- * with its *element's* ns rather than the attribute's own, so an attribute's
203
- * namespaceURI is resolved from its prefix here, not from node->ns. Returns
204
- * NULL (=> DOM null) for any other prefix.
205
- */
206
- static const char *
207
- mkr_attr_ns_for_prefix(const char *p, size_t n)
208
- {
209
- if (n == 5 && memcmp(p, "xlink", 5) == 0) return "http://www.w3.org/1999/xlink";
210
- if (n == 3 && memcmp(p, "xml", 3) == 0) return "http://www.w3.org/XML/1998/namespace";
211
- if (n == 5 && memcmp(p, "xmlns", 5) == 0) return "http://www.w3.org/2000/xmlns/";
212
- return NULL;
213
- }
214
-
215
- /*
216
- * Namespace URI (DOM `namespaceURI`).
217
- *
218
- * Element: resolved from node->ns, so — DOM-faithfully — an HTML element is in
219
- * the XHTML namespace ("http://www.w3.org/1999/xhtml"), not nil (an HTML
220
- * element is never namespaceless; this is what browsers' DOM and `namespace-uri()`
221
- * return). SVG/MathML elements get their own URI; nil only when truly
222
- * unnamespaced (LXB_NS__UNDEF).
223
- *
224
- * Attribute: nil for an unprefixed attribute (class, id, ...); for a prefixed
225
- * one, the parser-assigned foreign-content namespace keyed on the prefix
226
- * (xlink/xml/xmlns), else nil.
227
- *
228
- * Other node kinds: nil.
229
- */
230
- static VALUE
231
- mkr_node_namespace_uri(VALUE self)
232
- {
233
- lxb_dom_node_t *node = mkr_node_unwrap(self);
234
-
235
- if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
236
- if (node->ns == LXB_NS__UNDEF) {
237
- return Qnil;
238
- }
239
- lxb_dom_document_t *doc = node->owner_document;
240
- if (doc == NULL || doc->ns == NULL) {
241
- return Qnil;
242
- }
243
- size_t len = 0;
244
- const lxb_char_t *uri = lxb_ns_by_id(doc->ns, node->ns, &len);
245
- if (uri == NULL || len == 0) {
246
- return Qnil;
247
- }
248
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)uri, len));
249
- }
250
-
251
- if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
252
- lxb_dom_attr_t *at = lxb_dom_interface_attr(node);
253
- size_t qlen = 0, llen = 0;
254
- const lxb_char_t *q = lxb_dom_attr_qualified_name(at, &qlen);
255
- (void) lxb_dom_attr_local_name(at, &llen);
256
- if (q == NULL || qlen <= llen + 1) {
257
- return Qnil; /* unprefixed attribute => no namespace */
258
- }
259
- const char *uri = mkr_attr_ns_for_prefix((const char *)q, qlen - llen - 1);
260
- return uri ? rb_utf8_str_new_cstr(uri) : Qnil;
261
- }
262
-
263
- return Qnil;
264
- }
265
-
266
- /*
267
- * Element#tag_name (DOM `tagName`): the qualified name, uppercased for an HTML
268
- * element in an HTML document ("DIV"), as the DOM specifies — unlike #name,
269
- * which is the lowercase qualified name. SVG/MathML elements keep their case.
270
- * nil for non-element nodes.
271
- */
272
- static VALUE
273
- mkr_node_tag_name(VALUE self)
274
- {
275
- lxb_dom_node_t *node = mkr_node_unwrap(self);
276
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
277
- return Qnil;
278
- }
279
- size_t len = 0;
280
- const lxb_char_t *name =
281
- lxb_dom_element_tag_name(lxb_dom_interface_element(node), &len);
282
- if (name == NULL) {
283
- return Qnil;
284
- }
285
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
286
- }
287
-
288
- /*
289
- * ProcessingInstruction#target (DOM `target`): the PI's target name
290
- * (the "xml" in <?xml ...?>). nil for non-PI nodes. The PI's data is read via
291
- * #content / #text like any character-data node.
292
- */
293
- static VALUE
294
- mkr_node_pi_target(VALUE self)
295
- {
296
- lxb_dom_node_t *node = mkr_node_unwrap(self);
297
- if (node->type != LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION) {
298
- return Qnil;
299
- }
300
- size_t len = 0;
301
- const lxb_char_t *t = lxb_dom_processing_instruction_target(
302
- lxb_dom_interface_processing_instruction(node), &len);
303
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)t, len));
304
- }
305
-
306
- /* Numeric DOM node type (LXB_DOM_NODE_TYPE_*). */
307
- static VALUE
308
- mkr_node_get_type(VALUE self)
309
- {
310
- return INT2NUM((int)mkr_node_unwrap(self)->type);
311
- }
312
-
313
- /*
314
- * DocumentType public / system identifiers (WHATWG DOM `publicId`/`systemId`).
315
- * Returns the String, or nil when the doctype carries no such identifier.
316
- * Lexbor represents a missing id inconsistently (NULL after `SYSTEM`, but an
317
- * empty string for a bare `<!DOCTYPE html>`), so we treat empty as absent and
318
- * return nil for both — matching Nokogiri (which also reports nil for an empty
319
- * or missing id). Defined only on Makiri::DocumentType, so the receiver is
320
- * always a doctype node; the guard is belt-and-suspenders.
321
- */
322
- static VALUE
323
- mkr_doctype_id(VALUE self, int system)
324
- {
325
- lxb_dom_node_t *node = mkr_node_unwrap(self);
326
- if (node->type != LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
327
- return Qnil;
328
- }
329
- lxb_dom_document_type_t *dt = lxb_dom_interface_document_type(node);
330
- size_t len = 0;
331
- const lxb_char_t *id = system ? lxb_dom_document_type_system_id(dt, &len)
332
- : lxb_dom_document_type_public_id(dt, &len);
333
- return (id == NULL || len == 0)
334
- ? Qnil
335
- : mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)id, len));
336
- }
337
-
338
- static VALUE
339
- mkr_doctype_public_id(VALUE self)
340
- {
341
- return mkr_doctype_id(self, 0);
342
- }
343
-
344
- static VALUE
345
- mkr_doctype_system_id(VALUE self)
346
- {
347
- return mkr_doctype_id(self, 1);
348
- }
349
-
350
- /*
351
- * A <template> element's "template contents" — the separate DocumentFragment
352
- * the HTML parser fills instead of making the parsed nodes children of the
353
- * <template> (WHATWG DOM `HTMLTemplateElement.content`; browsers behave the
354
- * same: template.children is empty, template.content holds the nodes). Lexbor
355
- * stores it on the template interface; we surface it as a Makiri::DocumentFragment
356
- * so it can be traversed/queried (`tpl.content_fragment.css("p")`).
357
- *
358
- * Returns nil for any node that is not an HTML <template>. Note: CSS/XPath over
359
- * the *template element itself* deliberately do NOT descend into the content
360
- * (matching the DOM, and unavoidable for CSS since it runs Lexbor's selector
361
- * engine over the real tree) — query the fragment instead.
362
- */
363
- static VALUE
364
- mkr_node_content_fragment(VALUE self)
365
- {
366
- lxb_dom_node_t *node = mkr_node_unwrap(self);
367
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT
368
- || node->local_name != LXB_TAG_TEMPLATE
369
- || node->ns != LXB_NS_HTML) {
370
- return Qnil;
371
- }
372
- lxb_dom_document_fragment_t *content = lxb_html_interface_template(node)->content;
373
- if (content == NULL) {
374
- return Qnil;
375
- }
376
- return mkr_wrap_node((lxb_dom_node_t *)content, mkr_node_document(self));
377
- }
378
-
379
- /* Concatenated text content of this node and its descendants. The DOM spec
380
- * makes a Document's textContent null; we instead return the text of the root
381
- * element (matching the intuitive, Nokogiri-like Document#text). */
382
- static VALUE
383
- mkr_node_content(VALUE self)
384
- {
385
- lxb_dom_node_t *node = mkr_node_unwrap(self);
386
- if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT) {
387
- node = lxb_dom_document_root((lxb_dom_document_t *)node);
388
- if (node == NULL) {
389
- return rb_utf8_str_new("", 0);
390
- }
391
- }
392
-
393
- /* Fast path for elements / fragments (the common case, incl. document text).
394
- *
395
- * Preferred: the per-document text index (lexbor_compat/text_index.c) maps
396
- * this node to the contiguous, document-order run of its descendants' text
397
- * slices, so we serve a single pre-sized memcpy run with no per-extraction
398
- * tree walk — the walk is otherwise the dominant, cache-bound cost. Built
399
- * lazily on first use and dropped on any mutation, so a slice can never
400
- * point at reallocated/detached storage.
401
- *
402
- * Fallback (index unavailable — node outside the indexed tree, e.g. a
403
- * fragment, or a build OOM): stream each descendant text/CDATA node's data
404
- * straight into the Ruby string via an iterative pre-order walk (stack-safe;
405
- * skips Lexbor's intermediate arena buffer + copy). */
406
- if (node->type == LXB_DOM_NODE_TYPE_ELEMENT
407
- || node->type == LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT) {
408
- mkr_parsed_t *parsed = mkr_doc_parsed(mkr_node_document(self));
409
- const mkr_borrowed_text_t *slices;
410
- size_t nslices, total;
411
- if (parsed != NULL
412
- && mkr_parsed_text_slices(parsed, node, &slices, &nslices, &total)) {
413
- return mkr_ruby_str_from_slices(slices, nslices, total);
414
- }
415
-
416
- VALUE str = rb_utf8_str_new(NULL, 0);
417
- for (lxb_dom_node_t *n = node->first_child; n != NULL;) {
418
- if (n->type == LXB_DOM_NODE_TYPE_TEXT
419
- || n->type == LXB_DOM_NODE_TYPE_CDATA_SECTION) {
420
- const lexbor_str_t *d = &lxb_dom_interface_character_data(n)->data;
421
- if (d->data != NULL && d->length != 0) {
422
- rb_str_cat(str, (const char *)d->data, (long)d->length);
423
- }
424
- }
425
- if (n->first_child != NULL) { n = n->first_child; continue; }
426
- while (n != node && n->next == NULL) { n = n->parent; }
427
- if (n == node) { break; }
428
- n = n->next;
429
- }
430
- return str;
431
- }
432
-
433
- /* Character-data and other node kinds keep the general (proven) path. */
434
- size_t len = 0;
435
- lxb_char_t *text = lxb_dom_node_text_content(node, &len);
436
- if (text == NULL) {
437
- return rb_utf8_str_new("", 0);
438
- }
439
- VALUE str = rb_utf8_str_new((const char *)text, len);
440
- lxb_dom_document_destroy_text(node->owner_document, text);
441
- return str;
442
- }
443
-
444
- /* ------------------------------------------------------------------ */
445
- /* tree navigation */
446
- /* ------------------------------------------------------------------ */
447
-
448
- static VALUE
449
- mkr_node_get_document(VALUE self)
450
- {
451
- return mkr_node_document(self);
452
- }
453
-
454
- static VALUE
455
- mkr_node_parent(VALUE self)
456
- {
457
- lxb_dom_node_t *node = mkr_node_unwrap(self);
458
- VALUE document = mkr_node_document(self);
459
-
460
- /* Lexbor never links an attribute back to its element, so node->parent is
461
- * NULL for attributes. Resolve via the compat attr->owner index. */
462
- if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
463
- lxb_dom_node_t *owner =
464
- mkr_parsed_attr_owner(mkr_doc_parsed(document),
465
- lxb_dom_interface_attr(node));
466
- return mkr_wrap_node(owner, document);
467
- }
468
-
469
- return mkr_wrap_node(node->parent, document);
470
- }
471
-
472
- static VALUE
473
- mkr_node_next(VALUE self)
474
- {
475
- lxb_dom_node_t *node = mkr_node_unwrap(self);
476
- return mkr_wrap_node(node->next, mkr_node_document(self));
477
- }
478
-
479
- static VALUE
480
- mkr_node_previous(VALUE self)
481
- {
482
- lxb_dom_node_t *node = mkr_node_unwrap(self);
483
- return mkr_wrap_node(node->prev, mkr_node_document(self));
484
- }
485
-
486
- static VALUE
487
- mkr_node_next_element(VALUE self)
488
- {
489
- lxb_dom_node_t *node = mkr_node_unwrap(self)->next;
490
- while (node != NULL && node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
491
- node = node->next;
492
- }
493
- return mkr_wrap_node(node, mkr_node_document(self));
494
- }
495
-
496
- static VALUE
497
- mkr_node_previous_element(VALUE self)
498
- {
499
- lxb_dom_node_t *node = mkr_node_unwrap(self)->prev;
500
- while (node != NULL && node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
501
- node = node->prev;
502
- }
503
- return mkr_wrap_node(node, mkr_node_document(self));
504
- }
505
-
506
- /* First child node (any type), or nil. */
507
- static VALUE
508
- mkr_node_child(VALUE self)
509
- {
510
- lxb_dom_node_t *node = mkr_node_unwrap(self);
511
- return mkr_wrap_node(node->first_child, mkr_node_document(self));
512
- }
513
-
514
- /* All child nodes as a NodeSet. */
515
- static VALUE
516
- mkr_node_children(VALUE self)
517
- {
518
- lxb_dom_node_t *node = mkr_node_unwrap(self);
519
- VALUE document = mkr_node_document(self);
520
- VALUE set = mkr_node_set_new(document);
521
- for (lxb_dom_node_t *c = node->first_child; c != NULL; c = c->next) {
522
- mkr_node_set_push(set, c);
523
- }
524
- return set;
525
- }
526
-
527
- /* Child elements only, as a NodeSet. */
528
- static VALUE
529
- mkr_node_element_children(VALUE self)
530
- {
531
- lxb_dom_node_t *node = mkr_node_unwrap(self);
532
- VALUE document = mkr_node_document(self);
533
- VALUE set = mkr_node_set_new(document);
534
- for (lxb_dom_node_t *c = node->first_child; c != NULL; c = c->next) {
535
- if (c->type == LXB_DOM_NODE_TYPE_ELEMENT) {
536
- mkr_node_set_push(set, c);
537
- }
538
- }
539
- return set;
540
- }
541
-
542
- /* Ancestor elements, nearest first (parent, grandparent, ... root). */
543
- static VALUE
544
- mkr_node_ancestors(VALUE self)
545
- {
546
- lxb_dom_node_t *node = mkr_node_unwrap(self);
547
- VALUE document = mkr_node_document(self);
548
- VALUE set = mkr_node_set_new(document);
549
- for (lxb_dom_node_t *p = node->parent; p != NULL; p = p->parent) {
550
- if (p->type == LXB_DOM_NODE_TYPE_ELEMENT) {
551
- mkr_node_set_push(set, p);
552
- }
553
- }
554
- return set;
555
- }
556
-
557
- static VALUE
558
- mkr_node_first_element_child(VALUE self)
559
- {
560
- lxb_dom_node_t *c = mkr_node_unwrap(self)->first_child;
561
- while (c != NULL && c->type != LXB_DOM_NODE_TYPE_ELEMENT) {
562
- c = c->next;
563
- }
564
- return mkr_wrap_node(c, mkr_node_document(self));
565
- }
566
-
567
- static VALUE
568
- mkr_node_last_element_child(VALUE self)
569
- {
570
- lxb_dom_node_t *c = mkr_node_unwrap(self)->last_child;
571
- while (c != NULL && c->type != LXB_DOM_NODE_TYPE_ELEMENT) {
572
- c = c->prev;
573
- }
574
- return mkr_wrap_node(c, mkr_node_document(self));
575
- }
576
-
577
- /* ------------------------------------------------------------------ */
578
- /* attributes (read-only) */
579
- /* ------------------------------------------------------------------ */
580
-
581
- /* node[name] -> String or nil (nil when not an element or absent). */
582
- static VALUE
583
- mkr_node_aref(VALUE self, VALUE rb_name)
584
- {
585
- lxb_dom_node_t *node = mkr_node_unwrap(self);
586
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
587
- return Qnil;
588
- }
589
-
590
- mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "attribute name");
591
- const lxb_char_t *nm = (const lxb_char_t *)nv.ptr;
592
- size_t nlen = nv.len;
593
-
594
- lxb_dom_element_t *el = lxb_dom_interface_element(node);
595
- if (!lxb_dom_element_has_attribute(el, nm, nlen)) {
596
- return Qnil;
597
- }
598
-
599
- size_t vlen = 0;
600
- const lxb_char_t *val = lxb_dom_element_get_attribute(el, nm, nlen, &vlen);
601
- RB_GC_GUARD(nv.value);
602
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)val, vlen));
603
- }
604
-
605
- /* node.key?(name) -> true/false */
606
- static VALUE
607
- mkr_node_has_key(VALUE self, VALUE rb_name)
608
- {
609
- lxb_dom_node_t *node = mkr_node_unwrap(self);
610
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
611
- return Qfalse;
612
- }
613
- mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "attribute name");
614
- lxb_dom_element_t *el = lxb_dom_interface_element(node);
615
- bool has = lxb_dom_element_has_attribute(el, (const lxb_char_t *)nv.ptr, nv.len);
616
- RB_GC_GUARD(nv.value);
617
- return has ? Qtrue : Qfalse;
618
- }
619
-
620
- /* node.keys -> [String, ...] of attribute names (document order). */
621
- static VALUE
622
- mkr_node_keys(VALUE self)
623
- {
624
- lxb_dom_node_t *node = mkr_node_unwrap(self);
625
- VALUE ary = rb_ary_new();
626
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
627
- return ary;
628
- }
629
- lxb_dom_attr_t *attr =
630
- lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
631
- while (attr != NULL) {
632
- size_t len = 0;
633
- const lxb_char_t *name = lxb_dom_attr_qualified_name(attr, &len);
634
- rb_ary_push(ary, mkr_ruby_str_from_borrowed(
635
- mkr_borrowed_text((const char *)name, len)));
636
- attr = lxb_dom_element_next_attribute(attr);
637
- }
638
- return ary;
639
- }
640
-
641
- /* node.values -> [String, ...] of attribute values (document order). */
642
- static VALUE
643
- mkr_node_values(VALUE self)
644
- {
645
- lxb_dom_node_t *node = mkr_node_unwrap(self);
646
- VALUE ary = rb_ary_new();
647
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
648
- return ary;
649
- }
650
- lxb_dom_attr_t *attr =
651
- lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
652
- while (attr != NULL) {
653
- size_t len = 0;
654
- const lxb_char_t *val = lxb_dom_attr_value(attr, &len);
655
- rb_ary_push(ary, mkr_ruby_str_from_borrowed(
656
- mkr_borrowed_text((const char *)val, len)));
657
- attr = lxb_dom_element_next_attribute(attr);
658
- }
659
- return ary;
660
- }
661
-
662
- /* element.attribute_nodes -> NodeSet of Attribute nodes (document order).
663
- * Empty for non-elements. These wrap the bare lxb_dom_attr_t; navigating back
664
- * with Attribute#parent goes through the compat attr->owner index. */
665
- static VALUE
666
- mkr_node_attribute_nodes(VALUE self)
667
- {
668
- lxb_dom_node_t *node = mkr_node_unwrap(self);
669
- VALUE document = mkr_node_document(self);
670
- VALUE set = mkr_node_set_new(document);
671
- if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
672
- return set;
673
- }
674
- lxb_dom_attr_t *attr =
675
- lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
676
- while (attr != NULL) {
677
- mkr_node_set_push(set, lxb_dom_interface_node(attr));
678
- attr = lxb_dom_element_next_attribute(attr);
679
- }
680
- return set;
681
- }
682
-
683
- /* attr.value -> the attribute's value String. For non-attribute nodes, falls
684
- * back to text content (matching the loose Nokogiri-ish meaning of #value). */
685
- static VALUE
686
- mkr_node_value(VALUE self)
687
- {
688
- lxb_dom_node_t *node = mkr_node_unwrap(self);
689
- if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
690
- size_t len = 0;
691
- const lxb_char_t *val =
692
- lxb_dom_attr_value(lxb_dom_interface_attr(node), &len);
693
- return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)val, len));
694
- }
695
- return mkr_node_content(self);
696
- }
697
-
698
- /* node.line -> 1-based source line, or nil when unknown.
699
- *
700
- * The line comes from the byte offset stamped onto the node at parse time
701
- * (source-location tracking) resolved against the document's line table.
702
- * Returns nil for nodes the tracker could not place (e.g. parser-inserted
703
- * implicit <html>/<head>/<body>, or any node when tracking was disabled). */
704
- static VALUE
705
- mkr_node_line(VALUE self)
706
- {
707
- lxb_dom_node_t *node = mkr_node_unwrap(self);
708
- mkr_parsed_t *p = mkr_doc_parsed(mkr_node_document(self));
709
- size_t line = mkr_parsed_node_line(p, node);
710
- return line == 0 ? Qnil : ULONG2NUM(line);
711
- }
712
-
713
- /* ------------------------------------------------------------------ */
714
- /* identity */
715
- /* ------------------------------------------------------------------ */
716
-
717
- /* Pointer identity: equal iff both wrap the same lxb_dom_node_t. */
718
- static VALUE
719
- mkr_node_equals(VALUE self, VALUE other)
720
- {
721
- if (!rb_obj_is_kind_of(other, mkr_cNode)) {
722
- return Qfalse;
723
- }
724
- return mkr_node_unwrap(self) == mkr_node_unwrap(other) ? Qtrue : Qfalse;
725
- }
726
-
727
- /* Nokogiri-compatible identity: the underlying lxb_dom_node_t pointer as an
728
- * Integer. Stable for the node's lifetime and unique among currently-live
729
- * nodes; a freed-then-reallocated node may reuse an address (same caveat as
730
- * Nokogiri::XML::Node#pointer_id). a.pointer_id == b.pointer_id iff a.eql?(b). */
731
- static VALUE
732
- mkr_node_pointer_id(VALUE self)
733
- {
734
- lxb_dom_node_t *node = mkr_node_unwrap(self);
735
- return ULL2NUM((unsigned long long)(uintptr_t)node);
736
- }
737
-
738
- /* Stable hash derived from the node pointer, so a == b implies a.hash ==
739
- * b.hash even across separately-created wrappers. Shares the pointer value
740
- * with #pointer_id. */
741
- static VALUE
742
- mkr_node_hash(VALUE self)
743
- {
744
- return mkr_node_pointer_id(self);
745
- }
746
-
747
- void
748
- mkr_init_node(void)
749
- {
750
- rb_define_method(mkr_cNode, "name", mkr_node_name, 0);
751
- rb_define_method(mkr_cNode, "namespace_uri", mkr_node_namespace_uri, 0);
752
- rb_define_method(mkr_cNode, "prefix", mkr_node_prefix, 0);
753
- rb_define_method(mkr_cNode, "local_name", mkr_node_local_name, 0);
754
- rb_define_method(mkr_cNode, "tag_name", mkr_node_tag_name, 0);
755
- rb_define_method(mkr_cNode, "target", mkr_node_pi_target, 0);
756
- rb_define_method(mkr_cNode, "node_type", mkr_node_get_type, 0);
757
- rb_define_method(mkr_cNode, "content", mkr_node_content, 0);
758
- rb_define_method(mkr_cNode, "text", mkr_node_content, 0);
759
- rb_define_method(mkr_cNode, "inner_text", mkr_node_content, 0);
760
-
761
- rb_define_method(mkr_cNode, "document", mkr_node_get_document, 0);
762
- rb_define_method(mkr_cNode, "parent", mkr_node_parent, 0);
763
- rb_define_method(mkr_cNode, "next", mkr_node_next, 0);
764
- rb_define_method(mkr_cNode, "next_sibling", mkr_node_next, 0);
765
- rb_define_method(mkr_cNode, "previous", mkr_node_previous, 0);
766
- rb_define_method(mkr_cNode, "previous_sibling", mkr_node_previous, 0);
767
- rb_define_method(mkr_cNode, "next_element", mkr_node_next_element, 0);
768
- rb_define_method(mkr_cNode, "previous_element", mkr_node_previous_element, 0);
769
-
770
- rb_define_method(mkr_cNode, "child", mkr_node_child, 0);
771
- rb_define_method(mkr_cNode, "children", mkr_node_children, 0);
772
- rb_define_method(mkr_cNode, "element_children", mkr_node_element_children, 0);
773
- rb_define_method(mkr_cNode, "elements", mkr_node_element_children, 0);
774
- rb_define_method(mkr_cNode, "first_element_child", mkr_node_first_element_child, 0);
775
- rb_define_method(mkr_cNode, "last_element_child", mkr_node_last_element_child, 0);
776
- rb_define_method(mkr_cNode, "ancestors", mkr_node_ancestors, 0);
777
-
778
- rb_define_method(mkr_cNode, "[]", mkr_node_aref, 1);
779
- rb_define_method(mkr_cNode, "key?", mkr_node_has_key, 1);
780
- rb_define_method(mkr_cNode, "keys", mkr_node_keys, 0);
781
- rb_define_method(mkr_cNode, "values", mkr_node_values, 0);
782
- rb_define_method(mkr_cNode, "attribute_nodes", mkr_node_attribute_nodes, 0);
783
- rb_define_method(mkr_cNode, "value", mkr_node_value, 0);
784
- rb_define_method(mkr_cNode, "line", mkr_node_line, 0);
785
-
786
- rb_define_method(mkr_cNode, "==", mkr_node_equals, 1);
787
- rb_define_method(mkr_cNode, "eql?", mkr_node_equals, 1);
788
- rb_define_method(mkr_cNode, "hash", mkr_node_hash, 0);
789
- rb_define_method(mkr_cNode, "pointer_id", mkr_node_pointer_id, 0);
790
- rb_define_method(mkr_cNode, "clone_node", mkr_node_clone_node, -1);
791
-
792
- /* DocumentType identifiers (WHATWG DOM names; external_id is the
793
- * Nokogiri-compatible alias for public_id). */
794
- rb_define_method(mkr_cDocumentType, "public_id", mkr_doctype_public_id, 0);
795
- rb_define_method(mkr_cDocumentType, "external_id", mkr_doctype_public_id, 0);
796
- rb_define_method(mkr_cDocumentType, "system_id", mkr_doctype_system_id, 0);
797
-
798
- /* <template> contents (WHATWG DOM HTMLTemplateElement.content). */
799
- rb_define_method(mkr_cElement, "content_fragment", mkr_node_content_fragment, 0);
800
- }