makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,888 @@
|
|
|
1
|
+
/* ruby_html_node.c - the HTML (Lexbor) node representation: wrapping an
|
|
2
|
+
* lxb_dom_node_t into a Makiri::HTML::* leaf, the HTML node-pointer accessor, and
|
|
3
|
+
* all the HTML node reader methods. The XML counterpart is ruby_xml_node.c; the
|
|
4
|
+
* shared, representation-neutral node type system (the TypedData types plus the
|
|
5
|
+
* kind-agnostic mkr_node_raw / mkr_node_id / mkr_node_document accessors) lives in
|
|
6
|
+
* ruby_node.c. */
|
|
7
|
+
#include "glue.h"
|
|
8
|
+
|
|
9
|
+
#include <lexbor/ns/ns.h> /* lxb_ns_by_id, LXB_NS__UNDEF (namespaceURI) */
|
|
10
|
+
|
|
11
|
+
/* ------------------------------------------------------------------ */
|
|
12
|
+
/* wrap / unwrap */
|
|
13
|
+
/* ------------------------------------------------------------------ */
|
|
14
|
+
|
|
15
|
+
VALUE
|
|
16
|
+
mkr_wrap_html_node(lxb_dom_node_t *node, VALUE document)
|
|
17
|
+
{
|
|
18
|
+
if (node == NULL) {
|
|
19
|
+
return Qnil;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/* The document node maps back onto the Ruby Document object. */
|
|
23
|
+
if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT) {
|
|
24
|
+
return document;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/* An HTML (lxb_dom) node wraps to a Makiri::HTML::* leaf; the leaf carries the
|
|
28
|
+
* lxb_dom reader methods via the included mkr_mHtmlNodeMethods module. XML
|
|
29
|
+
* nodes get their own wrap path (Makiri::XML::* leaves) in step 2. An uncommon
|
|
30
|
+
* DOM node type with no specific leaf (entity/notation - Lexbor's HTML parser
|
|
31
|
+
* does not produce these) falls back to the generic Makiri::HTML::Node rather
|
|
32
|
+
* than being misclassified as an Element. */
|
|
33
|
+
VALUE klass;
|
|
34
|
+
switch (node->type) {
|
|
35
|
+
case LXB_DOM_NODE_TYPE_ELEMENT: klass = mkr_cHtmlElement; break;
|
|
36
|
+
case LXB_DOM_NODE_TYPE_ATTRIBUTE: klass = mkr_cHtmlAttr; break;
|
|
37
|
+
case LXB_DOM_NODE_TYPE_TEXT: klass = mkr_cHtmlText; break;
|
|
38
|
+
case LXB_DOM_NODE_TYPE_COMMENT: klass = mkr_cHtmlComment; break;
|
|
39
|
+
case LXB_DOM_NODE_TYPE_CDATA_SECTION: klass = mkr_cHtmlCDATASection; break;
|
|
40
|
+
case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION:
|
|
41
|
+
klass = mkr_cHtmlProcessingInstruction; break;
|
|
42
|
+
case LXB_DOM_NODE_TYPE_DOCUMENT_TYPE: klass = mkr_cHtmlDocumentType; break;
|
|
43
|
+
case LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT:
|
|
44
|
+
klass = mkr_cHtmlDocumentFragment; break;
|
|
45
|
+
default: klass = mkr_cHtmlNode; break;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
mkr_node_data_t *nd;
|
|
49
|
+
VALUE obj = TypedData_Make_Struct(klass, mkr_node_data_t, &mkr_html_node_type, nd);
|
|
50
|
+
nd->node = (mkr_raw_node_t *)node;
|
|
51
|
+
nd->document = document;
|
|
52
|
+
return obj;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/* The HTML node-pointer accessor: returns the lxb_dom_node_t for an HTML node or
|
|
56
|
+
* HTML Document, and RAISES TypeError for an XML node/Document (TypedData_Get_Struct
|
|
57
|
+
* checks mkr_html_node_type, which an XML node - wrapped under mkr_xml_node_type -
|
|
58
|
+
* does not satisfy). Every HTML-glue site that dereferences a node or hands its
|
|
59
|
+
* pointer to Lexbor MUST use this, for `self` and for arguments alike. */
|
|
60
|
+
lxb_dom_node_t *
|
|
61
|
+
mkr_html_node_unwrap(VALUE rb_node)
|
|
62
|
+
{
|
|
63
|
+
if (rb_obj_is_kind_of(rb_node, mkr_cDocument)) {
|
|
64
|
+
if (rb_obj_is_kind_of(rb_node, mkr_cXmlDocument)) {
|
|
65
|
+
rb_raise(rb_eTypeError, "expected an HTML node, got a Makiri::XML::Document");
|
|
66
|
+
}
|
|
67
|
+
return (lxb_dom_node_t *)mkr_html_doc_unwrap(rb_node);
|
|
68
|
+
}
|
|
69
|
+
mkr_node_data_t *nd;
|
|
70
|
+
TypedData_Get_Struct(rb_node, mkr_node_data_t, &mkr_html_node_type, nd);
|
|
71
|
+
return (lxb_dom_node_t *)nd->node;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/* mkr_node_raw / mkr_node_id / mkr_node_document (the kind-agnostic accessors) and
|
|
75
|
+
* the TypedData types live in ruby_node.c (the shared node core). */
|
|
76
|
+
|
|
77
|
+
/* ------------------------------------------------------------------ */
|
|
78
|
+
/* name / type / content */
|
|
79
|
+
/* ------------------------------------------------------------------ */
|
|
80
|
+
|
|
81
|
+
/*
|
|
82
|
+
* Node name. Matches Nokogiri: lowercase tag name for HTML elements
|
|
83
|
+
* (Lexbor lowercases during tokenization), and the un-prefixed DOM names
|
|
84
|
+
* "text"/"comment"/"#cdata-section"/"document" for the other kinds.
|
|
85
|
+
*/
|
|
86
|
+
static VALUE
|
|
87
|
+
mkr_node_name(VALUE self)
|
|
88
|
+
{
|
|
89
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
90
|
+
size_t len = 0;
|
|
91
|
+
const lxb_char_t *name;
|
|
92
|
+
|
|
93
|
+
switch (node->type) {
|
|
94
|
+
case LXB_DOM_NODE_TYPE_ELEMENT:
|
|
95
|
+
name = lxb_dom_element_qualified_name(lxb_dom_interface_element(node), &len);
|
|
96
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
|
|
97
|
+
case LXB_DOM_NODE_TYPE_ATTRIBUTE:
|
|
98
|
+
name = lxb_dom_attr_qualified_name(lxb_dom_interface_attr(node), &len);
|
|
99
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
|
|
100
|
+
case LXB_DOM_NODE_TYPE_TEXT:
|
|
101
|
+
return rb_utf8_str_new_cstr("text");
|
|
102
|
+
case LXB_DOM_NODE_TYPE_COMMENT:
|
|
103
|
+
return rb_utf8_str_new_cstr("comment");
|
|
104
|
+
case LXB_DOM_NODE_TYPE_CDATA_SECTION:
|
|
105
|
+
return rb_utf8_str_new_cstr("#cdata-section");
|
|
106
|
+
case LXB_DOM_NODE_TYPE_DOCUMENT:
|
|
107
|
+
return rb_utf8_str_new_cstr("document");
|
|
108
|
+
default:
|
|
109
|
+
name = lxb_dom_node_name(node, &len);
|
|
110
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/* ------------------------------------------------------------------ */
|
|
115
|
+
/* namespace (WHATWG DOM Element/Attr: namespaceURI/prefix/localName) */
|
|
116
|
+
/* ------------------------------------------------------------------ */
|
|
117
|
+
|
|
118
|
+
/*
|
|
119
|
+
* Local name (DOM `localName`): the name without any prefix - "div" for
|
|
120
|
+
* <div>, "path" for an SVG <path>, "href" for an xlink:href attribute.
|
|
121
|
+
* Defined on Element and Attribute only; nil for the other node kinds (the DOM
|
|
122
|
+
* gives a Text/Comment/Document no localName).
|
|
123
|
+
*/
|
|
124
|
+
static VALUE
|
|
125
|
+
mkr_node_local_name(VALUE self)
|
|
126
|
+
{
|
|
127
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
128
|
+
size_t len = 0;
|
|
129
|
+
const lxb_char_t *name;
|
|
130
|
+
|
|
131
|
+
switch (node->type) {
|
|
132
|
+
case LXB_DOM_NODE_TYPE_ELEMENT:
|
|
133
|
+
name = lxb_dom_element_local_name(lxb_dom_interface_element(node), &len);
|
|
134
|
+
break;
|
|
135
|
+
case LXB_DOM_NODE_TYPE_ATTRIBUTE: {
|
|
136
|
+
/* The case-preserved local name is the suffix of the qualified name;
|
|
137
|
+
* Lexbor's stored local_name is lower-cased even when the qualified name
|
|
138
|
+
* keeps its case (set_attribute_ns is case-sensitive). */
|
|
139
|
+
lxb_dom_attr_t *at = lxb_dom_interface_attr(node);
|
|
140
|
+
size_t qlen = 0, llen = 0;
|
|
141
|
+
const lxb_char_t *q = lxb_dom_attr_qualified_name(at, &qlen);
|
|
142
|
+
(void) lxb_dom_attr_local_name(at, &llen);
|
|
143
|
+
if (q != NULL && qlen >= llen) {
|
|
144
|
+
name = q + (qlen - llen);
|
|
145
|
+
len = llen;
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
name = lxb_dom_attr_local_name(at, &len);
|
|
149
|
+
}
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
default:
|
|
153
|
+
return Qnil;
|
|
154
|
+
}
|
|
155
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/*
|
|
159
|
+
* Namespace prefix (DOM `prefix`): nil unless the qualified name is
|
|
160
|
+
* `prefix:local` - typically nil for HTML5-parsed content. Derived from the
|
|
161
|
+
* qualified-vs-local length (qualified == prefix ":" local), so a colon inside
|
|
162
|
+
* a local name can't be mistaken for a separator. Element/Attribute only.
|
|
163
|
+
*/
|
|
164
|
+
static VALUE
|
|
165
|
+
mkr_node_prefix(VALUE self)
|
|
166
|
+
{
|
|
167
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
168
|
+
const lxb_char_t *q = NULL;
|
|
169
|
+
size_t qlen = 0, llen = 0;
|
|
170
|
+
|
|
171
|
+
switch (node->type) {
|
|
172
|
+
case LXB_DOM_NODE_TYPE_ELEMENT: {
|
|
173
|
+
lxb_dom_element_t *el = lxb_dom_interface_element(node);
|
|
174
|
+
q = lxb_dom_element_qualified_name(el, &qlen);
|
|
175
|
+
(void) lxb_dom_element_local_name(el, &llen);
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
case LXB_DOM_NODE_TYPE_ATTRIBUTE: {
|
|
179
|
+
lxb_dom_attr_t *at = lxb_dom_interface_attr(node);
|
|
180
|
+
q = lxb_dom_attr_qualified_name(at, &qlen);
|
|
181
|
+
(void) lxb_dom_attr_local_name(at, &llen);
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
default:
|
|
185
|
+
return Qnil;
|
|
186
|
+
}
|
|
187
|
+
if (q == NULL || qlen <= llen + 1) { /* no "prefix:" segment */
|
|
188
|
+
return Qnil;
|
|
189
|
+
}
|
|
190
|
+
return mkr_ruby_str_from_borrowed(
|
|
191
|
+
mkr_borrowed_text((const char *)q, qlen - llen - 1));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/*
|
|
195
|
+
* The fixed namespaces the HTML parser assigns to foreign-content attributes by
|
|
196
|
+
* prefix (the "adjust foreign attributes" step). Lexbor tags an attribute node
|
|
197
|
+
* with its *element's* ns rather than the attribute's own, so an attribute's
|
|
198
|
+
* namespaceURI is resolved from its prefix here, not from node->ns. Returns
|
|
199
|
+
* NULL (=> DOM null) for any other prefix.
|
|
200
|
+
*/
|
|
201
|
+
static const char *
|
|
202
|
+
mkr_attr_ns_for_prefix(const char *p, size_t n)
|
|
203
|
+
{
|
|
204
|
+
if (n == 5 && memcmp(p, "xlink", 5) == 0) return "http://www.w3.org/1999/xlink";
|
|
205
|
+
if (n == 3 && memcmp(p, "xml", 3) == 0) return "http://www.w3.org/XML/1998/namespace";
|
|
206
|
+
if (n == 5 && memcmp(p, "xmlns", 5) == 0) return "http://www.w3.org/2000/xmlns/";
|
|
207
|
+
return NULL;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/*
|
|
211
|
+
* Namespace URI (DOM `namespaceURI`).
|
|
212
|
+
*
|
|
213
|
+
* Element: resolved from node->ns, so - DOM-faithfully - an HTML element is in
|
|
214
|
+
* the XHTML namespace ("http://www.w3.org/1999/xhtml"), not nil (an HTML
|
|
215
|
+
* element is never namespaceless; this is what browsers' DOM and `namespace-uri()`
|
|
216
|
+
* return). SVG/MathML elements get their own URI; nil only when truly
|
|
217
|
+
* unnamespaced (LXB_NS__UNDEF).
|
|
218
|
+
*
|
|
219
|
+
* Attribute: nil for an unprefixed attribute (class, id, ...); for a prefixed
|
|
220
|
+
* one, the parser-assigned foreign-content namespace keyed on the prefix
|
|
221
|
+
* (xlink/xml/xmlns), else nil.
|
|
222
|
+
*
|
|
223
|
+
* Other node kinds: nil.
|
|
224
|
+
*/
|
|
225
|
+
static VALUE
|
|
226
|
+
mkr_node_namespace_uri(VALUE self)
|
|
227
|
+
{
|
|
228
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
229
|
+
|
|
230
|
+
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
231
|
+
if (node->ns == LXB_NS__UNDEF) {
|
|
232
|
+
return Qnil;
|
|
233
|
+
}
|
|
234
|
+
lxb_dom_document_t *doc = node->owner_document;
|
|
235
|
+
if (doc == NULL || doc->ns == NULL) {
|
|
236
|
+
return Qnil;
|
|
237
|
+
}
|
|
238
|
+
size_t len = 0;
|
|
239
|
+
const lxb_char_t *uri = lxb_ns_by_id(doc->ns, node->ns, &len);
|
|
240
|
+
if (uri == NULL || len == 0) {
|
|
241
|
+
return Qnil;
|
|
242
|
+
}
|
|
243
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)uri, len));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
|
|
247
|
+
lxb_dom_attr_t *at = lxb_dom_interface_attr(node);
|
|
248
|
+
|
|
249
|
+
/* An attribute set via set_attribute_ns records its OWN namespace on the
|
|
250
|
+
* attr node - distinguishable because it differs from the owner element's
|
|
251
|
+
* ns (a normally-set/parsed attr inherits the element's). Resolve it from
|
|
252
|
+
* the interned id; LXB_NS__UNDEF (set by set_attribute_ns(nil, ...)) is
|
|
253
|
+
* the null namespace. */
|
|
254
|
+
if (at->owner != NULL && node->ns != at->owner->node.ns) {
|
|
255
|
+
if (node->ns == LXB_NS__UNDEF) {
|
|
256
|
+
return Qnil;
|
|
257
|
+
}
|
|
258
|
+
lxb_dom_document_t *doc = node->owner_document;
|
|
259
|
+
if (doc != NULL && doc->ns != NULL) {
|
|
260
|
+
size_t len = 0;
|
|
261
|
+
const lxb_char_t *uri = lxb_ns_by_id(doc->ns, node->ns, &len);
|
|
262
|
+
if (uri != NULL && len != 0) {
|
|
263
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)uri, len));
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return Qnil;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
size_t qlen = 0, llen = 0;
|
|
270
|
+
const lxb_char_t *q = lxb_dom_attr_qualified_name(at, &qlen);
|
|
271
|
+
(void) lxb_dom_attr_local_name(at, &llen);
|
|
272
|
+
if (q == NULL || qlen <= llen + 1) {
|
|
273
|
+
return Qnil; /* unprefixed attribute => no namespace */
|
|
274
|
+
}
|
|
275
|
+
const char *uri = mkr_attr_ns_for_prefix((const char *)q, qlen - llen - 1);
|
|
276
|
+
return uri ? rb_utf8_str_new_cstr(uri) : Qnil;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return Qnil;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/*
|
|
283
|
+
* Element#tag_name (DOM `tagName`): the qualified name, uppercased for an HTML
|
|
284
|
+
* element in an HTML document ("DIV"), as the DOM specifies - unlike #name,
|
|
285
|
+
* which is the lowercase qualified name. SVG/MathML elements keep their case.
|
|
286
|
+
* nil for non-element nodes.
|
|
287
|
+
*/
|
|
288
|
+
static VALUE
|
|
289
|
+
mkr_node_tag_name(VALUE self)
|
|
290
|
+
{
|
|
291
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
292
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
293
|
+
return Qnil;
|
|
294
|
+
}
|
|
295
|
+
size_t len = 0;
|
|
296
|
+
const lxb_char_t *name =
|
|
297
|
+
lxb_dom_element_tag_name(lxb_dom_interface_element(node), &len);
|
|
298
|
+
if (name == NULL) {
|
|
299
|
+
return Qnil;
|
|
300
|
+
}
|
|
301
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)name, len));
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/*
|
|
305
|
+
* ProcessingInstruction#target (DOM `target`): the PI's target name
|
|
306
|
+
* (the "xml" in <?xml ...?>). nil for non-PI nodes. The PI's data is read via
|
|
307
|
+
* #content / #text like any character-data node.
|
|
308
|
+
*/
|
|
309
|
+
static VALUE
|
|
310
|
+
mkr_node_pi_target(VALUE self)
|
|
311
|
+
{
|
|
312
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
313
|
+
if (node->type != LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION) {
|
|
314
|
+
return Qnil;
|
|
315
|
+
}
|
|
316
|
+
size_t len = 0;
|
|
317
|
+
const lxb_char_t *t = lxb_dom_processing_instruction_target(
|
|
318
|
+
lxb_dom_interface_processing_instruction(node), &len);
|
|
319
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)t, len));
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/* Numeric DOM node type (LXB_DOM_NODE_TYPE_*). */
|
|
323
|
+
static VALUE
|
|
324
|
+
mkr_node_get_type(VALUE self)
|
|
325
|
+
{
|
|
326
|
+
return INT2NUM((int)mkr_html_node_unwrap(self)->type);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/*
|
|
330
|
+
* DocumentType public / system identifiers (WHATWG DOM `publicId`/`systemId`).
|
|
331
|
+
* Returns the String, or nil when the doctype carries no such identifier.
|
|
332
|
+
* Lexbor represents a missing id inconsistently (NULL after `SYSTEM`, but an
|
|
333
|
+
* empty string for a bare `<!DOCTYPE html>`), so we treat empty as absent and
|
|
334
|
+
* return nil for both - matching Nokogiri (which also reports nil for an empty
|
|
335
|
+
* or missing id). Defined only on Makiri::DocumentType, so the receiver is
|
|
336
|
+
* always a doctype node; the guard is belt-and-suspenders.
|
|
337
|
+
*/
|
|
338
|
+
static VALUE
|
|
339
|
+
mkr_doctype_id(VALUE self, int system)
|
|
340
|
+
{
|
|
341
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
342
|
+
if (node->type != LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
|
|
343
|
+
return Qnil;
|
|
344
|
+
}
|
|
345
|
+
lxb_dom_document_type_t *dt = lxb_dom_interface_document_type(node);
|
|
346
|
+
size_t len = 0;
|
|
347
|
+
const lxb_char_t *id = system ? lxb_dom_document_type_system_id(dt, &len)
|
|
348
|
+
: lxb_dom_document_type_public_id(dt, &len);
|
|
349
|
+
return (id == NULL || len == 0)
|
|
350
|
+
? Qnil
|
|
351
|
+
: mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)id, len));
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
static VALUE
|
|
355
|
+
mkr_doctype_public_id(VALUE self)
|
|
356
|
+
{
|
|
357
|
+
return mkr_doctype_id(self, 0);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
static VALUE
|
|
361
|
+
mkr_doctype_system_id(VALUE self)
|
|
362
|
+
{
|
|
363
|
+
return mkr_doctype_id(self, 1);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/*
|
|
367
|
+
* A <template> element's "template contents" - the separate DocumentFragment
|
|
368
|
+
* the HTML parser fills instead of making the parsed nodes children of the
|
|
369
|
+
* <template> (WHATWG DOM `HTMLTemplateElement.content`; browsers behave the
|
|
370
|
+
* same: template.children is empty, template.content holds the nodes). Lexbor
|
|
371
|
+
* stores it on the template interface; we surface it as a Makiri::DocumentFragment
|
|
372
|
+
* so it can be traversed/queried (`tpl.content_fragment.css("p")`).
|
|
373
|
+
*
|
|
374
|
+
* Returns nil for any node that is not an HTML <template>. Note: CSS/XPath over
|
|
375
|
+
* the *template element itself* deliberately do NOT descend into the content
|
|
376
|
+
* (matching the DOM, and unavoidable for CSS since it runs Lexbor's selector
|
|
377
|
+
* engine over the real tree) - query the fragment instead.
|
|
378
|
+
*/
|
|
379
|
+
static VALUE
|
|
380
|
+
mkr_node_content_fragment(VALUE self)
|
|
381
|
+
{
|
|
382
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
383
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT
|
|
384
|
+
|| node->local_name != LXB_TAG_TEMPLATE
|
|
385
|
+
|| node->ns != LXB_NS_HTML) {
|
|
386
|
+
return Qnil;
|
|
387
|
+
}
|
|
388
|
+
lxb_dom_document_fragment_t *content = lxb_html_interface_template(node)->content;
|
|
389
|
+
if (content == NULL) {
|
|
390
|
+
return Qnil;
|
|
391
|
+
}
|
|
392
|
+
return mkr_wrap_html_node((lxb_dom_node_t *)content, mkr_node_document(self));
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/* Concatenated text content of this node and its descendants. The DOM spec
|
|
396
|
+
* makes a Document's textContent null; we instead return the text of the root
|
|
397
|
+
* element (matching the intuitive, Nokogiri-like Document#text). */
|
|
398
|
+
static VALUE
|
|
399
|
+
mkr_node_content(VALUE self)
|
|
400
|
+
{
|
|
401
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
402
|
+
if (node->type == LXB_DOM_NODE_TYPE_DOCUMENT) {
|
|
403
|
+
node = lxb_dom_document_root((lxb_dom_document_t *)node);
|
|
404
|
+
if (node == NULL) {
|
|
405
|
+
return rb_utf8_str_new("", 0);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/* Fast path for elements / fragments (the common case, incl. document text).
|
|
410
|
+
*
|
|
411
|
+
* Preferred: the per-document text index (lexbor_compat/text_index.c) maps
|
|
412
|
+
* this node to the contiguous, document-order run of its descendants' text
|
|
413
|
+
* slices, so we serve a single pre-sized memcpy run with no per-extraction
|
|
414
|
+
* tree walk - the walk is otherwise the dominant, cache-bound cost. Built
|
|
415
|
+
* lazily on first use and dropped on any mutation, so a slice can never
|
|
416
|
+
* point at reallocated/detached storage.
|
|
417
|
+
*
|
|
418
|
+
* Fallback (index unavailable - node outside the indexed tree, e.g. a
|
|
419
|
+
* fragment, or a build OOM): stream each descendant text/CDATA node's data
|
|
420
|
+
* straight into the Ruby string via an iterative pre-order walk (stack-safe;
|
|
421
|
+
* skips Lexbor's intermediate arena buffer + copy). */
|
|
422
|
+
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT
|
|
423
|
+
|| node->type == LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT) {
|
|
424
|
+
mkr_parsed_t *parsed = mkr_doc_parsed(mkr_node_document(self));
|
|
425
|
+
const mkr_borrowed_text_t *slices;
|
|
426
|
+
size_t nslices, total;
|
|
427
|
+
if (parsed != NULL
|
|
428
|
+
&& mkr_parsed_text_slices(parsed, node, &slices, &nslices, &total)) {
|
|
429
|
+
return mkr_ruby_str_from_slices(slices, nslices, total);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
VALUE str = rb_utf8_str_new(NULL, 0);
|
|
433
|
+
for (lxb_dom_node_t *n = node->first_child; n != NULL;) {
|
|
434
|
+
if (n->type == LXB_DOM_NODE_TYPE_TEXT
|
|
435
|
+
|| n->type == LXB_DOM_NODE_TYPE_CDATA_SECTION) {
|
|
436
|
+
const lexbor_str_t *d = &lxb_dom_interface_character_data(n)->data;
|
|
437
|
+
if (d->data != NULL && d->length != 0) {
|
|
438
|
+
rb_str_cat(str, (const char *)d->data, (long)d->length);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
if (n->first_child != NULL) { n = n->first_child; continue; }
|
|
442
|
+
while (n != node && n->next == NULL) { n = n->parent; }
|
|
443
|
+
if (n == node) { break; }
|
|
444
|
+
n = n->next;
|
|
445
|
+
}
|
|
446
|
+
return str;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/* Character-data and other node kinds keep the general (proven) path. */
|
|
450
|
+
size_t len = 0;
|
|
451
|
+
lxb_char_t *text = lxb_dom_node_text_content(node, &len);
|
|
452
|
+
if (text == NULL) {
|
|
453
|
+
return rb_utf8_str_new("", 0);
|
|
454
|
+
}
|
|
455
|
+
VALUE str = rb_utf8_str_new((const char *)text, len);
|
|
456
|
+
lxb_dom_document_destroy_text(node->owner_document, text);
|
|
457
|
+
return str;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/* ------------------------------------------------------------------ */
|
|
461
|
+
/* tree navigation */
|
|
462
|
+
/* ------------------------------------------------------------------ */
|
|
463
|
+
|
|
464
|
+
static VALUE
|
|
465
|
+
mkr_node_get_document(VALUE self)
|
|
466
|
+
{
|
|
467
|
+
return mkr_node_document(self);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
static VALUE
|
|
471
|
+
mkr_node_parent(VALUE self)
|
|
472
|
+
{
|
|
473
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
474
|
+
VALUE document = mkr_node_document(self);
|
|
475
|
+
|
|
476
|
+
/* Lexbor never links an attribute back to its element, so node->parent is
|
|
477
|
+
* NULL for attributes. Resolve via the compat attr->owner index. */
|
|
478
|
+
if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
|
|
479
|
+
lxb_dom_node_t *owner =
|
|
480
|
+
mkr_parsed_attr_owner(mkr_doc_parsed(document),
|
|
481
|
+
lxb_dom_interface_attr(node));
|
|
482
|
+
return mkr_wrap_html_node(owner, document);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
return mkr_wrap_html_node(node->parent, document);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
static VALUE
|
|
489
|
+
mkr_node_next(VALUE self)
|
|
490
|
+
{
|
|
491
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
492
|
+
return mkr_wrap_html_node(node->next, mkr_node_document(self));
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
static VALUE
|
|
496
|
+
mkr_node_previous(VALUE self)
|
|
497
|
+
{
|
|
498
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
499
|
+
return mkr_wrap_html_node(node->prev, mkr_node_document(self));
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
static VALUE
|
|
503
|
+
mkr_node_next_element(VALUE self)
|
|
504
|
+
{
|
|
505
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self)->next;
|
|
506
|
+
while (node != NULL && node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
507
|
+
node = node->next;
|
|
508
|
+
}
|
|
509
|
+
return mkr_wrap_html_node(node, mkr_node_document(self));
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
static VALUE
|
|
513
|
+
mkr_node_previous_element(VALUE self)
|
|
514
|
+
{
|
|
515
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self)->prev;
|
|
516
|
+
while (node != NULL && node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
517
|
+
node = node->prev;
|
|
518
|
+
}
|
|
519
|
+
return mkr_wrap_html_node(node, mkr_node_document(self));
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
/* First child node (any type), or nil. */
|
|
523
|
+
static VALUE
|
|
524
|
+
mkr_node_child(VALUE self)
|
|
525
|
+
{
|
|
526
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
527
|
+
return mkr_wrap_html_node(node->first_child, mkr_node_document(self));
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
/* All child nodes as a NodeSet. */
|
|
531
|
+
static VALUE
|
|
532
|
+
mkr_node_children(VALUE self)
|
|
533
|
+
{
|
|
534
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
535
|
+
VALUE document = mkr_node_document(self);
|
|
536
|
+
VALUE set = mkr_node_set_new(document);
|
|
537
|
+
for (lxb_dom_node_t *c = node->first_child; c != NULL; c = c->next) {
|
|
538
|
+
mkr_node_set_push(set, (mkr_raw_node_t *)c);
|
|
539
|
+
}
|
|
540
|
+
return set;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
/* Child elements only, as a NodeSet. */
|
|
544
|
+
static VALUE
|
|
545
|
+
mkr_node_element_children(VALUE self)
|
|
546
|
+
{
|
|
547
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
548
|
+
VALUE document = mkr_node_document(self);
|
|
549
|
+
VALUE set = mkr_node_set_new(document);
|
|
550
|
+
for (lxb_dom_node_t *c = node->first_child; c != NULL; c = c->next) {
|
|
551
|
+
if (c->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
552
|
+
mkr_node_set_push(set, (mkr_raw_node_t *)c);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
return set;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/* Ancestor elements, nearest first (parent, grandparent, ... root). */
|
|
559
|
+
static VALUE
|
|
560
|
+
mkr_node_ancestors(VALUE self)
|
|
561
|
+
{
|
|
562
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
563
|
+
VALUE document = mkr_node_document(self);
|
|
564
|
+
VALUE set = mkr_node_set_new(document);
|
|
565
|
+
for (lxb_dom_node_t *p = node->parent; p != NULL; p = p->parent) {
|
|
566
|
+
if (p->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
567
|
+
mkr_node_set_push(set, (mkr_raw_node_t *)p);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
return set;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
static VALUE
|
|
574
|
+
mkr_node_first_element_child(VALUE self)
|
|
575
|
+
{
|
|
576
|
+
lxb_dom_node_t *c = mkr_html_node_unwrap(self)->first_child;
|
|
577
|
+
while (c != NULL && c->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
578
|
+
c = c->next;
|
|
579
|
+
}
|
|
580
|
+
return mkr_wrap_html_node(c, mkr_node_document(self));
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
static VALUE
|
|
584
|
+
mkr_node_last_element_child(VALUE self)
|
|
585
|
+
{
|
|
586
|
+
lxb_dom_node_t *c = mkr_html_node_unwrap(self)->last_child;
|
|
587
|
+
while (c != NULL && c->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
588
|
+
c = c->prev;
|
|
589
|
+
}
|
|
590
|
+
return mkr_wrap_html_node(c, mkr_node_document(self));
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
/* ------------------------------------------------------------------ */
|
|
594
|
+
/* attributes (read-only) */
|
|
595
|
+
/* ------------------------------------------------------------------ */
|
|
596
|
+
|
|
597
|
+
/* node[name] -> String or nil (nil when not an element or absent). */
|
|
598
|
+
static VALUE
|
|
599
|
+
mkr_node_aref(VALUE self, VALUE rb_name)
|
|
600
|
+
{
|
|
601
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
602
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
603
|
+
return Qnil;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "attribute name");
|
|
607
|
+
const lxb_char_t *nm = (const lxb_char_t *)nv.ptr;
|
|
608
|
+
size_t nlen = nv.len;
|
|
609
|
+
|
|
610
|
+
lxb_dom_element_t *el = lxb_dom_interface_element(node);
|
|
611
|
+
if (!lxb_dom_element_has_attribute(el, nm, nlen)) {
|
|
612
|
+
return Qnil;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
size_t vlen = 0;
|
|
616
|
+
const lxb_char_t *val = lxb_dom_element_get_attribute(el, nm, nlen, &vlen);
|
|
617
|
+
RB_GC_GUARD(nv.value);
|
|
618
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)val, vlen));
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/* node.key?(name) -> true/false */
|
|
622
|
+
static VALUE
|
|
623
|
+
mkr_node_has_key(VALUE self, VALUE rb_name)
|
|
624
|
+
{
|
|
625
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
626
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
627
|
+
return Qfalse;
|
|
628
|
+
}
|
|
629
|
+
mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "attribute name");
|
|
630
|
+
lxb_dom_element_t *el = lxb_dom_interface_element(node);
|
|
631
|
+
bool has = lxb_dom_element_has_attribute(el, (const lxb_char_t *)nv.ptr, nv.len);
|
|
632
|
+
RB_GC_GUARD(nv.value);
|
|
633
|
+
return has ? Qtrue : Qfalse;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
/* node.keys -> [String, ...] of attribute names (document order). */
|
|
637
|
+
static VALUE
|
|
638
|
+
mkr_node_keys(VALUE self)
|
|
639
|
+
{
|
|
640
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
641
|
+
VALUE ary = rb_ary_new();
|
|
642
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
643
|
+
return ary;
|
|
644
|
+
}
|
|
645
|
+
lxb_dom_attr_t *attr =
|
|
646
|
+
lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
|
|
647
|
+
while (attr != NULL) {
|
|
648
|
+
size_t len = 0;
|
|
649
|
+
const lxb_char_t *name = lxb_dom_attr_qualified_name(attr, &len);
|
|
650
|
+
rb_ary_push(ary, mkr_ruby_str_from_borrowed(
|
|
651
|
+
mkr_borrowed_text((const char *)name, len)));
|
|
652
|
+
attr = lxb_dom_element_next_attribute(attr);
|
|
653
|
+
}
|
|
654
|
+
return ary;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
/* node.values -> [String, ...] of attribute values (document order). */
|
|
658
|
+
static VALUE
|
|
659
|
+
mkr_node_values(VALUE self)
|
|
660
|
+
{
|
|
661
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
662
|
+
VALUE ary = rb_ary_new();
|
|
663
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
664
|
+
return ary;
|
|
665
|
+
}
|
|
666
|
+
lxb_dom_attr_t *attr =
|
|
667
|
+
lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
|
|
668
|
+
while (attr != NULL) {
|
|
669
|
+
size_t len = 0;
|
|
670
|
+
const lxb_char_t *val = lxb_dom_attr_value(attr, &len);
|
|
671
|
+
rb_ary_push(ary, mkr_ruby_str_from_borrowed(
|
|
672
|
+
mkr_borrowed_text((const char *)val, len)));
|
|
673
|
+
attr = lxb_dom_element_next_attribute(attr);
|
|
674
|
+
}
|
|
675
|
+
return ary;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
/* element.attribute_nodes -> NodeSet of Attribute nodes (document order).
|
|
679
|
+
* Empty for non-elements. These wrap the bare lxb_dom_attr_t; navigating back
|
|
680
|
+
* with Attribute#parent goes through the compat attr->owner index. */
|
|
681
|
+
static VALUE
|
|
682
|
+
mkr_node_attribute_nodes(VALUE self)
|
|
683
|
+
{
|
|
684
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
685
|
+
VALUE document = mkr_node_document(self);
|
|
686
|
+
VALUE set = mkr_node_set_new(document);
|
|
687
|
+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
688
|
+
return set;
|
|
689
|
+
}
|
|
690
|
+
lxb_dom_attr_t *attr =
|
|
691
|
+
lxb_dom_element_first_attribute(lxb_dom_interface_element(node));
|
|
692
|
+
while (attr != NULL) {
|
|
693
|
+
mkr_node_set_push(set, (mkr_raw_node_t *)lxb_dom_interface_node(attr));
|
|
694
|
+
attr = lxb_dom_element_next_attribute(attr);
|
|
695
|
+
}
|
|
696
|
+
return set;
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
/* attr.value -> the attribute's value String. For non-attribute nodes, falls
|
|
700
|
+
* back to text content (matching the loose Nokogiri-ish meaning of #value). */
|
|
701
|
+
static VALUE
|
|
702
|
+
mkr_node_value(VALUE self)
|
|
703
|
+
{
|
|
704
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
705
|
+
if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
|
|
706
|
+
size_t len = 0;
|
|
707
|
+
const lxb_char_t *val =
|
|
708
|
+
lxb_dom_attr_value(lxb_dom_interface_attr(node), &len);
|
|
709
|
+
return mkr_ruby_str_from_borrowed(mkr_borrowed_text((const char *)val, len));
|
|
710
|
+
}
|
|
711
|
+
return mkr_node_content(self);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
/* node.line -> 1-based source line, or nil when unknown.
|
|
715
|
+
*
|
|
716
|
+
* The line comes from the byte offset stamped onto the node at parse time
|
|
717
|
+
* (source-location tracking) resolved against the document's line table.
|
|
718
|
+
* Returns nil for nodes the tracker could not place (e.g. parser-inserted
|
|
719
|
+
* implicit <html>/<head>/<body>, or any node when tracking was disabled). */
|
|
720
|
+
static VALUE
|
|
721
|
+
mkr_node_line(VALUE self)
|
|
722
|
+
{
|
|
723
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
724
|
+
mkr_parsed_t *p = mkr_doc_parsed(mkr_node_document(self));
|
|
725
|
+
size_t line = mkr_parsed_node_line(p, node);
|
|
726
|
+
return line == 0 ? Qnil : ULONG2NUM(line);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
/* ------------------------------------------------------------------ */
|
|
730
|
+
/* identity */
|
|
731
|
+
/* ------------------------------------------------------------------ */
|
|
732
|
+
|
|
733
|
+
/* Pointer identity: equal iff both wrap the same lxb_dom_node_t. */
|
|
734
|
+
static VALUE
|
|
735
|
+
mkr_node_equals(VALUE self, VALUE other)
|
|
736
|
+
{
|
|
737
|
+
if (!rb_obj_is_kind_of(other, mkr_cNode)) {
|
|
738
|
+
return Qfalse;
|
|
739
|
+
}
|
|
740
|
+
/* Identity by pointer, kind-agnostic (an HTML node is simply never equal to an
|
|
741
|
+
* XML node) - mkr_node_id never dereferences, so comparing across
|
|
742
|
+
* representations is safe. */
|
|
743
|
+
return mkr_node_id(self) == mkr_node_id(other) ? Qtrue : Qfalse;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/* Distance from `n` to the root (a node with no parent). */
|
|
747
|
+
static size_t
|
|
748
|
+
mkr_node_depth(const lxb_dom_node_t *n)
|
|
749
|
+
{
|
|
750
|
+
size_t d = 0;
|
|
751
|
+
for (const lxb_dom_node_t *p = n->parent; p != NULL; p = p->parent) {
|
|
752
|
+
d++;
|
|
753
|
+
}
|
|
754
|
+
return d;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
/*
|
|
758
|
+
* Node#<=> : document (pre-order) position, so an array of nodes can be sorted.
|
|
759
|
+
* Returns -1 / 0 / 1, or nil when the nodes are not comparable: a non-node,
|
|
760
|
+
* different documents or detached subtrees (no common root), or an attribute
|
|
761
|
+
* node (attributes are not in the first_child/next chain, so their order is not
|
|
762
|
+
* defined here). Included via Comparable, which gives <, >, between?, etc.
|
|
763
|
+
*/
|
|
764
|
+
static VALUE
|
|
765
|
+
mkr_node_spaceship(VALUE self, VALUE other)
|
|
766
|
+
{
|
|
767
|
+
if (!rb_obj_is_kind_of(other, mkr_cNode)
|
|
768
|
+
|| rb_obj_is_kind_of(mkr_node_document(other), mkr_cXmlDocument)) {
|
|
769
|
+
return Qnil; /* not a node, or an XML node - never order-comparable to HTML */
|
|
770
|
+
}
|
|
771
|
+
lxb_dom_node_t *a = mkr_html_node_unwrap(self);
|
|
772
|
+
lxb_dom_node_t *b = mkr_html_node_unwrap(other);
|
|
773
|
+
if (a == b) {
|
|
774
|
+
return INT2FIX(0);
|
|
775
|
+
}
|
|
776
|
+
if (a->type == LXB_DOM_NODE_TYPE_ATTRIBUTE
|
|
777
|
+
|| b->type == LXB_DOM_NODE_TYPE_ATTRIBUTE
|
|
778
|
+
|| a->owner_document != b->owner_document) {
|
|
779
|
+
return Qnil;
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
size_t da = mkr_node_depth(a), db = mkr_node_depth(b);
|
|
783
|
+
lxb_dom_node_t *pa = a, *pb = b;
|
|
784
|
+
|
|
785
|
+
/* Raise the deeper node to the other's depth; if it lands on the other,
|
|
786
|
+
* that other is an ancestor and so comes first in pre-order. */
|
|
787
|
+
if (da > db) {
|
|
788
|
+
for (size_t k = 0; k < da - db; k++) pa = pa->parent;
|
|
789
|
+
if (pa == b) return INT2FIX(1); /* b is an ancestor of a */
|
|
790
|
+
} else if (db > da) {
|
|
791
|
+
for (size_t k = 0; k < db - da; k++) pb = pb->parent;
|
|
792
|
+
if (pb == a) return INT2FIX(-1); /* a is an ancestor of b */
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
/* Climb both until they share a parent (the lowest common ancestor). */
|
|
796
|
+
while (pa->parent != pb->parent) {
|
|
797
|
+
if (pa->parent == NULL || pb->parent == NULL) {
|
|
798
|
+
return Qnil; /* different trees */
|
|
799
|
+
}
|
|
800
|
+
pa = pa->parent;
|
|
801
|
+
pb = pb->parent;
|
|
802
|
+
}
|
|
803
|
+
if (pa->parent == NULL) {
|
|
804
|
+
return Qnil; /* two distinct roots */
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/* pa and pb are distinct siblings: earlier in the child list comes first. */
|
|
808
|
+
for (lxb_dom_node_t *c = pa->parent->first_child; c != NULL; c = c->next) {
|
|
809
|
+
if (c == pa) return INT2FIX(-1);
|
|
810
|
+
if (c == pb) return INT2FIX(1);
|
|
811
|
+
}
|
|
812
|
+
return Qnil; /* unreachable for a well-formed tree */
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
/* Nokogiri-compatible identity: the underlying lxb_dom_node_t pointer as an
|
|
816
|
+
* Integer. Stable for the node's lifetime and unique among currently-live
|
|
817
|
+
* nodes; a freed-then-reallocated node may reuse an address (same caveat as
|
|
818
|
+
* Nokogiri::XML::Node#pointer_id). a.pointer_id == b.pointer_id iff a.eql?(b). */
|
|
819
|
+
static VALUE
|
|
820
|
+
mkr_node_pointer_id(VALUE self)
|
|
821
|
+
{
|
|
822
|
+
return ULL2NUM((unsigned long long)mkr_node_id(self));
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
/* Stable hash derived from the node pointer, so a == b implies a.hash ==
|
|
826
|
+
* b.hash even across separately-created wrappers. Shares the pointer value
|
|
827
|
+
* with #pointer_id. */
|
|
828
|
+
static VALUE
|
|
829
|
+
mkr_node_hash(VALUE self)
|
|
830
|
+
{
|
|
831
|
+
return mkr_node_pointer_id(self);
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
void
|
|
835
|
+
mkr_init_node(void)
|
|
836
|
+
{
|
|
837
|
+
rb_define_method(mkr_mHtmlNodeMethods, "name", mkr_node_name, 0);
|
|
838
|
+
rb_define_method(mkr_mHtmlNodeMethods, "namespace_uri", mkr_node_namespace_uri, 0);
|
|
839
|
+
rb_define_method(mkr_mHtmlNodeMethods, "prefix", mkr_node_prefix, 0);
|
|
840
|
+
rb_define_method(mkr_mHtmlNodeMethods, "local_name", mkr_node_local_name, 0);
|
|
841
|
+
rb_define_method(mkr_mHtmlNodeMethods, "tag_name", mkr_node_tag_name, 0);
|
|
842
|
+
rb_define_method(mkr_mHtmlNodeMethods, "target", mkr_node_pi_target, 0);
|
|
843
|
+
rb_define_method(mkr_mHtmlNodeMethods, "node_type", mkr_node_get_type, 0);
|
|
844
|
+
rb_define_method(mkr_mHtmlNodeMethods, "content", mkr_node_content, 0);
|
|
845
|
+
rb_define_method(mkr_mHtmlNodeMethods, "text", mkr_node_content, 0);
|
|
846
|
+
rb_define_method(mkr_mHtmlNodeMethods, "inner_text", mkr_node_content, 0);
|
|
847
|
+
|
|
848
|
+
rb_define_method(mkr_mHtmlNodeMethods, "document", mkr_node_get_document, 0);
|
|
849
|
+
rb_define_method(mkr_mHtmlNodeMethods, "parent", mkr_node_parent, 0);
|
|
850
|
+
rb_define_method(mkr_mHtmlNodeMethods, "next", mkr_node_next, 0);
|
|
851
|
+
rb_define_method(mkr_mHtmlNodeMethods, "next_sibling", mkr_node_next, 0);
|
|
852
|
+
rb_define_method(mkr_mHtmlNodeMethods, "previous", mkr_node_previous, 0);
|
|
853
|
+
rb_define_method(mkr_mHtmlNodeMethods, "previous_sibling", mkr_node_previous, 0);
|
|
854
|
+
rb_define_method(mkr_mHtmlNodeMethods, "next_element", mkr_node_next_element, 0);
|
|
855
|
+
rb_define_method(mkr_mHtmlNodeMethods, "previous_element", mkr_node_previous_element, 0);
|
|
856
|
+
|
|
857
|
+
rb_define_method(mkr_mHtmlNodeMethods, "child", mkr_node_child, 0);
|
|
858
|
+
rb_define_method(mkr_mHtmlNodeMethods, "children", mkr_node_children, 0);
|
|
859
|
+
rb_define_method(mkr_mHtmlNodeMethods, "element_children", mkr_node_element_children, 0);
|
|
860
|
+
rb_define_method(mkr_mHtmlNodeMethods, "elements", mkr_node_element_children, 0);
|
|
861
|
+
rb_define_method(mkr_mHtmlNodeMethods, "first_element_child", mkr_node_first_element_child, 0);
|
|
862
|
+
rb_define_method(mkr_mHtmlNodeMethods, "last_element_child", mkr_node_last_element_child, 0);
|
|
863
|
+
rb_define_method(mkr_mHtmlNodeMethods, "ancestors", mkr_node_ancestors, 0);
|
|
864
|
+
|
|
865
|
+
rb_define_method(mkr_mHtmlNodeMethods, "[]", mkr_node_aref, 1);
|
|
866
|
+
rb_define_method(mkr_mHtmlNodeMethods, "key?", mkr_node_has_key, 1);
|
|
867
|
+
rb_define_method(mkr_mHtmlNodeMethods, "keys", mkr_node_keys, 0);
|
|
868
|
+
rb_define_method(mkr_mHtmlNodeMethods, "values", mkr_node_values, 0);
|
|
869
|
+
rb_define_method(mkr_mHtmlNodeMethods, "attribute_nodes", mkr_node_attribute_nodes, 0);
|
|
870
|
+
rb_define_method(mkr_mHtmlNodeMethods, "value", mkr_node_value, 0);
|
|
871
|
+
rb_define_method(mkr_mHtmlNodeMethods, "line", mkr_node_line, 0);
|
|
872
|
+
|
|
873
|
+
rb_define_method(mkr_mHtmlNodeMethods, "==", mkr_node_equals, 1);
|
|
874
|
+
rb_define_method(mkr_mHtmlNodeMethods, "eql?", mkr_node_equals, 1);
|
|
875
|
+
rb_define_method(mkr_mHtmlNodeMethods, "<=>", mkr_node_spaceship, 1);
|
|
876
|
+
rb_define_method(mkr_mHtmlNodeMethods, "hash", mkr_node_hash, 0);
|
|
877
|
+
rb_define_method(mkr_mHtmlNodeMethods, "pointer_id", mkr_node_pointer_id, 0);
|
|
878
|
+
rb_define_method(mkr_mHtmlNodeMethods, "clone_node", mkr_node_clone_node, -1);
|
|
879
|
+
|
|
880
|
+
/* DocumentType identifiers (WHATWG DOM names; external_id is the
|
|
881
|
+
* Nokogiri-compatible alias for public_id). */
|
|
882
|
+
rb_define_method(mkr_cHtmlDocumentType, "public_id", mkr_doctype_public_id, 0);
|
|
883
|
+
rb_define_method(mkr_cHtmlDocumentType, "external_id", mkr_doctype_public_id, 0);
|
|
884
|
+
rb_define_method(mkr_cHtmlDocumentType, "system_id", mkr_doctype_system_id, 0);
|
|
885
|
+
|
|
886
|
+
/* <template> contents (WHATWG DOM HTMLTemplateElement.content). */
|
|
887
|
+
rb_define_method(mkr_cHtmlElement, "content_fragment", mkr_node_content_fragment, 0);
|
|
888
|
+
}
|