makiri 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/valgrind.yml +49 -46
  3. data/CHANGELOG.md +68 -1
  4. data/README.md +14 -0
  5. data/Rakefile +13 -0
  6. data/ext/makiri/bridge/ruby_string.c +80 -54
  7. data/ext/makiri/core/mkr_alloc.h +1 -1
  8. data/ext/makiri/core/mkr_utf8.c +1 -1
  9. data/ext/makiri/core/mkr_utf8.h +1 -1
  10. data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
  11. data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
  12. data/ext/makiri/dom_adapter/cross_import.c +434 -0
  13. data/ext/makiri/dom_adapter/cross_import.h +35 -0
  14. data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
  15. data/ext/makiri/fuzz/Makefile +1 -1
  16. data/ext/makiri/glue/cross_import.h +30 -0
  17. data/ext/makiri/glue/glue.h +1 -1
  18. data/ext/makiri/glue/ruby_doc.c +11 -3
  19. data/ext/makiri/glue/ruby_html_mutate.c +6 -0
  20. data/ext/makiri/glue/ruby_html_node.c +1 -1
  21. data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
  22. data/ext/makiri/glue/ruby_node.c +14 -0
  23. data/ext/makiri/glue/ruby_xml.c +31 -2
  24. data/ext/makiri/glue/ruby_xml_node.c +87 -2
  25. data/ext/makiri/glue/ruby_xpath.c +16 -1
  26. data/ext/makiri/makiri.c +3 -0
  27. data/ext/makiri/makiri.h +5 -0
  28. data/ext/makiri/xml/mkr_xml.h +5 -0
  29. data/ext/makiri/xml/mkr_xml_chars.c +22 -0
  30. data/ext/makiri/xml/mkr_xml_mutate.c +160 -50
  31. data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
  32. data/ext/makiri/xml/mkr_xml_tree.c +63 -27
  33. data/ext/makiri/xpath/mkr_xpath.c +28 -0
  34. data/ext/makiri/xpath/mkr_xpath.h +5 -1
  35. data/ext/makiri/xpath/mkr_xpath_eval_body.h +11 -1
  36. data/lib/makiri/html/document.rb +11 -12
  37. data/lib/makiri/html/node_methods.rb +0 -1
  38. data/lib/makiri/node_set.rb +14 -9
  39. data/lib/makiri/processing_instruction.rb +8 -0
  40. data/lib/makiri/version.rb +1 -1
  41. data/lib/makiri/xml/builder.rb +29 -21
  42. data/lib/makiri/xpath_context.rb +12 -4
  43. data/script/check_c_safety.rb +1 -1
  44. data/script/check_c_safety_allowlist.yml +8 -5
  45. data/script/leaks_harness.rb +7 -0
  46. data/suppressions/ruby.supp +140 -0
  47. metadata +13 -8
  48. /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
  49. /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
  50. /data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +0 -0
  51. /data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +0 -0
@@ -0,0 +1,434 @@
1
+ /* cross_import.c - cross-kind subtree translation for Document#import_node.
2
+ *
3
+ * Makiri keeps HTML nodes (Lexbor lxb_dom_node_t) and XML nodes (mkr_xml_node_t)
4
+ * as distinct C representations that cannot share a tree. import_node bridges
5
+ * them: it deep/shallow-copies a subtree from one representation into the other,
6
+ * owned by the target document, returning a DETACHED copy (the caller links it).
7
+ *
8
+ * Ruby-FREE and lives in dom_adapter (not glue): it reads/writes BOTH lexbor and
9
+ * the XML arena, exactly the bridge this layer is for. The glue import_node entries
10
+ * (ruby_doc.c / ruby_xml_node.c) do the Ruby-VALUE kind check (mkr_node_kind), call
11
+ * these, and wrap/raise. Both directions:
12
+ * - build the destination subtree DETACHED, then return it (never linking into a
13
+ * live tree mid-build), so a failure abandons a self-contained partial subtree
14
+ * in the destination arena (HTML mraw or the XML node arena), freed with the
15
+ * document - the same fail-closed model the XML deep-copy uses;
16
+ * - walk the source with an explicit heap stack (no C recursion -> no stack DoS),
17
+ * freed on every path;
18
+ * - report failure via mkr_xml_mut_status_t; the Ruby entry maps it with
19
+ * mkr_xml_mut_check.
20
+ *
21
+ * Namespaces (phase 4): preserved across the two representations.
22
+ * - HTML->XML: an mkr node's namespace is resolved from xmlns declarations at
23
+ * insertion time (resolve_node_ns), so a directly-set ns_uri would be
24
+ * overwritten when the imported subtree is later linked. We therefore SYNTHESIZE
25
+ * xmlns declarations: each element declares xmlns="URI" when its namespace
26
+ * differs from the one inherited from its translated parent (so unprefixed
27
+ * elements resolve correctly), and a foreign-prefixed attribute (e.g. xlink:*)
28
+ * gets an xmlns:PREFIX declaration on its element. The predefined xml: prefix
29
+ * needs none.
30
+ * - XML->HTML: Lexbor stores a namespace as an id, so the element's node.ns is set
31
+ * from the URI (interning any URI via lxb_ns_append) and a namespaced attribute
32
+ * is built with lxb_dom_attr_set_name_ns.
33
+ */
34
+ #include "cross_import.h"
35
+ #include "../core/mkr_core.h" /* mkr_grow_reserve, mkr_reallocarray, MKR_OK */
36
+
37
+ #include <lexbor/ns/ns.h> /* lxb_ns_by_id, LXB_NS_* */
38
+ #include <stdlib.h> /* free (the xmlns:PREFIX scratch; alloc via mkr_reallocarray) */
39
+ #include <string.h> /* memcmp / memcpy / memchr */
40
+
41
+ /* Exported by Lexbor but omitted from its public headers: names an attribute from
42
+ * (namespace URI, qualified name), splitting prefix/local and interning the ns.
43
+ * (Same forward declaration ruby_html_mutate.c uses.) */
44
+ extern lxb_status_t
45
+ lxb_dom_attr_set_name_ns(lxb_dom_attr_t *attr, const lxb_char_t *link,
46
+ size_t link_length, const lxb_char_t *name,
47
+ size_t name_length, bool to_lowercase);
48
+
49
+ /* Also Lexbor-internal: intern a namespace URI in the document's ns table,
50
+ * returning the entry (whose ns_id we set on a translated element's node.ns). */
51
+ extern const lxb_ns_data_t *
52
+ lxb_ns_append(lexbor_hash_t *hash, const lxb_char_t *link, size_t length);
53
+
54
+ /* A DOM name/value slice must fit uint32 (the mkr arena's per-slice cap and the
55
+ * factory signatures). A >4 GiB slice is rejected fail-closed rather than wrapped. */
56
+ #define MKR_FITS_U32(n) ((n) <= UINT32_MAX)
57
+
58
+ /* Intern +uri+ in the destination HTML document's namespace table and return its
59
+ * Lexbor id, so an element's namespace survives translation for ANY URI (not just
60
+ * the few Lexbor knows by default) - the same interning lxb_dom_attr_set_name_ns
61
+ * does for attributes. A null/empty URI (or an intern OOM) is the null namespace. */
62
+ static lxb_ns_id_t
63
+ x2h_ns_id(lxb_dom_document_t *hdoc, const char *uri, uint32_t len)
64
+ {
65
+ if (uri == NULL || len == 0) return LXB_NS__UNDEF;
66
+ const lxb_ns_data_t *d = lxb_ns_append(hdoc->ns, (const lxb_char_t *)uri, len);
67
+ return (d != NULL) ? d->ns_id : LXB_NS__UNDEF; /* fail-soft on OOM */
68
+ }
69
+
70
+ /* The URI string for an HTML node's namespace id (borrowed from the source
71
+ * document's interned ns table - stable for that document's lifetime), or NULL/0
72
+ * for the null namespace. */
73
+ static const char *
74
+ mkr_html_ns_uri(lxb_dom_node_t *n, uint32_t *out_len)
75
+ {
76
+ *out_len = 0;
77
+ if (n->ns == LXB_NS__UNDEF) return NULL;
78
+ size_t len = 0;
79
+ const lxb_char_t *u = lxb_ns_by_id(n->owner_document->ns, n->ns, &len);
80
+ if (u == NULL || !MKR_FITS_U32(len)) return NULL;
81
+ *out_len = (uint32_t)len;
82
+ return (const char *)u;
83
+ }
84
+
85
+ static int
86
+ mkr_uri_eq(const char *a, uint32_t al, const char *b, uint32_t bl)
87
+ {
88
+ return al == bl && (al == 0 || memcmp(a, b, al) == 0);
89
+ }
90
+
91
+ /* ----- explicit (src, dst) work stack shared by both directions --------------
92
+ * +def+/+deflen+ carry, for HTML->XML, the default-namespace URI in scope for the
93
+ * destination node's children (so a child only redeclares xmlns when it differs);
94
+ * unused (NULL/0) for XML->HTML. */
95
+ typedef struct { void *s; void *d; const char *def; uint32_t deflen; } mkr_xframe_t;
96
+ typedef struct { mkr_xframe_t *v; size_t n, cap; } mkr_xstack_t;
97
+
98
+ static int
99
+ mkr_xstack_push(mkr_xstack_t *st, void *s, void *d, const char *def, uint32_t deflen)
100
+ {
101
+ if (mkr_grow_reserve((void **)&st->v, &st->cap, st->n + 1, sizeof(*st->v)) != MKR_OK) {
102
+ return -1;
103
+ }
104
+ st->v[st->n].s = s;
105
+ st->v[st->n].d = d;
106
+ st->v[st->n].def = def;
107
+ st->v[st->n].deflen = deflen;
108
+ st->n++;
109
+ return 0;
110
+ }
111
+
112
+ /* ===================== HTML (lxb) -> XML (mkr) =============================== */
113
+
114
+ /* Declare xmlns (prefix NULL/plen 0) or xmlns:PREFIX = uri on the detached mkr
115
+ * element +el+, as an ordinary attribute, so the inserted subtree's prefix-based
116
+ * namespace resolution reproduces +uri+. */
117
+ static mkr_xml_mut_status_t
118
+ h2x_declare_ns(mkr_xml_doc_t *xdoc, mkr_xml_node_t *el,
119
+ const char *prefix, uint32_t plen, const char *uri, uint32_t ulen)
120
+ {
121
+ if (plen == 0) {
122
+ return mkr_xml_set_attribute(xdoc, el, "xmlns", 5, uri != NULL ? uri : "", ulen, NULL);
123
+ }
124
+ size_t nlen = (size_t)6 + plen; /* "xmlns:" + prefix */
125
+ if (!MKR_FITS_U32(nlen)) return MKR_XML_MUT_OOM;
126
+ char *nm = mkr_reallocarray(NULL, nlen, 1); /* overflow-checked safe alloc */
127
+ if (nm == NULL) return MKR_XML_MUT_OOM;
128
+ memcpy(nm, "xmlns:", 6);
129
+ memcpy(nm + 6, prefix, plen);
130
+ mkr_xml_mut_status_t st = mkr_xml_set_attribute(xdoc, el, nm, (uint32_t)nlen,
131
+ uri != NULL ? uri : "", ulen, NULL);
132
+ free(nm);
133
+ return st;
134
+ }
135
+
136
+ /* Copy +s+'s attributes onto the translated mkr element +el+, declaring an
137
+ * xmlns:PREFIX for each foreign-prefixed attribute so resolution at link time
138
+ * succeeds (the predefined xml: prefix needs none). */
139
+ static mkr_xml_mut_status_t
140
+ h2x_copy_attrs(mkr_xml_doc_t *xdoc, lxb_dom_node_t *s, mkr_xml_node_t *el)
141
+ {
142
+ for (lxb_dom_attr_t *a = lxb_dom_element_first_attribute(lxb_dom_interface_element(s));
143
+ a != NULL; a = lxb_dom_element_next_attribute(a)) {
144
+ size_t anl, avl;
145
+ const lxb_char_t *an = lxb_dom_attr_qualified_name(a, &anl);
146
+ const lxb_char_t *av = lxb_dom_attr_value(a, &avl);
147
+ if (!MKR_FITS_U32(anl) || !MKR_FITS_U32(avl)) return MKR_XML_MUT_OOM;
148
+
149
+ lxb_ns_id_t ans = a->node.ns;
150
+ if (ans != LXB_NS__UNDEF && ans != LXB_NS_HTML && ans != LXB_NS_XML) {
151
+ const lxb_char_t *colon = memchr(an, ':', anl);
152
+ if (colon != NULL) {
153
+ uint32_t ulen;
154
+ const char *uri = mkr_html_ns_uri(&a->node, &ulen);
155
+ if (uri != NULL) {
156
+ mkr_xml_mut_status_t st = h2x_declare_ns(
157
+ xdoc, el, (const char *)an, (uint32_t)(colon - an), uri, ulen);
158
+ if (st != MKR_XML_MUT_OK) return st;
159
+ }
160
+ }
161
+ }
162
+ mkr_xml_mut_status_t st = mkr_xml_set_attribute(
163
+ xdoc, el, (const char *)an, (uint32_t)anl,
164
+ av != NULL ? (const char *)av : "", (uint32_t)avl, NULL);
165
+ if (st != MKR_XML_MUT_OK) return st;
166
+ }
167
+ return MKR_XML_MUT_OK;
168
+ }
169
+
170
+ /* Translate ONE lxb node into a fresh mkr node (own fields + attributes, NOT its
171
+ * children). +pdef+/+pdef_len+ is the default namespace inherited from the
172
+ * translated parent; *cdef / *cdef_len receive the default namespace in scope for
173
+ * THIS node's children. *out is the new node, or NULL to SKIP an unsupported type;
174
+ * an error status fails the whole import. */
175
+ static mkr_xml_mut_status_t
176
+ h2x_make(mkr_xml_doc_t *xdoc, lxb_dom_node_t *s, const char *pdef, uint32_t pdef_len,
177
+ const char **cdef, uint32_t *cdef_len, mkr_xml_node_t **out)
178
+ {
179
+ *out = NULL;
180
+ *cdef = pdef; /* default: a non-element does not change the scope */
181
+ *cdef_len = pdef_len;
182
+
183
+ switch (s->type) {
184
+ case LXB_DOM_NODE_TYPE_ELEMENT: {
185
+ size_t nl;
186
+ const lxb_char_t *nm = lxb_dom_element_qualified_name(lxb_dom_interface_element(s), &nl);
187
+ if (!MKR_FITS_U32(nl)) return MKR_XML_MUT_OOM;
188
+ mkr_xml_node_t *el = NULL;
189
+ mkr_xml_mut_status_t st = mkr_xml_new_element(xdoc, (const char *)nm, (uint32_t)nl, &el);
190
+ if (st != MKR_XML_MUT_OK) return st;
191
+
192
+ /* Declare the element's default namespace iff it differs from the inherited
193
+ * one, so this (unprefixed, like all HTML elements) element resolves to it.
194
+ * An element with no namespace under an inherited default undeclares (xmlns=""). */
195
+ uint32_t eul;
196
+ const char *euri = mkr_html_ns_uri(s, &eul);
197
+ if (!mkr_uri_eq(euri, eul, pdef, pdef_len)) {
198
+ st = h2x_declare_ns(xdoc, el, NULL, 0, euri, eul);
199
+ if (st != MKR_XML_MUT_OK) return st;
200
+ *cdef = (euri != NULL) ? euri : "";
201
+ *cdef_len = eul;
202
+ }
203
+
204
+ st = h2x_copy_attrs(xdoc, s, el);
205
+ if (st != MKR_XML_MUT_OK) return st;
206
+ *out = el;
207
+ return MKR_XML_MUT_OK;
208
+ }
209
+ case LXB_DOM_NODE_TYPE_TEXT:
210
+ case LXB_DOM_NODE_TYPE_CDATA_SECTION:
211
+ case LXB_DOM_NODE_TYPE_COMMENT: {
212
+ const lexbor_str_t *d = &lxb_dom_interface_character_data(s)->data;
213
+ if (!MKR_FITS_U32(d->length)) return MKR_XML_MUT_OOM;
214
+ uint8_t t = (s->type == LXB_DOM_NODE_TYPE_TEXT) ? MKR_XML_NODE_TYPE_TEXT
215
+ : (s->type == LXB_DOM_NODE_TYPE_CDATA_SECTION) ? MKR_XML_NODE_TYPE_CDATA_SECTION
216
+ : MKR_XML_NODE_TYPE_COMMENT;
217
+ return mkr_xml_new_chardata(xdoc, t, d->data != NULL ? (const char *)d->data : "",
218
+ (uint32_t)d->length, out);
219
+ }
220
+ case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION: {
221
+ size_t tl;
222
+ const lxb_char_t *tg = lxb_dom_processing_instruction_target(
223
+ lxb_dom_interface_processing_instruction(s), &tl);
224
+ const lexbor_str_t *d = &lxb_dom_interface_character_data(s)->data;
225
+ if (!MKR_FITS_U32(tl) || !MKR_FITS_U32(d->length)) return MKR_XML_MUT_OOM;
226
+ return mkr_xml_new_pi(xdoc, (const char *)tg, (uint32_t)tl,
227
+ d->data != NULL ? (const char *)d->data : "", (uint32_t)d->length, out);
228
+ }
229
+ case LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT: {
230
+ mkr_xml_node_t *f = mkr_xml_arena_node(xdoc, MKR_XML_NODE_TYPE_DOCUMENT_FRAGMENT);
231
+ if (f == NULL) return MKR_XML_MUT_OOM;
232
+ *out = f;
233
+ return MKR_XML_MUT_OK;
234
+ }
235
+ default:
236
+ return MKR_XML_MUT_OK; /* unsupported descendant type: skip (*out stays NULL) */
237
+ }
238
+ }
239
+
240
+ /* The children to translate under +s+. An HTML <template> keeps its content in a
241
+ * separate document fragment, NOT the normal child chain, so a plain first_child
242
+ * walk would silently drop template contents. We descend into the content fragment
243
+ * instead: mkr (XML) has no template-content concept, so the contents become
244
+ * ordinary children of the translated element (lossless, the natural XML shape). */
245
+ static lxb_dom_node_t *
246
+ h2x_children_of(lxb_dom_node_t *s)
247
+ {
248
+ if (s->type == LXB_DOM_NODE_TYPE_ELEMENT
249
+ && s->local_name == LXB_TAG_TEMPLATE && s->ns == LXB_NS_HTML) {
250
+ lxb_dom_document_fragment_t *content = lxb_html_interface_template(s)->content;
251
+ return content != NULL ? lxb_dom_interface_node(content)->first_child : NULL;
252
+ }
253
+ return s->first_child;
254
+ }
255
+
256
+ mkr_xml_mut_status_t
257
+ mkr_cross_html_to_xml(mkr_xml_doc_t *xdoc, lxb_dom_node_t *src, int deep, mkr_xml_node_t **out)
258
+ {
259
+ *out = NULL;
260
+ mkr_xml_node_t *root = NULL;
261
+ const char *rdef = NULL; uint32_t rdef_len = 0;
262
+ mkr_xml_mut_status_t st = h2x_make(xdoc, src, NULL, 0, &rdef, &rdef_len, &root);
263
+ if (st != MKR_XML_MUT_OK) return st;
264
+ if (root == NULL) return MKR_XML_MUT_TYPE; /* root node type has no XML counterpart */
265
+
266
+ if (deep) {
267
+ mkr_xstack_t stk = { NULL, 0, 0 };
268
+ if (mkr_xstack_push(&stk, src, root, rdef, rdef_len) != 0) { free(stk.v); return MKR_XML_MUT_OOM; }
269
+ while (stk.n > 0) {
270
+ mkr_xframe_t f = stk.v[--stk.n];
271
+ lxb_dom_node_t *s = (lxb_dom_node_t *)f.s;
272
+ mkr_xml_node_t *d = (mkr_xml_node_t *)f.d;
273
+ for (lxb_dom_node_t *c = h2x_children_of(s); c != NULL; c = c->next) {
274
+ mkr_xml_node_t *dc = NULL;
275
+ const char *cdef = NULL; uint32_t cdef_len = 0;
276
+ st = h2x_make(xdoc, c, f.def, f.deflen, &cdef, &cdef_len, &dc);
277
+ if (st != MKR_XML_MUT_OK) goto done;
278
+ if (dc == NULL) continue; /* skipped node type */
279
+ st = mkr_xml_insert_child(xdoc, d, dc); /* detached parent: ns deferred */
280
+ if (st != MKR_XML_MUT_OK) goto done;
281
+ if (h2x_children_of(c) != NULL
282
+ && mkr_xstack_push(&stk, c, dc, cdef, cdef_len) != 0) {
283
+ st = MKR_XML_MUT_OOM; goto done;
284
+ }
285
+ }
286
+ }
287
+ done:
288
+ free(stk.v);
289
+ if (st != MKR_XML_MUT_OK) return st; /* partial subtree abandoned in the arena */
290
+ }
291
+ *out = root;
292
+ return MKR_XML_MUT_OK;
293
+ }
294
+
295
+ /* ===================== XML (mkr) -> HTML (lxb) =============================== */
296
+
297
+ /* Copy +s+'s attributes onto the translated lxb element +el+, preserving each
298
+ * attribute's namespace (a null-namespace attribute via set_attribute, a
299
+ * namespaced one via an explicit lxb_dom_attr_set_name_ns). */
300
+ static mkr_xml_mut_status_t
301
+ x2h_copy_attrs(lxb_dom_document_t *hdoc, const mkr_xml_node_t *s, lxb_dom_element_t *el)
302
+ {
303
+ for (const mkr_xml_node_t *a = s->attrs; a != NULL; a = a->next) {
304
+ const char *val = a->value != NULL ? a->value : "";
305
+ if (a->ns_uri_len == 0) {
306
+ if (lxb_dom_element_set_attribute(el, (const lxb_char_t *)a->qname, a->qname_len,
307
+ (const lxb_char_t *)val, a->value_len) == NULL) {
308
+ return MKR_XML_MUT_OOM;
309
+ }
310
+ continue;
311
+ }
312
+ lxb_dom_attr_t *at = lxb_dom_attr_interface_create(hdoc);
313
+ if (at == NULL) return MKR_XML_MUT_OOM;
314
+ if (lxb_dom_attr_set_name_ns(at, (const lxb_char_t *)a->ns_uri, a->ns_uri_len,
315
+ (const lxb_char_t *)a->qname, a->qname_len, false) != LXB_STATUS_OK
316
+ || lxb_dom_attr_set_value(at, (const lxb_char_t *)val, a->value_len) != LXB_STATUS_OK) {
317
+ return MKR_XML_MUT_OOM; /* the un-appended attr is abandoned in mraw */
318
+ }
319
+ lxb_dom_element_attr_append(el, at);
320
+ }
321
+ return MKR_XML_MUT_OK;
322
+ }
323
+
324
+ /* Translate ONE mkr node into a fresh, detached lxb node (own fields + attributes,
325
+ * NOT children). *out is the new node, or NULL to SKIP an unsupported type. An XML
326
+ * CDATA section has no HTML counterpart, so it fails closed (MKR_XML_MUT_TYPE). */
327
+ static mkr_xml_mut_status_t
328
+ x2h_make(lxb_dom_document_t *hdoc, const mkr_xml_node_t *s, lxb_dom_node_t **out)
329
+ {
330
+ *out = NULL;
331
+ switch (s->type) {
332
+ case MKR_XML_NODE_TYPE_ELEMENT: {
333
+ lxb_dom_element_t *el = lxb_dom_document_create_element(
334
+ hdoc, (const lxb_char_t *)s->qname, s->qname_len, NULL);
335
+ if (el == NULL) return MKR_XML_MUT_OOM;
336
+ /* Preserve the namespace as a Lexbor id (any URI, interned; else null). */
337
+ lxb_dom_interface_node(el)->ns = x2h_ns_id(hdoc, s->ns_uri, s->ns_uri_len);
338
+ mkr_xml_mut_status_t st = x2h_copy_attrs(hdoc, s, el);
339
+ if (st != MKR_XML_MUT_OK) return st;
340
+ *out = lxb_dom_interface_node(el);
341
+ return MKR_XML_MUT_OK;
342
+ }
343
+ case MKR_XML_NODE_TYPE_TEXT: {
344
+ lxb_dom_text_t *t = lxb_dom_document_create_text_node(
345
+ hdoc, (const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
346
+ if (t == NULL) return MKR_XML_MUT_OOM;
347
+ *out = lxb_dom_interface_node(t);
348
+ return MKR_XML_MUT_OK;
349
+ }
350
+ case MKR_XML_NODE_TYPE_COMMENT: {
351
+ lxb_dom_comment_t *c = lxb_dom_document_create_comment(
352
+ hdoc, (const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
353
+ if (c == NULL) return MKR_XML_MUT_OOM;
354
+ *out = lxb_dom_interface_node(c);
355
+ return MKR_XML_MUT_OK;
356
+ }
357
+ case MKR_XML_NODE_TYPE_PI: {
358
+ /* The PI target is the node's name (local == qname for a PI); data is value. */
359
+ lxb_dom_processing_instruction_t *pi = lxb_dom_document_create_processing_instruction(
360
+ hdoc, (const lxb_char_t *)s->local, s->local_len,
361
+ (const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
362
+ if (pi == NULL) return MKR_XML_MUT_OOM;
363
+ *out = lxb_dom_interface_node(pi);
364
+ return MKR_XML_MUT_OK;
365
+ }
366
+ case MKR_XML_NODE_TYPE_CDATA_SECTION:
367
+ return MKR_XML_MUT_TYPE; /* HTML has no CDATA section: fail closed */
368
+ case MKR_XML_NODE_TYPE_DOCUMENT_FRAGMENT: {
369
+ lxb_dom_document_fragment_t *f = lxb_dom_document_create_document_fragment(hdoc);
370
+ if (f == NULL) return MKR_XML_MUT_OOM;
371
+ *out = lxb_dom_interface_node(f);
372
+ return MKR_XML_MUT_OK;
373
+ }
374
+ default:
375
+ return MKR_XML_MUT_OK; /* unsupported descendant type: skip (*out stays NULL) */
376
+ }
377
+ }
378
+
379
+ /* Where a translated element's CHILDREN attach. An HTML <template> holds its
380
+ * content in a separate document fragment (HTMLTemplateElement.content), not the
381
+ * normal child chain, so children go there - matching a parsed template and the
382
+ * HTML->HTML import_node fixup (mkr_fixup_template_content). Other elements link
383
+ * children directly. */
384
+ static lxb_dom_node_t *
385
+ x2h_link_target(lxb_dom_node_t *el)
386
+ {
387
+ if (el->type == LXB_DOM_NODE_TYPE_ELEMENT
388
+ && el->local_name == LXB_TAG_TEMPLATE && el->ns == LXB_NS_HTML) {
389
+ lxb_dom_document_fragment_t *content = lxb_html_interface_template(el)->content;
390
+ if (content != NULL) return lxb_dom_interface_node(content);
391
+ }
392
+ return el;
393
+ }
394
+
395
+ mkr_xml_mut_status_t
396
+ mkr_cross_xml_to_html(lxb_dom_document_t *hdoc, const mkr_xml_node_t *src, int deep,
397
+ lxb_dom_node_t **out)
398
+ {
399
+ *out = NULL;
400
+ lxb_dom_node_t *root = NULL;
401
+ mkr_xml_mut_status_t st = x2h_make(hdoc, src, &root);
402
+ if (st != MKR_XML_MUT_OK) return st;
403
+ if (root == NULL) return MKR_XML_MUT_TYPE; /* root node type has no HTML counterpart */
404
+
405
+ if (deep) {
406
+ mkr_xstack_t stk = { NULL, 0, 0 };
407
+ /* The frame's d is the link target for the source node's children (a
408
+ * template element's content fragment, else the element itself). */
409
+ if (mkr_xstack_push(&stk, (void *)src, x2h_link_target(root), NULL, 0) != 0) {
410
+ free(stk.v); return MKR_XML_MUT_OOM;
411
+ }
412
+ while (stk.n > 0) {
413
+ mkr_xframe_t f = stk.v[--stk.n];
414
+ const mkr_xml_node_t *s = (const mkr_xml_node_t *)f.s;
415
+ lxb_dom_node_t *d = (lxb_dom_node_t *)f.d;
416
+ for (const mkr_xml_node_t *c = s->first_child; c != NULL; c = c->next) {
417
+ lxb_dom_node_t *dc = NULL;
418
+ st = x2h_make(hdoc, c, &dc);
419
+ if (st != MKR_XML_MUT_OK) goto done;
420
+ if (dc == NULL) continue; /* skipped node type */
421
+ lxb_dom_node_insert_child(d, dc);
422
+ if (c->first_child != NULL
423
+ && mkr_xstack_push(&stk, (void *)c, x2h_link_target(dc), NULL, 0) != 0) {
424
+ st = MKR_XML_MUT_OOM; goto done;
425
+ }
426
+ }
427
+ }
428
+ done:
429
+ free(stk.v);
430
+ if (st != MKR_XML_MUT_OK) return st; /* partial subtree abandoned in mraw */
431
+ }
432
+ *out = root;
433
+ return MKR_XML_MUT_OK;
434
+ }
@@ -0,0 +1,35 @@
1
+ #ifndef MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H
2
+ #define MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H
3
+
4
+ /* Cross-kind subtree translation for Document#import_node. Ruby-FREE: it reads and
5
+ * writes BOTH the HTML (Lexbor lxb_dom_node_t) and XML (mkr_xml_node_t)
6
+ * representations, so it lives in dom_adapter - the layer that already bridges
7
+ * Lexbor and the XML arena - rather than in the Ruby glue. The glue import_node
8
+ * entries call these after a kind check and wrap the result. */
9
+
10
+ #include <lexbor/html/html.h> /* lxb_dom_* + the <template> content interface */
11
+ #include <lexbor/dom/dom.h>
12
+ #include "../xml/mkr_xml_node.h" /* mkr_xml_node_t, mkr_xml_doc_t */
13
+ #include "../xml/mkr_xml_mutate.h" /* mkr_xml_mut_status_t + node factories */
14
+
15
+ #ifdef __cplusplus
16
+ extern "C" {
17
+ #endif
18
+
19
+ /* Build a DETACHED deep (or shallow, deep == 0) copy of +src+ in the OTHER
20
+ * representation, owned by the destination document, returned in *out (set only on
21
+ * MKR_XML_MUT_OK). Iterative (no C recursion -> no stack DoS) and fail-closed: a
22
+ * failure abandons a self-contained partial subtree in the destination arena (the
23
+ * XML node arena or Lexbor's mraw), freed with the document. Namespaces are
24
+ * preserved across the translation; an XML CDATA section has no HTML counterpart,
25
+ * so mkr_cross_xml_to_html fails closed (MKR_XML_MUT_TYPE) when it meets one. */
26
+ mkr_xml_mut_status_t mkr_cross_html_to_xml(mkr_xml_doc_t *xdoc, lxb_dom_node_t *src,
27
+ int deep, mkr_xml_node_t **out);
28
+ mkr_xml_mut_status_t mkr_cross_xml_to_html(lxb_dom_document_t *hdoc, const mkr_xml_node_t *src,
29
+ int deep, lxb_dom_node_t **out);
30
+
31
+ #ifdef __cplusplus
32
+ }
33
+ #endif
34
+
35
+ #endif /* MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H */
@@ -7,7 +7,7 @@
7
7
  #include <string.h>
8
8
 
9
9
  /*
10
- * Per-document text-extraction index (lexbor_compat/text_index.c).
10
+ * Per-document text-extraction index (dom_adapter/text_index.c).
11
11
  *
12
12
  * Descendant-text aggregation (Node#text, XPath string-value) walks every node
13
13
  * of a subtree chasing pointers through Lexbor's 96-byte nodes - at scale this
@@ -30,7 +30,7 @@ CFLAGS = -O2 -g -Wall -Wextra \
30
30
  -I$(EXT_DIR)/core \
31
31
  -I$(EXT_DIR)/xml \
32
32
  -I$(EXT_DIR)/xpath \
33
- -I$(EXT_DIR)/lexbor_compat
33
+ -I$(EXT_DIR)/dom_adapter
34
34
 
35
35
  # Linker flags: libFuzzer driver + sanitizers.
36
36
  LDFLAGS = -fsanitize=fuzzer,$(SANITIZE)
@@ -0,0 +1,30 @@
1
+ #ifndef MAKIRI_CROSS_IMPORT_H
2
+ #define MAKIRI_CROSS_IMPORT_H
3
+
4
+ /* Glue-side helpers for cross-kind Document#import_node. The Ruby-FREE subtree
5
+ * translators live in dom_adapter/cross_import.c (they read/write both Lexbor
6
+ * and the XML arena); this header adds the Ruby-boundary pieces the import_node
7
+ * entries (ruby_doc.c / ruby_xml_node.c) need on top of them. */
8
+
9
+ #include "../makiri.h"
10
+ #include "../dom_adapter/cross_import.h" /* mkr_cross_html_to_xml / _xml_to_html */
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
+
16
+ /* Which representation a wrapped Ruby node is, by its TypedData type (NOT by Ruby
17
+ * class). A Document VALUE, a NodeSet, or any non-node is MKR_NODE_KIND_OTHER.
18
+ * Defined in ruby_node.c. */
19
+ typedef enum { MKR_NODE_KIND_OTHER = 0, MKR_NODE_KIND_HTML, MKR_NODE_KIND_XML } mkr_node_kind_t;
20
+ mkr_node_kind_t mkr_node_kind(VALUE v);
21
+
22
+ /* Raise a Ruby exception for a non-OK mutation status (no-op on OK). Defined in
23
+ * ruby_xml_node.c; shared by the XML mutators and the cross-import entries. */
24
+ void mkr_xml_mut_check(mkr_xml_mut_status_t st);
25
+
26
+ #ifdef __cplusplus
27
+ }
28
+ #endif
29
+
30
+ #endif /* MAKIRI_CROSS_IMPORT_H */
@@ -2,7 +2,7 @@
2
2
  #define MAKIRI_GLUE_H
3
3
 
4
4
  #include "../makiri.h"
5
- #include "../lexbor_compat/compat.h"
5
+ #include "../dom_adapter/compat.h"
6
6
 
7
7
  #ifdef __cplusplus
8
8
  extern "C" {
@@ -1,5 +1,6 @@
1
1
  #include "glue.h"
2
- #include "../lexbor_compat/compat_internal.h" /* mkr_dom_preorder_next */
2
+ #include "cross_import.h" /* cross-kind import_node (XML node -> this HTML doc) */
3
+ #include "../dom_adapter/compat_internal.h" /* mkr_dom_preorder_next */
3
4
  #include "../core/mkr_core.h"
4
5
  #include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
5
6
 
@@ -348,10 +349,17 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
348
349
  VALUE node_v, deep_v;
349
350
  rb_scan_args(argc, argv, "11", &node_v, &deep_v);
350
351
  bool deep = RTEST(deep_v);
351
-
352
- lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
353
352
  lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
354
353
 
354
+ /* An XML node is TRANSLATED across representations (mkr -> lxb) by
355
+ * ruby_cross_import.c into a detached lxb subtree owned by this document. */
356
+ if (mkr_node_kind(node_v) == MKR_NODE_KIND_XML) {
357
+ lxb_dom_node_t *imp = NULL;
358
+ mkr_xml_mut_check(mkr_cross_xml_to_html(doc, mkr_xml_node_unwrap(node_v), deep, &imp));
359
+ return mkr_wrap_html_node(imp, self);
360
+ }
361
+
362
+ lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* HTML node (raises on a non-node) */
355
363
  lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
356
364
  if (imp == NULL) {
357
365
  rb_raise(mkr_eError, "failed to import node");
@@ -426,6 +426,12 @@ mkr_node_set_name(VALUE self, VALUE rb_name)
426
426
  el->qualified_name = fresh->qualified_name;
427
427
 
428
428
  lxb_dom_node_destroy(lxb_dom_interface_node(fresh));
429
+ /* The element's tag id (local_name) is the key the element-by-tag index
430
+ * buckets on and the //tag fast path serves from; renaming changes it, so
431
+ * the persisted index would otherwise miss the element under its new name
432
+ * (a truncated, wrong //newtag result). Drop the indexes like every other
433
+ * mutator. */
434
+ mkr_invalidate_index(self);
429
435
  return rb_name;
430
436
  }
431
437
 
@@ -408,7 +408,7 @@ mkr_node_content(VALUE self)
408
408
 
409
409
  /* Fast path for elements / fragments (the common case, incl. document text).
410
410
  *
411
- * Preferred: the per-document text index (lexbor_compat/text_index.c) maps
411
+ * Preferred: the per-document text index (dom_adapter/text_index.c) maps
412
412
  * this node to the contiguous, document-order run of its descendants' text
413
413
  * slices, so we serve a single pre-sized memcpy run with no per-extraction
414
414
  * tree walk - the walk is otherwise the dominant, cache-bound cost. Built