makiri 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/valgrind.yml +49 -46
- data/CHANGELOG.md +68 -1
- data/README.md +14 -0
- data/Rakefile +13 -0
- data/ext/makiri/bridge/ruby_string.c +80 -54
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_utf8.c +1 -1
- data/ext/makiri/core/mkr_utf8.h +1 -1
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
- data/ext/makiri/dom_adapter/cross_import.c +434 -0
- data/ext/makiri/dom_adapter/cross_import.h +35 -0
- data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
- data/ext/makiri/fuzz/Makefile +1 -1
- data/ext/makiri/glue/cross_import.h +30 -0
- data/ext/makiri/glue/glue.h +1 -1
- data/ext/makiri/glue/ruby_doc.c +11 -3
- data/ext/makiri/glue/ruby_html_mutate.c +6 -0
- data/ext/makiri/glue/ruby_html_node.c +1 -1
- data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
- data/ext/makiri/glue/ruby_node.c +14 -0
- data/ext/makiri/glue/ruby_xml.c +31 -2
- data/ext/makiri/glue/ruby_xml_node.c +87 -2
- data/ext/makiri/glue/ruby_xpath.c +16 -1
- data/ext/makiri/makiri.c +3 -0
- data/ext/makiri/makiri.h +5 -0
- data/ext/makiri/xml/mkr_xml.h +5 -0
- data/ext/makiri/xml/mkr_xml_chars.c +22 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +160 -50
- data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
- data/ext/makiri/xml/mkr_xml_tree.c +63 -27
- data/ext/makiri/xpath/mkr_xpath.c +28 -0
- data/ext/makiri/xpath/mkr_xpath.h +5 -1
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +11 -1
- data/lib/makiri/html/document.rb +11 -12
- data/lib/makiri/html/node_methods.rb +0 -1
- data/lib/makiri/node_set.rb +14 -9
- data/lib/makiri/processing_instruction.rb +8 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +29 -21
- data/lib/makiri/xpath_context.rb +12 -4
- data/script/check_c_safety.rb +1 -1
- data/script/check_c_safety_allowlist.yml +8 -5
- data/script/leaks_harness.rb +7 -0
- data/suppressions/ruby.supp +140 -0
- metadata +13 -8
- /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +0 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
/* cross_import.c - cross-kind subtree translation for Document#import_node.
|
|
2
|
+
*
|
|
3
|
+
* Makiri keeps HTML nodes (Lexbor lxb_dom_node_t) and XML nodes (mkr_xml_node_t)
|
|
4
|
+
* as distinct C representations that cannot share a tree. import_node bridges
|
|
5
|
+
* them: it deep/shallow-copies a subtree from one representation into the other,
|
|
6
|
+
* owned by the target document, returning a DETACHED copy (the caller links it).
|
|
7
|
+
*
|
|
8
|
+
* Ruby-FREE and lives in dom_adapter (not glue): it reads/writes BOTH lexbor and
|
|
9
|
+
* the XML arena, exactly the bridge this layer is for. The glue import_node entries
|
|
10
|
+
* (ruby_doc.c / ruby_xml_node.c) do the Ruby-VALUE kind check (mkr_node_kind), call
|
|
11
|
+
* these, and wrap/raise. Both directions:
|
|
12
|
+
* - build the destination subtree DETACHED, then return it (never linking into a
|
|
13
|
+
* live tree mid-build), so a failure abandons a self-contained partial subtree
|
|
14
|
+
* in the destination arena (HTML mraw or the XML node arena), freed with the
|
|
15
|
+
* document - the same fail-closed model the XML deep-copy uses;
|
|
16
|
+
* - walk the source with an explicit heap stack (no C recursion -> no stack DoS),
|
|
17
|
+
* freed on every path;
|
|
18
|
+
* - report failure via mkr_xml_mut_status_t; the Ruby entry maps it with
|
|
19
|
+
* mkr_xml_mut_check.
|
|
20
|
+
*
|
|
21
|
+
* Namespaces (phase 4): preserved across the two representations.
|
|
22
|
+
* - HTML->XML: an mkr node's namespace is resolved from xmlns declarations at
|
|
23
|
+
* insertion time (resolve_node_ns), so a directly-set ns_uri would be
|
|
24
|
+
* overwritten when the imported subtree is later linked. We therefore SYNTHESIZE
|
|
25
|
+
* xmlns declarations: each element declares xmlns="URI" when its namespace
|
|
26
|
+
* differs from the one inherited from its translated parent (so unprefixed
|
|
27
|
+
* elements resolve correctly), and a foreign-prefixed attribute (e.g. xlink:*)
|
|
28
|
+
* gets an xmlns:PREFIX declaration on its element. The predefined xml: prefix
|
|
29
|
+
* needs none.
|
|
30
|
+
* - XML->HTML: Lexbor stores a namespace as an id, so the element's node.ns is set
|
|
31
|
+
* from the URI (interning any URI via lxb_ns_append) and a namespaced attribute
|
|
32
|
+
* is built with lxb_dom_attr_set_name_ns.
|
|
33
|
+
*/
|
|
34
|
+
#include "cross_import.h"
|
|
35
|
+
#include "../core/mkr_core.h" /* mkr_grow_reserve, mkr_reallocarray, MKR_OK */
|
|
36
|
+
|
|
37
|
+
#include <lexbor/ns/ns.h> /* lxb_ns_by_id, LXB_NS_* */
|
|
38
|
+
#include <stdlib.h> /* free (the xmlns:PREFIX scratch; alloc via mkr_reallocarray) */
|
|
39
|
+
#include <string.h> /* memcmp / memcpy / memchr */
|
|
40
|
+
|
|
41
|
+
/* Exported by Lexbor but omitted from its public headers: names an attribute from
|
|
42
|
+
* (namespace URI, qualified name), splitting prefix/local and interning the ns.
|
|
43
|
+
* (Same forward declaration ruby_html_mutate.c uses.) */
|
|
44
|
+
extern lxb_status_t
|
|
45
|
+
lxb_dom_attr_set_name_ns(lxb_dom_attr_t *attr, const lxb_char_t *link,
|
|
46
|
+
size_t link_length, const lxb_char_t *name,
|
|
47
|
+
size_t name_length, bool to_lowercase);
|
|
48
|
+
|
|
49
|
+
/* Also Lexbor-internal: intern a namespace URI in the document's ns table,
|
|
50
|
+
* returning the entry (whose ns_id we set on a translated element's node.ns). */
|
|
51
|
+
extern const lxb_ns_data_t *
|
|
52
|
+
lxb_ns_append(lexbor_hash_t *hash, const lxb_char_t *link, size_t length);
|
|
53
|
+
|
|
54
|
+
/* A DOM name/value slice must fit uint32 (the mkr arena's per-slice cap and the
|
|
55
|
+
* factory signatures). A >4 GiB slice is rejected fail-closed rather than wrapped. */
|
|
56
|
+
#define MKR_FITS_U32(n) ((n) <= UINT32_MAX)
|
|
57
|
+
|
|
58
|
+
/* Intern +uri+ in the destination HTML document's namespace table and return its
|
|
59
|
+
* Lexbor id, so an element's namespace survives translation for ANY URI (not just
|
|
60
|
+
* the few Lexbor knows by default) - the same interning lxb_dom_attr_set_name_ns
|
|
61
|
+
* does for attributes. A null/empty URI (or an intern OOM) is the null namespace. */
|
|
62
|
+
static lxb_ns_id_t
|
|
63
|
+
x2h_ns_id(lxb_dom_document_t *hdoc, const char *uri, uint32_t len)
|
|
64
|
+
{
|
|
65
|
+
if (uri == NULL || len == 0) return LXB_NS__UNDEF;
|
|
66
|
+
const lxb_ns_data_t *d = lxb_ns_append(hdoc->ns, (const lxb_char_t *)uri, len);
|
|
67
|
+
return (d != NULL) ? d->ns_id : LXB_NS__UNDEF; /* fail-soft on OOM */
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/* The URI string for an HTML node's namespace id (borrowed from the source
|
|
71
|
+
* document's interned ns table - stable for that document's lifetime), or NULL/0
|
|
72
|
+
* for the null namespace. */
|
|
73
|
+
static const char *
|
|
74
|
+
mkr_html_ns_uri(lxb_dom_node_t *n, uint32_t *out_len)
|
|
75
|
+
{
|
|
76
|
+
*out_len = 0;
|
|
77
|
+
if (n->ns == LXB_NS__UNDEF) return NULL;
|
|
78
|
+
size_t len = 0;
|
|
79
|
+
const lxb_char_t *u = lxb_ns_by_id(n->owner_document->ns, n->ns, &len);
|
|
80
|
+
if (u == NULL || !MKR_FITS_U32(len)) return NULL;
|
|
81
|
+
*out_len = (uint32_t)len;
|
|
82
|
+
return (const char *)u;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
static int
|
|
86
|
+
mkr_uri_eq(const char *a, uint32_t al, const char *b, uint32_t bl)
|
|
87
|
+
{
|
|
88
|
+
return al == bl && (al == 0 || memcmp(a, b, al) == 0);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/* ----- explicit (src, dst) work stack shared by both directions --------------
|
|
92
|
+
* +def+/+deflen+ carry, for HTML->XML, the default-namespace URI in scope for the
|
|
93
|
+
* destination node's children (so a child only redeclares xmlns when it differs);
|
|
94
|
+
* unused (NULL/0) for XML->HTML. */
|
|
95
|
+
typedef struct { void *s; void *d; const char *def; uint32_t deflen; } mkr_xframe_t;
|
|
96
|
+
typedef struct { mkr_xframe_t *v; size_t n, cap; } mkr_xstack_t;
|
|
97
|
+
|
|
98
|
+
static int
|
|
99
|
+
mkr_xstack_push(mkr_xstack_t *st, void *s, void *d, const char *def, uint32_t deflen)
|
|
100
|
+
{
|
|
101
|
+
if (mkr_grow_reserve((void **)&st->v, &st->cap, st->n + 1, sizeof(*st->v)) != MKR_OK) {
|
|
102
|
+
return -1;
|
|
103
|
+
}
|
|
104
|
+
st->v[st->n].s = s;
|
|
105
|
+
st->v[st->n].d = d;
|
|
106
|
+
st->v[st->n].def = def;
|
|
107
|
+
st->v[st->n].deflen = deflen;
|
|
108
|
+
st->n++;
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/* ===================== HTML (lxb) -> XML (mkr) =============================== */
|
|
113
|
+
|
|
114
|
+
/* Declare xmlns (prefix NULL/plen 0) or xmlns:PREFIX = uri on the detached mkr
|
|
115
|
+
* element +el+, as an ordinary attribute, so the inserted subtree's prefix-based
|
|
116
|
+
* namespace resolution reproduces +uri+. */
|
|
117
|
+
static mkr_xml_mut_status_t
|
|
118
|
+
h2x_declare_ns(mkr_xml_doc_t *xdoc, mkr_xml_node_t *el,
|
|
119
|
+
const char *prefix, uint32_t plen, const char *uri, uint32_t ulen)
|
|
120
|
+
{
|
|
121
|
+
if (plen == 0) {
|
|
122
|
+
return mkr_xml_set_attribute(xdoc, el, "xmlns", 5, uri != NULL ? uri : "", ulen, NULL);
|
|
123
|
+
}
|
|
124
|
+
size_t nlen = (size_t)6 + plen; /* "xmlns:" + prefix */
|
|
125
|
+
if (!MKR_FITS_U32(nlen)) return MKR_XML_MUT_OOM;
|
|
126
|
+
char *nm = mkr_reallocarray(NULL, nlen, 1); /* overflow-checked safe alloc */
|
|
127
|
+
if (nm == NULL) return MKR_XML_MUT_OOM;
|
|
128
|
+
memcpy(nm, "xmlns:", 6);
|
|
129
|
+
memcpy(nm + 6, prefix, plen);
|
|
130
|
+
mkr_xml_mut_status_t st = mkr_xml_set_attribute(xdoc, el, nm, (uint32_t)nlen,
|
|
131
|
+
uri != NULL ? uri : "", ulen, NULL);
|
|
132
|
+
free(nm);
|
|
133
|
+
return st;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/* Copy +s+'s attributes onto the translated mkr element +el+, declaring an
|
|
137
|
+
* xmlns:PREFIX for each foreign-prefixed attribute so resolution at link time
|
|
138
|
+
* succeeds (the predefined xml: prefix needs none). */
|
|
139
|
+
static mkr_xml_mut_status_t
|
|
140
|
+
h2x_copy_attrs(mkr_xml_doc_t *xdoc, lxb_dom_node_t *s, mkr_xml_node_t *el)
|
|
141
|
+
{
|
|
142
|
+
for (lxb_dom_attr_t *a = lxb_dom_element_first_attribute(lxb_dom_interface_element(s));
|
|
143
|
+
a != NULL; a = lxb_dom_element_next_attribute(a)) {
|
|
144
|
+
size_t anl, avl;
|
|
145
|
+
const lxb_char_t *an = lxb_dom_attr_qualified_name(a, &anl);
|
|
146
|
+
const lxb_char_t *av = lxb_dom_attr_value(a, &avl);
|
|
147
|
+
if (!MKR_FITS_U32(anl) || !MKR_FITS_U32(avl)) return MKR_XML_MUT_OOM;
|
|
148
|
+
|
|
149
|
+
lxb_ns_id_t ans = a->node.ns;
|
|
150
|
+
if (ans != LXB_NS__UNDEF && ans != LXB_NS_HTML && ans != LXB_NS_XML) {
|
|
151
|
+
const lxb_char_t *colon = memchr(an, ':', anl);
|
|
152
|
+
if (colon != NULL) {
|
|
153
|
+
uint32_t ulen;
|
|
154
|
+
const char *uri = mkr_html_ns_uri(&a->node, &ulen);
|
|
155
|
+
if (uri != NULL) {
|
|
156
|
+
mkr_xml_mut_status_t st = h2x_declare_ns(
|
|
157
|
+
xdoc, el, (const char *)an, (uint32_t)(colon - an), uri, ulen);
|
|
158
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
mkr_xml_mut_status_t st = mkr_xml_set_attribute(
|
|
163
|
+
xdoc, el, (const char *)an, (uint32_t)anl,
|
|
164
|
+
av != NULL ? (const char *)av : "", (uint32_t)avl, NULL);
|
|
165
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
166
|
+
}
|
|
167
|
+
return MKR_XML_MUT_OK;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/* Translate ONE lxb node into a fresh mkr node (own fields + attributes, NOT its
|
|
171
|
+
* children). +pdef+/+pdef_len+ is the default namespace inherited from the
|
|
172
|
+
* translated parent; *cdef / *cdef_len receive the default namespace in scope for
|
|
173
|
+
* THIS node's children. *out is the new node, or NULL to SKIP an unsupported type;
|
|
174
|
+
* an error status fails the whole import. */
|
|
175
|
+
static mkr_xml_mut_status_t
|
|
176
|
+
h2x_make(mkr_xml_doc_t *xdoc, lxb_dom_node_t *s, const char *pdef, uint32_t pdef_len,
|
|
177
|
+
const char **cdef, uint32_t *cdef_len, mkr_xml_node_t **out)
|
|
178
|
+
{
|
|
179
|
+
*out = NULL;
|
|
180
|
+
*cdef = pdef; /* default: a non-element does not change the scope */
|
|
181
|
+
*cdef_len = pdef_len;
|
|
182
|
+
|
|
183
|
+
switch (s->type) {
|
|
184
|
+
case LXB_DOM_NODE_TYPE_ELEMENT: {
|
|
185
|
+
size_t nl;
|
|
186
|
+
const lxb_char_t *nm = lxb_dom_element_qualified_name(lxb_dom_interface_element(s), &nl);
|
|
187
|
+
if (!MKR_FITS_U32(nl)) return MKR_XML_MUT_OOM;
|
|
188
|
+
mkr_xml_node_t *el = NULL;
|
|
189
|
+
mkr_xml_mut_status_t st = mkr_xml_new_element(xdoc, (const char *)nm, (uint32_t)nl, &el);
|
|
190
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
191
|
+
|
|
192
|
+
/* Declare the element's default namespace iff it differs from the inherited
|
|
193
|
+
* one, so this (unprefixed, like all HTML elements) element resolves to it.
|
|
194
|
+
* An element with no namespace under an inherited default undeclares (xmlns=""). */
|
|
195
|
+
uint32_t eul;
|
|
196
|
+
const char *euri = mkr_html_ns_uri(s, &eul);
|
|
197
|
+
if (!mkr_uri_eq(euri, eul, pdef, pdef_len)) {
|
|
198
|
+
st = h2x_declare_ns(xdoc, el, NULL, 0, euri, eul);
|
|
199
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
200
|
+
*cdef = (euri != NULL) ? euri : "";
|
|
201
|
+
*cdef_len = eul;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
st = h2x_copy_attrs(xdoc, s, el);
|
|
205
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
206
|
+
*out = el;
|
|
207
|
+
return MKR_XML_MUT_OK;
|
|
208
|
+
}
|
|
209
|
+
case LXB_DOM_NODE_TYPE_TEXT:
|
|
210
|
+
case LXB_DOM_NODE_TYPE_CDATA_SECTION:
|
|
211
|
+
case LXB_DOM_NODE_TYPE_COMMENT: {
|
|
212
|
+
const lexbor_str_t *d = &lxb_dom_interface_character_data(s)->data;
|
|
213
|
+
if (!MKR_FITS_U32(d->length)) return MKR_XML_MUT_OOM;
|
|
214
|
+
uint8_t t = (s->type == LXB_DOM_NODE_TYPE_TEXT) ? MKR_XML_NODE_TYPE_TEXT
|
|
215
|
+
: (s->type == LXB_DOM_NODE_TYPE_CDATA_SECTION) ? MKR_XML_NODE_TYPE_CDATA_SECTION
|
|
216
|
+
: MKR_XML_NODE_TYPE_COMMENT;
|
|
217
|
+
return mkr_xml_new_chardata(xdoc, t, d->data != NULL ? (const char *)d->data : "",
|
|
218
|
+
(uint32_t)d->length, out);
|
|
219
|
+
}
|
|
220
|
+
case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION: {
|
|
221
|
+
size_t tl;
|
|
222
|
+
const lxb_char_t *tg = lxb_dom_processing_instruction_target(
|
|
223
|
+
lxb_dom_interface_processing_instruction(s), &tl);
|
|
224
|
+
const lexbor_str_t *d = &lxb_dom_interface_character_data(s)->data;
|
|
225
|
+
if (!MKR_FITS_U32(tl) || !MKR_FITS_U32(d->length)) return MKR_XML_MUT_OOM;
|
|
226
|
+
return mkr_xml_new_pi(xdoc, (const char *)tg, (uint32_t)tl,
|
|
227
|
+
d->data != NULL ? (const char *)d->data : "", (uint32_t)d->length, out);
|
|
228
|
+
}
|
|
229
|
+
case LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT: {
|
|
230
|
+
mkr_xml_node_t *f = mkr_xml_arena_node(xdoc, MKR_XML_NODE_TYPE_DOCUMENT_FRAGMENT);
|
|
231
|
+
if (f == NULL) return MKR_XML_MUT_OOM;
|
|
232
|
+
*out = f;
|
|
233
|
+
return MKR_XML_MUT_OK;
|
|
234
|
+
}
|
|
235
|
+
default:
|
|
236
|
+
return MKR_XML_MUT_OK; /* unsupported descendant type: skip (*out stays NULL) */
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/* The children to translate under +s+. An HTML <template> keeps its content in a
|
|
241
|
+
* separate document fragment, NOT the normal child chain, so a plain first_child
|
|
242
|
+
* walk would silently drop template contents. We descend into the content fragment
|
|
243
|
+
* instead: mkr (XML) has no template-content concept, so the contents become
|
|
244
|
+
* ordinary children of the translated element (lossless, the natural XML shape). */
|
|
245
|
+
static lxb_dom_node_t *
|
|
246
|
+
h2x_children_of(lxb_dom_node_t *s)
|
|
247
|
+
{
|
|
248
|
+
if (s->type == LXB_DOM_NODE_TYPE_ELEMENT
|
|
249
|
+
&& s->local_name == LXB_TAG_TEMPLATE && s->ns == LXB_NS_HTML) {
|
|
250
|
+
lxb_dom_document_fragment_t *content = lxb_html_interface_template(s)->content;
|
|
251
|
+
return content != NULL ? lxb_dom_interface_node(content)->first_child : NULL;
|
|
252
|
+
}
|
|
253
|
+
return s->first_child;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
mkr_xml_mut_status_t
|
|
257
|
+
mkr_cross_html_to_xml(mkr_xml_doc_t *xdoc, lxb_dom_node_t *src, int deep, mkr_xml_node_t **out)
|
|
258
|
+
{
|
|
259
|
+
*out = NULL;
|
|
260
|
+
mkr_xml_node_t *root = NULL;
|
|
261
|
+
const char *rdef = NULL; uint32_t rdef_len = 0;
|
|
262
|
+
mkr_xml_mut_status_t st = h2x_make(xdoc, src, NULL, 0, &rdef, &rdef_len, &root);
|
|
263
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
264
|
+
if (root == NULL) return MKR_XML_MUT_TYPE; /* root node type has no XML counterpart */
|
|
265
|
+
|
|
266
|
+
if (deep) {
|
|
267
|
+
mkr_xstack_t stk = { NULL, 0, 0 };
|
|
268
|
+
if (mkr_xstack_push(&stk, src, root, rdef, rdef_len) != 0) { free(stk.v); return MKR_XML_MUT_OOM; }
|
|
269
|
+
while (stk.n > 0) {
|
|
270
|
+
mkr_xframe_t f = stk.v[--stk.n];
|
|
271
|
+
lxb_dom_node_t *s = (lxb_dom_node_t *)f.s;
|
|
272
|
+
mkr_xml_node_t *d = (mkr_xml_node_t *)f.d;
|
|
273
|
+
for (lxb_dom_node_t *c = h2x_children_of(s); c != NULL; c = c->next) {
|
|
274
|
+
mkr_xml_node_t *dc = NULL;
|
|
275
|
+
const char *cdef = NULL; uint32_t cdef_len = 0;
|
|
276
|
+
st = h2x_make(xdoc, c, f.def, f.deflen, &cdef, &cdef_len, &dc);
|
|
277
|
+
if (st != MKR_XML_MUT_OK) goto done;
|
|
278
|
+
if (dc == NULL) continue; /* skipped node type */
|
|
279
|
+
st = mkr_xml_insert_child(xdoc, d, dc); /* detached parent: ns deferred */
|
|
280
|
+
if (st != MKR_XML_MUT_OK) goto done;
|
|
281
|
+
if (h2x_children_of(c) != NULL
|
|
282
|
+
&& mkr_xstack_push(&stk, c, dc, cdef, cdef_len) != 0) {
|
|
283
|
+
st = MKR_XML_MUT_OOM; goto done;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
done:
|
|
288
|
+
free(stk.v);
|
|
289
|
+
if (st != MKR_XML_MUT_OK) return st; /* partial subtree abandoned in the arena */
|
|
290
|
+
}
|
|
291
|
+
*out = root;
|
|
292
|
+
return MKR_XML_MUT_OK;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/* ===================== XML (mkr) -> HTML (lxb) =============================== */
|
|
296
|
+
|
|
297
|
+
/* Copy +s+'s attributes onto the translated lxb element +el+, preserving each
|
|
298
|
+
* attribute's namespace (a null-namespace attribute via set_attribute, a
|
|
299
|
+
* namespaced one via an explicit lxb_dom_attr_set_name_ns). */
|
|
300
|
+
static mkr_xml_mut_status_t
|
|
301
|
+
x2h_copy_attrs(lxb_dom_document_t *hdoc, const mkr_xml_node_t *s, lxb_dom_element_t *el)
|
|
302
|
+
{
|
|
303
|
+
for (const mkr_xml_node_t *a = s->attrs; a != NULL; a = a->next) {
|
|
304
|
+
const char *val = a->value != NULL ? a->value : "";
|
|
305
|
+
if (a->ns_uri_len == 0) {
|
|
306
|
+
if (lxb_dom_element_set_attribute(el, (const lxb_char_t *)a->qname, a->qname_len,
|
|
307
|
+
(const lxb_char_t *)val, a->value_len) == NULL) {
|
|
308
|
+
return MKR_XML_MUT_OOM;
|
|
309
|
+
}
|
|
310
|
+
continue;
|
|
311
|
+
}
|
|
312
|
+
lxb_dom_attr_t *at = lxb_dom_attr_interface_create(hdoc);
|
|
313
|
+
if (at == NULL) return MKR_XML_MUT_OOM;
|
|
314
|
+
if (lxb_dom_attr_set_name_ns(at, (const lxb_char_t *)a->ns_uri, a->ns_uri_len,
|
|
315
|
+
(const lxb_char_t *)a->qname, a->qname_len, false) != LXB_STATUS_OK
|
|
316
|
+
|| lxb_dom_attr_set_value(at, (const lxb_char_t *)val, a->value_len) != LXB_STATUS_OK) {
|
|
317
|
+
return MKR_XML_MUT_OOM; /* the un-appended attr is abandoned in mraw */
|
|
318
|
+
}
|
|
319
|
+
lxb_dom_element_attr_append(el, at);
|
|
320
|
+
}
|
|
321
|
+
return MKR_XML_MUT_OK;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/* Translate ONE mkr node into a fresh, detached lxb node (own fields + attributes,
|
|
325
|
+
* NOT children). *out is the new node, or NULL to SKIP an unsupported type. An XML
|
|
326
|
+
* CDATA section has no HTML counterpart, so it fails closed (MKR_XML_MUT_TYPE). */
|
|
327
|
+
static mkr_xml_mut_status_t
|
|
328
|
+
x2h_make(lxb_dom_document_t *hdoc, const mkr_xml_node_t *s, lxb_dom_node_t **out)
|
|
329
|
+
{
|
|
330
|
+
*out = NULL;
|
|
331
|
+
switch (s->type) {
|
|
332
|
+
case MKR_XML_NODE_TYPE_ELEMENT: {
|
|
333
|
+
lxb_dom_element_t *el = lxb_dom_document_create_element(
|
|
334
|
+
hdoc, (const lxb_char_t *)s->qname, s->qname_len, NULL);
|
|
335
|
+
if (el == NULL) return MKR_XML_MUT_OOM;
|
|
336
|
+
/* Preserve the namespace as a Lexbor id (any URI, interned; else null). */
|
|
337
|
+
lxb_dom_interface_node(el)->ns = x2h_ns_id(hdoc, s->ns_uri, s->ns_uri_len);
|
|
338
|
+
mkr_xml_mut_status_t st = x2h_copy_attrs(hdoc, s, el);
|
|
339
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
340
|
+
*out = lxb_dom_interface_node(el);
|
|
341
|
+
return MKR_XML_MUT_OK;
|
|
342
|
+
}
|
|
343
|
+
case MKR_XML_NODE_TYPE_TEXT: {
|
|
344
|
+
lxb_dom_text_t *t = lxb_dom_document_create_text_node(
|
|
345
|
+
hdoc, (const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
|
|
346
|
+
if (t == NULL) return MKR_XML_MUT_OOM;
|
|
347
|
+
*out = lxb_dom_interface_node(t);
|
|
348
|
+
return MKR_XML_MUT_OK;
|
|
349
|
+
}
|
|
350
|
+
case MKR_XML_NODE_TYPE_COMMENT: {
|
|
351
|
+
lxb_dom_comment_t *c = lxb_dom_document_create_comment(
|
|
352
|
+
hdoc, (const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
|
|
353
|
+
if (c == NULL) return MKR_XML_MUT_OOM;
|
|
354
|
+
*out = lxb_dom_interface_node(c);
|
|
355
|
+
return MKR_XML_MUT_OK;
|
|
356
|
+
}
|
|
357
|
+
case MKR_XML_NODE_TYPE_PI: {
|
|
358
|
+
/* The PI target is the node's name (local == qname for a PI); data is value. */
|
|
359
|
+
lxb_dom_processing_instruction_t *pi = lxb_dom_document_create_processing_instruction(
|
|
360
|
+
hdoc, (const lxb_char_t *)s->local, s->local_len,
|
|
361
|
+
(const lxb_char_t *)(s->value != NULL ? s->value : ""), s->value_len);
|
|
362
|
+
if (pi == NULL) return MKR_XML_MUT_OOM;
|
|
363
|
+
*out = lxb_dom_interface_node(pi);
|
|
364
|
+
return MKR_XML_MUT_OK;
|
|
365
|
+
}
|
|
366
|
+
case MKR_XML_NODE_TYPE_CDATA_SECTION:
|
|
367
|
+
return MKR_XML_MUT_TYPE; /* HTML has no CDATA section: fail closed */
|
|
368
|
+
case MKR_XML_NODE_TYPE_DOCUMENT_FRAGMENT: {
|
|
369
|
+
lxb_dom_document_fragment_t *f = lxb_dom_document_create_document_fragment(hdoc);
|
|
370
|
+
if (f == NULL) return MKR_XML_MUT_OOM;
|
|
371
|
+
*out = lxb_dom_interface_node(f);
|
|
372
|
+
return MKR_XML_MUT_OK;
|
|
373
|
+
}
|
|
374
|
+
default:
|
|
375
|
+
return MKR_XML_MUT_OK; /* unsupported descendant type: skip (*out stays NULL) */
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/* Where a translated element's CHILDREN attach. An HTML <template> holds its
|
|
380
|
+
* content in a separate document fragment (HTMLTemplateElement.content), not the
|
|
381
|
+
* normal child chain, so children go there - matching a parsed template and the
|
|
382
|
+
* HTML->HTML import_node fixup (mkr_fixup_template_content). Other elements link
|
|
383
|
+
* children directly. */
|
|
384
|
+
static lxb_dom_node_t *
|
|
385
|
+
x2h_link_target(lxb_dom_node_t *el)
|
|
386
|
+
{
|
|
387
|
+
if (el->type == LXB_DOM_NODE_TYPE_ELEMENT
|
|
388
|
+
&& el->local_name == LXB_TAG_TEMPLATE && el->ns == LXB_NS_HTML) {
|
|
389
|
+
lxb_dom_document_fragment_t *content = lxb_html_interface_template(el)->content;
|
|
390
|
+
if (content != NULL) return lxb_dom_interface_node(content);
|
|
391
|
+
}
|
|
392
|
+
return el;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
mkr_xml_mut_status_t
|
|
396
|
+
mkr_cross_xml_to_html(lxb_dom_document_t *hdoc, const mkr_xml_node_t *src, int deep,
|
|
397
|
+
lxb_dom_node_t **out)
|
|
398
|
+
{
|
|
399
|
+
*out = NULL;
|
|
400
|
+
lxb_dom_node_t *root = NULL;
|
|
401
|
+
mkr_xml_mut_status_t st = x2h_make(hdoc, src, &root);
|
|
402
|
+
if (st != MKR_XML_MUT_OK) return st;
|
|
403
|
+
if (root == NULL) return MKR_XML_MUT_TYPE; /* root node type has no HTML counterpart */
|
|
404
|
+
|
|
405
|
+
if (deep) {
|
|
406
|
+
mkr_xstack_t stk = { NULL, 0, 0 };
|
|
407
|
+
/* The frame's d is the link target for the source node's children (a
|
|
408
|
+
* template element's content fragment, else the element itself). */
|
|
409
|
+
if (mkr_xstack_push(&stk, (void *)src, x2h_link_target(root), NULL, 0) != 0) {
|
|
410
|
+
free(stk.v); return MKR_XML_MUT_OOM;
|
|
411
|
+
}
|
|
412
|
+
while (stk.n > 0) {
|
|
413
|
+
mkr_xframe_t f = stk.v[--stk.n];
|
|
414
|
+
const mkr_xml_node_t *s = (const mkr_xml_node_t *)f.s;
|
|
415
|
+
lxb_dom_node_t *d = (lxb_dom_node_t *)f.d;
|
|
416
|
+
for (const mkr_xml_node_t *c = s->first_child; c != NULL; c = c->next) {
|
|
417
|
+
lxb_dom_node_t *dc = NULL;
|
|
418
|
+
st = x2h_make(hdoc, c, &dc);
|
|
419
|
+
if (st != MKR_XML_MUT_OK) goto done;
|
|
420
|
+
if (dc == NULL) continue; /* skipped node type */
|
|
421
|
+
lxb_dom_node_insert_child(d, dc);
|
|
422
|
+
if (c->first_child != NULL
|
|
423
|
+
&& mkr_xstack_push(&stk, (void *)c, x2h_link_target(dc), NULL, 0) != 0) {
|
|
424
|
+
st = MKR_XML_MUT_OOM; goto done;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
done:
|
|
429
|
+
free(stk.v);
|
|
430
|
+
if (st != MKR_XML_MUT_OK) return st; /* partial subtree abandoned in mraw */
|
|
431
|
+
}
|
|
432
|
+
*out = root;
|
|
433
|
+
return MKR_XML_MUT_OK;
|
|
434
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#ifndef MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H
|
|
2
|
+
#define MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H
|
|
3
|
+
|
|
4
|
+
/* Cross-kind subtree translation for Document#import_node. Ruby-FREE: it reads and
|
|
5
|
+
* writes BOTH the HTML (Lexbor lxb_dom_node_t) and XML (mkr_xml_node_t)
|
|
6
|
+
* representations, so it lives in dom_adapter - the layer that already bridges
|
|
7
|
+
* Lexbor and the XML arena - rather than in the Ruby glue. The glue import_node
|
|
8
|
+
* entries call these after a kind check and wrap the result. */
|
|
9
|
+
|
|
10
|
+
#include <lexbor/html/html.h> /* lxb_dom_* + the <template> content interface */
|
|
11
|
+
#include <lexbor/dom/dom.h>
|
|
12
|
+
#include "../xml/mkr_xml_node.h" /* mkr_xml_node_t, mkr_xml_doc_t */
|
|
13
|
+
#include "../xml/mkr_xml_mutate.h" /* mkr_xml_mut_status_t + node factories */
|
|
14
|
+
|
|
15
|
+
#ifdef __cplusplus
|
|
16
|
+
extern "C" {
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
|
+
/* Build a DETACHED deep (or shallow, deep == 0) copy of +src+ in the OTHER
|
|
20
|
+
* representation, owned by the destination document, returned in *out (set only on
|
|
21
|
+
* MKR_XML_MUT_OK). Iterative (no C recursion -> no stack DoS) and fail-closed: a
|
|
22
|
+
* failure abandons a self-contained partial subtree in the destination arena (the
|
|
23
|
+
* XML node arena or Lexbor's mraw), freed with the document. Namespaces are
|
|
24
|
+
* preserved across the translation; an XML CDATA section has no HTML counterpart,
|
|
25
|
+
* so mkr_cross_xml_to_html fails closed (MKR_XML_MUT_TYPE) when it meets one. */
|
|
26
|
+
mkr_xml_mut_status_t mkr_cross_html_to_xml(mkr_xml_doc_t *xdoc, lxb_dom_node_t *src,
|
|
27
|
+
int deep, mkr_xml_node_t **out);
|
|
28
|
+
mkr_xml_mut_status_t mkr_cross_xml_to_html(lxb_dom_document_t *hdoc, const mkr_xml_node_t *src,
|
|
29
|
+
int deep, lxb_dom_node_t **out);
|
|
30
|
+
|
|
31
|
+
#ifdef __cplusplus
|
|
32
|
+
}
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
#endif /* MAKIRI_DOM_ADAPTER_CROSS_IMPORT_H */
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include <string.h>
|
|
8
8
|
|
|
9
9
|
/*
|
|
10
|
-
* Per-document text-extraction index (
|
|
10
|
+
* Per-document text-extraction index (dom_adapter/text_index.c).
|
|
11
11
|
*
|
|
12
12
|
* Descendant-text aggregation (Node#text, XPath string-value) walks every node
|
|
13
13
|
* of a subtree chasing pointers through Lexbor's 96-byte nodes - at scale this
|
data/ext/makiri/fuzz/Makefile
CHANGED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#ifndef MAKIRI_CROSS_IMPORT_H
|
|
2
|
+
#define MAKIRI_CROSS_IMPORT_H
|
|
3
|
+
|
|
4
|
+
/* Glue-side helpers for cross-kind Document#import_node. The Ruby-FREE subtree
|
|
5
|
+
* translators live in dom_adapter/cross_import.c (they read/write both Lexbor
|
|
6
|
+
* and the XML arena); this header adds the Ruby-boundary pieces the import_node
|
|
7
|
+
* entries (ruby_doc.c / ruby_xml_node.c) need on top of them. */
|
|
8
|
+
|
|
9
|
+
#include "../makiri.h"
|
|
10
|
+
#include "../dom_adapter/cross_import.h" /* mkr_cross_html_to_xml / _xml_to_html */
|
|
11
|
+
|
|
12
|
+
#ifdef __cplusplus
|
|
13
|
+
extern "C" {
|
|
14
|
+
#endif
|
|
15
|
+
|
|
16
|
+
/* Which representation a wrapped Ruby node is, by its TypedData type (NOT by Ruby
|
|
17
|
+
* class). A Document VALUE, a NodeSet, or any non-node is MKR_NODE_KIND_OTHER.
|
|
18
|
+
* Defined in ruby_node.c. */
|
|
19
|
+
typedef enum { MKR_NODE_KIND_OTHER = 0, MKR_NODE_KIND_HTML, MKR_NODE_KIND_XML } mkr_node_kind_t;
|
|
20
|
+
mkr_node_kind_t mkr_node_kind(VALUE v);
|
|
21
|
+
|
|
22
|
+
/* Raise a Ruby exception for a non-OK mutation status (no-op on OK). Defined in
|
|
23
|
+
* ruby_xml_node.c; shared by the XML mutators and the cross-import entries. */
|
|
24
|
+
void mkr_xml_mut_check(mkr_xml_mut_status_t st);
|
|
25
|
+
|
|
26
|
+
#ifdef __cplusplus
|
|
27
|
+
}
|
|
28
|
+
#endif
|
|
29
|
+
|
|
30
|
+
#endif /* MAKIRI_CROSS_IMPORT_H */
|
data/ext/makiri/glue/glue.h
CHANGED
data/ext/makiri/glue/ruby_doc.c
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "glue.h"
|
|
2
|
-
#include "
|
|
2
|
+
#include "cross_import.h" /* cross-kind import_node (XML node -> this HTML doc) */
|
|
3
|
+
#include "../dom_adapter/compat_internal.h" /* mkr_dom_preorder_next */
|
|
3
4
|
#include "../core/mkr_core.h"
|
|
4
5
|
#include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
|
|
5
6
|
|
|
@@ -348,10 +349,17 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
|
|
|
348
349
|
VALUE node_v, deep_v;
|
|
349
350
|
rb_scan_args(argc, argv, "11", &node_v, &deep_v);
|
|
350
351
|
bool deep = RTEST(deep_v);
|
|
351
|
-
|
|
352
|
-
lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
|
|
353
352
|
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
|
|
354
353
|
|
|
354
|
+
/* An XML node is TRANSLATED across representations (mkr -> lxb) by
|
|
355
|
+
* ruby_cross_import.c into a detached lxb subtree owned by this document. */
|
|
356
|
+
if (mkr_node_kind(node_v) == MKR_NODE_KIND_XML) {
|
|
357
|
+
lxb_dom_node_t *imp = NULL;
|
|
358
|
+
mkr_xml_mut_check(mkr_cross_xml_to_html(doc, mkr_xml_node_unwrap(node_v), deep, &imp));
|
|
359
|
+
return mkr_wrap_html_node(imp, self);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* HTML node (raises on a non-node) */
|
|
355
363
|
lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
|
|
356
364
|
if (imp == NULL) {
|
|
357
365
|
rb_raise(mkr_eError, "failed to import node");
|
|
@@ -426,6 +426,12 @@ mkr_node_set_name(VALUE self, VALUE rb_name)
|
|
|
426
426
|
el->qualified_name = fresh->qualified_name;
|
|
427
427
|
|
|
428
428
|
lxb_dom_node_destroy(lxb_dom_interface_node(fresh));
|
|
429
|
+
/* The element's tag id (local_name) is the key the element-by-tag index
|
|
430
|
+
* buckets on and the //tag fast path serves from; renaming changes it, so
|
|
431
|
+
* the persisted index would otherwise miss the element under its new name
|
|
432
|
+
* (a truncated, wrong //newtag result). Drop the indexes like every other
|
|
433
|
+
* mutator. */
|
|
434
|
+
mkr_invalidate_index(self);
|
|
429
435
|
return rb_name;
|
|
430
436
|
}
|
|
431
437
|
|
|
@@ -408,7 +408,7 @@ mkr_node_content(VALUE self)
|
|
|
408
408
|
|
|
409
409
|
/* Fast path for elements / fragments (the common case, incl. document text).
|
|
410
410
|
*
|
|
411
|
-
* Preferred: the per-document text index (
|
|
411
|
+
* Preferred: the per-document text index (dom_adapter/text_index.c) maps
|
|
412
412
|
* this node to the contiguous, document-order run of its descendants' text
|
|
413
413
|
* slices, so we serve a single pre-sized memcpy run with no per-extraction
|
|
414
414
|
* tree walk - the walk is otherwise the dominant, cache-bound cost. Built
|