makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
|
@@ -1,1286 +0,0 @@
|
|
|
1
|
-
#include "mkr_xpath_internal.h"
|
|
2
|
-
#include "../core/mkr_core.h"
|
|
3
|
-
|
|
4
|
-
#include <lexbor/dom/dom.h>
|
|
5
|
-
#include <ctype.h>
|
|
6
|
-
#include <math.h>
|
|
7
|
-
#include <stdio.h>
|
|
8
|
-
#include <stdlib.h>
|
|
9
|
-
#include <string.h>
|
|
10
|
-
|
|
11
|
-
/*
|
|
12
|
-
* Runtime values: node-sets, type coercions, and node string-values.
|
|
13
|
-
* Also hosts the small AST destructor helpers.
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
/* ---------- node-set ---------- */
|
|
17
|
-
|
|
18
|
-
void
|
|
19
|
-
mkr_nodeset_init(mkr_nodeset_t *ns)
|
|
20
|
-
{
|
|
21
|
-
ns->items = NULL;
|
|
22
|
-
ns->count = 0;
|
|
23
|
-
ns->capacity = 0;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
int
|
|
27
|
-
mkr_nodeset_push(mkr_nodeset_t *ns, lxb_dom_node_t *node,
|
|
28
|
-
mkr_xpath_limits_t *limits, mkr_xpath_error_t *err)
|
|
29
|
-
{
|
|
30
|
-
if (node == NULL) return 0;
|
|
31
|
-
if (limits != NULL && mkr_limit_check_nodeset_size(limits, ns->count + 1, err) != 0) {
|
|
32
|
-
return -1;
|
|
33
|
-
}
|
|
34
|
-
if (mkr_grow_reserve((void **)&ns->items, &ns->capacity, ns->count + 1,
|
|
35
|
-
sizeof(*ns->items)) != MKR_OK) {
|
|
36
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory growing node-set");
|
|
37
|
-
return -1;
|
|
38
|
-
}
|
|
39
|
-
ns->items[ns->count++] = node;
|
|
40
|
-
return 0;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
void
|
|
44
|
-
mkr_nodeset_clear(mkr_nodeset_t *ns)
|
|
45
|
-
{
|
|
46
|
-
if (ns == NULL) return;
|
|
47
|
-
free(ns->items);
|
|
48
|
-
ns->items = NULL;
|
|
49
|
-
ns->count = 0;
|
|
50
|
-
ns->capacity = 0;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
void
|
|
54
|
-
mkr_owned_text_init(mkr_owned_text_t *t)
|
|
55
|
-
{
|
|
56
|
-
if (t == NULL) return;
|
|
57
|
-
t->ptr = NULL;
|
|
58
|
-
t->len = 0;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
void
|
|
62
|
-
mkr_owned_text_clear(mkr_owned_text_t *t)
|
|
63
|
-
{
|
|
64
|
-
if (t == NULL) return;
|
|
65
|
-
free(t->ptr);
|
|
66
|
-
t->ptr = NULL;
|
|
67
|
-
t->len = 0;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
int
|
|
71
|
-
mkr_borrowed_text_eq(mkr_borrowed_text_t a, mkr_borrowed_text_t b)
|
|
72
|
-
{
|
|
73
|
-
if (a.ptr == NULL || b.ptr == NULL) return a.ptr == b.ptr;
|
|
74
|
-
return a.len == b.len && memcmp(a.ptr, b.ptr, a.len) == 0;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/* Copy an already-valid borrowed text into owned storage. Taking
|
|
78
|
-
* mkr_borrowed_text_t (not raw char*+len) keeps the type contract: an
|
|
79
|
-
* mkr_owned_text_t can only be minted from text the caller has asserted valid
|
|
80
|
-
* (via mkr_borrowed_text / mkr_borrowed_text_from_verified /
|
|
81
|
-
* mkr_borrowed_text_from_owned), so every raw-bytes -> text entry point is
|
|
82
|
-
* greppable. */
|
|
83
|
-
int
|
|
84
|
-
mkr_owned_text_from_borrowed_copy(mkr_owned_text_t *out, mkr_borrowed_text_t t,
|
|
85
|
-
mkr_xpath_error_t *err, const char *what)
|
|
86
|
-
{
|
|
87
|
-
if (out == NULL) {
|
|
88
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_borrowed_copy: bad args");
|
|
89
|
-
return -1;
|
|
90
|
-
}
|
|
91
|
-
mkr_owned_text_init(out);
|
|
92
|
-
const char *s = t.ptr ? t.ptr : "";
|
|
93
|
-
size_t len = t.ptr ? t.len : 0;
|
|
94
|
-
char *p = mkr_strndup(s, len);
|
|
95
|
-
if (p == NULL) {
|
|
96
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory copying text");
|
|
97
|
-
return -1;
|
|
98
|
-
}
|
|
99
|
-
out->ptr = p;
|
|
100
|
-
out->len = len;
|
|
101
|
-
return 0;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
int
|
|
105
|
-
mkr_owned_text_from_buf_steal(mkr_owned_text_t *out, mkr_buf_t *buf,
|
|
106
|
-
mkr_xpath_error_t *err, const char *what)
|
|
107
|
-
{
|
|
108
|
-
if (out == NULL || buf == NULL) {
|
|
109
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_buf_steal: bad args");
|
|
110
|
-
return -1;
|
|
111
|
-
}
|
|
112
|
-
mkr_owned_text_init(out);
|
|
113
|
-
size_t len = 0;
|
|
114
|
-
char *p = mkr_buf_steal(buf, &len);
|
|
115
|
-
if (p == NULL) {
|
|
116
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory stealing text buffer");
|
|
117
|
-
return -1;
|
|
118
|
-
}
|
|
119
|
-
out->ptr = p;
|
|
120
|
-
out->len = len;
|
|
121
|
-
return 0;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
void
|
|
125
|
-
mkr_val_set_owned_text(mkr_val_t *v, mkr_owned_text_t text)
|
|
126
|
-
{
|
|
127
|
-
if (v == NULL) return;
|
|
128
|
-
v->type = MKR_XPATH_TYPE_STRING;
|
|
129
|
-
v->u.string = text;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
/* Set +v+ to a STRING by copying a borrowed view: the engine allocates and owns
|
|
133
|
-
* the copy. This is how callers outside the engine (the glue handler bridge)
|
|
134
|
-
* hand a string into a value — they pass what they have, a borrowed slice, and
|
|
135
|
-
* never construct an mkr_owned_text_t themselves. Keeping the copy-and-own step
|
|
136
|
-
* here keeps allocation and freeing of owned strings in one layer. Returns 0 on
|
|
137
|
-
* success, -1 on OOM (err populated; +v+ left untouched). */
|
|
138
|
-
int
|
|
139
|
-
mkr_val_set_borrowed_text_copy(mkr_val_t *v, mkr_borrowed_text_t text,
|
|
140
|
-
mkr_xpath_error_t *err, const char *what)
|
|
141
|
-
{
|
|
142
|
-
if (v == NULL) {
|
|
143
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_set_borrowed_text_copy: bad args");
|
|
144
|
-
return -1;
|
|
145
|
-
}
|
|
146
|
-
mkr_owned_text_t owned;
|
|
147
|
-
if (mkr_owned_text_from_borrowed_copy(&owned, text, err, what) != 0) {
|
|
148
|
-
return -1;
|
|
149
|
-
}
|
|
150
|
-
mkr_val_set_owned_text(v, owned);
|
|
151
|
-
return 0;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/* ---------- value ---------- */
|
|
155
|
-
|
|
156
|
-
void
|
|
157
|
-
mkr_val_clear(mkr_val_t *v)
|
|
158
|
-
{
|
|
159
|
-
if (v == NULL) return;
|
|
160
|
-
switch (v->type) {
|
|
161
|
-
case MKR_XPATH_TYPE_NODESET:
|
|
162
|
-
mkr_nodeset_clear(&v->u.nodeset);
|
|
163
|
-
break;
|
|
164
|
-
case MKR_XPATH_TYPE_STRING:
|
|
165
|
-
mkr_owned_text_clear(&v->u.string);
|
|
166
|
-
break;
|
|
167
|
-
default:
|
|
168
|
-
break;
|
|
169
|
-
}
|
|
170
|
-
memset(v, 0, sizeof(*v));
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
int
|
|
174
|
-
mkr_val_clone(const mkr_val_t *src, mkr_val_t *dst, mkr_xpath_error_t *err)
|
|
175
|
-
{
|
|
176
|
-
if (src == NULL || dst == NULL) {
|
|
177
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: bad args");
|
|
178
|
-
return -1;
|
|
179
|
-
}
|
|
180
|
-
memset(dst, 0, sizeof(*dst));
|
|
181
|
-
dst->type = src->type;
|
|
182
|
-
switch (src->type) {
|
|
183
|
-
case MKR_XPATH_TYPE_STRING: {
|
|
184
|
-
mkr_owned_text_t text;
|
|
185
|
-
if (mkr_owned_text_from_borrowed_copy(&text, mkr_borrowed_text_from_owned(src->u.string),
|
|
186
|
-
err, "out of memory cloning string value") != 0) return -1;
|
|
187
|
-
mkr_val_set_owned_text(dst, text);
|
|
188
|
-
return 0;
|
|
189
|
-
}
|
|
190
|
-
case MKR_XPATH_TYPE_NUMBER:
|
|
191
|
-
dst->u.number = src->u.number;
|
|
192
|
-
return 0;
|
|
193
|
-
case MKR_XPATH_TYPE_BOOLEAN:
|
|
194
|
-
dst->u.boolean = src->u.boolean;
|
|
195
|
-
return 0;
|
|
196
|
-
case MKR_XPATH_TYPE_NODESET: {
|
|
197
|
-
size_t n = src->u.nodeset.count;
|
|
198
|
-
mkr_nodeset_init(&dst->u.nodeset);
|
|
199
|
-
if (n == 0) return 0;
|
|
200
|
-
lxb_dom_node_t **items;
|
|
201
|
-
size_t items_bytes;
|
|
202
|
-
if (!mkr_size_mul(n, sizeof(*items), &items_bytes)) {
|
|
203
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
|
|
204
|
-
return -1;
|
|
205
|
-
}
|
|
206
|
-
items = mkr_reallocarray(NULL, n, sizeof(*items));
|
|
207
|
-
if (items == NULL) {
|
|
208
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
|
|
209
|
-
return -1;
|
|
210
|
-
}
|
|
211
|
-
memcpy(items, src->u.nodeset.items, items_bytes);
|
|
212
|
-
dst->u.nodeset.items = items;
|
|
213
|
-
dst->u.nodeset.count = n;
|
|
214
|
-
dst->u.nodeset.capacity = n;
|
|
215
|
-
return 0;
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: unknown value type");
|
|
219
|
-
return -1;
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
/* ---------- node string-value (XPath 1.0 §5) ---------- */
|
|
223
|
-
|
|
224
|
-
/* ---------- node string-value (XPath 1.0 §5) ----------
|
|
225
|
-
*
|
|
226
|
-
* Built into an mkr_buf_t whose `max` is the per-evaluate byte cap: append fails
|
|
227
|
-
* closed with MKR_ERR_LIMIT past the cap and MKR_ERR_OOM on allocation failure,
|
|
228
|
-
* so there is never a partial/truncated result. Lexbor-allocated text is freed
|
|
229
|
-
* after each append (otherwise we'd leak document-arena memory on every XPath
|
|
230
|
-
* that touches text content). */
|
|
231
|
-
|
|
232
|
-
/* Append `node`'s own text content. */
|
|
233
|
-
static mkr_status_t
|
|
234
|
-
append_text_content(lxb_dom_node_t *node, mkr_buf_t *buf)
|
|
235
|
-
{
|
|
236
|
-
size_t tlen = 0;
|
|
237
|
-
lxb_char_t *t = lxb_dom_node_text_content(node, &tlen);
|
|
238
|
-
if (t == NULL) return MKR_OK;
|
|
239
|
-
mkr_status_t st = mkr_buf_append(buf, t, tlen);
|
|
240
|
-
lxb_dom_document_destroy_text(node->owner_document, t);
|
|
241
|
-
return st;
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
/* Append the string-value of every TEXT descendant of `node`, in document
|
|
245
|
-
* order. Iterative (parent-pointer) pre-order walk rather than C recursion, so
|
|
246
|
-
* an adversarially deep tree cannot overflow the stack (fail-closed / no DoS);
|
|
247
|
-
* O(1) extra space. Descends only into elements, matching the original. */
|
|
248
|
-
static mkr_status_t
|
|
249
|
-
append_text_descendants(lxb_dom_node_t *node, mkr_buf_t *buf)
|
|
250
|
-
{
|
|
251
|
-
lxb_dom_node_t *cur = node->first_child;
|
|
252
|
-
while (cur != NULL) {
|
|
253
|
-
if (cur->type == LXB_DOM_NODE_TYPE_TEXT) {
|
|
254
|
-
mkr_status_t st = append_text_content(cur, buf);
|
|
255
|
-
if (st != MKR_OK) return st; /* LIMIT or OOM — caller fails closed */
|
|
256
|
-
}
|
|
257
|
-
if (cur->type == LXB_DOM_NODE_TYPE_ELEMENT && cur->first_child != NULL) {
|
|
258
|
-
cur = cur->first_child;
|
|
259
|
-
continue;
|
|
260
|
-
}
|
|
261
|
-
while (cur != node && cur->next == NULL) {
|
|
262
|
-
cur = cur->parent;
|
|
263
|
-
}
|
|
264
|
-
if (cur == node) return MKR_OK;
|
|
265
|
-
cur = cur->next;
|
|
266
|
-
}
|
|
267
|
-
return MKR_OK;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/* Build node's string-value into `buf` (cap carried by buf->max). */
|
|
271
|
-
static mkr_status_t
|
|
272
|
-
build_string_value(const lxb_dom_node_t *node, mkr_buf_t *buf)
|
|
273
|
-
{
|
|
274
|
-
if (node == NULL) return MKR_OK;
|
|
275
|
-
|
|
276
|
-
switch (node->type) {
|
|
277
|
-
case LXB_DOM_NODE_TYPE_ATTRIBUTE: {
|
|
278
|
-
lxb_dom_attr_t *attr = (lxb_dom_attr_t *)node;
|
|
279
|
-
size_t vlen = 0;
|
|
280
|
-
const lxb_char_t *v = lxb_dom_attr_value(attr, &vlen);
|
|
281
|
-
return mkr_buf_append(buf, v ? (const char *)v : "", vlen);
|
|
282
|
-
}
|
|
283
|
-
case LXB_DOM_NODE_TYPE_TEXT:
|
|
284
|
-
case LXB_DOM_NODE_TYPE_CDATA_SECTION:
|
|
285
|
-
case LXB_DOM_NODE_TYPE_COMMENT:
|
|
286
|
-
case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION:
|
|
287
|
-
return append_text_content((lxb_dom_node_t *)node, buf);
|
|
288
|
-
default:
|
|
289
|
-
return append_text_descendants((lxb_dom_node_t *)node, buf);
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
static void
|
|
294
|
-
mkr_build_node_text_unchecked(const lxb_dom_node_t *node, mkr_owned_text_t *out)
|
|
295
|
-
{
|
|
296
|
-
/* Uncapped, best-effort: callers (number/string coercion) require a non-NULL
|
|
297
|
-
* text, so on any failure fall back to an owned "" rather than NULL. */
|
|
298
|
-
mkr_owned_text_init(out);
|
|
299
|
-
mkr_buf_t buf;
|
|
300
|
-
mkr_buf_init(&buf, 0);
|
|
301
|
-
if (build_string_value(node, &buf) != MKR_OK) {
|
|
302
|
-
mkr_buf_free(&buf);
|
|
303
|
-
(void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
|
|
304
|
-
return;
|
|
305
|
-
}
|
|
306
|
-
if (mkr_owned_text_from_buf_steal(out, &buf, NULL, NULL) != 0) {
|
|
307
|
-
(void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
int
|
|
312
|
-
mkr_node_to_owned_text_or_fail(const lxb_dom_node_t *node,
|
|
313
|
-
mkr_xpath_limits_t *limits,
|
|
314
|
-
mkr_xpath_error_t *err,
|
|
315
|
-
mkr_owned_text_t *out)
|
|
316
|
-
{
|
|
317
|
-
if (out == NULL) {
|
|
318
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_node_to_owned_text_or_fail: bad args");
|
|
319
|
-
return -1;
|
|
320
|
-
}
|
|
321
|
-
mkr_owned_text_init(out);
|
|
322
|
-
mkr_buf_t buf;
|
|
323
|
-
mkr_buf_init(&buf, (limits != NULL) ? limits->max_string_bytes : 0);
|
|
324
|
-
mkr_status_t st = build_string_value(node, &buf);
|
|
325
|
-
if (st == MKR_ERR_LIMIT) {
|
|
326
|
-
mkr_buf_free(&buf);
|
|
327
|
-
mkr_err_setf(err, MKR_XPATH_ERR_LIMIT,
|
|
328
|
-
"string size limit exceeded (%zu bytes) while building node string-value",
|
|
329
|
-
limits->max_string_bytes);
|
|
330
|
-
return -1;
|
|
331
|
-
}
|
|
332
|
-
if (st != MKR_OK) {
|
|
333
|
-
mkr_buf_free(&buf);
|
|
334
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building node string-value");
|
|
335
|
-
return -1;
|
|
336
|
-
}
|
|
337
|
-
return mkr_owned_text_from_buf_steal(out, &buf, err, "out of memory building node string-value");
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
int
|
|
341
|
-
mkr_val_to_owned_text_or_fail(const mkr_val_t *v,
|
|
342
|
-
mkr_xpath_limits_t *limits,
|
|
343
|
-
mkr_xpath_error_t *err,
|
|
344
|
-
mkr_owned_text_t *out)
|
|
345
|
-
{
|
|
346
|
-
if (out == NULL) {
|
|
347
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_owned_text_or_fail: bad args");
|
|
348
|
-
return -1;
|
|
349
|
-
}
|
|
350
|
-
mkr_owned_text_init(out);
|
|
351
|
-
if (v == NULL) {
|
|
352
|
-
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory converting value to string");
|
|
353
|
-
}
|
|
354
|
-
switch (v->type) {
|
|
355
|
-
case MKR_XPATH_TYPE_STRING: {
|
|
356
|
-
mkr_borrowed_text_t text = mkr_borrowed_text_from_owned(v->u.string);
|
|
357
|
-
if (text.ptr == NULL) text.len = 0;
|
|
358
|
-
if (limits != NULL && mkr_limit_check_string_bytes(limits, text.len, err) != 0) return -1;
|
|
359
|
-
return mkr_owned_text_from_borrowed_copy(out, text,
|
|
360
|
-
err, "out of memory copying string value");
|
|
361
|
-
}
|
|
362
|
-
case MKR_XPATH_TYPE_BOOLEAN:
|
|
363
|
-
return v->u.boolean
|
|
364
|
-
? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("true"), err, "out of memory converting boolean to string")
|
|
365
|
-
: mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("false"), err, "out of memory converting boolean to string");
|
|
366
|
-
case MKR_XPATH_TYPE_NUMBER: {
|
|
367
|
-
double d = v->u.number;
|
|
368
|
-
if (isnan(d)) {
|
|
369
|
-
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("NaN"), err, "out of memory converting number to string");
|
|
370
|
-
}
|
|
371
|
-
if (isinf(d)) {
|
|
372
|
-
return d < 0
|
|
373
|
-
? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("-Infinity"), err, "out of memory converting number to string")
|
|
374
|
-
: mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("Infinity"), err, "out of memory converting number to string");
|
|
375
|
-
}
|
|
376
|
-
if (d == 0.0) {
|
|
377
|
-
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("0"), err, "out of memory converting number to string");
|
|
378
|
-
}
|
|
379
|
-
char buf[64];
|
|
380
|
-
int n;
|
|
381
|
-
if (d == floor(d) && fabs(d) < 1e15) {
|
|
382
|
-
n = snprintf(buf, sizeof(buf), "%lld", (long long)d);
|
|
383
|
-
} else {
|
|
384
|
-
n = snprintf(buf, sizeof(buf), "%.15g", d);
|
|
385
|
-
}
|
|
386
|
-
if (n < 0 || (size_t)n >= sizeof(buf)) {
|
|
387
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "number string conversion overflow");
|
|
388
|
-
return -1;
|
|
389
|
-
}
|
|
390
|
-
char *p = mkr_strndup(buf, (size_t)n);
|
|
391
|
-
if (p == NULL) { mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory converting number to string"); return -1; }
|
|
392
|
-
*out = mkr_owned_text(p, (size_t)n);
|
|
393
|
-
return 0;
|
|
394
|
-
}
|
|
395
|
-
case MKR_XPATH_TYPE_NODESET:
|
|
396
|
-
if (v->u.nodeset.count == 0) {
|
|
397
|
-
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory");
|
|
398
|
-
}
|
|
399
|
-
/* XPath 1.0 §4.2: string(node-set) = string-value of first node in doc order. */
|
|
400
|
-
return mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, out);
|
|
401
|
-
}
|
|
402
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "unknown value type");
|
|
403
|
-
return -1;
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
int
|
|
407
|
-
mkr_val_to_number_or_fail(const mkr_val_t *v,
|
|
408
|
-
mkr_xpath_limits_t *limits,
|
|
409
|
-
mkr_xpath_error_t *err,
|
|
410
|
-
double *out)
|
|
411
|
-
{
|
|
412
|
-
if (v == NULL || out == NULL) {
|
|
413
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_number_or_fail: bad args");
|
|
414
|
-
return -1;
|
|
415
|
-
}
|
|
416
|
-
if (v->type == MKR_XPATH_TYPE_NODESET) {
|
|
417
|
-
if (v->u.nodeset.count == 0) {
|
|
418
|
-
*out = (double)NAN;
|
|
419
|
-
return 0;
|
|
420
|
-
}
|
|
421
|
-
mkr_owned_text_t text;
|
|
422
|
-
if (mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, &text) != 0) return -1;
|
|
423
|
-
*out = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
|
|
424
|
-
mkr_owned_text_clear(&text);
|
|
425
|
-
return 0;
|
|
426
|
-
}
|
|
427
|
-
*out = mkr_val_to_number_unchecked(v);
|
|
428
|
-
return 0;
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
/* ---------- coercions ---------- */
|
|
432
|
-
|
|
433
|
-
double
|
|
434
|
-
mkr_borrowed_text_to_number(mkr_borrowed_text_t t)
|
|
435
|
-
{
|
|
436
|
-
if (t.ptr == NULL) return (double)NAN;
|
|
437
|
-
const char *s = t.ptr;
|
|
438
|
-
while (*s && isspace((unsigned char)*s)) s++;
|
|
439
|
-
if (*s == '\0') return (double)NAN;
|
|
440
|
-
char *end = NULL;
|
|
441
|
-
double d = strtod(s, &end);
|
|
442
|
-
if (end == s) return (double)NAN;
|
|
443
|
-
while (*end && isspace((unsigned char)*end)) end++;
|
|
444
|
-
if (*end != '\0') return (double)NAN;
|
|
445
|
-
return d;
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
double
|
|
449
|
-
mkr_val_to_number_unchecked(const mkr_val_t *v)
|
|
450
|
-
{
|
|
451
|
-
switch (v->type) {
|
|
452
|
-
case MKR_XPATH_TYPE_NUMBER:
|
|
453
|
-
return v->u.number;
|
|
454
|
-
case MKR_XPATH_TYPE_BOOLEAN:
|
|
455
|
-
return v->u.boolean ? 1.0 : 0.0;
|
|
456
|
-
case MKR_XPATH_TYPE_STRING:
|
|
457
|
-
return mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(v->u.string));
|
|
458
|
-
case MKR_XPATH_TYPE_NODESET: {
|
|
459
|
-
if (v->u.nodeset.count == 0) return (double)NAN;
|
|
460
|
-
/* string-value of first node in document order */
|
|
461
|
-
mkr_owned_text_t text;
|
|
462
|
-
mkr_build_node_text_unchecked(v->u.nodeset.items[0], &text);
|
|
463
|
-
double d = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
|
|
464
|
-
mkr_owned_text_clear(&text);
|
|
465
|
-
return d;
|
|
466
|
-
}
|
|
467
|
-
}
|
|
468
|
-
return (double)NAN;
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
int
|
|
472
|
-
mkr_val_to_boolean(const mkr_val_t *v)
|
|
473
|
-
{
|
|
474
|
-
switch (v->type) {
|
|
475
|
-
case MKR_XPATH_TYPE_BOOLEAN:
|
|
476
|
-
return v->u.boolean;
|
|
477
|
-
case MKR_XPATH_TYPE_NUMBER:
|
|
478
|
-
return !(v->u.number == 0.0 || isnan(v->u.number));
|
|
479
|
-
case MKR_XPATH_TYPE_STRING:
|
|
480
|
-
return v->u.string.ptr != NULL && v->u.string.ptr[0] != '\0';
|
|
481
|
-
case MKR_XPATH_TYPE_NODESET:
|
|
482
|
-
return v->u.nodeset.count > 0;
|
|
483
|
-
}
|
|
484
|
-
return 0;
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
/* ---------- document order ---------- */
|
|
488
|
-
|
|
489
|
-
/*
|
|
490
|
-
* Treat an attribute node as positioned "with" its owner element for
|
|
491
|
-
* cross-subtree comparisons; only when both belong to the same element
|
|
492
|
-
* does the attribute-vs-attribute or attribute-vs-descendant rule kick in.
|
|
493
|
-
*/
|
|
494
|
-
static const lxb_dom_node_t *
|
|
495
|
-
anchor_for_cmp(const lxb_dom_node_t *n)
|
|
496
|
-
{
|
|
497
|
-
if (n->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
|
|
498
|
-
return n->parent ? n->parent : n;
|
|
499
|
-
}
|
|
500
|
-
return n;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
static int
|
|
504
|
-
depth_of(const lxb_dom_node_t *n)
|
|
505
|
-
{
|
|
506
|
-
int d = 0;
|
|
507
|
-
while (n->parent) { d++; n = n->parent; }
|
|
508
|
-
return d;
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
static int
|
|
512
|
-
doc_order_cmp(const lxb_dom_node_t *a, const lxb_dom_node_t *b)
|
|
513
|
-
{
|
|
514
|
-
if (a == b) return 0;
|
|
515
|
-
const lxb_dom_node_t *aa = anchor_for_cmp(a);
|
|
516
|
-
const lxb_dom_node_t *bb = anchor_for_cmp(b);
|
|
517
|
-
|
|
518
|
-
/* If the anchors are the same element, decide by node type. A non-attribute
|
|
519
|
-
* node that anchors to the same element E can ONLY be E itself: any other
|
|
520
|
-
* node (a child/descendant) anchors to itself, not to E, so it would not
|
|
521
|
-
* reach this branch (the attribute-vs-descendant case is handled below by
|
|
522
|
-
* the depth-normalisation walk). Per XPath 1.0 §5.1 document order is
|
|
523
|
-
* "element, then its attribute nodes, then its children", so an attribute
|
|
524
|
-
* comes AFTER its own owner element. */
|
|
525
|
-
if (aa == bb) {
|
|
526
|
-
int a_attr = (a->type == LXB_DOM_NODE_TYPE_ATTRIBUTE);
|
|
527
|
-
int b_attr = (b->type == LXB_DOM_NODE_TYPE_ATTRIBUTE);
|
|
528
|
-
if (a_attr && !b_attr) return 1; /* b is the owner element E; a (its attr) follows */
|
|
529
|
-
if (b_attr && !a_attr) return -1; /* a is the owner element E; b (its attr) follows */
|
|
530
|
-
/* Both attributes of the same element: relative order is
|
|
531
|
-
* implementation-defined. Use insertion order via attr linked list. */
|
|
532
|
-
if (a_attr && b_attr) {
|
|
533
|
-
for (const lxb_dom_attr_t *at = ((const lxb_dom_element_t *)aa)->first_attr;
|
|
534
|
-
at != NULL; at = at->next) {
|
|
535
|
-
if ((const lxb_dom_node_t *)at == a) return -1;
|
|
536
|
-
if ((const lxb_dom_node_t *)at == b) return 1;
|
|
537
|
-
}
|
|
538
|
-
return 0;
|
|
539
|
-
}
|
|
540
|
-
/* aa == bb but neither is an attribute means a == b, handled above. */
|
|
541
|
-
return 0;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
int da = depth_of(aa), db = depth_of(bb);
|
|
545
|
-
while (da > db) { aa = aa->parent; da--; }
|
|
546
|
-
while (db > da) { bb = bb->parent; db--; }
|
|
547
|
-
if (aa == bb) {
|
|
548
|
-
/* One is ancestor of the other; ancestor comes first. */
|
|
549
|
-
return (aa == anchor_for_cmp(a)) ? -1 : 1;
|
|
550
|
-
}
|
|
551
|
-
while (aa->parent != bb->parent) {
|
|
552
|
-
aa = aa->parent;
|
|
553
|
-
bb = bb->parent;
|
|
554
|
-
}
|
|
555
|
-
/* Resolve sibling order. Scan outward from aa and bb in lockstep (via ->next)
|
|
556
|
-
* rather than forward from parent->first_child: the cost is then O(distance
|
|
557
|
-
* between aa and bb), not O(distance from the first child. The latter is
|
|
558
|
-
* quadratic when sorting nodes that sit deep in a wide, flat parent (e.g. a
|
|
559
|
-
* predicate result picking scattered <li> from a 2000-child <ul>), which the
|
|
560
|
-
* doc-order index would only avoid once a single sort reaches its build
|
|
561
|
-
* threshold. */
|
|
562
|
-
if (aa->parent == NULL) {
|
|
563
|
-
/* Different documents/roots — undefined; keep stable. */
|
|
564
|
-
return 0;
|
|
565
|
-
}
|
|
566
|
-
const lxb_dom_node_t *fa = aa, *fb = bb;
|
|
567
|
-
for (;;) {
|
|
568
|
-
fa = fa ? fa->next : NULL;
|
|
569
|
-
fb = fb ? fb->next : NULL;
|
|
570
|
-
if (fa == bb) return -1; /* bb lies after aa -> aa first */
|
|
571
|
-
if (fb == aa) return 1; /* aa lies after bb -> bb first */
|
|
572
|
-
if (fa == NULL && fb == NULL) return 0; /* unreachable for same-parent nodes */
|
|
573
|
-
}
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
/* ---------- per-evaluate document-order index ---------- */
|
|
577
|
-
|
|
578
|
-
static uint32_t
|
|
579
|
-
pointer_hash(const void *p)
|
|
580
|
-
{
|
|
581
|
-
uintptr_t x = (uintptr_t)p;
|
|
582
|
-
/* SplitMix-style mixing — cheap and good enough for pointer keys. */
|
|
583
|
-
x = (x ^ (x >> 16)) * 0x9E3779B9u;
|
|
584
|
-
x = (x ^ (x >> 13)) * 0x85EBCA6Bu;
|
|
585
|
-
return (uint32_t)(x ^ (x >> 16));
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
void
|
|
589
|
-
mkr_doc_order_index_init(mkr_doc_order_index_t *idx)
|
|
590
|
-
{
|
|
591
|
-
idx->buckets = NULL;
|
|
592
|
-
idx->cap = 0;
|
|
593
|
-
idx->count = 0;
|
|
594
|
-
idx->built = 0;
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
void
|
|
598
|
-
mkr_doc_order_index_clear(mkr_doc_order_index_t *idx)
|
|
599
|
-
{
|
|
600
|
-
if (idx == NULL) return;
|
|
601
|
-
free(idx->buckets);
|
|
602
|
-
idx->buckets = NULL;
|
|
603
|
-
idx->cap = 0;
|
|
604
|
-
idx->count = 0;
|
|
605
|
-
idx->built = 0;
|
|
606
|
-
}
|
|
607
|
-
|
|
608
|
-
/* Insert (node, ord) into the open-addressing table. Grows when load
|
|
609
|
-
* factor exceeds 3/4. Returns 0 on success, -1 on OOM. */
|
|
610
|
-
static int
|
|
611
|
-
order_index_insert(mkr_doc_order_index_t *idx, const lxb_dom_node_t *node, size_t ord)
|
|
612
|
-
{
|
|
613
|
-
if (idx->cap == 0 || idx->count * 4 >= idx->cap * 3) {
|
|
614
|
-
size_t new_cap = 256;
|
|
615
|
-
if (idx->cap != 0 && !mkr_size_mul(idx->cap, 2, &new_cap)) {
|
|
616
|
-
return -1; /* overflow */
|
|
617
|
-
}
|
|
618
|
-
void *new_buckets = mkr_callocarray(new_cap, sizeof(*idx->buckets));
|
|
619
|
-
if (new_buckets == NULL) return -1;
|
|
620
|
-
/* Rehash. */
|
|
621
|
-
typeof(idx->buckets) old_buckets = idx->buckets;
|
|
622
|
-
size_t old_cap = idx->cap;
|
|
623
|
-
idx->buckets = new_buckets;
|
|
624
|
-
idx->cap = new_cap;
|
|
625
|
-
idx->count = 0;
|
|
626
|
-
for (size_t i = 0; i < old_cap; ++i) {
|
|
627
|
-
if (old_buckets[i].node != NULL) {
|
|
628
|
-
size_t mask = new_cap - 1;
|
|
629
|
-
size_t j = pointer_hash(old_buckets[i].node) & mask;
|
|
630
|
-
while (idx->buckets[j].node != NULL) j = (j + 1) & mask;
|
|
631
|
-
idx->buckets[j].node = old_buckets[i].node;
|
|
632
|
-
idx->buckets[j].ord = old_buckets[i].ord;
|
|
633
|
-
idx->count++;
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
free(old_buckets);
|
|
637
|
-
}
|
|
638
|
-
size_t mask = idx->cap - 1;
|
|
639
|
-
size_t j = pointer_hash(node) & mask;
|
|
640
|
-
while (idx->buckets[j].node != NULL) {
|
|
641
|
-
if (idx->buckets[j].node == node) return 0; /* already present */
|
|
642
|
-
j = (j + 1) & mask;
|
|
643
|
-
}
|
|
644
|
-
idx->buckets[j].node = node;
|
|
645
|
-
idx->buckets[j].ord = ord;
|
|
646
|
-
idx->count++;
|
|
647
|
-
return 0;
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
static int
|
|
651
|
-
order_index_lookup(const mkr_doc_order_index_t *idx, const lxb_dom_node_t *node,
|
|
652
|
-
size_t *out_ord)
|
|
653
|
-
{
|
|
654
|
-
if (idx->cap == 0) return -1;
|
|
655
|
-
size_t mask = idx->cap - 1;
|
|
656
|
-
size_t j = pointer_hash(node) & mask;
|
|
657
|
-
while (idx->buckets[j].node != NULL) {
|
|
658
|
-
if (idx->buckets[j].node == node) {
|
|
659
|
-
if (out_ord) *out_ord = idx->buckets[j].ord;
|
|
660
|
-
return 0;
|
|
661
|
-
}
|
|
662
|
-
j = (j + 1) & mask;
|
|
663
|
-
}
|
|
664
|
-
return -1;
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
/* DFS pre-order: assign ordinal to the element, then its attributes
|
|
668
|
-
* (in linked-list order, before children), then descendants. This
|
|
669
|
-
* matches doc_order_cmp's attribute placement.
|
|
670
|
-
*
|
|
671
|
-
* Iterative (parent-pointer) walk rather than C recursion, so an adversarially
|
|
672
|
-
* deep tree cannot overflow the stack (fail-closed / no DoS); O(1) extra space.
|
|
673
|
-
* The traversal stays within the subtree rooted at `root` (it never follows
|
|
674
|
-
* root->next). */
|
|
675
|
-
static int
|
|
676
|
-
order_index_walk(mkr_doc_order_index_t *idx, lxb_dom_node_t *root, size_t *next_ord)
|
|
677
|
-
{
|
|
678
|
-
lxb_dom_node_t *cur = root;
|
|
679
|
-
while (cur != NULL) {
|
|
680
|
-
/* Visit (pre-order): the node, then its attributes before any child. */
|
|
681
|
-
if (order_index_insert(idx, cur, (*next_ord)++) != 0) return -1;
|
|
682
|
-
if (cur->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
683
|
-
lxb_dom_element_t *el = (lxb_dom_element_t *)cur;
|
|
684
|
-
for (lxb_dom_attr_t *a = el->first_attr; a != NULL; a = a->next) {
|
|
685
|
-
if (order_index_insert(idx, (lxb_dom_node_t *)a, (*next_ord)++) != 0) return -1;
|
|
686
|
-
}
|
|
687
|
-
}
|
|
688
|
-
if (cur->first_child != NULL) {
|
|
689
|
-
cur = cur->first_child;
|
|
690
|
-
continue;
|
|
691
|
-
}
|
|
692
|
-
while (cur != root && cur->next == NULL) {
|
|
693
|
-
cur = cur->parent;
|
|
694
|
-
}
|
|
695
|
-
if (cur == root) break;
|
|
696
|
-
cur = cur->next;
|
|
697
|
-
}
|
|
698
|
-
return 0;
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
static int
|
|
702
|
-
order_index_build(mkr_doc_order_index_t *idx, lxb_dom_node_t *root,
|
|
703
|
-
mkr_xpath_error_t *err)
|
|
704
|
-
{
|
|
705
|
-
if (idx->built) return 0;
|
|
706
|
-
if (root == NULL) return -1;
|
|
707
|
-
size_t next_ord = 0;
|
|
708
|
-
if (order_index_walk(idx, root, &next_ord) != 0) {
|
|
709
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building document order index");
|
|
710
|
-
mkr_doc_order_index_clear(idx);
|
|
711
|
-
return -1;
|
|
712
|
-
}
|
|
713
|
-
idx->built = 1;
|
|
714
|
-
return 0;
|
|
715
|
-
}
|
|
716
|
-
|
|
717
|
-
/* Indexed comparator. Falls back to doc_order_cmp on any miss
|
|
718
|
-
* (e.g., synthesised nodes or cross-document compares). */
|
|
719
|
-
static int
|
|
720
|
-
doc_order_cmp_ctx(mkr_xpath_context_t *ctx, const lxb_dom_node_t *a, const lxb_dom_node_t *b)
|
|
721
|
-
{
|
|
722
|
-
if (a == b) return 0;
|
|
723
|
-
if (ctx == NULL) return doc_order_cmp(a, b);
|
|
724
|
-
mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
|
|
725
|
-
if (idx == NULL || !idx->built) return doc_order_cmp(a, b);
|
|
726
|
-
size_t oa, ob;
|
|
727
|
-
if (order_index_lookup(idx, a, &oa) != 0) return doc_order_cmp(a, b);
|
|
728
|
-
if (order_index_lookup(idx, b, &ob) != 0) return doc_order_cmp(a, b);
|
|
729
|
-
/* Safe comparison — compare, don't subtract (unsigned difference wraps). */
|
|
730
|
-
if (oa < ob) return -1;
|
|
731
|
-
if (oa > ob) return 1;
|
|
732
|
-
return 0;
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
/* Bottom-up merge sort. Threading ctx through avoids the qsort_r /
|
|
736
|
-
* thread-local hack and keeps everything reentrant. Stable as a
|
|
737
|
-
* bonus: ties (same ord — only possible for synthesised nodes that
|
|
738
|
-
* weren't in the index) preserve insertion order. */
|
|
739
|
-
static void
|
|
740
|
-
ms_merge(lxb_dom_node_t **arr, lxb_dom_node_t **tmp,
|
|
741
|
-
size_t lo, size_t mid, size_t hi, mkr_xpath_context_t *ctx)
|
|
742
|
-
{
|
|
743
|
-
size_t i = lo, j = mid, k = lo;
|
|
744
|
-
while (i < mid && j < hi) {
|
|
745
|
-
if (doc_order_cmp_ctx(ctx, arr[i], arr[j]) <= 0) tmp[k++] = arr[i++];
|
|
746
|
-
else tmp[k++] = arr[j++];
|
|
747
|
-
}
|
|
748
|
-
while (i < mid) tmp[k++] = arr[i++];
|
|
749
|
-
while (j < hi) tmp[k++] = arr[j++];
|
|
750
|
-
for (size_t x = lo; x < hi; ++x) arr[x] = tmp[x];
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
static void
|
|
754
|
-
ms_sort(lxb_dom_node_t **arr, lxb_dom_node_t **tmp,
|
|
755
|
-
size_t lo, size_t hi, mkr_xpath_context_t *ctx)
|
|
756
|
-
{
|
|
757
|
-
if (hi - lo < 2) return;
|
|
758
|
-
size_t mid = lo + (hi - lo) / 2;
|
|
759
|
-
ms_sort(arr, tmp, lo, mid, ctx);
|
|
760
|
-
ms_sort(arr, tmp, mid, hi, ctx);
|
|
761
|
-
ms_merge(arr, tmp, lo, mid, hi, ctx);
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
/* qsort fallback used only when tmp-buffer allocation fails. */
|
|
765
|
-
static int
|
|
766
|
-
doc_order_qsort_cb_fallback(const void *pa, const void *pb)
|
|
767
|
-
{
|
|
768
|
-
const lxb_dom_node_t *a = *(const lxb_dom_node_t * const *)pa;
|
|
769
|
-
const lxb_dom_node_t *b = *(const lxb_dom_node_t * const *)pb;
|
|
770
|
-
return doc_order_cmp(a, b);
|
|
771
|
-
}
|
|
772
|
-
|
|
773
|
-
/* Threshold for building the doc-order index. Below this we expect
|
|
774
|
-
* N log N parent-chain compares to be cheaper than the O(D) full-doc
|
|
775
|
-
* walk that the index requires (D = total nodes in document, which is
|
|
776
|
-
* typically 6000+ on real pages). Empirically the crossover sits
|
|
777
|
-
* somewhere between N=100 and N=300 on coffee.html; we pick a safe
|
|
778
|
-
* point that keeps small unions and reverse-axis dedups off the slow
|
|
779
|
-
* build path. Once the index IS built (e.g., by a larger sort earlier
|
|
780
|
-
* in the same evaluate), subsequent small sorts naturally reuse it. */
|
|
781
|
-
#define MKR_INDEX_BUILD_MIN 200
|
|
782
|
-
|
|
783
|
-
void
|
|
784
|
-
mkr_nodeset_sort_doc_order(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
|
|
785
|
-
{
|
|
786
|
-
if (ns == NULL || ns->count < 2) return;
|
|
787
|
-
|
|
788
|
-
/* Lazy build of the doc-order index. Only worth doing when the sort
|
|
789
|
-
* itself is large enough to amortise the full-doc walk; smaller
|
|
790
|
-
* sorts fall through to parent-chain compares via doc_order_cmp_ctx
|
|
791
|
-
* (which sees an unbuilt index and dispatches accordingly). */
|
|
792
|
-
mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
|
|
793
|
-
if (idx != NULL && !idx->built && ns->count >= MKR_INDEX_BUILD_MIN) {
|
|
794
|
-
lxb_dom_node_t *root = (lxb_dom_node_t *)mkr_ctx_document(ctx);
|
|
795
|
-
mkr_xpath_error_t ierr = {0};
|
|
796
|
-
(void)order_index_build(idx, root, &ierr);
|
|
797
|
-
mkr_xpath_error_clear(&ierr); /* index is best-effort; on OOM we fall through to parent-chain cmp */
|
|
798
|
-
}
|
|
799
|
-
|
|
800
|
-
lxb_dom_node_t **tmp = mkr_reallocarray(NULL, ns->count, sizeof(*tmp));
|
|
801
|
-
if (tmp == NULL) {
|
|
802
|
-
/* Fall back to in-place qsort with parent-chain compare (slow but
|
|
803
|
-
* correct). Should be a very rare path. */
|
|
804
|
-
qsort(ns->items, ns->count, sizeof(ns->items[0]), doc_order_qsort_cb_fallback);
|
|
805
|
-
return;
|
|
806
|
-
}
|
|
807
|
-
ms_sort(ns->items, tmp, 0, ns->count, ctx);
|
|
808
|
-
free(tmp);
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
void
|
|
812
|
-
mkr_nodeset_unique_sorted(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
|
|
813
|
-
{
|
|
814
|
-
if (ns == NULL || ns->count < 2) return;
|
|
815
|
-
mkr_nodeset_sort_doc_order(ctx, ns);
|
|
816
|
-
size_t w = 1;
|
|
817
|
-
for (size_t r = 1; r < ns->count; ++r) {
|
|
818
|
-
if (ns->items[r] != ns->items[r - 1]) {
|
|
819
|
-
ns->items[w++] = ns->items[r];
|
|
820
|
-
}
|
|
821
|
-
}
|
|
822
|
-
ns->count = w;
|
|
823
|
-
}
|
|
824
|
-
|
|
825
|
-
/* ---------- per-evaluation string-value cache ---------- */
|
|
826
|
-
|
|
827
|
-
void
|
|
828
|
-
mkr_str_cache_init(mkr_str_cache_t *c)
|
|
829
|
-
{
|
|
830
|
-
c->entries = NULL;
|
|
831
|
-
c->count = 0;
|
|
832
|
-
c->cap = 0;
|
|
833
|
-
c->buckets = NULL;
|
|
834
|
-
c->bucket_cap = 0;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
/* Insert entry index `idx` (keyed by entries[idx].node) into the index. The
|
|
838
|
-
* index must have room (callers grow/rehash first). */
|
|
839
|
-
static void
|
|
840
|
-
mkr_str_cache_index_put(mkr_str_cache_t *c, size_t idx)
|
|
841
|
-
{
|
|
842
|
-
size_t mask = c->bucket_cap - 1;
|
|
843
|
-
size_t j = pointer_hash(c->entries[idx].node) & mask;
|
|
844
|
-
while (c->buckets[j] != 0) {
|
|
845
|
-
j = (j + 1) & mask;
|
|
846
|
-
}
|
|
847
|
-
c->buckets[j] = idx + 1;
|
|
848
|
-
}
|
|
849
|
-
|
|
850
|
-
/* Rebuild the index from entries[0, count). Returns -1 on OOM. */
|
|
851
|
-
static int
|
|
852
|
-
mkr_str_cache_reindex(mkr_str_cache_t *c, size_t bucket_cap)
|
|
853
|
-
{
|
|
854
|
-
size_t *buckets = mkr_callocarray(bucket_cap, sizeof(*buckets));
|
|
855
|
-
if (buckets == NULL) return -1;
|
|
856
|
-
free(c->buckets);
|
|
857
|
-
c->buckets = buckets;
|
|
858
|
-
c->bucket_cap = bucket_cap;
|
|
859
|
-
for (size_t i = 0; i < c->count; ++i) {
|
|
860
|
-
mkr_str_cache_index_put(c, i);
|
|
861
|
-
}
|
|
862
|
-
return 0;
|
|
863
|
-
}
|
|
864
|
-
|
|
865
|
-
void
|
|
866
|
-
mkr_str_cache_truncate(mkr_str_cache_t *c, size_t target_count)
|
|
867
|
-
{
|
|
868
|
-
if (c == NULL || target_count >= c->count) return;
|
|
869
|
-
for (size_t i = target_count; i < c->count; ++i) {
|
|
870
|
-
free(c->entries[i].str);
|
|
871
|
-
}
|
|
872
|
-
c->count = target_count;
|
|
873
|
-
/* Drop the removed nodes from the index. A full truncate just clears it;
|
|
874
|
-
* a partial one (nested-eval snapshot restore) rebuilds from what remains. */
|
|
875
|
-
if (c->buckets != NULL) {
|
|
876
|
-
if (target_count == 0) {
|
|
877
|
-
size_t buckets_bytes;
|
|
878
|
-
if (!mkr_size_mul(c->bucket_cap, sizeof(*c->buckets), &buckets_bytes)) {
|
|
879
|
-
free(c->buckets);
|
|
880
|
-
c->buckets = NULL;
|
|
881
|
-
c->bucket_cap = 0;
|
|
882
|
-
return;
|
|
883
|
-
}
|
|
884
|
-
memset(c->buckets, 0, buckets_bytes);
|
|
885
|
-
} else {
|
|
886
|
-
mkr_str_cache_reindex(c, c->bucket_cap);
|
|
887
|
-
}
|
|
888
|
-
}
|
|
889
|
-
}
|
|
890
|
-
|
|
891
|
-
void
|
|
892
|
-
mkr_str_cache_clear(mkr_str_cache_t *c)
|
|
893
|
-
{
|
|
894
|
-
if (c == NULL) return;
|
|
895
|
-
for (size_t i = 0; i < c->count; ++i) {
|
|
896
|
-
free(c->entries[i].str);
|
|
897
|
-
}
|
|
898
|
-
free(c->entries);
|
|
899
|
-
free(c->buckets);
|
|
900
|
-
c->entries = NULL;
|
|
901
|
-
c->count = 0;
|
|
902
|
-
c->cap = 0;
|
|
903
|
-
c->buckets = NULL;
|
|
904
|
-
c->bucket_cap = 0;
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
int
|
|
908
|
-
mkr_get_cached_node_text(mkr_xpath_context_t *ctx,
|
|
909
|
-
lxb_dom_node_t *node,
|
|
910
|
-
mkr_borrowed_text_t *out,
|
|
911
|
-
mkr_xpath_error_t *err)
|
|
912
|
-
{
|
|
913
|
-
if (out == NULL) {
|
|
914
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_get_cached_node_text: bad args");
|
|
915
|
-
return -1;
|
|
916
|
-
}
|
|
917
|
-
*out = mkr_borrowed_text(NULL, 0);
|
|
918
|
-
/* Contract: ctx is non-NULL when called from the evaluator (the only
|
|
919
|
-
* intended caller). A NULL ctx is a programming error; surface it. */
|
|
920
|
-
mkr_str_cache_t *c = mkr_ctx_str_cache(ctx);
|
|
921
|
-
if (c == NULL) {
|
|
922
|
-
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL,
|
|
923
|
-
"mkr_get_cached_node_text called without a context");
|
|
924
|
-
return -1;
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
/* O(1) lookup via the pointer-keyed index. */
|
|
928
|
-
if (c->bucket_cap != 0) {
|
|
929
|
-
size_t mask = c->bucket_cap - 1;
|
|
930
|
-
size_t j = pointer_hash(node) & mask;
|
|
931
|
-
while (c->buckets[j] != 0) {
|
|
932
|
-
mkr_str_cache_entry_t *e = &c->entries[c->buckets[j] - 1];
|
|
933
|
-
if (e->node == node) {
|
|
934
|
-
*out = mkr_borrowed_text(e->str, e->len);
|
|
935
|
-
return 0;
|
|
936
|
-
}
|
|
937
|
-
j = (j + 1) & mask;
|
|
938
|
-
}
|
|
939
|
-
}
|
|
940
|
-
|
|
941
|
-
mkr_owned_text_t text;
|
|
942
|
-
if (mkr_node_to_owned_text_or_fail(node, mkr_ctx_limits(ctx), err, &text) != 0) return -1;
|
|
943
|
-
|
|
944
|
-
if (mkr_grow_reserve((void **)&c->entries, &c->cap, c->count + 1,
|
|
945
|
-
sizeof(*c->entries)) != MKR_OK) {
|
|
946
|
-
mkr_owned_text_clear(&text);
|
|
947
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory in node string cache");
|
|
948
|
-
return -1;
|
|
949
|
-
}
|
|
950
|
-
c->entries[c->count].node = node;
|
|
951
|
-
c->entries[c->count].str = text.ptr;
|
|
952
|
-
c->entries[c->count].len = text.len;
|
|
953
|
-
|
|
954
|
-
/* Grow / build the index, keeping load factor <= 1/2. */
|
|
955
|
-
if (c->bucket_cap == 0 || (c->count + 1) * 2 > c->bucket_cap) {
|
|
956
|
-
size_t new_bucket_cap = 64;
|
|
957
|
-
if (c->bucket_cap != 0 && !mkr_size_mul(c->bucket_cap, 2, &new_bucket_cap)) {
|
|
958
|
-
mkr_owned_text_clear(&text);
|
|
959
|
-
c->entries[c->count].node = NULL;
|
|
960
|
-
c->entries[c->count].str = NULL;
|
|
961
|
-
c->entries[c->count].len = 0;
|
|
962
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "node string cache index overflow");
|
|
963
|
-
return -1;
|
|
964
|
-
}
|
|
965
|
-
if (mkr_str_cache_reindex(c, new_bucket_cap) != 0) {
|
|
966
|
-
mkr_owned_text_clear(&text);
|
|
967
|
-
c->entries[c->count].node = NULL;
|
|
968
|
-
c->entries[c->count].str = NULL;
|
|
969
|
-
c->entries[c->count].len = 0;
|
|
970
|
-
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory indexing node string cache");
|
|
971
|
-
return -1;
|
|
972
|
-
}
|
|
973
|
-
}
|
|
974
|
-
mkr_str_cache_index_put(c, c->count);
|
|
975
|
-
c->count++;
|
|
976
|
-
|
|
977
|
-
*out = mkr_borrowed_text_from_owned(text);
|
|
978
|
-
return 0;
|
|
979
|
-
}
|
|
980
|
-
|
|
981
|
-
/* ---------- AST destructors ---------- */
|
|
982
|
-
|
|
983
|
-
void
|
|
984
|
-
mkr_step_clear(mkr_step_t *s)
|
|
985
|
-
{
|
|
986
|
-
if (s == NULL) return;
|
|
987
|
-
mkr_owned_text_clear(&s->test.prefix);
|
|
988
|
-
mkr_owned_text_clear(&s->test.local);
|
|
989
|
-
mkr_owned_text_clear(&s->test.pi_target);
|
|
990
|
-
for (size_t i = 0; i < s->npredicates; ++i) {
|
|
991
|
-
mkr_node_free(s->predicates[i]);
|
|
992
|
-
}
|
|
993
|
-
free(s->predicates);
|
|
994
|
-
memset(s, 0, sizeof(*s));
|
|
995
|
-
}
|
|
996
|
-
|
|
997
|
-
/* ---------- AST hoisting helpers ---------- */
|
|
998
|
-
|
|
999
|
-
/* Pure XPath 1.0 built-ins safe to hoist when all args are CI. Listed
|
|
1000
|
-
* explicitly to keep the set conservative. Functions that read the
|
|
1001
|
-
* context node (last/position, 0-arg string/normalize-space/local-
|
|
1002
|
-
* name/etc., lang) or that may depend on dynamic state (id, handler-
|
|
1003
|
-
* routed) are intentionally absent. */
|
|
1004
|
-
static int
|
|
1005
|
-
is_pure_builtin_name(const char *name, size_t nargs)
|
|
1006
|
-
{
|
|
1007
|
-
if (name == NULL) return 0;
|
|
1008
|
-
/* 0-arg only — these read no input. */
|
|
1009
|
-
if (nargs == 0) {
|
|
1010
|
-
return strcmp(name, "true") == 0 || strcmp(name, "false") == 0;
|
|
1011
|
-
}
|
|
1012
|
-
/* n-arg pure functions — all args must themselves be CI (checked
|
|
1013
|
-
* by the caller). */
|
|
1014
|
-
static const char *pure_names[] = {
|
|
1015
|
-
"count", "string-length", "number", "boolean", "not",
|
|
1016
|
-
"floor", "ceiling", "round", "sum",
|
|
1017
|
-
"concat", "starts-with", "contains",
|
|
1018
|
-
"substring-before", "substring-after", "substring",
|
|
1019
|
-
"translate",
|
|
1020
|
-
NULL,
|
|
1021
|
-
};
|
|
1022
|
-
for (size_t i = 0; pure_names[i]; ++i) {
|
|
1023
|
-
if (strcmp(pure_names[i], name) == 0) return 1;
|
|
1024
|
-
}
|
|
1025
|
-
return 0;
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
static void
|
|
1029
|
-
mark_step_predicates(mkr_step_t *s)
|
|
1030
|
-
{
|
|
1031
|
-
for (size_t i = 0; i < s->npredicates; ++i) {
|
|
1032
|
-
mkr_mark_context_independent(s->predicates[i]);
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
|
|
1036
|
-
void
|
|
1037
|
-
mkr_mark_context_independent(mkr_node_t *n)
|
|
1038
|
-
{
|
|
1039
|
-
if (n == NULL) return;
|
|
1040
|
-
int ci = 0;
|
|
1041
|
-
switch (n->kind) {
|
|
1042
|
-
case MKR_NK_LITERAL_STR:
|
|
1043
|
-
case MKR_NK_LITERAL_NUM:
|
|
1044
|
-
ci = 1;
|
|
1045
|
-
break;
|
|
1046
|
-
case MKR_NK_VARREF:
|
|
1047
|
-
/* Conservative: variables not hoisted even though XPath 1.0 says
|
|
1048
|
-
* they're fixed per evaluation. */
|
|
1049
|
-
ci = 0;
|
|
1050
|
-
break;
|
|
1051
|
-
case MKR_NK_FNCALL: {
|
|
1052
|
-
/* Recurse first so subtrees get their own CI marks even when this
|
|
1053
|
-
* call itself is not hoistable. */
|
|
1054
|
-
for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
|
|
1055
|
-
mkr_mark_context_independent(n->u.fncall.args[i]);
|
|
1056
|
-
}
|
|
1057
|
-
if (n->u.fncall.prefix.ptr != NULL) {
|
|
1058
|
-
ci = 0; /* Handler-routed or namespaced builtins → non-CI. */
|
|
1059
|
-
break;
|
|
1060
|
-
}
|
|
1061
|
-
if (!is_pure_builtin_name(n->u.fncall.name.ptr, n->u.fncall.nargs)) {
|
|
1062
|
-
ci = 0;
|
|
1063
|
-
break;
|
|
1064
|
-
}
|
|
1065
|
-
ci = 1;
|
|
1066
|
-
for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
|
|
1067
|
-
if (!n->u.fncall.args[i]->is_context_independent) { ci = 0; break; }
|
|
1068
|
-
}
|
|
1069
|
-
break;
|
|
1070
|
-
}
|
|
1071
|
-
case MKR_NK_UNARY:
|
|
1072
|
-
mkr_mark_context_independent(n->u.unary.expr);
|
|
1073
|
-
ci = n->u.unary.expr ? n->u.unary.expr->is_context_independent : 0;
|
|
1074
|
-
break;
|
|
1075
|
-
case MKR_NK_BINOP:
|
|
1076
|
-
mkr_mark_context_independent(n->u.binop.lhs);
|
|
1077
|
-
mkr_mark_context_independent(n->u.binop.rhs);
|
|
1078
|
-
ci = (n->u.binop.lhs && n->u.binop.lhs->is_context_independent)
|
|
1079
|
-
&& (n->u.binop.rhs && n->u.binop.rhs->is_context_independent);
|
|
1080
|
-
break;
|
|
1081
|
-
case MKR_NK_PATH:
|
|
1082
|
-
/* Absolute path is CI: seed is the document root regardless of
|
|
1083
|
-
* outer context. Relative paths use the outer context node and
|
|
1084
|
-
* are not hoistable. Predicates inside the path are evaluated
|
|
1085
|
-
* against the path's own context, so their position()/last() do
|
|
1086
|
-
* not leak — recurse so any pure sub-expressions still get marks. */
|
|
1087
|
-
ci = n->u.path.absolute ? 1 : 0;
|
|
1088
|
-
for (size_t i = 0; i < n->u.path.nsteps; ++i) {
|
|
1089
|
-
mark_step_predicates(&n->u.path.steps[i]);
|
|
1090
|
-
}
|
|
1091
|
-
break;
|
|
1092
|
-
case MKR_NK_FILTER:
|
|
1093
|
-
/* Conservative: filter expressions are not hoisted in v1. */
|
|
1094
|
-
ci = 0;
|
|
1095
|
-
mkr_mark_context_independent(n->u.filter.expr);
|
|
1096
|
-
for (size_t i = 0; i < n->u.filter.npreds; ++i) {
|
|
1097
|
-
mkr_mark_context_independent(n->u.filter.preds[i]);
|
|
1098
|
-
}
|
|
1099
|
-
for (size_t i = 0; i < n->u.filter.npath; ++i) {
|
|
1100
|
-
mark_step_predicates(&n->u.filter.path_steps[i]);
|
|
1101
|
-
}
|
|
1102
|
-
break;
|
|
1103
|
-
}
|
|
1104
|
-
n->is_context_independent = (uint8_t)ci;
|
|
1105
|
-
}
|
|
1106
|
-
|
|
1107
|
-
static void
|
|
1108
|
-
clear_memos_step(mkr_step_t *s)
|
|
1109
|
-
{
|
|
1110
|
-
for (size_t i = 0; i < s->npredicates; ++i) {
|
|
1111
|
-
mkr_node_clear_memos(s->predicates[i]);
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
|
|
1115
|
-
/* ---------- peephole: //X fusion ---------- */
|
|
1116
|
-
|
|
1117
|
-
/*
|
|
1118
|
-
* Collapse pairs of consecutive steps:
|
|
1119
|
-
* (axis=descendant-or-self, test=node(), no predicates)
|
|
1120
|
-
* (axis=child, test=*, no predicates)
|
|
1121
|
-
* into a single
|
|
1122
|
-
* (axis=descendant, test=*, no predicates)
|
|
1123
|
-
*
|
|
1124
|
-
* The fusion is safe per XPath 1.0 only when the child step has no
|
|
1125
|
-
* predicates: otherwise '//X[1]' would change meaning ("first X per
|
|
1126
|
-
* parent" vs "first X in doc order"). The synthesised // step always
|
|
1127
|
-
* has no predicates by construction, so we don't need to check the
|
|
1128
|
-
* first step's predicate list — only the child step's.
|
|
1129
|
-
*/
|
|
1130
|
-
static void
|
|
1131
|
-
fuse_descendant_or_self_steps(mkr_step_t *steps, size_t *nsteps_ptr)
|
|
1132
|
-
{
|
|
1133
|
-
if (steps == NULL || *nsteps_ptr < 2) return;
|
|
1134
|
-
size_t nsteps = *nsteps_ptr;
|
|
1135
|
-
size_t w = 0, r = 0;
|
|
1136
|
-
while (r < nsteps) {
|
|
1137
|
-
if (r + 1 < nsteps
|
|
1138
|
-
&& steps[r].axis == MKR_AXIS_DESCENDANT_OR_SELF
|
|
1139
|
-
&& steps[r].test.kind == MKR_NT_NODE
|
|
1140
|
-
&& steps[r].test.prefix.ptr == NULL
|
|
1141
|
-
&& steps[r].npredicates == 0
|
|
1142
|
-
&& steps[r + 1].axis == MKR_AXIS_CHILD
|
|
1143
|
-
&& steps[r + 1].npredicates == 0) {
|
|
1144
|
-
/* Drop the desc-or-self step and promote the child step. */
|
|
1145
|
-
mkr_step_clear(&steps[r]);
|
|
1146
|
-
steps[w] = steps[r + 1];
|
|
1147
|
-
memset(&steps[r + 1], 0, sizeof(steps[r + 1]));
|
|
1148
|
-
steps[w].axis = MKR_AXIS_DESCENDANT;
|
|
1149
|
-
w++;
|
|
1150
|
-
r += 2;
|
|
1151
|
-
} else {
|
|
1152
|
-
if (w != r) {
|
|
1153
|
-
steps[w] = steps[r];
|
|
1154
|
-
memset(&steps[r], 0, sizeof(steps[r]));
|
|
1155
|
-
}
|
|
1156
|
-
w++;
|
|
1157
|
-
r++;
|
|
1158
|
-
}
|
|
1159
|
-
}
|
|
1160
|
-
*nsteps_ptr = w;
|
|
1161
|
-
}
|
|
1162
|
-
|
|
1163
|
-
void
|
|
1164
|
-
mkr_apply_peephole(mkr_node_t *n)
|
|
1165
|
-
{
|
|
1166
|
-
if (n == NULL) return;
|
|
1167
|
-
switch (n->kind) {
|
|
1168
|
-
case MKR_NK_FNCALL:
|
|
1169
|
-
for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_apply_peephole(n->u.fncall.args[i]);
|
|
1170
|
-
break;
|
|
1171
|
-
case MKR_NK_UNARY:
|
|
1172
|
-
mkr_apply_peephole(n->u.unary.expr);
|
|
1173
|
-
break;
|
|
1174
|
-
case MKR_NK_BINOP:
|
|
1175
|
-
mkr_apply_peephole(n->u.binop.lhs);
|
|
1176
|
-
mkr_apply_peephole(n->u.binop.rhs);
|
|
1177
|
-
break;
|
|
1178
|
-
case MKR_NK_PATH:
|
|
1179
|
-
fuse_descendant_or_self_steps(n->u.path.steps, &n->u.path.nsteps);
|
|
1180
|
-
for (size_t i = 0; i < n->u.path.nsteps; ++i) {
|
|
1181
|
-
for (size_t j = 0; j < n->u.path.steps[i].npredicates; ++j) {
|
|
1182
|
-
mkr_apply_peephole(n->u.path.steps[i].predicates[j]);
|
|
1183
|
-
}
|
|
1184
|
-
}
|
|
1185
|
-
break;
|
|
1186
|
-
case MKR_NK_FILTER:
|
|
1187
|
-
mkr_apply_peephole(n->u.filter.expr);
|
|
1188
|
-
for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_apply_peephole(n->u.filter.preds[i]);
|
|
1189
|
-
fuse_descendant_or_self_steps(n->u.filter.path_steps, &n->u.filter.npath);
|
|
1190
|
-
for (size_t i = 0; i < n->u.filter.npath; ++i) {
|
|
1191
|
-
for (size_t j = 0; j < n->u.filter.path_steps[i].npredicates; ++j) {
|
|
1192
|
-
mkr_apply_peephole(n->u.filter.path_steps[i].predicates[j]);
|
|
1193
|
-
}
|
|
1194
|
-
}
|
|
1195
|
-
break;
|
|
1196
|
-
default:
|
|
1197
|
-
break;
|
|
1198
|
-
}
|
|
1199
|
-
}
|
|
1200
|
-
|
|
1201
|
-
void
|
|
1202
|
-
mkr_node_clear_memos(mkr_node_t *n)
|
|
1203
|
-
{
|
|
1204
|
-
if (n == NULL) return;
|
|
1205
|
-
if (n->memoized) {
|
|
1206
|
-
mkr_val_clear(&n->memo_value);
|
|
1207
|
-
n->memoized = 0;
|
|
1208
|
-
}
|
|
1209
|
-
switch (n->kind) {
|
|
1210
|
-
case MKR_NK_FNCALL:
|
|
1211
|
-
for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_node_clear_memos(n->u.fncall.args[i]);
|
|
1212
|
-
break;
|
|
1213
|
-
case MKR_NK_UNARY:
|
|
1214
|
-
mkr_node_clear_memos(n->u.unary.expr);
|
|
1215
|
-
break;
|
|
1216
|
-
case MKR_NK_BINOP:
|
|
1217
|
-
mkr_node_clear_memos(n->u.binop.lhs);
|
|
1218
|
-
mkr_node_clear_memos(n->u.binop.rhs);
|
|
1219
|
-
break;
|
|
1220
|
-
case MKR_NK_PATH:
|
|
1221
|
-
for (size_t i = 0; i < n->u.path.nsteps; ++i) clear_memos_step(&n->u.path.steps[i]);
|
|
1222
|
-
break;
|
|
1223
|
-
case MKR_NK_FILTER:
|
|
1224
|
-
mkr_node_clear_memos(n->u.filter.expr);
|
|
1225
|
-
for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_node_clear_memos(n->u.filter.preds[i]);
|
|
1226
|
-
for (size_t i = 0; i < n->u.filter.npath; ++i) clear_memos_step(&n->u.filter.path_steps[i]);
|
|
1227
|
-
break;
|
|
1228
|
-
default:
|
|
1229
|
-
break;
|
|
1230
|
-
}
|
|
1231
|
-
}
|
|
1232
|
-
|
|
1233
|
-
void
|
|
1234
|
-
mkr_node_free(mkr_node_t *n)
|
|
1235
|
-
{
|
|
1236
|
-
if (n == NULL) return;
|
|
1237
|
-
/* Free any memoized value first (idempotent). */
|
|
1238
|
-
if (n->memoized) {
|
|
1239
|
-
mkr_val_clear(&n->memo_value);
|
|
1240
|
-
n->memoized = 0;
|
|
1241
|
-
}
|
|
1242
|
-
switch (n->kind) {
|
|
1243
|
-
case MKR_NK_LITERAL_STR:
|
|
1244
|
-
mkr_owned_text_clear(&n->u.literal);
|
|
1245
|
-
break;
|
|
1246
|
-
case MKR_NK_LITERAL_NUM:
|
|
1247
|
-
break;
|
|
1248
|
-
case MKR_NK_VARREF:
|
|
1249
|
-
mkr_owned_text_clear(&n->u.varref.prefix);
|
|
1250
|
-
mkr_owned_text_clear(&n->u.varref.name);
|
|
1251
|
-
break;
|
|
1252
|
-
case MKR_NK_FNCALL:
|
|
1253
|
-
mkr_owned_text_clear(&n->u.fncall.prefix);
|
|
1254
|
-
mkr_owned_text_clear(&n->u.fncall.name);
|
|
1255
|
-
for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
|
|
1256
|
-
mkr_node_free(n->u.fncall.args[i]);
|
|
1257
|
-
}
|
|
1258
|
-
free(n->u.fncall.args);
|
|
1259
|
-
break;
|
|
1260
|
-
case MKR_NK_UNARY:
|
|
1261
|
-
mkr_node_free(n->u.unary.expr);
|
|
1262
|
-
break;
|
|
1263
|
-
case MKR_NK_BINOP:
|
|
1264
|
-
mkr_node_free(n->u.binop.lhs);
|
|
1265
|
-
mkr_node_free(n->u.binop.rhs);
|
|
1266
|
-
break;
|
|
1267
|
-
case MKR_NK_PATH:
|
|
1268
|
-
for (size_t i = 0; i < n->u.path.nsteps; ++i) {
|
|
1269
|
-
mkr_step_clear(&n->u.path.steps[i]);
|
|
1270
|
-
}
|
|
1271
|
-
free(n->u.path.steps);
|
|
1272
|
-
break;
|
|
1273
|
-
case MKR_NK_FILTER:
|
|
1274
|
-
mkr_node_free(n->u.filter.expr);
|
|
1275
|
-
for (size_t i = 0; i < n->u.filter.npreds; ++i) {
|
|
1276
|
-
mkr_node_free(n->u.filter.preds[i]);
|
|
1277
|
-
}
|
|
1278
|
-
free(n->u.filter.preds);
|
|
1279
|
-
for (size_t i = 0; i < n->u.filter.npath; ++i) {
|
|
1280
|
-
mkr_step_clear(&n->u.filter.path_steps[i]);
|
|
1281
|
-
}
|
|
1282
|
-
free(n->u.filter.path_steps);
|
|
1283
|
-
break;
|
|
1284
|
-
}
|
|
1285
|
-
free(n);
|
|
1286
|
-
}
|