makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -1,1286 +0,0 @@
1
- #include "mkr_xpath_internal.h"
2
- #include "../core/mkr_core.h"
3
-
4
- #include <lexbor/dom/dom.h>
5
- #include <ctype.h>
6
- #include <math.h>
7
- #include <stdio.h>
8
- #include <stdlib.h>
9
- #include <string.h>
10
-
11
- /*
12
- * Runtime values: node-sets, type coercions, and node string-values.
13
- * Also hosts the small AST destructor helpers.
14
- */
15
-
16
- /* ---------- node-set ---------- */
17
-
18
- void
19
- mkr_nodeset_init(mkr_nodeset_t *ns)
20
- {
21
- ns->items = NULL;
22
- ns->count = 0;
23
- ns->capacity = 0;
24
- }
25
-
26
- int
27
- mkr_nodeset_push(mkr_nodeset_t *ns, lxb_dom_node_t *node,
28
- mkr_xpath_limits_t *limits, mkr_xpath_error_t *err)
29
- {
30
- if (node == NULL) return 0;
31
- if (limits != NULL && mkr_limit_check_nodeset_size(limits, ns->count + 1, err) != 0) {
32
- return -1;
33
- }
34
- if (mkr_grow_reserve((void **)&ns->items, &ns->capacity, ns->count + 1,
35
- sizeof(*ns->items)) != MKR_OK) {
36
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory growing node-set");
37
- return -1;
38
- }
39
- ns->items[ns->count++] = node;
40
- return 0;
41
- }
42
-
43
- void
44
- mkr_nodeset_clear(mkr_nodeset_t *ns)
45
- {
46
- if (ns == NULL) return;
47
- free(ns->items);
48
- ns->items = NULL;
49
- ns->count = 0;
50
- ns->capacity = 0;
51
- }
52
-
53
- void
54
- mkr_owned_text_init(mkr_owned_text_t *t)
55
- {
56
- if (t == NULL) return;
57
- t->ptr = NULL;
58
- t->len = 0;
59
- }
60
-
61
- void
62
- mkr_owned_text_clear(mkr_owned_text_t *t)
63
- {
64
- if (t == NULL) return;
65
- free(t->ptr);
66
- t->ptr = NULL;
67
- t->len = 0;
68
- }
69
-
70
- int
71
- mkr_borrowed_text_eq(mkr_borrowed_text_t a, mkr_borrowed_text_t b)
72
- {
73
- if (a.ptr == NULL || b.ptr == NULL) return a.ptr == b.ptr;
74
- return a.len == b.len && memcmp(a.ptr, b.ptr, a.len) == 0;
75
- }
76
-
77
- /* Copy an already-valid borrowed text into owned storage. Taking
78
- * mkr_borrowed_text_t (not raw char*+len) keeps the type contract: an
79
- * mkr_owned_text_t can only be minted from text the caller has asserted valid
80
- * (via mkr_borrowed_text / mkr_borrowed_text_from_verified /
81
- * mkr_borrowed_text_from_owned), so every raw-bytes -> text entry point is
82
- * greppable. */
83
- int
84
- mkr_owned_text_from_borrowed_copy(mkr_owned_text_t *out, mkr_borrowed_text_t t,
85
- mkr_xpath_error_t *err, const char *what)
86
- {
87
- if (out == NULL) {
88
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_borrowed_copy: bad args");
89
- return -1;
90
- }
91
- mkr_owned_text_init(out);
92
- const char *s = t.ptr ? t.ptr : "";
93
- size_t len = t.ptr ? t.len : 0;
94
- char *p = mkr_strndup(s, len);
95
- if (p == NULL) {
96
- mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory copying text");
97
- return -1;
98
- }
99
- out->ptr = p;
100
- out->len = len;
101
- return 0;
102
- }
103
-
104
- int
105
- mkr_owned_text_from_buf_steal(mkr_owned_text_t *out, mkr_buf_t *buf,
106
- mkr_xpath_error_t *err, const char *what)
107
- {
108
- if (out == NULL || buf == NULL) {
109
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_buf_steal: bad args");
110
- return -1;
111
- }
112
- mkr_owned_text_init(out);
113
- size_t len = 0;
114
- char *p = mkr_buf_steal(buf, &len);
115
- if (p == NULL) {
116
- mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory stealing text buffer");
117
- return -1;
118
- }
119
- out->ptr = p;
120
- out->len = len;
121
- return 0;
122
- }
123
-
124
- void
125
- mkr_val_set_owned_text(mkr_val_t *v, mkr_owned_text_t text)
126
- {
127
- if (v == NULL) return;
128
- v->type = MKR_XPATH_TYPE_STRING;
129
- v->u.string = text;
130
- }
131
-
132
- /* Set +v+ to a STRING by copying a borrowed view: the engine allocates and owns
133
- * the copy. This is how callers outside the engine (the glue handler bridge)
134
- * hand a string into a value — they pass what they have, a borrowed slice, and
135
- * never construct an mkr_owned_text_t themselves. Keeping the copy-and-own step
136
- * here keeps allocation and freeing of owned strings in one layer. Returns 0 on
137
- * success, -1 on OOM (err populated; +v+ left untouched). */
138
- int
139
- mkr_val_set_borrowed_text_copy(mkr_val_t *v, mkr_borrowed_text_t text,
140
- mkr_xpath_error_t *err, const char *what)
141
- {
142
- if (v == NULL) {
143
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_set_borrowed_text_copy: bad args");
144
- return -1;
145
- }
146
- mkr_owned_text_t owned;
147
- if (mkr_owned_text_from_borrowed_copy(&owned, text, err, what) != 0) {
148
- return -1;
149
- }
150
- mkr_val_set_owned_text(v, owned);
151
- return 0;
152
- }
153
-
154
- /* ---------- value ---------- */
155
-
156
- void
157
- mkr_val_clear(mkr_val_t *v)
158
- {
159
- if (v == NULL) return;
160
- switch (v->type) {
161
- case MKR_XPATH_TYPE_NODESET:
162
- mkr_nodeset_clear(&v->u.nodeset);
163
- break;
164
- case MKR_XPATH_TYPE_STRING:
165
- mkr_owned_text_clear(&v->u.string);
166
- break;
167
- default:
168
- break;
169
- }
170
- memset(v, 0, sizeof(*v));
171
- }
172
-
173
- int
174
- mkr_val_clone(const mkr_val_t *src, mkr_val_t *dst, mkr_xpath_error_t *err)
175
- {
176
- if (src == NULL || dst == NULL) {
177
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: bad args");
178
- return -1;
179
- }
180
- memset(dst, 0, sizeof(*dst));
181
- dst->type = src->type;
182
- switch (src->type) {
183
- case MKR_XPATH_TYPE_STRING: {
184
- mkr_owned_text_t text;
185
- if (mkr_owned_text_from_borrowed_copy(&text, mkr_borrowed_text_from_owned(src->u.string),
186
- err, "out of memory cloning string value") != 0) return -1;
187
- mkr_val_set_owned_text(dst, text);
188
- return 0;
189
- }
190
- case MKR_XPATH_TYPE_NUMBER:
191
- dst->u.number = src->u.number;
192
- return 0;
193
- case MKR_XPATH_TYPE_BOOLEAN:
194
- dst->u.boolean = src->u.boolean;
195
- return 0;
196
- case MKR_XPATH_TYPE_NODESET: {
197
- size_t n = src->u.nodeset.count;
198
- mkr_nodeset_init(&dst->u.nodeset);
199
- if (n == 0) return 0;
200
- lxb_dom_node_t **items;
201
- size_t items_bytes;
202
- if (!mkr_size_mul(n, sizeof(*items), &items_bytes)) {
203
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
204
- return -1;
205
- }
206
- items = mkr_reallocarray(NULL, n, sizeof(*items));
207
- if (items == NULL) {
208
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
209
- return -1;
210
- }
211
- memcpy(items, src->u.nodeset.items, items_bytes);
212
- dst->u.nodeset.items = items;
213
- dst->u.nodeset.count = n;
214
- dst->u.nodeset.capacity = n;
215
- return 0;
216
- }
217
- }
218
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: unknown value type");
219
- return -1;
220
- }
221
-
222
- /* ---------- node string-value (XPath 1.0 §5) ---------- */
223
-
224
- /* ---------- node string-value (XPath 1.0 §5) ----------
225
- *
226
- * Built into an mkr_buf_t whose `max` is the per-evaluate byte cap: append fails
227
- * closed with MKR_ERR_LIMIT past the cap and MKR_ERR_OOM on allocation failure,
228
- * so there is never a partial/truncated result. Lexbor-allocated text is freed
229
- * after each append (otherwise we'd leak document-arena memory on every XPath
230
- * that touches text content). */
231
-
232
- /* Append `node`'s own text content. */
233
- static mkr_status_t
234
- append_text_content(lxb_dom_node_t *node, mkr_buf_t *buf)
235
- {
236
- size_t tlen = 0;
237
- lxb_char_t *t = lxb_dom_node_text_content(node, &tlen);
238
- if (t == NULL) return MKR_OK;
239
- mkr_status_t st = mkr_buf_append(buf, t, tlen);
240
- lxb_dom_document_destroy_text(node->owner_document, t);
241
- return st;
242
- }
243
-
244
- /* Append the string-value of every TEXT descendant of `node`, in document
245
- * order. Iterative (parent-pointer) pre-order walk rather than C recursion, so
246
- * an adversarially deep tree cannot overflow the stack (fail-closed / no DoS);
247
- * O(1) extra space. Descends only into elements, matching the original. */
248
- static mkr_status_t
249
- append_text_descendants(lxb_dom_node_t *node, mkr_buf_t *buf)
250
- {
251
- lxb_dom_node_t *cur = node->first_child;
252
- while (cur != NULL) {
253
- if (cur->type == LXB_DOM_NODE_TYPE_TEXT) {
254
- mkr_status_t st = append_text_content(cur, buf);
255
- if (st != MKR_OK) return st; /* LIMIT or OOM — caller fails closed */
256
- }
257
- if (cur->type == LXB_DOM_NODE_TYPE_ELEMENT && cur->first_child != NULL) {
258
- cur = cur->first_child;
259
- continue;
260
- }
261
- while (cur != node && cur->next == NULL) {
262
- cur = cur->parent;
263
- }
264
- if (cur == node) return MKR_OK;
265
- cur = cur->next;
266
- }
267
- return MKR_OK;
268
- }
269
-
270
- /* Build node's string-value into `buf` (cap carried by buf->max). */
271
- static mkr_status_t
272
- build_string_value(const lxb_dom_node_t *node, mkr_buf_t *buf)
273
- {
274
- if (node == NULL) return MKR_OK;
275
-
276
- switch (node->type) {
277
- case LXB_DOM_NODE_TYPE_ATTRIBUTE: {
278
- lxb_dom_attr_t *attr = (lxb_dom_attr_t *)node;
279
- size_t vlen = 0;
280
- const lxb_char_t *v = lxb_dom_attr_value(attr, &vlen);
281
- return mkr_buf_append(buf, v ? (const char *)v : "", vlen);
282
- }
283
- case LXB_DOM_NODE_TYPE_TEXT:
284
- case LXB_DOM_NODE_TYPE_CDATA_SECTION:
285
- case LXB_DOM_NODE_TYPE_COMMENT:
286
- case LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION:
287
- return append_text_content((lxb_dom_node_t *)node, buf);
288
- default:
289
- return append_text_descendants((lxb_dom_node_t *)node, buf);
290
- }
291
- }
292
-
293
- static void
294
- mkr_build_node_text_unchecked(const lxb_dom_node_t *node, mkr_owned_text_t *out)
295
- {
296
- /* Uncapped, best-effort: callers (number/string coercion) require a non-NULL
297
- * text, so on any failure fall back to an owned "" rather than NULL. */
298
- mkr_owned_text_init(out);
299
- mkr_buf_t buf;
300
- mkr_buf_init(&buf, 0);
301
- if (build_string_value(node, &buf) != MKR_OK) {
302
- mkr_buf_free(&buf);
303
- (void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
304
- return;
305
- }
306
- if (mkr_owned_text_from_buf_steal(out, &buf, NULL, NULL) != 0) {
307
- (void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
308
- }
309
- }
310
-
311
- int
312
- mkr_node_to_owned_text_or_fail(const lxb_dom_node_t *node,
313
- mkr_xpath_limits_t *limits,
314
- mkr_xpath_error_t *err,
315
- mkr_owned_text_t *out)
316
- {
317
- if (out == NULL) {
318
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_node_to_owned_text_or_fail: bad args");
319
- return -1;
320
- }
321
- mkr_owned_text_init(out);
322
- mkr_buf_t buf;
323
- mkr_buf_init(&buf, (limits != NULL) ? limits->max_string_bytes : 0);
324
- mkr_status_t st = build_string_value(node, &buf);
325
- if (st == MKR_ERR_LIMIT) {
326
- mkr_buf_free(&buf);
327
- mkr_err_setf(err, MKR_XPATH_ERR_LIMIT,
328
- "string size limit exceeded (%zu bytes) while building node string-value",
329
- limits->max_string_bytes);
330
- return -1;
331
- }
332
- if (st != MKR_OK) {
333
- mkr_buf_free(&buf);
334
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building node string-value");
335
- return -1;
336
- }
337
- return mkr_owned_text_from_buf_steal(out, &buf, err, "out of memory building node string-value");
338
- }
339
-
340
- int
341
- mkr_val_to_owned_text_or_fail(const mkr_val_t *v,
342
- mkr_xpath_limits_t *limits,
343
- mkr_xpath_error_t *err,
344
- mkr_owned_text_t *out)
345
- {
346
- if (out == NULL) {
347
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_owned_text_or_fail: bad args");
348
- return -1;
349
- }
350
- mkr_owned_text_init(out);
351
- if (v == NULL) {
352
- return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory converting value to string");
353
- }
354
- switch (v->type) {
355
- case MKR_XPATH_TYPE_STRING: {
356
- mkr_borrowed_text_t text = mkr_borrowed_text_from_owned(v->u.string);
357
- if (text.ptr == NULL) text.len = 0;
358
- if (limits != NULL && mkr_limit_check_string_bytes(limits, text.len, err) != 0) return -1;
359
- return mkr_owned_text_from_borrowed_copy(out, text,
360
- err, "out of memory copying string value");
361
- }
362
- case MKR_XPATH_TYPE_BOOLEAN:
363
- return v->u.boolean
364
- ? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("true"), err, "out of memory converting boolean to string")
365
- : mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("false"), err, "out of memory converting boolean to string");
366
- case MKR_XPATH_TYPE_NUMBER: {
367
- double d = v->u.number;
368
- if (isnan(d)) {
369
- return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("NaN"), err, "out of memory converting number to string");
370
- }
371
- if (isinf(d)) {
372
- return d < 0
373
- ? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("-Infinity"), err, "out of memory converting number to string")
374
- : mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("Infinity"), err, "out of memory converting number to string");
375
- }
376
- if (d == 0.0) {
377
- return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("0"), err, "out of memory converting number to string");
378
- }
379
- char buf[64];
380
- int n;
381
- if (d == floor(d) && fabs(d) < 1e15) {
382
- n = snprintf(buf, sizeof(buf), "%lld", (long long)d);
383
- } else {
384
- n = snprintf(buf, sizeof(buf), "%.15g", d);
385
- }
386
- if (n < 0 || (size_t)n >= sizeof(buf)) {
387
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "number string conversion overflow");
388
- return -1;
389
- }
390
- char *p = mkr_strndup(buf, (size_t)n);
391
- if (p == NULL) { mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory converting number to string"); return -1; }
392
- *out = mkr_owned_text(p, (size_t)n);
393
- return 0;
394
- }
395
- case MKR_XPATH_TYPE_NODESET:
396
- if (v->u.nodeset.count == 0) {
397
- return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory");
398
- }
399
- /* XPath 1.0 §4.2: string(node-set) = string-value of first node in doc order. */
400
- return mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, out);
401
- }
402
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "unknown value type");
403
- return -1;
404
- }
405
-
406
- int
407
- mkr_val_to_number_or_fail(const mkr_val_t *v,
408
- mkr_xpath_limits_t *limits,
409
- mkr_xpath_error_t *err,
410
- double *out)
411
- {
412
- if (v == NULL || out == NULL) {
413
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_number_or_fail: bad args");
414
- return -1;
415
- }
416
- if (v->type == MKR_XPATH_TYPE_NODESET) {
417
- if (v->u.nodeset.count == 0) {
418
- *out = (double)NAN;
419
- return 0;
420
- }
421
- mkr_owned_text_t text;
422
- if (mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, &text) != 0) return -1;
423
- *out = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
424
- mkr_owned_text_clear(&text);
425
- return 0;
426
- }
427
- *out = mkr_val_to_number_unchecked(v);
428
- return 0;
429
- }
430
-
431
- /* ---------- coercions ---------- */
432
-
433
- double
434
- mkr_borrowed_text_to_number(mkr_borrowed_text_t t)
435
- {
436
- if (t.ptr == NULL) return (double)NAN;
437
- const char *s = t.ptr;
438
- while (*s && isspace((unsigned char)*s)) s++;
439
- if (*s == '\0') return (double)NAN;
440
- char *end = NULL;
441
- double d = strtod(s, &end);
442
- if (end == s) return (double)NAN;
443
- while (*end && isspace((unsigned char)*end)) end++;
444
- if (*end != '\0') return (double)NAN;
445
- return d;
446
- }
447
-
448
- double
449
- mkr_val_to_number_unchecked(const mkr_val_t *v)
450
- {
451
- switch (v->type) {
452
- case MKR_XPATH_TYPE_NUMBER:
453
- return v->u.number;
454
- case MKR_XPATH_TYPE_BOOLEAN:
455
- return v->u.boolean ? 1.0 : 0.0;
456
- case MKR_XPATH_TYPE_STRING:
457
- return mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(v->u.string));
458
- case MKR_XPATH_TYPE_NODESET: {
459
- if (v->u.nodeset.count == 0) return (double)NAN;
460
- /* string-value of first node in document order */
461
- mkr_owned_text_t text;
462
- mkr_build_node_text_unchecked(v->u.nodeset.items[0], &text);
463
- double d = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
464
- mkr_owned_text_clear(&text);
465
- return d;
466
- }
467
- }
468
- return (double)NAN;
469
- }
470
-
471
- int
472
- mkr_val_to_boolean(const mkr_val_t *v)
473
- {
474
- switch (v->type) {
475
- case MKR_XPATH_TYPE_BOOLEAN:
476
- return v->u.boolean;
477
- case MKR_XPATH_TYPE_NUMBER:
478
- return !(v->u.number == 0.0 || isnan(v->u.number));
479
- case MKR_XPATH_TYPE_STRING:
480
- return v->u.string.ptr != NULL && v->u.string.ptr[0] != '\0';
481
- case MKR_XPATH_TYPE_NODESET:
482
- return v->u.nodeset.count > 0;
483
- }
484
- return 0;
485
- }
486
-
487
- /* ---------- document order ---------- */
488
-
489
- /*
490
- * Treat an attribute node as positioned "with" its owner element for
491
- * cross-subtree comparisons; only when both belong to the same element
492
- * does the attribute-vs-attribute or attribute-vs-descendant rule kick in.
493
- */
494
- static const lxb_dom_node_t *
495
- anchor_for_cmp(const lxb_dom_node_t *n)
496
- {
497
- if (n->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
498
- return n->parent ? n->parent : n;
499
- }
500
- return n;
501
- }
502
-
503
- static int
504
- depth_of(const lxb_dom_node_t *n)
505
- {
506
- int d = 0;
507
- while (n->parent) { d++; n = n->parent; }
508
- return d;
509
- }
510
-
511
- static int
512
- doc_order_cmp(const lxb_dom_node_t *a, const lxb_dom_node_t *b)
513
- {
514
- if (a == b) return 0;
515
- const lxb_dom_node_t *aa = anchor_for_cmp(a);
516
- const lxb_dom_node_t *bb = anchor_for_cmp(b);
517
-
518
- /* If the anchors are the same element, decide by node type. A non-attribute
519
- * node that anchors to the same element E can ONLY be E itself: any other
520
- * node (a child/descendant) anchors to itself, not to E, so it would not
521
- * reach this branch (the attribute-vs-descendant case is handled below by
522
- * the depth-normalisation walk). Per XPath 1.0 §5.1 document order is
523
- * "element, then its attribute nodes, then its children", so an attribute
524
- * comes AFTER its own owner element. */
525
- if (aa == bb) {
526
- int a_attr = (a->type == LXB_DOM_NODE_TYPE_ATTRIBUTE);
527
- int b_attr = (b->type == LXB_DOM_NODE_TYPE_ATTRIBUTE);
528
- if (a_attr && !b_attr) return 1; /* b is the owner element E; a (its attr) follows */
529
- if (b_attr && !a_attr) return -1; /* a is the owner element E; b (its attr) follows */
530
- /* Both attributes of the same element: relative order is
531
- * implementation-defined. Use insertion order via attr linked list. */
532
- if (a_attr && b_attr) {
533
- for (const lxb_dom_attr_t *at = ((const lxb_dom_element_t *)aa)->first_attr;
534
- at != NULL; at = at->next) {
535
- if ((const lxb_dom_node_t *)at == a) return -1;
536
- if ((const lxb_dom_node_t *)at == b) return 1;
537
- }
538
- return 0;
539
- }
540
- /* aa == bb but neither is an attribute means a == b, handled above. */
541
- return 0;
542
- }
543
-
544
- int da = depth_of(aa), db = depth_of(bb);
545
- while (da > db) { aa = aa->parent; da--; }
546
- while (db > da) { bb = bb->parent; db--; }
547
- if (aa == bb) {
548
- /* One is ancestor of the other; ancestor comes first. */
549
- return (aa == anchor_for_cmp(a)) ? -1 : 1;
550
- }
551
- while (aa->parent != bb->parent) {
552
- aa = aa->parent;
553
- bb = bb->parent;
554
- }
555
- /* Resolve sibling order. Scan outward from aa and bb in lockstep (via ->next)
556
- * rather than forward from parent->first_child: the cost is then O(distance
557
- * between aa and bb), not O(distance from the first child. The latter is
558
- * quadratic when sorting nodes that sit deep in a wide, flat parent (e.g. a
559
- * predicate result picking scattered <li> from a 2000-child <ul>), which the
560
- * doc-order index would only avoid once a single sort reaches its build
561
- * threshold. */
562
- if (aa->parent == NULL) {
563
- /* Different documents/roots — undefined; keep stable. */
564
- return 0;
565
- }
566
- const lxb_dom_node_t *fa = aa, *fb = bb;
567
- for (;;) {
568
- fa = fa ? fa->next : NULL;
569
- fb = fb ? fb->next : NULL;
570
- if (fa == bb) return -1; /* bb lies after aa -> aa first */
571
- if (fb == aa) return 1; /* aa lies after bb -> bb first */
572
- if (fa == NULL && fb == NULL) return 0; /* unreachable for same-parent nodes */
573
- }
574
- }
575
-
576
- /* ---------- per-evaluate document-order index ---------- */
577
-
578
- static uint32_t
579
- pointer_hash(const void *p)
580
- {
581
- uintptr_t x = (uintptr_t)p;
582
- /* SplitMix-style mixing — cheap and good enough for pointer keys. */
583
- x = (x ^ (x >> 16)) * 0x9E3779B9u;
584
- x = (x ^ (x >> 13)) * 0x85EBCA6Bu;
585
- return (uint32_t)(x ^ (x >> 16));
586
- }
587
-
588
- void
589
- mkr_doc_order_index_init(mkr_doc_order_index_t *idx)
590
- {
591
- idx->buckets = NULL;
592
- idx->cap = 0;
593
- idx->count = 0;
594
- idx->built = 0;
595
- }
596
-
597
- void
598
- mkr_doc_order_index_clear(mkr_doc_order_index_t *idx)
599
- {
600
- if (idx == NULL) return;
601
- free(idx->buckets);
602
- idx->buckets = NULL;
603
- idx->cap = 0;
604
- idx->count = 0;
605
- idx->built = 0;
606
- }
607
-
608
- /* Insert (node, ord) into the open-addressing table. Grows when load
609
- * factor exceeds 3/4. Returns 0 on success, -1 on OOM. */
610
- static int
611
- order_index_insert(mkr_doc_order_index_t *idx, const lxb_dom_node_t *node, size_t ord)
612
- {
613
- if (idx->cap == 0 || idx->count * 4 >= idx->cap * 3) {
614
- size_t new_cap = 256;
615
- if (idx->cap != 0 && !mkr_size_mul(idx->cap, 2, &new_cap)) {
616
- return -1; /* overflow */
617
- }
618
- void *new_buckets = mkr_callocarray(new_cap, sizeof(*idx->buckets));
619
- if (new_buckets == NULL) return -1;
620
- /* Rehash. */
621
- typeof(idx->buckets) old_buckets = idx->buckets;
622
- size_t old_cap = idx->cap;
623
- idx->buckets = new_buckets;
624
- idx->cap = new_cap;
625
- idx->count = 0;
626
- for (size_t i = 0; i < old_cap; ++i) {
627
- if (old_buckets[i].node != NULL) {
628
- size_t mask = new_cap - 1;
629
- size_t j = pointer_hash(old_buckets[i].node) & mask;
630
- while (idx->buckets[j].node != NULL) j = (j + 1) & mask;
631
- idx->buckets[j].node = old_buckets[i].node;
632
- idx->buckets[j].ord = old_buckets[i].ord;
633
- idx->count++;
634
- }
635
- }
636
- free(old_buckets);
637
- }
638
- size_t mask = idx->cap - 1;
639
- size_t j = pointer_hash(node) & mask;
640
- while (idx->buckets[j].node != NULL) {
641
- if (idx->buckets[j].node == node) return 0; /* already present */
642
- j = (j + 1) & mask;
643
- }
644
- idx->buckets[j].node = node;
645
- idx->buckets[j].ord = ord;
646
- idx->count++;
647
- return 0;
648
- }
649
-
650
- static int
651
- order_index_lookup(const mkr_doc_order_index_t *idx, const lxb_dom_node_t *node,
652
- size_t *out_ord)
653
- {
654
- if (idx->cap == 0) return -1;
655
- size_t mask = idx->cap - 1;
656
- size_t j = pointer_hash(node) & mask;
657
- while (idx->buckets[j].node != NULL) {
658
- if (idx->buckets[j].node == node) {
659
- if (out_ord) *out_ord = idx->buckets[j].ord;
660
- return 0;
661
- }
662
- j = (j + 1) & mask;
663
- }
664
- return -1;
665
- }
666
-
667
- /* DFS pre-order: assign ordinal to the element, then its attributes
668
- * (in linked-list order, before children), then descendants. This
669
- * matches doc_order_cmp's attribute placement.
670
- *
671
- * Iterative (parent-pointer) walk rather than C recursion, so an adversarially
672
- * deep tree cannot overflow the stack (fail-closed / no DoS); O(1) extra space.
673
- * The traversal stays within the subtree rooted at `root` (it never follows
674
- * root->next). */
675
- static int
676
- order_index_walk(mkr_doc_order_index_t *idx, lxb_dom_node_t *root, size_t *next_ord)
677
- {
678
- lxb_dom_node_t *cur = root;
679
- while (cur != NULL) {
680
- /* Visit (pre-order): the node, then its attributes before any child. */
681
- if (order_index_insert(idx, cur, (*next_ord)++) != 0) return -1;
682
- if (cur->type == LXB_DOM_NODE_TYPE_ELEMENT) {
683
- lxb_dom_element_t *el = (lxb_dom_element_t *)cur;
684
- for (lxb_dom_attr_t *a = el->first_attr; a != NULL; a = a->next) {
685
- if (order_index_insert(idx, (lxb_dom_node_t *)a, (*next_ord)++) != 0) return -1;
686
- }
687
- }
688
- if (cur->first_child != NULL) {
689
- cur = cur->first_child;
690
- continue;
691
- }
692
- while (cur != root && cur->next == NULL) {
693
- cur = cur->parent;
694
- }
695
- if (cur == root) break;
696
- cur = cur->next;
697
- }
698
- return 0;
699
- }
700
-
701
- static int
702
- order_index_build(mkr_doc_order_index_t *idx, lxb_dom_node_t *root,
703
- mkr_xpath_error_t *err)
704
- {
705
- if (idx->built) return 0;
706
- if (root == NULL) return -1;
707
- size_t next_ord = 0;
708
- if (order_index_walk(idx, root, &next_ord) != 0) {
709
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building document order index");
710
- mkr_doc_order_index_clear(idx);
711
- return -1;
712
- }
713
- idx->built = 1;
714
- return 0;
715
- }
716
-
717
- /* Indexed comparator. Falls back to doc_order_cmp on any miss
718
- * (e.g., synthesised nodes or cross-document compares). */
719
- static int
720
- doc_order_cmp_ctx(mkr_xpath_context_t *ctx, const lxb_dom_node_t *a, const lxb_dom_node_t *b)
721
- {
722
- if (a == b) return 0;
723
- if (ctx == NULL) return doc_order_cmp(a, b);
724
- mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
725
- if (idx == NULL || !idx->built) return doc_order_cmp(a, b);
726
- size_t oa, ob;
727
- if (order_index_lookup(idx, a, &oa) != 0) return doc_order_cmp(a, b);
728
- if (order_index_lookup(idx, b, &ob) != 0) return doc_order_cmp(a, b);
729
- /* Safe comparison — compare, don't subtract (unsigned difference wraps). */
730
- if (oa < ob) return -1;
731
- if (oa > ob) return 1;
732
- return 0;
733
- }
734
-
735
- /* Bottom-up merge sort. Threading ctx through avoids the qsort_r /
736
- * thread-local hack and keeps everything reentrant. Stable as a
737
- * bonus: ties (same ord — only possible for synthesised nodes that
738
- * weren't in the index) preserve insertion order. */
739
- static void
740
- ms_merge(lxb_dom_node_t **arr, lxb_dom_node_t **tmp,
741
- size_t lo, size_t mid, size_t hi, mkr_xpath_context_t *ctx)
742
- {
743
- size_t i = lo, j = mid, k = lo;
744
- while (i < mid && j < hi) {
745
- if (doc_order_cmp_ctx(ctx, arr[i], arr[j]) <= 0) tmp[k++] = arr[i++];
746
- else tmp[k++] = arr[j++];
747
- }
748
- while (i < mid) tmp[k++] = arr[i++];
749
- while (j < hi) tmp[k++] = arr[j++];
750
- for (size_t x = lo; x < hi; ++x) arr[x] = tmp[x];
751
- }
752
-
753
- static void
754
- ms_sort(lxb_dom_node_t **arr, lxb_dom_node_t **tmp,
755
- size_t lo, size_t hi, mkr_xpath_context_t *ctx)
756
- {
757
- if (hi - lo < 2) return;
758
- size_t mid = lo + (hi - lo) / 2;
759
- ms_sort(arr, tmp, lo, mid, ctx);
760
- ms_sort(arr, tmp, mid, hi, ctx);
761
- ms_merge(arr, tmp, lo, mid, hi, ctx);
762
- }
763
-
764
- /* qsort fallback used only when tmp-buffer allocation fails. */
765
- static int
766
- doc_order_qsort_cb_fallback(const void *pa, const void *pb)
767
- {
768
- const lxb_dom_node_t *a = *(const lxb_dom_node_t * const *)pa;
769
- const lxb_dom_node_t *b = *(const lxb_dom_node_t * const *)pb;
770
- return doc_order_cmp(a, b);
771
- }
772
-
773
- /* Threshold for building the doc-order index. Below this we expect
774
- * N log N parent-chain compares to be cheaper than the O(D) full-doc
775
- * walk that the index requires (D = total nodes in document, which is
776
- * typically 6000+ on real pages). Empirically the crossover sits
777
- * somewhere between N=100 and N=300 on coffee.html; we pick a safe
778
- * point that keeps small unions and reverse-axis dedups off the slow
779
- * build path. Once the index IS built (e.g., by a larger sort earlier
780
- * in the same evaluate), subsequent small sorts naturally reuse it. */
781
- #define MKR_INDEX_BUILD_MIN 200
782
-
783
- void
784
- mkr_nodeset_sort_doc_order(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
785
- {
786
- if (ns == NULL || ns->count < 2) return;
787
-
788
- /* Lazy build of the doc-order index. Only worth doing when the sort
789
- * itself is large enough to amortise the full-doc walk; smaller
790
- * sorts fall through to parent-chain compares via doc_order_cmp_ctx
791
- * (which sees an unbuilt index and dispatches accordingly). */
792
- mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
793
- if (idx != NULL && !idx->built && ns->count >= MKR_INDEX_BUILD_MIN) {
794
- lxb_dom_node_t *root = (lxb_dom_node_t *)mkr_ctx_document(ctx);
795
- mkr_xpath_error_t ierr = {0};
796
- (void)order_index_build(idx, root, &ierr);
797
- mkr_xpath_error_clear(&ierr); /* index is best-effort; on OOM we fall through to parent-chain cmp */
798
- }
799
-
800
- lxb_dom_node_t **tmp = mkr_reallocarray(NULL, ns->count, sizeof(*tmp));
801
- if (tmp == NULL) {
802
- /* Fall back to in-place qsort with parent-chain compare (slow but
803
- * correct). Should be a very rare path. */
804
- qsort(ns->items, ns->count, sizeof(ns->items[0]), doc_order_qsort_cb_fallback);
805
- return;
806
- }
807
- ms_sort(ns->items, tmp, 0, ns->count, ctx);
808
- free(tmp);
809
- }
810
-
811
- void
812
- mkr_nodeset_unique_sorted(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
813
- {
814
- if (ns == NULL || ns->count < 2) return;
815
- mkr_nodeset_sort_doc_order(ctx, ns);
816
- size_t w = 1;
817
- for (size_t r = 1; r < ns->count; ++r) {
818
- if (ns->items[r] != ns->items[r - 1]) {
819
- ns->items[w++] = ns->items[r];
820
- }
821
- }
822
- ns->count = w;
823
- }
824
-
825
- /* ---------- per-evaluation string-value cache ---------- */
826
-
827
- void
828
- mkr_str_cache_init(mkr_str_cache_t *c)
829
- {
830
- c->entries = NULL;
831
- c->count = 0;
832
- c->cap = 0;
833
- c->buckets = NULL;
834
- c->bucket_cap = 0;
835
- }
836
-
837
- /* Insert entry index `idx` (keyed by entries[idx].node) into the index. The
838
- * index must have room (callers grow/rehash first). */
839
- static void
840
- mkr_str_cache_index_put(mkr_str_cache_t *c, size_t idx)
841
- {
842
- size_t mask = c->bucket_cap - 1;
843
- size_t j = pointer_hash(c->entries[idx].node) & mask;
844
- while (c->buckets[j] != 0) {
845
- j = (j + 1) & mask;
846
- }
847
- c->buckets[j] = idx + 1;
848
- }
849
-
850
- /* Rebuild the index from entries[0, count). Returns -1 on OOM. */
851
- static int
852
- mkr_str_cache_reindex(mkr_str_cache_t *c, size_t bucket_cap)
853
- {
854
- size_t *buckets = mkr_callocarray(bucket_cap, sizeof(*buckets));
855
- if (buckets == NULL) return -1;
856
- free(c->buckets);
857
- c->buckets = buckets;
858
- c->bucket_cap = bucket_cap;
859
- for (size_t i = 0; i < c->count; ++i) {
860
- mkr_str_cache_index_put(c, i);
861
- }
862
- return 0;
863
- }
864
-
865
- void
866
- mkr_str_cache_truncate(mkr_str_cache_t *c, size_t target_count)
867
- {
868
- if (c == NULL || target_count >= c->count) return;
869
- for (size_t i = target_count; i < c->count; ++i) {
870
- free(c->entries[i].str);
871
- }
872
- c->count = target_count;
873
- /* Drop the removed nodes from the index. A full truncate just clears it;
874
- * a partial one (nested-eval snapshot restore) rebuilds from what remains. */
875
- if (c->buckets != NULL) {
876
- if (target_count == 0) {
877
- size_t buckets_bytes;
878
- if (!mkr_size_mul(c->bucket_cap, sizeof(*c->buckets), &buckets_bytes)) {
879
- free(c->buckets);
880
- c->buckets = NULL;
881
- c->bucket_cap = 0;
882
- return;
883
- }
884
- memset(c->buckets, 0, buckets_bytes);
885
- } else {
886
- mkr_str_cache_reindex(c, c->bucket_cap);
887
- }
888
- }
889
- }
890
-
891
- void
892
- mkr_str_cache_clear(mkr_str_cache_t *c)
893
- {
894
- if (c == NULL) return;
895
- for (size_t i = 0; i < c->count; ++i) {
896
- free(c->entries[i].str);
897
- }
898
- free(c->entries);
899
- free(c->buckets);
900
- c->entries = NULL;
901
- c->count = 0;
902
- c->cap = 0;
903
- c->buckets = NULL;
904
- c->bucket_cap = 0;
905
- }
906
-
907
- int
908
- mkr_get_cached_node_text(mkr_xpath_context_t *ctx,
909
- lxb_dom_node_t *node,
910
- mkr_borrowed_text_t *out,
911
- mkr_xpath_error_t *err)
912
- {
913
- if (out == NULL) {
914
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_get_cached_node_text: bad args");
915
- return -1;
916
- }
917
- *out = mkr_borrowed_text(NULL, 0);
918
- /* Contract: ctx is non-NULL when called from the evaluator (the only
919
- * intended caller). A NULL ctx is a programming error; surface it. */
920
- mkr_str_cache_t *c = mkr_ctx_str_cache(ctx);
921
- if (c == NULL) {
922
- mkr_err_set(err, MKR_XPATH_ERR_INTERNAL,
923
- "mkr_get_cached_node_text called without a context");
924
- return -1;
925
- }
926
-
927
- /* O(1) lookup via the pointer-keyed index. */
928
- if (c->bucket_cap != 0) {
929
- size_t mask = c->bucket_cap - 1;
930
- size_t j = pointer_hash(node) & mask;
931
- while (c->buckets[j] != 0) {
932
- mkr_str_cache_entry_t *e = &c->entries[c->buckets[j] - 1];
933
- if (e->node == node) {
934
- *out = mkr_borrowed_text(e->str, e->len);
935
- return 0;
936
- }
937
- j = (j + 1) & mask;
938
- }
939
- }
940
-
941
- mkr_owned_text_t text;
942
- if (mkr_node_to_owned_text_or_fail(node, mkr_ctx_limits(ctx), err, &text) != 0) return -1;
943
-
944
- if (mkr_grow_reserve((void **)&c->entries, &c->cap, c->count + 1,
945
- sizeof(*c->entries)) != MKR_OK) {
946
- mkr_owned_text_clear(&text);
947
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory in node string cache");
948
- return -1;
949
- }
950
- c->entries[c->count].node = node;
951
- c->entries[c->count].str = text.ptr;
952
- c->entries[c->count].len = text.len;
953
-
954
- /* Grow / build the index, keeping load factor <= 1/2. */
955
- if (c->bucket_cap == 0 || (c->count + 1) * 2 > c->bucket_cap) {
956
- size_t new_bucket_cap = 64;
957
- if (c->bucket_cap != 0 && !mkr_size_mul(c->bucket_cap, 2, &new_bucket_cap)) {
958
- mkr_owned_text_clear(&text);
959
- c->entries[c->count].node = NULL;
960
- c->entries[c->count].str = NULL;
961
- c->entries[c->count].len = 0;
962
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "node string cache index overflow");
963
- return -1;
964
- }
965
- if (mkr_str_cache_reindex(c, new_bucket_cap) != 0) {
966
- mkr_owned_text_clear(&text);
967
- c->entries[c->count].node = NULL;
968
- c->entries[c->count].str = NULL;
969
- c->entries[c->count].len = 0;
970
- mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory indexing node string cache");
971
- return -1;
972
- }
973
- }
974
- mkr_str_cache_index_put(c, c->count);
975
- c->count++;
976
-
977
- *out = mkr_borrowed_text_from_owned(text);
978
- return 0;
979
- }
980
-
981
- /* ---------- AST destructors ---------- */
982
-
983
- void
984
- mkr_step_clear(mkr_step_t *s)
985
- {
986
- if (s == NULL) return;
987
- mkr_owned_text_clear(&s->test.prefix);
988
- mkr_owned_text_clear(&s->test.local);
989
- mkr_owned_text_clear(&s->test.pi_target);
990
- for (size_t i = 0; i < s->npredicates; ++i) {
991
- mkr_node_free(s->predicates[i]);
992
- }
993
- free(s->predicates);
994
- memset(s, 0, sizeof(*s));
995
- }
996
-
997
- /* ---------- AST hoisting helpers ---------- */
998
-
999
- /* Pure XPath 1.0 built-ins safe to hoist when all args are CI. Listed
1000
- * explicitly to keep the set conservative. Functions that read the
1001
- * context node (last/position, 0-arg string/normalize-space/local-
1002
- * name/etc., lang) or that may depend on dynamic state (id, handler-
1003
- * routed) are intentionally absent. */
1004
- static int
1005
- is_pure_builtin_name(const char *name, size_t nargs)
1006
- {
1007
- if (name == NULL) return 0;
1008
- /* 0-arg only — these read no input. */
1009
- if (nargs == 0) {
1010
- return strcmp(name, "true") == 0 || strcmp(name, "false") == 0;
1011
- }
1012
- /* n-arg pure functions — all args must themselves be CI (checked
1013
- * by the caller). */
1014
- static const char *pure_names[] = {
1015
- "count", "string-length", "number", "boolean", "not",
1016
- "floor", "ceiling", "round", "sum",
1017
- "concat", "starts-with", "contains",
1018
- "substring-before", "substring-after", "substring",
1019
- "translate",
1020
- NULL,
1021
- };
1022
- for (size_t i = 0; pure_names[i]; ++i) {
1023
- if (strcmp(pure_names[i], name) == 0) return 1;
1024
- }
1025
- return 0;
1026
- }
1027
-
1028
- static void
1029
- mark_step_predicates(mkr_step_t *s)
1030
- {
1031
- for (size_t i = 0; i < s->npredicates; ++i) {
1032
- mkr_mark_context_independent(s->predicates[i]);
1033
- }
1034
- }
1035
-
1036
- void
1037
- mkr_mark_context_independent(mkr_node_t *n)
1038
- {
1039
- if (n == NULL) return;
1040
- int ci = 0;
1041
- switch (n->kind) {
1042
- case MKR_NK_LITERAL_STR:
1043
- case MKR_NK_LITERAL_NUM:
1044
- ci = 1;
1045
- break;
1046
- case MKR_NK_VARREF:
1047
- /* Conservative: variables not hoisted even though XPath 1.0 says
1048
- * they're fixed per evaluation. */
1049
- ci = 0;
1050
- break;
1051
- case MKR_NK_FNCALL: {
1052
- /* Recurse first so subtrees get their own CI marks even when this
1053
- * call itself is not hoistable. */
1054
- for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
1055
- mkr_mark_context_independent(n->u.fncall.args[i]);
1056
- }
1057
- if (n->u.fncall.prefix.ptr != NULL) {
1058
- ci = 0; /* Handler-routed or namespaced builtins → non-CI. */
1059
- break;
1060
- }
1061
- if (!is_pure_builtin_name(n->u.fncall.name.ptr, n->u.fncall.nargs)) {
1062
- ci = 0;
1063
- break;
1064
- }
1065
- ci = 1;
1066
- for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
1067
- if (!n->u.fncall.args[i]->is_context_independent) { ci = 0; break; }
1068
- }
1069
- break;
1070
- }
1071
- case MKR_NK_UNARY:
1072
- mkr_mark_context_independent(n->u.unary.expr);
1073
- ci = n->u.unary.expr ? n->u.unary.expr->is_context_independent : 0;
1074
- break;
1075
- case MKR_NK_BINOP:
1076
- mkr_mark_context_independent(n->u.binop.lhs);
1077
- mkr_mark_context_independent(n->u.binop.rhs);
1078
- ci = (n->u.binop.lhs && n->u.binop.lhs->is_context_independent)
1079
- && (n->u.binop.rhs && n->u.binop.rhs->is_context_independent);
1080
- break;
1081
- case MKR_NK_PATH:
1082
- /* Absolute path is CI: seed is the document root regardless of
1083
- * outer context. Relative paths use the outer context node and
1084
- * are not hoistable. Predicates inside the path are evaluated
1085
- * against the path's own context, so their position()/last() do
1086
- * not leak — recurse so any pure sub-expressions still get marks. */
1087
- ci = n->u.path.absolute ? 1 : 0;
1088
- for (size_t i = 0; i < n->u.path.nsteps; ++i) {
1089
- mark_step_predicates(&n->u.path.steps[i]);
1090
- }
1091
- break;
1092
- case MKR_NK_FILTER:
1093
- /* Conservative: filter expressions are not hoisted in v1. */
1094
- ci = 0;
1095
- mkr_mark_context_independent(n->u.filter.expr);
1096
- for (size_t i = 0; i < n->u.filter.npreds; ++i) {
1097
- mkr_mark_context_independent(n->u.filter.preds[i]);
1098
- }
1099
- for (size_t i = 0; i < n->u.filter.npath; ++i) {
1100
- mark_step_predicates(&n->u.filter.path_steps[i]);
1101
- }
1102
- break;
1103
- }
1104
- n->is_context_independent = (uint8_t)ci;
1105
- }
1106
-
1107
- static void
1108
- clear_memos_step(mkr_step_t *s)
1109
- {
1110
- for (size_t i = 0; i < s->npredicates; ++i) {
1111
- mkr_node_clear_memos(s->predicates[i]);
1112
- }
1113
- }
1114
-
1115
- /* ---------- peephole: //X fusion ---------- */
1116
-
1117
- /*
1118
- * Collapse pairs of consecutive steps:
1119
- * (axis=descendant-or-self, test=node(), no predicates)
1120
- * (axis=child, test=*, no predicates)
1121
- * into a single
1122
- * (axis=descendant, test=*, no predicates)
1123
- *
1124
- * The fusion is safe per XPath 1.0 only when the child step has no
1125
- * predicates: otherwise '//X[1]' would change meaning ("first X per
1126
- * parent" vs "first X in doc order"). The synthesised // step always
1127
- * has no predicates by construction, so we don't need to check the
1128
- * first step's predicate list — only the child step's.
1129
- */
1130
- static void
1131
- fuse_descendant_or_self_steps(mkr_step_t *steps, size_t *nsteps_ptr)
1132
- {
1133
- if (steps == NULL || *nsteps_ptr < 2) return;
1134
- size_t nsteps = *nsteps_ptr;
1135
- size_t w = 0, r = 0;
1136
- while (r < nsteps) {
1137
- if (r + 1 < nsteps
1138
- && steps[r].axis == MKR_AXIS_DESCENDANT_OR_SELF
1139
- && steps[r].test.kind == MKR_NT_NODE
1140
- && steps[r].test.prefix.ptr == NULL
1141
- && steps[r].npredicates == 0
1142
- && steps[r + 1].axis == MKR_AXIS_CHILD
1143
- && steps[r + 1].npredicates == 0) {
1144
- /* Drop the desc-or-self step and promote the child step. */
1145
- mkr_step_clear(&steps[r]);
1146
- steps[w] = steps[r + 1];
1147
- memset(&steps[r + 1], 0, sizeof(steps[r + 1]));
1148
- steps[w].axis = MKR_AXIS_DESCENDANT;
1149
- w++;
1150
- r += 2;
1151
- } else {
1152
- if (w != r) {
1153
- steps[w] = steps[r];
1154
- memset(&steps[r], 0, sizeof(steps[r]));
1155
- }
1156
- w++;
1157
- r++;
1158
- }
1159
- }
1160
- *nsteps_ptr = w;
1161
- }
1162
-
1163
- void
1164
- mkr_apply_peephole(mkr_node_t *n)
1165
- {
1166
- if (n == NULL) return;
1167
- switch (n->kind) {
1168
- case MKR_NK_FNCALL:
1169
- for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_apply_peephole(n->u.fncall.args[i]);
1170
- break;
1171
- case MKR_NK_UNARY:
1172
- mkr_apply_peephole(n->u.unary.expr);
1173
- break;
1174
- case MKR_NK_BINOP:
1175
- mkr_apply_peephole(n->u.binop.lhs);
1176
- mkr_apply_peephole(n->u.binop.rhs);
1177
- break;
1178
- case MKR_NK_PATH:
1179
- fuse_descendant_or_self_steps(n->u.path.steps, &n->u.path.nsteps);
1180
- for (size_t i = 0; i < n->u.path.nsteps; ++i) {
1181
- for (size_t j = 0; j < n->u.path.steps[i].npredicates; ++j) {
1182
- mkr_apply_peephole(n->u.path.steps[i].predicates[j]);
1183
- }
1184
- }
1185
- break;
1186
- case MKR_NK_FILTER:
1187
- mkr_apply_peephole(n->u.filter.expr);
1188
- for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_apply_peephole(n->u.filter.preds[i]);
1189
- fuse_descendant_or_self_steps(n->u.filter.path_steps, &n->u.filter.npath);
1190
- for (size_t i = 0; i < n->u.filter.npath; ++i) {
1191
- for (size_t j = 0; j < n->u.filter.path_steps[i].npredicates; ++j) {
1192
- mkr_apply_peephole(n->u.filter.path_steps[i].predicates[j]);
1193
- }
1194
- }
1195
- break;
1196
- default:
1197
- break;
1198
- }
1199
- }
1200
-
1201
- void
1202
- mkr_node_clear_memos(mkr_node_t *n)
1203
- {
1204
- if (n == NULL) return;
1205
- if (n->memoized) {
1206
- mkr_val_clear(&n->memo_value);
1207
- n->memoized = 0;
1208
- }
1209
- switch (n->kind) {
1210
- case MKR_NK_FNCALL:
1211
- for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_node_clear_memos(n->u.fncall.args[i]);
1212
- break;
1213
- case MKR_NK_UNARY:
1214
- mkr_node_clear_memos(n->u.unary.expr);
1215
- break;
1216
- case MKR_NK_BINOP:
1217
- mkr_node_clear_memos(n->u.binop.lhs);
1218
- mkr_node_clear_memos(n->u.binop.rhs);
1219
- break;
1220
- case MKR_NK_PATH:
1221
- for (size_t i = 0; i < n->u.path.nsteps; ++i) clear_memos_step(&n->u.path.steps[i]);
1222
- break;
1223
- case MKR_NK_FILTER:
1224
- mkr_node_clear_memos(n->u.filter.expr);
1225
- for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_node_clear_memos(n->u.filter.preds[i]);
1226
- for (size_t i = 0; i < n->u.filter.npath; ++i) clear_memos_step(&n->u.filter.path_steps[i]);
1227
- break;
1228
- default:
1229
- break;
1230
- }
1231
- }
1232
-
1233
- void
1234
- mkr_node_free(mkr_node_t *n)
1235
- {
1236
- if (n == NULL) return;
1237
- /* Free any memoized value first (idempotent). */
1238
- if (n->memoized) {
1239
- mkr_val_clear(&n->memo_value);
1240
- n->memoized = 0;
1241
- }
1242
- switch (n->kind) {
1243
- case MKR_NK_LITERAL_STR:
1244
- mkr_owned_text_clear(&n->u.literal);
1245
- break;
1246
- case MKR_NK_LITERAL_NUM:
1247
- break;
1248
- case MKR_NK_VARREF:
1249
- mkr_owned_text_clear(&n->u.varref.prefix);
1250
- mkr_owned_text_clear(&n->u.varref.name);
1251
- break;
1252
- case MKR_NK_FNCALL:
1253
- mkr_owned_text_clear(&n->u.fncall.prefix);
1254
- mkr_owned_text_clear(&n->u.fncall.name);
1255
- for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
1256
- mkr_node_free(n->u.fncall.args[i]);
1257
- }
1258
- free(n->u.fncall.args);
1259
- break;
1260
- case MKR_NK_UNARY:
1261
- mkr_node_free(n->u.unary.expr);
1262
- break;
1263
- case MKR_NK_BINOP:
1264
- mkr_node_free(n->u.binop.lhs);
1265
- mkr_node_free(n->u.binop.rhs);
1266
- break;
1267
- case MKR_NK_PATH:
1268
- for (size_t i = 0; i < n->u.path.nsteps; ++i) {
1269
- mkr_step_clear(&n->u.path.steps[i]);
1270
- }
1271
- free(n->u.path.steps);
1272
- break;
1273
- case MKR_NK_FILTER:
1274
- mkr_node_free(n->u.filter.expr);
1275
- for (size_t i = 0; i < n->u.filter.npreds; ++i) {
1276
- mkr_node_free(n->u.filter.preds[i]);
1277
- }
1278
- free(n->u.filter.preds);
1279
- for (size_t i = 0; i < n->u.filter.npath; ++i) {
1280
- mkr_step_clear(&n->u.filter.path_steps[i]);
1281
- }
1282
- free(n->u.filter.path_steps);
1283
- break;
1284
- }
1285
- free(n);
1286
- }