makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,609 @@
1
+ /* mkr_xpath_shared.c - the representation-independent engine primitives.
2
+ *
3
+ * Compiled exactly ONCE (one normal .c, not a monomorphized body): none of
4
+ * these functions dereferences a DOM node. They move node *pointers* (node-set
5
+ * build/clone/free), own/compare engine strings, manage the per-eval
6
+ * string-value cache and document-order-index lifecycles, and walk/destroy the
7
+ * AST. A pointer is a pointer whichever representation it points at, so the
8
+ * machine code is identical for HTML and XML -- one shared copy, not two.
9
+ *
10
+ * Contrast the engine bodies (mkr_xpath_{value,funcs,eval}_body.h): those are
11
+ * .h files precisely because they ARE compiled twice (mkr_xpath_engine_html.c /
12
+ * _xml.c include them with MKR_NODE_* bound to each representation). This file
13
+ * is compiled once, so its code lives directly in the .c -- there is nothing to
14
+ * include twice. mkr_xpath_internal.h is included WITHOUT a prelude, so
15
+ * MKR_DOM_NODE stays the neutral `void` default; the node pointers below are
16
+ * never dereferenced, so void* is exact.
17
+ *
18
+ * The driver (mkr_xpath.c), the parser/lexer, AND both engine instances call
19
+ * these by their bare names. Two are extern rather than file-static
20
+ * (mkr_str_cache_index_put, mkr_str_cache_reindex): the string-value cache
21
+ * splits its pure index bookkeeping (here) from its node-dereferencing insert
22
+ * (mkr_get_cached_node_text, in the per-instance value body), so both sides
23
+ * share the one index implementation. Pointer hashing is mkr_ptr_hash
24
+ * (core/mkr_hash.h) - the single pointer hash for every pointer-keyed index.
25
+ */
26
+ #include "mkr_xpath_internal.h"
27
+ #include "../core/mkr_core.h"
28
+
29
+ #include <stdint.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ /* Pointer hashing for the str-cache + doc-order index is shared: mkr_ptr_hash
34
+ * (core/mkr_hash.h), the one pointer hash for every pointer-keyed index. */
35
+
36
+ /* ---------- node-set ---------- */
37
+
38
+ void
39
+ mkr_nodeset_init(mkr_nodeset_t *ns)
40
+ {
41
+ ns->items = NULL;
42
+ ns->count = 0;
43
+ ns->capacity = 0;
44
+ }
45
+
46
+ int
47
+ mkr_nodeset_push(mkr_nodeset_t *ns, MKR_DOM_NODE *node,
48
+ mkr_xpath_limits_t *limits, mkr_xpath_error_t *err)
49
+ {
50
+ if (node == NULL) return 0;
51
+ if (limits != NULL && mkr_limit_check_nodeset_size(limits, ns->count + 1, err) != 0) {
52
+ return -1;
53
+ }
54
+ if (mkr_grow_reserve((void **)&ns->items, &ns->capacity, ns->count + 1,
55
+ sizeof(*ns->items)) != MKR_OK) {
56
+ mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory growing node-set");
57
+ return -1;
58
+ }
59
+ ns->items[ns->count++] = node;
60
+ return 0;
61
+ }
62
+
63
+ void
64
+ mkr_nodeset_clear(mkr_nodeset_t *ns)
65
+ {
66
+ if (ns == NULL) return;
67
+ free(ns->items);
68
+ ns->items = NULL;
69
+ ns->count = 0;
70
+ ns->capacity = 0;
71
+ }
72
+
73
+ /* ---------- owned / borrowed text ---------- */
74
+
75
+ void
76
+ mkr_owned_text_init(mkr_owned_text_t *t)
77
+ {
78
+ if (t == NULL) return;
79
+ t->ptr = NULL;
80
+ t->len = 0;
81
+ }
82
+
83
+ void
84
+ mkr_owned_text_clear(mkr_owned_text_t *t)
85
+ {
86
+ if (t == NULL) return;
87
+ free(t->ptr);
88
+ t->ptr = NULL;
89
+ t->len = 0;
90
+ }
91
+
92
+ int
93
+ mkr_borrowed_text_eq(mkr_borrowed_text_t a, mkr_borrowed_text_t b)
94
+ {
95
+ /* Delegate to the audited core primitive: equal lengths AND (len 0 OR
96
+ * memcmp). A zero-length view is equal regardless of ptr, so a NULL-repr
97
+ * empty and a ""-repr empty compare equal and NULL is never dereferenced. */
98
+ return mkr_bytes_eq(a.ptr, a.len, b.ptr, b.len);
99
+ }
100
+
101
+ /* Copy an already-valid borrowed text into owned storage. Taking
102
+ * mkr_borrowed_text_t (not raw char*+len) keeps the type contract: an
103
+ * mkr_owned_text_t can only be minted from text the caller has asserted valid
104
+ * (via mkr_borrowed_text / mkr_borrowed_text_from_verified /
105
+ * mkr_borrowed_text_from_owned), so every raw-bytes -> text entry point is
106
+ * greppable. */
107
+ int
108
+ mkr_owned_text_from_borrowed_copy(mkr_owned_text_t *out, mkr_borrowed_text_t t,
109
+ mkr_xpath_error_t *err, const char *what)
110
+ {
111
+ if (out == NULL) {
112
+ mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_borrowed_copy: bad args");
113
+ return -1;
114
+ }
115
+ mkr_owned_text_init(out);
116
+ const char *s = t.ptr ? t.ptr : "";
117
+ size_t len = t.ptr ? t.len : 0;
118
+ char *p = mkr_strndup(s, len);
119
+ if (p == NULL) {
120
+ mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory copying text");
121
+ return -1;
122
+ }
123
+ out->ptr = p;
124
+ out->len = len;
125
+ return 0;
126
+ }
127
+
128
+ /* ---------- value ---------- */
129
+
130
+ void
131
+ mkr_val_clear(mkr_val_t *v)
132
+ {
133
+ if (v == NULL) return;
134
+ switch (v->type) {
135
+ case MKR_XPATH_TYPE_NODESET:
136
+ mkr_nodeset_clear(&v->u.nodeset);
137
+ break;
138
+ case MKR_XPATH_TYPE_STRING:
139
+ mkr_owned_text_clear(&v->u.string);
140
+ break;
141
+ default:
142
+ break;
143
+ }
144
+ memset(v, 0, sizeof(*v));
145
+ }
146
+
147
+ void
148
+ mkr_val_set_owned_text(mkr_val_t *v, mkr_owned_text_t text)
149
+ {
150
+ if (v == NULL) return;
151
+ v->type = MKR_XPATH_TYPE_STRING;
152
+ v->u.string = text;
153
+ }
154
+
155
+ /* Set +v+ to a STRING by copying a borrowed view: the engine allocates and owns
156
+ * the copy. This is how callers outside the engine (the glue handler bridge)
157
+ * hand a string into a value - they pass what they have, a borrowed slice, and
158
+ * never construct an mkr_owned_text_t themselves. Keeping the copy-and-own step
159
+ * here keeps allocation and freeing of owned strings in one layer. Returns 0 on
160
+ * success, -1 on OOM (err populated; +v+ left untouched). */
161
+ int
162
+ mkr_val_set_borrowed_text_copy(mkr_val_t *v, mkr_borrowed_text_t text,
163
+ mkr_xpath_error_t *err, const char *what)
164
+ {
165
+ if (v == NULL) {
166
+ mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_set_borrowed_text_copy: bad args");
167
+ return -1;
168
+ }
169
+ mkr_owned_text_t owned;
170
+ if (mkr_owned_text_from_borrowed_copy(&owned, text, err, what) != 0) {
171
+ return -1;
172
+ }
173
+ mkr_val_set_owned_text(v, owned);
174
+ return 0;
175
+ }
176
+
177
+ /* ---------- per-evaluate document-order index (lifecycle) ---------- */
178
+
179
+ void
180
+ mkr_doc_order_index_init(mkr_doc_order_index_t *idx)
181
+ {
182
+ idx->buckets = NULL;
183
+ idx->cap = 0;
184
+ idx->count = 0;
185
+ idx->built = 0;
186
+ }
187
+
188
+ void
189
+ mkr_doc_order_index_clear(mkr_doc_order_index_t *idx)
190
+ {
191
+ if (idx == NULL) return;
192
+ free(idx->buckets);
193
+ idx->buckets = NULL;
194
+ idx->cap = 0;
195
+ idx->count = 0;
196
+ idx->built = 0;
197
+ }
198
+
199
+ /* ---------- per-evaluation string-value cache (lifecycle + index) ---------- */
200
+
201
+ void
202
+ mkr_str_cache_init(mkr_str_cache_t *c)
203
+ {
204
+ c->entries = NULL;
205
+ c->count = 0;
206
+ c->cap = 0;
207
+ c->buckets = NULL;
208
+ c->bucket_cap = 0;
209
+ c->total_bytes = 0;
210
+ }
211
+
212
+ /* Insert entry index `idx` (keyed by entries[idx].node) into the index. The
213
+ * index must have room (callers grow/rehash first). Extern: shared by the pure
214
+ * reindex below and the node-dereferencing insert in the per-instance body. */
215
+ void
216
+ mkr_str_cache_index_put(mkr_str_cache_t *c, size_t idx)
217
+ {
218
+ size_t mask = c->bucket_cap - 1;
219
+ size_t j = mkr_ptr_hash(c->entries[idx].node) & mask;
220
+ while (c->buckets[j] != 0) {
221
+ j = (j + 1) & mask;
222
+ }
223
+ c->buckets[j] = idx + 1;
224
+ }
225
+
226
+ /* Rebuild the index from entries[0, count). Returns -1 on OOM. Extern: see above. */
227
+ int
228
+ mkr_str_cache_reindex(mkr_str_cache_t *c, size_t bucket_cap)
229
+ {
230
+ size_t *buckets = mkr_callocarray(bucket_cap, sizeof(*buckets));
231
+ if (buckets == NULL) return -1;
232
+ free(c->buckets);
233
+ c->buckets = buckets;
234
+ c->bucket_cap = bucket_cap;
235
+ for (size_t i = 0; i < c->count; ++i) {
236
+ mkr_str_cache_index_put(c, i);
237
+ }
238
+ return 0;
239
+ }
240
+
241
+ void
242
+ mkr_str_cache_truncate(mkr_str_cache_t *c, size_t target_count)
243
+ {
244
+ if (c == NULL || target_count >= c->count) return;
245
+ for (size_t i = target_count; i < c->count; ++i) {
246
+ if (c->total_bytes >= c->entries[i].len) {
247
+ c->total_bytes -= c->entries[i].len;
248
+ } else {
249
+ c->total_bytes = 0;
250
+ }
251
+ free(c->entries[i].str);
252
+ }
253
+ c->count = target_count;
254
+ /* Drop the removed nodes from the index. A full truncate just clears it;
255
+ * a partial one (nested-eval snapshot restore) rebuilds from what remains. */
256
+ if (c->buckets != NULL) {
257
+ if (target_count == 0) {
258
+ size_t buckets_bytes;
259
+ if (!mkr_size_mul(c->bucket_cap, sizeof(*c->buckets), &buckets_bytes)) {
260
+ free(c->buckets);
261
+ c->buckets = NULL;
262
+ c->bucket_cap = 0;
263
+ c->total_bytes = 0;
264
+ return;
265
+ }
266
+ memset(c->buckets, 0, buckets_bytes);
267
+ c->total_bytes = 0;
268
+ } else {
269
+ mkr_str_cache_reindex(c, c->bucket_cap);
270
+ }
271
+ }
272
+ }
273
+
274
+ void
275
+ mkr_str_cache_clear(mkr_str_cache_t *c)
276
+ {
277
+ if (c == NULL) return;
278
+ for (size_t i = 0; i < c->count; ++i) {
279
+ free(c->entries[i].str);
280
+ }
281
+ free(c->entries);
282
+ free(c->buckets);
283
+ c->entries = NULL;
284
+ c->count = 0;
285
+ c->cap = 0;
286
+ c->buckets = NULL;
287
+ c->bucket_cap = 0;
288
+ c->total_bytes = 0;
289
+ }
290
+
291
+ /* ---------- AST destructors ---------- */
292
+
293
+ void
294
+ mkr_step_clear(mkr_step_t *s)
295
+ {
296
+ if (s == NULL) return;
297
+ mkr_owned_text_clear(&s->test.prefix);
298
+ mkr_owned_text_clear(&s->test.local);
299
+ mkr_owned_text_clear(&s->test.pi_target);
300
+ for (size_t i = 0; i < s->npredicates; ++i) {
301
+ mkr_node_free(s->predicates[i]);
302
+ }
303
+ free(s->predicates);
304
+ memset(s, 0, sizeof(*s));
305
+ }
306
+
307
+ /* ---------- AST hoisting helpers ---------- */
308
+
309
+ /* Pure XPath 1.0 built-ins safe to hoist when all args are CI. Listed
310
+ * explicitly to keep the set conservative. Functions that read the
311
+ * context node (last/position, 0-arg string/normalize-space/local-
312
+ * name/etc., lang) or that may depend on dynamic state (id, handler-
313
+ * routed) are intentionally absent. */
314
+ static int
315
+ is_pure_builtin_name(const char *name, size_t nargs)
316
+ {
317
+ if (name == NULL) return 0;
318
+ /* 0-arg only - these read no input. */
319
+ if (nargs == 0) {
320
+ return strcmp(name, "true") == 0 || strcmp(name, "false") == 0;
321
+ }
322
+ /* n-arg pure functions - all args must themselves be CI (checked
323
+ * by the caller). */
324
+ static const char *pure_names[] = {
325
+ "count", "string-length", "number", "boolean", "not",
326
+ "floor", "ceiling", "round", "sum",
327
+ "concat", "starts-with", "contains",
328
+ "substring-before", "substring-after", "substring",
329
+ "translate",
330
+ NULL,
331
+ };
332
+ for (size_t i = 0; pure_names[i]; ++i) {
333
+ if (strcmp(pure_names[i], name) == 0) return 1;
334
+ }
335
+ return 0;
336
+ }
337
+
338
+ static void
339
+ mark_step_predicates(mkr_step_t *s)
340
+ {
341
+ for (size_t i = 0; i < s->npredicates; ++i) {
342
+ mkr_mark_context_independent(s->predicates[i]);
343
+ }
344
+ }
345
+
346
+ void
347
+ mkr_mark_context_independent(mkr_node_t *n)
348
+ {
349
+ if (n == NULL) return;
350
+ int ci = 0;
351
+ switch (n->kind) {
352
+ case MKR_NK_LITERAL_STR:
353
+ case MKR_NK_LITERAL_NUM:
354
+ ci = 1;
355
+ break;
356
+ case MKR_NK_VARREF:
357
+ /* Conservative: variables not hoisted even though XPath 1.0 says
358
+ * they're fixed per evaluation. */
359
+ ci = 0;
360
+ break;
361
+ case MKR_NK_FNCALL: {
362
+ /* Recurse first so subtrees get their own CI marks even when this
363
+ * call itself is not hoistable. */
364
+ for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
365
+ mkr_mark_context_independent(n->u.fncall.args[i]);
366
+ }
367
+ if (n->u.fncall.prefix.ptr != NULL) {
368
+ ci = 0; /* Handler-routed or namespaced builtins → non-CI. */
369
+ break;
370
+ }
371
+ if (!is_pure_builtin_name(n->u.fncall.name.ptr, n->u.fncall.nargs)) {
372
+ ci = 0;
373
+ break;
374
+ }
375
+ ci = 1;
376
+ for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
377
+ if (!n->u.fncall.args[i]->is_context_independent) { ci = 0; break; }
378
+ }
379
+ break;
380
+ }
381
+ case MKR_NK_UNARY:
382
+ mkr_mark_context_independent(n->u.unary.expr);
383
+ ci = n->u.unary.expr ? n->u.unary.expr->is_context_independent : 0;
384
+ break;
385
+ case MKR_NK_BINOP:
386
+ mkr_mark_context_independent(n->u.binop.lhs);
387
+ mkr_mark_context_independent(n->u.binop.rhs);
388
+ ci = (n->u.binop.lhs && n->u.binop.lhs->is_context_independent)
389
+ && (n->u.binop.rhs && n->u.binop.rhs->is_context_independent);
390
+ break;
391
+ case MKR_NK_PATH:
392
+ /* Absolute path is CI: seed is the document root regardless of
393
+ * outer context. Relative paths use the outer context node and
394
+ * are not hoistable. Predicates inside the path are evaluated
395
+ * against the path's own context, so their position()/last() do
396
+ * not leak - recurse so any pure sub-expressions still get marks. */
397
+ ci = n->u.path.absolute ? 1 : 0;
398
+ for (size_t i = 0; i < n->u.path.nsteps; ++i) {
399
+ mark_step_predicates(&n->u.path.steps[i]);
400
+ }
401
+ break;
402
+ case MKR_NK_FILTER:
403
+ /* Conservative: filter expressions are not hoisted in v1. */
404
+ ci = 0;
405
+ mkr_mark_context_independent(n->u.filter.expr);
406
+ for (size_t i = 0; i < n->u.filter.npreds; ++i) {
407
+ mkr_mark_context_independent(n->u.filter.preds[i]);
408
+ }
409
+ for (size_t i = 0; i < n->u.filter.npath; ++i) {
410
+ mark_step_predicates(&n->u.filter.path_steps[i]);
411
+ }
412
+ break;
413
+ }
414
+ n->is_context_independent = (uint8_t)ci;
415
+ }
416
+
417
+ static void
418
+ clear_memos_step(mkr_step_t *s)
419
+ {
420
+ for (size_t i = 0; i < s->npredicates; ++i) {
421
+ mkr_node_clear_memos(s->predicates[i]);
422
+ }
423
+ }
424
+
425
+ /* ---------- peephole: //X fusion ---------- */
426
+
427
+ /*
428
+ * Collapse pairs of consecutive steps:
429
+ * (axis=descendant-or-self, test=node(), no predicates)
430
+ * (axis=child, test=*, no predicates)
431
+ * into a single
432
+ * (axis=descendant, test=*, no predicates)
433
+ *
434
+ * The fusion is safe per XPath 1.0 only when the child step has no
435
+ * predicates: otherwise '//X[1]' would change meaning ("first X per
436
+ * parent" vs "first X in doc order"). The synthesised // step always
437
+ * has no predicates by construction, so we don't need to check the
438
+ * first step's predicate list - only the child step's.
439
+ */
440
+ static void
441
+ fuse_descendant_or_self_steps(mkr_step_t *steps, size_t *nsteps_ptr)
442
+ {
443
+ if (steps == NULL || *nsteps_ptr < 2) return;
444
+ size_t nsteps = *nsteps_ptr;
445
+ size_t w = 0, r = 0;
446
+ while (r < nsteps) {
447
+ if (r + 1 < nsteps
448
+ && steps[r].axis == MKR_AXIS_DESCENDANT_OR_SELF
449
+ && steps[r].test.kind == MKR_NT_NODE
450
+ && steps[r].test.prefix.ptr == NULL
451
+ && steps[r].npredicates == 0
452
+ && steps[r + 1].axis == MKR_AXIS_CHILD
453
+ && steps[r + 1].npredicates == 0) {
454
+ /* Drop the desc-or-self step and promote the child step. */
455
+ mkr_step_clear(&steps[r]);
456
+ steps[w] = steps[r + 1];
457
+ memset(&steps[r + 1], 0, sizeof(steps[r + 1]));
458
+ steps[w].axis = MKR_AXIS_DESCENDANT;
459
+ w++;
460
+ r += 2;
461
+ } else {
462
+ if (w != r) {
463
+ steps[w] = steps[r];
464
+ memset(&steps[r], 0, sizeof(steps[r]));
465
+ }
466
+ w++;
467
+ r++;
468
+ }
469
+ }
470
+ *nsteps_ptr = w;
471
+ }
472
+
473
+ void
474
+ mkr_apply_peephole(mkr_node_t *n)
475
+ {
476
+ if (n == NULL) return;
477
+ switch (n->kind) {
478
+ case MKR_NK_FNCALL:
479
+ for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_apply_peephole(n->u.fncall.args[i]);
480
+ break;
481
+ case MKR_NK_UNARY:
482
+ mkr_apply_peephole(n->u.unary.expr);
483
+ break;
484
+ case MKR_NK_BINOP:
485
+ mkr_apply_peephole(n->u.binop.lhs);
486
+ mkr_apply_peephole(n->u.binop.rhs);
487
+ break;
488
+ case MKR_NK_PATH:
489
+ fuse_descendant_or_self_steps(n->u.path.steps, &n->u.path.nsteps);
490
+ for (size_t i = 0; i < n->u.path.nsteps; ++i) {
491
+ for (size_t j = 0; j < n->u.path.steps[i].npredicates; ++j) {
492
+ mkr_apply_peephole(n->u.path.steps[i].predicates[j]);
493
+ }
494
+ }
495
+ break;
496
+ case MKR_NK_FILTER:
497
+ mkr_apply_peephole(n->u.filter.expr);
498
+ for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_apply_peephole(n->u.filter.preds[i]);
499
+ fuse_descendant_or_self_steps(n->u.filter.path_steps, &n->u.filter.npath);
500
+ for (size_t i = 0; i < n->u.filter.npath; ++i) {
501
+ for (size_t j = 0; j < n->u.filter.path_steps[i].npredicates; ++j) {
502
+ mkr_apply_peephole(n->u.filter.path_steps[i].predicates[j]);
503
+ }
504
+ }
505
+ break;
506
+ default:
507
+ break;
508
+ }
509
+ }
510
+
511
+ mkr_node_t *
512
+ mkr_node_alloc(mkr_xpath_limits_t *limits, mkr_xpath_error_t *err, mkr_nk_t kind)
513
+ {
514
+ if (mkr_limit_ast_node(limits, err) != 0) return NULL;
515
+ mkr_node_t *n = mkr_callocarray(1, sizeof(*n));
516
+ if (n == NULL) {
517
+ mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory allocating AST node");
518
+ return NULL;
519
+ }
520
+ n->kind = kind;
521
+ return n;
522
+ }
523
+
524
+ void
525
+ mkr_node_clear_memos(mkr_node_t *n)
526
+ {
527
+ if (n == NULL) return;
528
+ if (n->memoized) {
529
+ mkr_val_clear(&n->memo_value);
530
+ n->memoized = 0;
531
+ }
532
+ switch (n->kind) {
533
+ case MKR_NK_FNCALL:
534
+ for (size_t i = 0; i < n->u.fncall.nargs; ++i) mkr_node_clear_memos(n->u.fncall.args[i]);
535
+ break;
536
+ case MKR_NK_UNARY:
537
+ mkr_node_clear_memos(n->u.unary.expr);
538
+ break;
539
+ case MKR_NK_BINOP:
540
+ mkr_node_clear_memos(n->u.binop.lhs);
541
+ mkr_node_clear_memos(n->u.binop.rhs);
542
+ break;
543
+ case MKR_NK_PATH:
544
+ for (size_t i = 0; i < n->u.path.nsteps; ++i) clear_memos_step(&n->u.path.steps[i]);
545
+ break;
546
+ case MKR_NK_FILTER:
547
+ mkr_node_clear_memos(n->u.filter.expr);
548
+ for (size_t i = 0; i < n->u.filter.npreds; ++i) mkr_node_clear_memos(n->u.filter.preds[i]);
549
+ for (size_t i = 0; i < n->u.filter.npath; ++i) clear_memos_step(&n->u.filter.path_steps[i]);
550
+ break;
551
+ default:
552
+ break;
553
+ }
554
+ }
555
+
556
+ void
557
+ mkr_node_free(mkr_node_t *n)
558
+ {
559
+ if (n == NULL) return;
560
+ /* Free any memoized value first (idempotent). */
561
+ if (n->memoized) {
562
+ mkr_val_clear(&n->memo_value);
563
+ n->memoized = 0;
564
+ }
565
+ switch (n->kind) {
566
+ case MKR_NK_LITERAL_STR:
567
+ mkr_owned_text_clear(&n->u.literal);
568
+ break;
569
+ case MKR_NK_LITERAL_NUM:
570
+ break;
571
+ case MKR_NK_VARREF:
572
+ mkr_owned_text_clear(&n->u.varref.prefix);
573
+ mkr_owned_text_clear(&n->u.varref.name);
574
+ break;
575
+ case MKR_NK_FNCALL:
576
+ mkr_owned_text_clear(&n->u.fncall.prefix);
577
+ mkr_owned_text_clear(&n->u.fncall.name);
578
+ for (size_t i = 0; i < n->u.fncall.nargs; ++i) {
579
+ mkr_node_free(n->u.fncall.args[i]);
580
+ }
581
+ free(n->u.fncall.args);
582
+ break;
583
+ case MKR_NK_UNARY:
584
+ mkr_node_free(n->u.unary.expr);
585
+ break;
586
+ case MKR_NK_BINOP:
587
+ mkr_node_free(n->u.binop.lhs);
588
+ mkr_node_free(n->u.binop.rhs);
589
+ break;
590
+ case MKR_NK_PATH:
591
+ for (size_t i = 0; i < n->u.path.nsteps; ++i) {
592
+ mkr_step_clear(&n->u.path.steps[i]);
593
+ }
594
+ free(n->u.path.steps);
595
+ break;
596
+ case MKR_NK_FILTER:
597
+ mkr_node_free(n->u.filter.expr);
598
+ for (size_t i = 0; i < n->u.filter.npreds; ++i) {
599
+ mkr_node_free(n->u.filter.preds[i]);
600
+ }
601
+ free(n->u.filter.preds);
602
+ for (size_t i = 0; i < n->u.filter.npath; ++i) {
603
+ mkr_step_clear(&n->u.filter.path_steps[i]);
604
+ }
605
+ free(n->u.filter.path_steps);
606
+ break;
607
+ }
608
+ free(n);
609
+ }