makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,801 @@
|
|
|
1
|
+
#include "mkr_xpath_internal.h"
|
|
2
|
+
#include "../core/mkr_core.h"
|
|
3
|
+
|
|
4
|
+
#include <lexbor/dom/dom.h>
|
|
5
|
+
#include <math.h>
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <stdlib.h>
|
|
8
|
+
#include <string.h>
|
|
9
|
+
|
|
10
|
+
/*
|
|
11
|
+
* Per-instance value model: the node-DEREFERENCING half of the runtime values
|
|
12
|
+
* - node string-value construction (XPath 1.0 §5), the value coercions that
|
|
13
|
+
* read a node-set's first node, document-order comparison/sort, and the
|
|
14
|
+
* string-value cache's node-keyed insert. Compiled once per representation
|
|
15
|
+
* (HTML / XML) with MKR_NODE_* bound by the including prelude.
|
|
16
|
+
*
|
|
17
|
+
* Every function here is file-static: it is reachable only from the other
|
|
18
|
+
* per-instance bodies (funcs / eval) in the same merged engine translation
|
|
19
|
+
* unit. The representation-INDEPENDENT primitives it leans on (node-set build,
|
|
20
|
+
* owned/borrowed text, str-cache + doc-order lifecycle, AST destructors) are the
|
|
21
|
+
* shared, bare-named functions in mkr_xpath_shared_body.h, declared in
|
|
22
|
+
* mkr_xpath_internal.h.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/* Forward declarations for the two coercions used before their definition
|
|
26
|
+
* (mkr_val_to_number_or_fail reads both). They are static, so there is no
|
|
27
|
+
* declaration in the shared internal header to cover the forward reference. */
|
|
28
|
+
static double mkr_borrowed_text_to_number(mkr_borrowed_text_t t);
|
|
29
|
+
static double mkr_val_to_number_unchecked(const mkr_val_t *v);
|
|
30
|
+
|
|
31
|
+
/* ---------- owned-text from a steal-able buffer ---------- */
|
|
32
|
+
|
|
33
|
+
static int
|
|
34
|
+
mkr_owned_text_from_buf_steal(mkr_owned_text_t *out, mkr_buf_t *buf,
|
|
35
|
+
mkr_xpath_error_t *err, const char *what)
|
|
36
|
+
{
|
|
37
|
+
if (out == NULL || buf == NULL) {
|
|
38
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_owned_text_from_buf_steal: bad args");
|
|
39
|
+
return -1;
|
|
40
|
+
}
|
|
41
|
+
mkr_owned_text_init(out);
|
|
42
|
+
size_t len = 0;
|
|
43
|
+
char *p = mkr_buf_steal(buf, &len);
|
|
44
|
+
if (p == NULL) {
|
|
45
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, what ? what : "out of memory stealing text buffer");
|
|
46
|
+
return -1;
|
|
47
|
+
}
|
|
48
|
+
out->ptr = p;
|
|
49
|
+
out->len = len;
|
|
50
|
+
return 0;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/* ---------- value clone ---------- */
|
|
54
|
+
|
|
55
|
+
static int
|
|
56
|
+
mkr_val_clone(const mkr_val_t *src, mkr_val_t *dst, mkr_xpath_error_t *err)
|
|
57
|
+
{
|
|
58
|
+
if (src == NULL || dst == NULL) {
|
|
59
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: bad args");
|
|
60
|
+
return -1;
|
|
61
|
+
}
|
|
62
|
+
memset(dst, 0, sizeof(*dst));
|
|
63
|
+
dst->type = src->type;
|
|
64
|
+
switch (src->type) {
|
|
65
|
+
case MKR_XPATH_TYPE_STRING: {
|
|
66
|
+
mkr_owned_text_t text;
|
|
67
|
+
if (mkr_owned_text_from_borrowed_copy(&text, mkr_borrowed_text_from_owned(src->u.string),
|
|
68
|
+
err, "out of memory cloning string value") != 0) return -1;
|
|
69
|
+
mkr_val_set_owned_text(dst, text);
|
|
70
|
+
return 0;
|
|
71
|
+
}
|
|
72
|
+
case MKR_XPATH_TYPE_NUMBER:
|
|
73
|
+
dst->u.number = src->u.number;
|
|
74
|
+
return 0;
|
|
75
|
+
case MKR_XPATH_TYPE_BOOLEAN:
|
|
76
|
+
dst->u.boolean = src->u.boolean;
|
|
77
|
+
return 0;
|
|
78
|
+
case MKR_XPATH_TYPE_NODESET: {
|
|
79
|
+
size_t n = src->u.nodeset.count;
|
|
80
|
+
mkr_nodeset_init(&dst->u.nodeset);
|
|
81
|
+
if (n == 0) return 0;
|
|
82
|
+
void **items;
|
|
83
|
+
size_t items_bytes;
|
|
84
|
+
if (!mkr_size_mul(n, sizeof(*items), &items_bytes)) {
|
|
85
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
|
|
86
|
+
return -1;
|
|
87
|
+
}
|
|
88
|
+
items = mkr_reallocarray(NULL, n, sizeof(*items));
|
|
89
|
+
if (items == NULL) {
|
|
90
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory cloning node-set");
|
|
91
|
+
return -1;
|
|
92
|
+
}
|
|
93
|
+
memcpy(items, src->u.nodeset.items, items_bytes);
|
|
94
|
+
dst->u.nodeset.items = items;
|
|
95
|
+
dst->u.nodeset.count = n;
|
|
96
|
+
dst->u.nodeset.capacity = n;
|
|
97
|
+
return 0;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_clone: unknown value type");
|
|
101
|
+
return -1;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/* ---------- node string-value (XPath 1.0 §5) ----------
|
|
105
|
+
*
|
|
106
|
+
* Built into an mkr_buf_t whose `max` is the per-evaluate byte cap: append fails
|
|
107
|
+
* closed with MKR_ERR_LIMIT past the cap and MKR_ERR_OOM on allocation failure,
|
|
108
|
+
* so there is never a partial/truncated result. Lexbor-allocated text is freed
|
|
109
|
+
* after each append (otherwise we'd leak document-arena memory on every XPath
|
|
110
|
+
* that touches text content). */
|
|
111
|
+
|
|
112
|
+
/* Append `node`'s own text content. */
|
|
113
|
+
static mkr_status_t
|
|
114
|
+
append_text_content(MKR_DOM_NODE *node, mkr_buf_t *buf)
|
|
115
|
+
{
|
|
116
|
+
mkr_status_t st;
|
|
117
|
+
MKR_NODE_APPEND_OWN_TEXT(node, buf, st);
|
|
118
|
+
return st;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/* Append the string-value of every character-data descendant of `node`, in
|
|
122
|
+
* document order. Both TEXT and CDATA-section nodes are character data (XPath
|
|
123
|
+
* 1.0 §3 / §5: a CDATA section is text, not a distinct node type), so both
|
|
124
|
+
* contribute - matching the text index that backs Node#text. Iterative
|
|
125
|
+
* (parent-pointer) pre-order walk rather than C recursion, so an adversarially
|
|
126
|
+
* deep tree cannot overflow the stack (fail-closed / no DoS); O(1) extra space.
|
|
127
|
+
* Descends only into elements. */
|
|
128
|
+
static mkr_status_t
|
|
129
|
+
append_text_descendants(MKR_DOM_NODE *node, mkr_buf_t *buf)
|
|
130
|
+
{
|
|
131
|
+
MKR_DOM_NODE *cur = MKR_NODE_FIRST_CHILD(node);
|
|
132
|
+
while (cur != NULL) {
|
|
133
|
+
if (MKR_NODE_TYPE(cur) == MKR_NTYPE_TEXT
|
|
134
|
+
|| MKR_NODE_TYPE(cur) == MKR_NTYPE_CDATA_SECTION) {
|
|
135
|
+
mkr_status_t st = append_text_content(cur, buf);
|
|
136
|
+
if (st != MKR_OK) return st; /* LIMIT or OOM - caller fails closed */
|
|
137
|
+
}
|
|
138
|
+
if (MKR_NODE_TYPE(cur) == MKR_NTYPE_ELEMENT && MKR_NODE_FIRST_CHILD(cur) != NULL) {
|
|
139
|
+
cur = MKR_NODE_FIRST_CHILD(cur);
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
while (cur != node && MKR_NODE_NEXT(cur) == NULL) {
|
|
143
|
+
cur = MKR_NODE_PARENT(cur);
|
|
144
|
+
}
|
|
145
|
+
if (cur == node) return MKR_OK;
|
|
146
|
+
cur = MKR_NODE_NEXT(cur);
|
|
147
|
+
}
|
|
148
|
+
return MKR_OK;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/* Build node's string-value into `buf` (cap carried by buf->max). */
|
|
152
|
+
static mkr_status_t
|
|
153
|
+
build_string_value(const MKR_DOM_NODE *node, mkr_buf_t *buf)
|
|
154
|
+
{
|
|
155
|
+
if (node == NULL) return MKR_OK;
|
|
156
|
+
|
|
157
|
+
switch (MKR_NODE_TYPE(node)) {
|
|
158
|
+
case MKR_NTYPE_ATTRIBUTE: {
|
|
159
|
+
MKR_DOM_ATTR *attr = (MKR_DOM_ATTR *)node;
|
|
160
|
+
size_t vlen = 0;
|
|
161
|
+
const lxb_char_t *v = MKR_ATTR_VALUE(attr, &vlen);
|
|
162
|
+
return mkr_buf_append(buf, v ? (const char *)v : "", vlen);
|
|
163
|
+
}
|
|
164
|
+
case MKR_NTYPE_TEXT:
|
|
165
|
+
case MKR_NTYPE_CDATA_SECTION:
|
|
166
|
+
case MKR_NTYPE_COMMENT:
|
|
167
|
+
case MKR_NTYPE_PI:
|
|
168
|
+
return append_text_content((MKR_DOM_NODE *)node, buf);
|
|
169
|
+
default:
|
|
170
|
+
return append_text_descendants((MKR_DOM_NODE *)node, buf);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
static void
|
|
175
|
+
mkr_build_node_text_unchecked(const MKR_DOM_NODE *node, mkr_owned_text_t *out)
|
|
176
|
+
{
|
|
177
|
+
/* Best-effort node string-value, used only for NUMBER coercion (its sole
|
|
178
|
+
* caller): the text is parsed straight to a double, so mkr_buf's conservative
|
|
179
|
+
* default ceiling (max == 0) is ample - a node whose text exceeds it was never a
|
|
180
|
+
* valid number, and the build then falls back to an owned "" (-> NaN), which is
|
|
181
|
+
* the correct coercion result anyway. On any failure return "" rather than NULL,
|
|
182
|
+
* since callers require a non-NULL text. */
|
|
183
|
+
mkr_owned_text_init(out);
|
|
184
|
+
mkr_buf_t buf;
|
|
185
|
+
mkr_buf_init(&buf, 0);
|
|
186
|
+
if (build_string_value(node, &buf) != MKR_OK) {
|
|
187
|
+
mkr_buf_free(&buf);
|
|
188
|
+
(void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
if (mkr_owned_text_from_buf_steal(out, &buf, NULL, NULL) != 0) {
|
|
192
|
+
(void)mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), NULL, NULL);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
static int
|
|
197
|
+
mkr_node_to_owned_text_or_fail(const MKR_DOM_NODE *node,
|
|
198
|
+
mkr_xpath_limits_t *limits,
|
|
199
|
+
mkr_xpath_error_t *err,
|
|
200
|
+
mkr_owned_text_t *out)
|
|
201
|
+
{
|
|
202
|
+
if (out == NULL) {
|
|
203
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_node_to_owned_text_or_fail: bad args");
|
|
204
|
+
return -1;
|
|
205
|
+
}
|
|
206
|
+
mkr_owned_text_init(out);
|
|
207
|
+
mkr_buf_t buf;
|
|
208
|
+
mkr_buf_init(&buf, (limits != NULL) ? limits->max_string_bytes : 0);
|
|
209
|
+
mkr_status_t st = build_string_value(node, &buf);
|
|
210
|
+
if (st == MKR_ERR_LIMIT) {
|
|
211
|
+
mkr_buf_free(&buf);
|
|
212
|
+
mkr_err_setf(err, MKR_XPATH_ERR_LIMIT,
|
|
213
|
+
"string size limit exceeded (%zu bytes) while building node string-value",
|
|
214
|
+
limits->max_string_bytes);
|
|
215
|
+
return -1;
|
|
216
|
+
}
|
|
217
|
+
if (st != MKR_OK) {
|
|
218
|
+
mkr_buf_free(&buf);
|
|
219
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building node string-value");
|
|
220
|
+
return -1;
|
|
221
|
+
}
|
|
222
|
+
return mkr_owned_text_from_buf_steal(out, &buf, err, "out of memory building node string-value");
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
static int
|
|
226
|
+
mkr_val_to_owned_text_or_fail(const mkr_val_t *v,
|
|
227
|
+
mkr_xpath_limits_t *limits,
|
|
228
|
+
mkr_xpath_error_t *err,
|
|
229
|
+
mkr_owned_text_t *out)
|
|
230
|
+
{
|
|
231
|
+
if (out == NULL) {
|
|
232
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_owned_text_or_fail: bad args");
|
|
233
|
+
return -1;
|
|
234
|
+
}
|
|
235
|
+
mkr_owned_text_init(out);
|
|
236
|
+
if (v == NULL) {
|
|
237
|
+
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory converting value to string");
|
|
238
|
+
}
|
|
239
|
+
switch (v->type) {
|
|
240
|
+
case MKR_XPATH_TYPE_STRING: {
|
|
241
|
+
mkr_borrowed_text_t text = mkr_borrowed_text_from_owned(v->u.string);
|
|
242
|
+
if (text.ptr == NULL) text.len = 0;
|
|
243
|
+
if (limits != NULL && mkr_limit_check_string_bytes(limits, text.len, err) != 0) return -1;
|
|
244
|
+
return mkr_owned_text_from_borrowed_copy(out, text,
|
|
245
|
+
err, "out of memory copying string value");
|
|
246
|
+
}
|
|
247
|
+
case MKR_XPATH_TYPE_BOOLEAN:
|
|
248
|
+
return v->u.boolean
|
|
249
|
+
? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("true"), err, "out of memory converting boolean to string")
|
|
250
|
+
: mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("false"), err, "out of memory converting boolean to string");
|
|
251
|
+
case MKR_XPATH_TYPE_NUMBER: {
|
|
252
|
+
double d = v->u.number;
|
|
253
|
+
if (isnan(d)) {
|
|
254
|
+
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("NaN"), err, "out of memory converting number to string");
|
|
255
|
+
}
|
|
256
|
+
if (isinf(d)) {
|
|
257
|
+
return d < 0
|
|
258
|
+
? mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("-Infinity"), err, "out of memory converting number to string")
|
|
259
|
+
: mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("Infinity"), err, "out of memory converting number to string");
|
|
260
|
+
}
|
|
261
|
+
if (d == 0.0) {
|
|
262
|
+
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit("0"), err, "out of memory converting number to string");
|
|
263
|
+
}
|
|
264
|
+
char buf[64];
|
|
265
|
+
int n;
|
|
266
|
+
if (d == floor(d) && fabs(d) < 1e15) {
|
|
267
|
+
n = snprintf(buf, sizeof(buf), "%lld", (long long)d);
|
|
268
|
+
} else {
|
|
269
|
+
n = snprintf(buf, sizeof(buf), "%.15g", d);
|
|
270
|
+
}
|
|
271
|
+
if (n < 0 || (size_t)n >= sizeof(buf)) {
|
|
272
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "number string conversion overflow");
|
|
273
|
+
return -1;
|
|
274
|
+
}
|
|
275
|
+
char *p = mkr_strndup(buf, (size_t)n);
|
|
276
|
+
if (p == NULL) { mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory converting number to string"); return -1; }
|
|
277
|
+
*out = mkr_owned_text(p, (size_t)n);
|
|
278
|
+
return 0;
|
|
279
|
+
}
|
|
280
|
+
case MKR_XPATH_TYPE_NODESET:
|
|
281
|
+
if (v->u.nodeset.count == 0) {
|
|
282
|
+
return mkr_owned_text_from_borrowed_copy(out, mkr_borrowed_text_lit(""), err, "out of memory");
|
|
283
|
+
}
|
|
284
|
+
/* XPath 1.0 §4.2: string(node-set) = string-value of first node in doc order. */
|
|
285
|
+
return mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, out);
|
|
286
|
+
}
|
|
287
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "unknown value type");
|
|
288
|
+
return -1;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
static int
|
|
292
|
+
mkr_val_to_number_or_fail(const mkr_val_t *v,
|
|
293
|
+
mkr_xpath_limits_t *limits,
|
|
294
|
+
mkr_xpath_error_t *err,
|
|
295
|
+
double *out)
|
|
296
|
+
{
|
|
297
|
+
if (v == NULL || out == NULL) {
|
|
298
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_val_to_number_or_fail: bad args");
|
|
299
|
+
return -1;
|
|
300
|
+
}
|
|
301
|
+
if (v->type == MKR_XPATH_TYPE_NODESET) {
|
|
302
|
+
if (v->u.nodeset.count == 0) {
|
|
303
|
+
*out = (double)NAN;
|
|
304
|
+
return 0;
|
|
305
|
+
}
|
|
306
|
+
mkr_owned_text_t text;
|
|
307
|
+
if (mkr_node_to_owned_text_or_fail(v->u.nodeset.items[0], limits, err, &text) != 0) return -1;
|
|
308
|
+
*out = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
|
|
309
|
+
mkr_owned_text_clear(&text);
|
|
310
|
+
return 0;
|
|
311
|
+
}
|
|
312
|
+
*out = mkr_val_to_number_unchecked(v);
|
|
313
|
+
return 0;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/* ---------- coercions ---------- */
|
|
317
|
+
|
|
318
|
+
/* string -> number coercion (XPath 1.0 §4.4): optional leading whitespace, an
|
|
319
|
+
* optional single '-' (NO whitespace between it and the digits, and NO '+'),
|
|
320
|
+
* then a Number, then optional trailing whitespace - anything else is NaN. The
|
|
321
|
+
* Number scan/convert uses the same grammar-exact, locale-independent helpers as
|
|
322
|
+
* the lexer, so "0x10" / "1e3" / "INF" all coerce to NaN (the extent stops
|
|
323
|
+
* before x/e and the trailing garbage trips the end check). All reads go through
|
|
324
|
+
* the bounded span. */
|
|
325
|
+
static double
|
|
326
|
+
mkr_borrowed_text_to_number(mkr_borrowed_text_t t)
|
|
327
|
+
{
|
|
328
|
+
if (t.ptr == NULL) return (double)NAN;
|
|
329
|
+
mkr_span_t s = mkr_span(t.ptr, t.len);
|
|
330
|
+
|
|
331
|
+
mkr_span_skip_xpath_ws(&s);
|
|
332
|
+
|
|
333
|
+
int neg = 0;
|
|
334
|
+
if (mkr_span_peek(&s) == '-') { neg = 1; mkr_span_skip(&s, 1); }
|
|
335
|
+
|
|
336
|
+
const char *mark = mkr_span_mark(&s);
|
|
337
|
+
size_t extent = mkr_xpath_number_extent(mark, mkr_span_left(&s));
|
|
338
|
+
if (extent == 0) return (double)NAN;
|
|
339
|
+
double d = mkr_xpath_number_from_extent(mark, extent);
|
|
340
|
+
mkr_span_skip(&s, extent);
|
|
341
|
+
|
|
342
|
+
mkr_span_skip_xpath_ws(&s);
|
|
343
|
+
if (mkr_span_peek(&s) != -1) return (double)NAN; /* trailing garbage */
|
|
344
|
+
|
|
345
|
+
return neg ? -d : d;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
static double
|
|
349
|
+
mkr_val_to_number_unchecked(const mkr_val_t *v)
|
|
350
|
+
{
|
|
351
|
+
switch (v->type) {
|
|
352
|
+
case MKR_XPATH_TYPE_NUMBER:
|
|
353
|
+
return v->u.number;
|
|
354
|
+
case MKR_XPATH_TYPE_BOOLEAN:
|
|
355
|
+
return v->u.boolean ? 1.0 : 0.0;
|
|
356
|
+
case MKR_XPATH_TYPE_STRING:
|
|
357
|
+
return mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(v->u.string));
|
|
358
|
+
case MKR_XPATH_TYPE_NODESET: {
|
|
359
|
+
if (v->u.nodeset.count == 0) return (double)NAN;
|
|
360
|
+
/* string-value of first node in document order */
|
|
361
|
+
mkr_owned_text_t text;
|
|
362
|
+
mkr_build_node_text_unchecked(v->u.nodeset.items[0], &text);
|
|
363
|
+
double d = mkr_borrowed_text_to_number(mkr_borrowed_text_from_owned(text));
|
|
364
|
+
mkr_owned_text_clear(&text);
|
|
365
|
+
return d;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
return (double)NAN;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
static int
|
|
372
|
+
mkr_val_to_boolean(const mkr_val_t *v)
|
|
373
|
+
{
|
|
374
|
+
switch (v->type) {
|
|
375
|
+
case MKR_XPATH_TYPE_BOOLEAN:
|
|
376
|
+
return v->u.boolean;
|
|
377
|
+
case MKR_XPATH_TYPE_NUMBER:
|
|
378
|
+
return !(v->u.number == 0.0 || isnan(v->u.number));
|
|
379
|
+
case MKR_XPATH_TYPE_STRING:
|
|
380
|
+
return v->u.string.ptr != NULL && v->u.string.ptr[0] != '\0';
|
|
381
|
+
case MKR_XPATH_TYPE_NODESET:
|
|
382
|
+
return v->u.nodeset.count > 0;
|
|
383
|
+
}
|
|
384
|
+
return 0;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/* ---------- document order ---------- */
|
|
388
|
+
|
|
389
|
+
/*
|
|
390
|
+
* Treat an attribute node as positioned "with" its owner element for
|
|
391
|
+
* cross-subtree comparisons; only when both belong to the same element
|
|
392
|
+
* does the attribute-vs-attribute or attribute-vs-descendant rule kick in.
|
|
393
|
+
*/
|
|
394
|
+
static const MKR_DOM_NODE *
|
|
395
|
+
anchor_for_cmp(const MKR_DOM_NODE *n)
|
|
396
|
+
{
|
|
397
|
+
if (MKR_NODE_TYPE(n) == MKR_NTYPE_ATTRIBUTE) {
|
|
398
|
+
return MKR_NODE_PARENT(n) ? MKR_NODE_PARENT(n) : n;
|
|
399
|
+
}
|
|
400
|
+
return n;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
static int
|
|
404
|
+
depth_of(const MKR_DOM_NODE *n)
|
|
405
|
+
{
|
|
406
|
+
int d = 0;
|
|
407
|
+
while (MKR_NODE_PARENT(n)) { d++; n = MKR_NODE_PARENT(n); }
|
|
408
|
+
return d;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
static int
|
|
412
|
+
doc_order_cmp(const MKR_DOM_NODE *a, const MKR_DOM_NODE *b)
|
|
413
|
+
{
|
|
414
|
+
if (a == b) return 0;
|
|
415
|
+
const MKR_DOM_NODE *aa = anchor_for_cmp(a);
|
|
416
|
+
const MKR_DOM_NODE *bb = anchor_for_cmp(b);
|
|
417
|
+
|
|
418
|
+
/* If the anchors are the same element, decide by node type. A non-attribute
|
|
419
|
+
* node that anchors to the same element E can ONLY be E itself: any other
|
|
420
|
+
* node (a child/descendant) anchors to itself, not to E, so it would not
|
|
421
|
+
* reach this branch (the attribute-vs-descendant case is handled below by
|
|
422
|
+
* the depth-normalisation walk). Per XPath 1.0 §5.1 document order is
|
|
423
|
+
* "element, then its attribute nodes, then its children", so an attribute
|
|
424
|
+
* comes AFTER its own owner element. */
|
|
425
|
+
if (aa == bb) {
|
|
426
|
+
int a_attr = (MKR_NODE_TYPE(a) == MKR_NTYPE_ATTRIBUTE);
|
|
427
|
+
int b_attr = (MKR_NODE_TYPE(b) == MKR_NTYPE_ATTRIBUTE);
|
|
428
|
+
if (a_attr && !b_attr) return 1; /* b is the owner element E; a (its attr) follows */
|
|
429
|
+
if (b_attr && !a_attr) return -1; /* a is the owner element E; b (its attr) follows */
|
|
430
|
+
/* Both attributes of the same element: relative order is
|
|
431
|
+
* implementation-defined. Use insertion order via attr linked list. */
|
|
432
|
+
if (a_attr && b_attr) {
|
|
433
|
+
for (const MKR_DOM_ATTR *at = MKR_ELEM_FIRST_ATTR((const MKR_DOM_ELEMENT *)aa);
|
|
434
|
+
at != NULL; at = MKR_ATTR_NEXT(at)) {
|
|
435
|
+
if ((const MKR_DOM_NODE *)at == a) return -1;
|
|
436
|
+
if ((const MKR_DOM_NODE *)at == b) return 1;
|
|
437
|
+
}
|
|
438
|
+
return 0;
|
|
439
|
+
}
|
|
440
|
+
/* aa == bb but neither is an attribute means a == b, handled above. */
|
|
441
|
+
return 0;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
int da = depth_of(aa), db = depth_of(bb);
|
|
445
|
+
while (da > db) { aa = MKR_NODE_PARENT(aa); da--; }
|
|
446
|
+
while (db > da) { bb = MKR_NODE_PARENT(bb); db--; }
|
|
447
|
+
if (aa == bb) {
|
|
448
|
+
/* One is ancestor of the other; ancestor comes first. */
|
|
449
|
+
return (aa == anchor_for_cmp(a)) ? -1 : 1;
|
|
450
|
+
}
|
|
451
|
+
while (MKR_NODE_PARENT(aa) != MKR_NODE_PARENT(bb)) {
|
|
452
|
+
aa = MKR_NODE_PARENT(aa);
|
|
453
|
+
bb = MKR_NODE_PARENT(bb);
|
|
454
|
+
}
|
|
455
|
+
/* Resolve sibling order. Scan outward from aa and bb in lockstep (via ->next)
|
|
456
|
+
* rather than forward from parent->first_child: the cost is then O(distance
|
|
457
|
+
* between aa and bb), not O(distance from the first child. The latter is
|
|
458
|
+
* quadratic when sorting nodes that sit deep in a wide, flat parent (e.g. a
|
|
459
|
+
* predicate result picking scattered <li> from a 2000-child <ul>), which the
|
|
460
|
+
* doc-order index would only avoid once a single sort reaches its build
|
|
461
|
+
* threshold. */
|
|
462
|
+
if (MKR_NODE_PARENT(aa) == NULL) {
|
|
463
|
+
/* Different documents/roots - undefined; keep stable. */
|
|
464
|
+
return 0;
|
|
465
|
+
}
|
|
466
|
+
const MKR_DOM_NODE *fa = aa, *fb = bb;
|
|
467
|
+
for (;;) {
|
|
468
|
+
fa = fa ? MKR_NODE_NEXT(fa) : NULL;
|
|
469
|
+
fb = fb ? MKR_NODE_NEXT(fb) : NULL;
|
|
470
|
+
if (fa == bb) return -1; /* bb lies after aa -> aa first */
|
|
471
|
+
if (fb == aa) return 1; /* aa lies after bb -> bb first */
|
|
472
|
+
if (fa == NULL && fb == NULL) return 0; /* unreachable for same-parent nodes */
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
/* ---------- per-evaluate document-order index (build/lookup/sort) ---------- */
|
|
477
|
+
|
|
478
|
+
/* Insert (node, ord) into the open-addressing table. Grows when load
|
|
479
|
+
* factor exceeds 3/4. Returns 0 on success, -1 on OOM. */
|
|
480
|
+
static int
|
|
481
|
+
order_index_insert(mkr_doc_order_index_t *idx, const MKR_DOM_NODE *node, size_t ord)
|
|
482
|
+
{
|
|
483
|
+
if (idx->cap == 0 || idx->count * 4 >= idx->cap * 3) {
|
|
484
|
+
size_t new_cap = 256;
|
|
485
|
+
if (idx->cap != 0 && !mkr_size_mul(idx->cap, 2, &new_cap)) {
|
|
486
|
+
return -1; /* overflow */
|
|
487
|
+
}
|
|
488
|
+
void *new_buckets = mkr_callocarray(new_cap, sizeof(*idx->buckets));
|
|
489
|
+
if (new_buckets == NULL) return -1;
|
|
490
|
+
/* Rehash. */
|
|
491
|
+
typeof(idx->buckets) old_buckets = idx->buckets;
|
|
492
|
+
size_t old_cap = idx->cap;
|
|
493
|
+
idx->buckets = new_buckets;
|
|
494
|
+
idx->cap = new_cap;
|
|
495
|
+
idx->count = 0;
|
|
496
|
+
for (size_t i = 0; i < old_cap; ++i) {
|
|
497
|
+
if (old_buckets[i].node != NULL) {
|
|
498
|
+
size_t mask = new_cap - 1;
|
|
499
|
+
size_t j = mkr_ptr_hash(old_buckets[i].node) & mask;
|
|
500
|
+
while (idx->buckets[j].node != NULL) j = (j + 1) & mask;
|
|
501
|
+
idx->buckets[j].node = old_buckets[i].node;
|
|
502
|
+
idx->buckets[j].ord = old_buckets[i].ord;
|
|
503
|
+
idx->count++;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
free(old_buckets);
|
|
507
|
+
}
|
|
508
|
+
size_t mask = idx->cap - 1;
|
|
509
|
+
size_t j = mkr_ptr_hash(node) & mask;
|
|
510
|
+
while (idx->buckets[j].node != NULL) {
|
|
511
|
+
if (idx->buckets[j].node == node) return 0; /* already present */
|
|
512
|
+
j = (j + 1) & mask;
|
|
513
|
+
}
|
|
514
|
+
idx->buckets[j].node = node;
|
|
515
|
+
idx->buckets[j].ord = ord;
|
|
516
|
+
idx->count++;
|
|
517
|
+
return 0;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
static int
|
|
521
|
+
order_index_lookup(const mkr_doc_order_index_t *idx, const MKR_DOM_NODE *node,
|
|
522
|
+
size_t *out_ord)
|
|
523
|
+
{
|
|
524
|
+
if (idx->cap == 0) return -1;
|
|
525
|
+
size_t mask = idx->cap - 1;
|
|
526
|
+
size_t j = mkr_ptr_hash(node) & mask;
|
|
527
|
+
while (idx->buckets[j].node != NULL) {
|
|
528
|
+
if (idx->buckets[j].node == node) {
|
|
529
|
+
if (out_ord) *out_ord = idx->buckets[j].ord;
|
|
530
|
+
return 0;
|
|
531
|
+
}
|
|
532
|
+
j = (j + 1) & mask;
|
|
533
|
+
}
|
|
534
|
+
return -1;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
/* DFS pre-order: assign ordinal to the element, then its attributes
|
|
538
|
+
* (in linked-list order, before children), then descendants. This
|
|
539
|
+
* matches doc_order_cmp's attribute placement.
|
|
540
|
+
*
|
|
541
|
+
* Iterative (parent-pointer) walk rather than C recursion, so an adversarially
|
|
542
|
+
* deep tree cannot overflow the stack (fail-closed / no DoS); O(1) extra space.
|
|
543
|
+
* The traversal stays within the subtree rooted at `root` (it never follows
|
|
544
|
+
* root->next). */
|
|
545
|
+
static int
|
|
546
|
+
order_index_walk(mkr_doc_order_index_t *idx, MKR_DOM_NODE *root, size_t *next_ord)
|
|
547
|
+
{
|
|
548
|
+
MKR_DOM_NODE *cur = root;
|
|
549
|
+
while (cur != NULL) {
|
|
550
|
+
/* Visit (pre-order): the node, then its attributes before any child. */
|
|
551
|
+
if (order_index_insert(idx, cur, (*next_ord)++) != 0) return -1;
|
|
552
|
+
if (MKR_NODE_TYPE(cur) == MKR_NTYPE_ELEMENT) {
|
|
553
|
+
MKR_DOM_ELEMENT *el = (MKR_DOM_ELEMENT *)cur;
|
|
554
|
+
for (MKR_DOM_ATTR *a = MKR_ELEM_FIRST_ATTR(el); a != NULL; a = MKR_ATTR_NEXT(a)) {
|
|
555
|
+
if (order_index_insert(idx, (MKR_DOM_NODE *)a, (*next_ord)++) != 0) return -1;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
if (MKR_NODE_FIRST_CHILD(cur) != NULL) {
|
|
559
|
+
cur = MKR_NODE_FIRST_CHILD(cur);
|
|
560
|
+
continue;
|
|
561
|
+
}
|
|
562
|
+
while (cur != root && MKR_NODE_NEXT(cur) == NULL) {
|
|
563
|
+
cur = MKR_NODE_PARENT(cur);
|
|
564
|
+
}
|
|
565
|
+
if (cur == root) break;
|
|
566
|
+
cur = MKR_NODE_NEXT(cur);
|
|
567
|
+
}
|
|
568
|
+
return 0;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
static int
|
|
572
|
+
order_index_build(mkr_doc_order_index_t *idx, MKR_DOM_NODE *root,
|
|
573
|
+
mkr_xpath_error_t *err)
|
|
574
|
+
{
|
|
575
|
+
if (idx->built) return 0;
|
|
576
|
+
if (root == NULL) return -1;
|
|
577
|
+
size_t next_ord = 0;
|
|
578
|
+
if (order_index_walk(idx, root, &next_ord) != 0) {
|
|
579
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory building document order index");
|
|
580
|
+
mkr_doc_order_index_clear(idx);
|
|
581
|
+
return -1;
|
|
582
|
+
}
|
|
583
|
+
idx->built = 1;
|
|
584
|
+
return 0;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
/* Indexed comparator. Falls back to doc_order_cmp on any miss
|
|
588
|
+
* (e.g., synthesised nodes or cross-document compares). */
|
|
589
|
+
static int
|
|
590
|
+
doc_order_cmp_ctx(mkr_xpath_context_t *ctx, const MKR_DOM_NODE *a, const MKR_DOM_NODE *b)
|
|
591
|
+
{
|
|
592
|
+
if (a == b) return 0;
|
|
593
|
+
if (ctx == NULL) return doc_order_cmp(a, b);
|
|
594
|
+
mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
|
|
595
|
+
if (idx == NULL || !idx->built) return doc_order_cmp(a, b);
|
|
596
|
+
size_t oa, ob;
|
|
597
|
+
if (order_index_lookup(idx, a, &oa) != 0) return doc_order_cmp(a, b);
|
|
598
|
+
if (order_index_lookup(idx, b, &ob) != 0) return doc_order_cmp(a, b);
|
|
599
|
+
/* Safe comparison - compare, don't subtract (unsigned difference wraps). */
|
|
600
|
+
if (oa < ob) return -1;
|
|
601
|
+
if (oa > ob) return 1;
|
|
602
|
+
return 0;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/* Bottom-up merge sort. Threading ctx through avoids the qsort_r /
|
|
606
|
+
* thread-local hack and keeps everything reentrant. Stable as a
|
|
607
|
+
* bonus: ties (same ord - only possible for synthesised nodes that
|
|
608
|
+
* weren't in the index) preserve insertion order. */
|
|
609
|
+
static void
|
|
610
|
+
ms_merge(void **arr, void **tmp,
|
|
611
|
+
size_t lo, size_t mid, size_t hi, mkr_xpath_context_t *ctx)
|
|
612
|
+
{
|
|
613
|
+
size_t i = lo, j = mid, k = lo;
|
|
614
|
+
while (i < mid && j < hi) {
|
|
615
|
+
if (doc_order_cmp_ctx(ctx, arr[i], arr[j]) <= 0) tmp[k++] = arr[i++];
|
|
616
|
+
else tmp[k++] = arr[j++];
|
|
617
|
+
}
|
|
618
|
+
while (i < mid) tmp[k++] = arr[i++];
|
|
619
|
+
while (j < hi) tmp[k++] = arr[j++];
|
|
620
|
+
for (size_t x = lo; x < hi; ++x) arr[x] = tmp[x];
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
static void
|
|
624
|
+
ms_sort(void **arr, void **tmp,
|
|
625
|
+
size_t lo, size_t hi, mkr_xpath_context_t *ctx)
|
|
626
|
+
{
|
|
627
|
+
if (hi - lo < 2) return;
|
|
628
|
+
size_t mid = lo + (hi - lo) / 2;
|
|
629
|
+
ms_sort(arr, tmp, lo, mid, ctx);
|
|
630
|
+
ms_sort(arr, tmp, mid, hi, ctx);
|
|
631
|
+
ms_merge(arr, tmp, lo, mid, hi, ctx);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
/* qsort fallback used only when tmp-buffer allocation fails. */
|
|
635
|
+
static int
|
|
636
|
+
doc_order_qsort_cb_fallback(const void *pa, const void *pb)
|
|
637
|
+
{
|
|
638
|
+
const MKR_DOM_NODE *a = *(const MKR_DOM_NODE * const *)pa;
|
|
639
|
+
const MKR_DOM_NODE *b = *(const MKR_DOM_NODE * const *)pb;
|
|
640
|
+
return doc_order_cmp(a, b);
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
/* Threshold for building the doc-order index. Below this we expect
|
|
644
|
+
* N log N parent-chain compares to be cheaper than the O(D) full-doc
|
|
645
|
+
* walk that the index requires (D = total nodes in document, which is
|
|
646
|
+
* typically 6000+ on real pages). Empirically the crossover sits
|
|
647
|
+
* somewhere between N=100 and N=300 on coffee.html; we pick a safe
|
|
648
|
+
* point that keeps small unions and reverse-axis dedups off the slow
|
|
649
|
+
* build path. Once the index IS built (e.g., by a larger sort earlier
|
|
650
|
+
* in the same evaluate), subsequent small sorts naturally reuse it. */
|
|
651
|
+
#define MKR_INDEX_BUILD_MIN 200
|
|
652
|
+
|
|
653
|
+
static void
|
|
654
|
+
mkr_nodeset_sort_doc_order(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
|
|
655
|
+
{
|
|
656
|
+
if (ns == NULL || ns->count < 2) return;
|
|
657
|
+
|
|
658
|
+
/* Already-sorted fast path. A relative step over a multi-node context
|
|
659
|
+
* (e.g. the child step of //li/a or //a:entry/a:title) collects its
|
|
660
|
+
* forward-axis results context-by-context in document order, so when the
|
|
661
|
+
* contexts are non-nested the concatenation is ALREADY in document order and
|
|
662
|
+
* the O(n log n) sort is pure waste. An O(n) scan confirms it: if every
|
|
663
|
+
* adjacent pair is already in order we return without sorting (and without
|
|
664
|
+
* building the doc-order index). Reverse axes and interleaved (nested-context)
|
|
665
|
+
* results fail the scan early and fall through to the full sort below. The
|
|
666
|
+
* scan uses the same comparator the sort would, so it can only skip work,
|
|
667
|
+
* never change the result. This is the libxml2-parity win for multi-step
|
|
668
|
+
* paths, where the sort otherwise dominates (profiled). */
|
|
669
|
+
int already_ordered = 1;
|
|
670
|
+
for (size_t i = 1; i < ns->count; ++i) {
|
|
671
|
+
if (doc_order_cmp_ctx(ctx, ns->items[i - 1], ns->items[i]) > 0) {
|
|
672
|
+
already_ordered = 0;
|
|
673
|
+
break;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
if (already_ordered) return;
|
|
677
|
+
|
|
678
|
+
/* Lazy build of the doc-order index. Only worth doing when the sort
|
|
679
|
+
* itself is large enough to amortise the full-doc walk; smaller
|
|
680
|
+
* sorts fall through to parent-chain compares via doc_order_cmp_ctx
|
|
681
|
+
* (which sees an unbuilt index and dispatches accordingly). */
|
|
682
|
+
mkr_doc_order_index_t *idx = mkr_ctx_order_index(ctx);
|
|
683
|
+
if (idx != NULL && !idx->built && ns->count >= MKR_INDEX_BUILD_MIN) {
|
|
684
|
+
MKR_DOM_NODE *root = (MKR_DOM_NODE *)mkr_ctx_document(ctx);
|
|
685
|
+
mkr_xpath_error_t ierr = {0};
|
|
686
|
+
(void)order_index_build(idx, root, &ierr);
|
|
687
|
+
mkr_xpath_error_clear(&ierr); /* index is best-effort; on OOM we fall through to parent-chain cmp */
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
void **tmp = mkr_reallocarray(NULL, ns->count, sizeof(*tmp));
|
|
691
|
+
if (tmp == NULL) {
|
|
692
|
+
/* Fall back to in-place qsort with parent-chain compare (slow but
|
|
693
|
+
* correct). Should be a very rare path. */
|
|
694
|
+
qsort(ns->items, ns->count, sizeof(ns->items[0]), doc_order_qsort_cb_fallback);
|
|
695
|
+
return;
|
|
696
|
+
}
|
|
697
|
+
ms_sort(ns->items, tmp, 0, ns->count, ctx);
|
|
698
|
+
free(tmp);
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
static void
|
|
702
|
+
mkr_nodeset_unique_sorted(mkr_xpath_context_t *ctx, mkr_nodeset_t *ns)
|
|
703
|
+
{
|
|
704
|
+
if (ns == NULL || ns->count < 2) return;
|
|
705
|
+
mkr_nodeset_sort_doc_order(ctx, ns);
|
|
706
|
+
size_t w = 1;
|
|
707
|
+
for (size_t r = 1; r < ns->count; ++r) {
|
|
708
|
+
if (ns->items[r] != ns->items[r - 1]) {
|
|
709
|
+
ns->items[w++] = ns->items[r];
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
ns->count = w;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
/* ---------- string-value cache: node-keyed insert (dereferences `node`) ---------- */
|
|
716
|
+
|
|
717
|
+
static int
|
|
718
|
+
mkr_get_cached_node_text(mkr_xpath_context_t *ctx,
|
|
719
|
+
MKR_DOM_NODE *node,
|
|
720
|
+
mkr_borrowed_text_t *out,
|
|
721
|
+
mkr_xpath_error_t *err)
|
|
722
|
+
{
|
|
723
|
+
if (out == NULL) {
|
|
724
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL, "mkr_get_cached_node_text: bad args");
|
|
725
|
+
return -1;
|
|
726
|
+
}
|
|
727
|
+
*out = mkr_borrowed_text(NULL, 0);
|
|
728
|
+
/* Contract: ctx is non-NULL when called from the evaluator (the only
|
|
729
|
+
* intended caller). A NULL ctx is a programming error; surface it. */
|
|
730
|
+
mkr_str_cache_t *c = mkr_ctx_str_cache(ctx);
|
|
731
|
+
if (c == NULL) {
|
|
732
|
+
mkr_err_set(err, MKR_XPATH_ERR_INTERNAL,
|
|
733
|
+
"mkr_get_cached_node_text called without a context");
|
|
734
|
+
return -1;
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
/* O(1) lookup via the pointer-keyed index. */
|
|
738
|
+
if (c->bucket_cap != 0) {
|
|
739
|
+
size_t mask = c->bucket_cap - 1;
|
|
740
|
+
size_t j = mkr_ptr_hash(node) & mask;
|
|
741
|
+
while (c->buckets[j] != 0) {
|
|
742
|
+
mkr_str_cache_entry_t *e = &c->entries[c->buckets[j] - 1];
|
|
743
|
+
if (e->node == node) {
|
|
744
|
+
*out = mkr_borrowed_text(e->str, e->len);
|
|
745
|
+
return 0;
|
|
746
|
+
}
|
|
747
|
+
j = (j + 1) & mask;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
mkr_owned_text_t text;
|
|
752
|
+
if (mkr_node_to_owned_text_or_fail(node, mkr_ctx_limits(ctx), err, &text) != 0) return -1;
|
|
753
|
+
|
|
754
|
+
if (mkr_grow_reserve((void **)&c->entries, &c->cap, c->count + 1,
|
|
755
|
+
sizeof(*c->entries)) != MKR_OK) {
|
|
756
|
+
mkr_owned_text_clear(&text);
|
|
757
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory in node string cache");
|
|
758
|
+
return -1;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/* Enforce a total cap on the cached string bytes (fail-closed). */
|
|
762
|
+
size_t new_total;
|
|
763
|
+
if (!mkr_size_add(c->total_bytes, text.len, &new_total)
|
|
764
|
+
|| mkr_limit_check_string_bytes(mkr_ctx_limits(ctx), new_total, err) != 0) {
|
|
765
|
+
mkr_owned_text_clear(&text);
|
|
766
|
+
return -1;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
/* Grow / build the index FIRST. It rebuilds only from the already-committed
|
|
770
|
+
* [0, count) entries (mkr_str_cache_reindex), so doing it before the new
|
|
771
|
+
* entry is written means every fallible step happens while the slot at
|
|
772
|
+
* [count] is still untouched - the entry is committed only once nothing can
|
|
773
|
+
* fail, eliminating the tentative-write-then-null-out rollback. Load factor
|
|
774
|
+
* is kept <= 1/2. */
|
|
775
|
+
if (c->bucket_cap == 0 || (c->count + 1) * 2 > c->bucket_cap) {
|
|
776
|
+
size_t new_bucket_cap = 64;
|
|
777
|
+
if (c->bucket_cap != 0 && !mkr_size_mul(c->bucket_cap, 2, &new_bucket_cap)) {
|
|
778
|
+
mkr_owned_text_clear(&text);
|
|
779
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "node string cache index overflow");
|
|
780
|
+
return -1;
|
|
781
|
+
}
|
|
782
|
+
if (mkr_str_cache_reindex(c, new_bucket_cap) != 0) {
|
|
783
|
+
mkr_owned_text_clear(&text);
|
|
784
|
+
mkr_err_set(err, MKR_XPATH_ERR_OOM, "out of memory indexing node string cache");
|
|
785
|
+
return -1;
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
/* Commit: no fallible step remains, so write the entry, index it, and bump
|
|
790
|
+
* the counters. mkr_str_cache_index_put reads entries[count].node, so the
|
|
791
|
+
* write must precede it. */
|
|
792
|
+
c->entries[c->count].node = node;
|
|
793
|
+
c->entries[c->count].str = text.ptr;
|
|
794
|
+
c->entries[c->count].len = text.len;
|
|
795
|
+
mkr_str_cache_index_put(c, c->count);
|
|
796
|
+
c->total_bytes += text.len;
|
|
797
|
+
c->count++;
|
|
798
|
+
|
|
799
|
+
*out = mkr_borrowed_text_from_owned(text);
|
|
800
|
+
return 0;
|
|
801
|
+
}
|