makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/* mkr_xpath_number.c - XPath 1.0 Number parsing, grammar-exact and
|
|
2
|
+
* locale-independent.
|
|
3
|
+
*
|
|
4
|
+
* The Number production is `Digits ('.' Digits?)? | '.' Digits` - no sign, no
|
|
5
|
+
* exponent, no hex, decimal point only. C strtod accepts a superset of that
|
|
6
|
+
* (hex floats, exponents, INF/NAN words) and honours LC_NUMERIC, so the engine
|
|
7
|
+
* never hands strtod an unscanned buffer: the extent scanner below bounds the
|
|
8
|
+
* exact grammar bytes first, and the converter parses only those.
|
|
9
|
+
*
|
|
10
|
+
* This file is the ONE home of strtod in the engine (the lexer and the
|
|
11
|
+
* string->number coercion both come through here). It is compiled once and is
|
|
12
|
+
* representation-independent: it never touches a DOM node, only bytes.
|
|
13
|
+
*/
|
|
14
|
+
#include "mkr_xpath_internal.h"
|
|
15
|
+
#include "../core/mkr_core.h"
|
|
16
|
+
|
|
17
|
+
#include <math.h>
|
|
18
|
+
#include <stdlib.h>
|
|
19
|
+
|
|
20
|
+
static inline int
|
|
21
|
+
mkr_is_ascii_digit(int c)
|
|
22
|
+
{
|
|
23
|
+
return c >= '0' && c <= '9';
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
size_t
|
|
27
|
+
mkr_xpath_number_extent(const char *p, size_t len)
|
|
28
|
+
{
|
|
29
|
+
/* All reads through the bounded span: mkr_span_peek is -1 past the end, so
|
|
30
|
+
* mkr_is_ascii_digit fails closed there and no scan can leave [p, p+len). */
|
|
31
|
+
mkr_span_t s = mkr_span(p, len);
|
|
32
|
+
const char *start = mkr_span_mark(&s);
|
|
33
|
+
|
|
34
|
+
if (mkr_is_ascii_digit(mkr_span_peek(&s))) {
|
|
35
|
+
/* Digits ('.' Digits?)? -> "5", "5.", "5.5" */
|
|
36
|
+
while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
|
|
37
|
+
if (mkr_span_peek(&s) == '.') {
|
|
38
|
+
mkr_span_skip(&s, 1);
|
|
39
|
+
while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
|
|
40
|
+
}
|
|
41
|
+
return mkr_span_since(&s, start);
|
|
42
|
+
}
|
|
43
|
+
if (mkr_span_peek(&s) == '.') {
|
|
44
|
+
/* '.' Digits -> ".5" (a bare "." is NOT a Number) */
|
|
45
|
+
mkr_span_skip(&s, 1);
|
|
46
|
+
if (!mkr_is_ascii_digit(mkr_span_peek(&s))) return 0;
|
|
47
|
+
while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
|
|
48
|
+
return mkr_span_since(&s, start);
|
|
49
|
+
}
|
|
50
|
+
return 0;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/* Allocation-free, locale-independent assembly of the (already grammar-checked)
|
|
54
|
+
* extent. Reached only when LC_NUMERIC != C makes the libc '.' parse fail, or
|
|
55
|
+
* when the isolating reparse copy can't be allocated (OOM). This corner trades
|
|
56
|
+
* correctly-rounded parsing for locale independence and never-failing: it builds
|
|
57
|
+
* the value digit-by-digit (libxml2-precision-class), so it can never raise and
|
|
58
|
+
* never mis-classifies the grammar. */
|
|
59
|
+
static double
|
|
60
|
+
mkr_xpath_number_manual(const char *p, size_t extent)
|
|
61
|
+
{
|
|
62
|
+
double v = 0.0;
|
|
63
|
+
size_t i = 0;
|
|
64
|
+
for (; i < extent && p[i] >= '0' && p[i] <= '9'; i++) {
|
|
65
|
+
v = v * 10.0 + (double)(p[i] - '0');
|
|
66
|
+
}
|
|
67
|
+
if (i < extent && p[i] == '.') {
|
|
68
|
+
i++;
|
|
69
|
+
double scale = 1.0;
|
|
70
|
+
for (; i < extent && p[i] >= '0' && p[i] <= '9'; i++) {
|
|
71
|
+
scale /= 10.0;
|
|
72
|
+
v += (double)(p[i] - '0') * scale;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return v;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
double
|
|
79
|
+
mkr_xpath_number_from_extent(const char *p, size_t extent)
|
|
80
|
+
{
|
|
81
|
+
if (p == NULL || extent == 0) return (double)NAN;
|
|
82
|
+
|
|
83
|
+
/* Fast path (the hot, common case): the engine text contract NUL-terminates
|
|
84
|
+
* the buffer at/after p+extent, so this strtod is bounded and cannot leave the
|
|
85
|
+
* buffer; in the C locale it parses exactly the grammar bytes - the extent
|
|
86
|
+
* holds no sign/exponent/hex/INF/NAN for strtod to over-consume. When it
|
|
87
|
+
* consumed precisely `extent` bytes we are done, with no allocation. */
|
|
88
|
+
char *end = NULL;
|
|
89
|
+
double v = strtod(p, &end);
|
|
90
|
+
if (end != NULL && (size_t)(end - p) == extent) return v;
|
|
91
|
+
|
|
92
|
+
/* Disagreement. Either strtod consumed MORE (a number-like continuation abuts
|
|
93
|
+
* the extent in the surrounding buffer, e.g. "1e3" / "0x10" where only "1" /
|
|
94
|
+
* "0" is the Number) - excluded by the grammar - or LESS (a comma-decimal
|
|
95
|
+
* LC_NUMERIC stopped at '.'). Reparse exactly the extent in isolation. The
|
|
96
|
+
* copy is made ONLY here, off the hot path. */
|
|
97
|
+
char *copy = mkr_strndup(p, extent);
|
|
98
|
+
if (copy != NULL) {
|
|
99
|
+
char *cend = NULL;
|
|
100
|
+
double cv = strtod(copy, &cend);
|
|
101
|
+
int full = (cend != NULL && (size_t)(cend - copy) == extent);
|
|
102
|
+
free(copy);
|
|
103
|
+
if (full) return cv;
|
|
104
|
+
/* parsed but stopped short -> LC_NUMERIC '.' failure: fall through. */
|
|
105
|
+
}
|
|
106
|
+
/* OOM on the copy, or the locale '.' failure: assemble by hand (no alloc), so
|
|
107
|
+
* we fail closed rather than turning an OOM into a NaN or a raise. */
|
|
108
|
+
return mkr_xpath_number_manual(p, extent);
|
|
109
|
+
}
|
|
@@ -35,7 +35,7 @@ P_strndup(mkr_parser_t *P, const char *s, size_t n)
|
|
|
35
35
|
|
|
36
36
|
/* strndup into an owned-text AST slot (node-test name, literal, varref/fncall
|
|
37
37
|
* name). Returns 0 on success, -1 on OOM (P->err set, slot left {NULL,0}).
|
|
38
|
-
* Callers MUST propagate the failure
|
|
38
|
+
* Callers MUST propagate the failure - a {NULL,0} slot left in the AST would
|
|
39
39
|
* silently mis-compare at evaluation, so the parse fails closed instead. The
|
|
40
40
|
* stored length lets the evaluator compare names without a per-node strlen.
|
|
41
41
|
* `text` is a slice of the already-validated expr buffer, so the copy is valid. */
|
|
@@ -64,19 +64,12 @@ P_eat(mkr_parser_t *P, mkr_tok_kind_t k, const char *what)
|
|
|
64
64
|
return P_advance(P);
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
/* AST node allocator
|
|
68
|
-
* OOM and LIMIT distinctly. */
|
|
67
|
+
/* AST node allocator - the shared mkr_node_alloc with this parser's limits/err
|
|
68
|
+
* (counts against max_ast_nodes, reports OOM and LIMIT distinctly). */
|
|
69
69
|
static mkr_node_t *
|
|
70
70
|
new_node(mkr_parser_t *P, mkr_nk_t kind)
|
|
71
71
|
{
|
|
72
|
-
|
|
73
|
-
mkr_node_t *n = mkr_callocarray(1, sizeof(*n));
|
|
74
|
-
if (n == NULL) {
|
|
75
|
-
mkr_err_set(P->err, MKR_XPATH_ERR_OOM, "out of memory allocating AST node");
|
|
76
|
-
return NULL;
|
|
77
|
-
}
|
|
78
|
-
n->kind = kind;
|
|
79
|
-
return n;
|
|
72
|
+
return mkr_node_alloc(P->limits, P->err, kind);
|
|
80
73
|
}
|
|
81
74
|
|
|
82
75
|
/* ---------- axis lookup ---------- */
|
|
@@ -84,7 +77,7 @@ new_node(mkr_parser_t *P, mkr_nk_t kind)
|
|
|
84
77
|
static int
|
|
85
78
|
axis_by_name(const char *s, size_t n, mkr_axis_t *out)
|
|
86
79
|
{
|
|
87
|
-
#define A(name, val) do { if (n
|
|
80
|
+
#define A(name, val) do { if (mkr_bytes_eq(s, n, name, sizeof(name)-1)) { *out = val; return 1; } } while (0)
|
|
88
81
|
A("child", MKR_AXIS_CHILD);
|
|
89
82
|
A("descendant", MKR_AXIS_DESCENDANT);
|
|
90
83
|
A("parent", MKR_AXIS_PARENT);
|
|
@@ -105,10 +98,10 @@ axis_by_name(const char *s, size_t n, mkr_axis_t *out)
|
|
|
105
98
|
static int
|
|
106
99
|
is_nodetype_name(const char *s, size_t n)
|
|
107
100
|
{
|
|
108
|
-
return (
|
|
109
|
-
|| (
|
|
110
|
-
|| (
|
|
111
|
-
|| (
|
|
101
|
+
return mkr_bytes_eq(s, n, "node", 4)
|
|
102
|
+
|| mkr_bytes_eq(s, n, "text", 4)
|
|
103
|
+
|| mkr_bytes_eq(s, n, "comment", 7)
|
|
104
|
+
|| mkr_bytes_eq(s, n, "processing-instruction", 22);
|
|
112
105
|
}
|
|
113
106
|
|
|
114
107
|
/* ---------- forward decls ---------- */
|
|
@@ -146,6 +139,31 @@ make_implicit_step(mkr_step_t *out, mkr_axis_t axis, mkr_nt_kind_t nt_kind)
|
|
|
146
139
|
return 0;
|
|
147
140
|
}
|
|
148
141
|
|
|
142
|
+
/* Parse a run of `('/' | '//') Step` continuations onto the step array,
|
|
143
|
+
* expanding each `//` to an implicit descendant-or-self::node() step. TOK(P)
|
|
144
|
+
* must be positioned at the (possible) leading separator; a non-separator token
|
|
145
|
+
* makes this a no-op (zero iterations). On failure the steps pushed so far stay
|
|
146
|
+
* in the array - the caller's owning node frees them (the path/filter node, via
|
|
147
|
+
* mkr_node_free). This is the single home for the slash-step loop that the
|
|
148
|
+
* relative-path, absolute `//`, and filter-expr trailing-path forms all share. */
|
|
149
|
+
static int
|
|
150
|
+
parse_step_tail(mkr_parser_t *P, mkr_step_t **steps, size_t *nsteps, size_t *cap)
|
|
151
|
+
{
|
|
152
|
+
while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
|
|
153
|
+
int dslash = (TOK(P).kind == MKR_TK_DSLASH);
|
|
154
|
+
if (P_advance(P) != 0) return -1;
|
|
155
|
+
if (dslash) {
|
|
156
|
+
mkr_step_t implicit;
|
|
157
|
+
make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
158
|
+
if (push_step(P, steps, nsteps, cap, implicit) != 0) return -1;
|
|
159
|
+
}
|
|
160
|
+
mkr_step_t next = {0};
|
|
161
|
+
if (parse_step(P, &next) != 0) return -1;
|
|
162
|
+
if (push_step(P, steps, nsteps, cap, next) != 0) { mkr_step_clear(&next); return -1; }
|
|
163
|
+
}
|
|
164
|
+
return 0;
|
|
165
|
+
}
|
|
166
|
+
|
|
149
167
|
/* ---------- node-test parsing ---------- */
|
|
150
168
|
|
|
151
169
|
/*
|
|
@@ -173,9 +191,9 @@ parse_node_test(mkr_parser_t *P, mkr_axis_t axis, mkr_nodetest_t *out)
|
|
|
173
191
|
if (P_advance(P) != 0) return -1;
|
|
174
192
|
if (TOK(P).kind == MKR_TK_LPAREN) {
|
|
175
193
|
if (P_advance(P) != 0) return -1;
|
|
176
|
-
if (saved.text.
|
|
177
|
-
else if (saved.text.
|
|
178
|
-
else if (saved.text.
|
|
194
|
+
if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "node", 4)) out->kind = MKR_NT_NODE;
|
|
195
|
+
else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "text", 4)) out->kind = MKR_NT_TEXT;
|
|
196
|
+
else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "comment", 7)) out->kind = MKR_NT_COMMENT;
|
|
179
197
|
else /* processing-instruction */ {
|
|
180
198
|
out->kind = MKR_NT_PI;
|
|
181
199
|
if (TOK(P).kind == MKR_TK_LITERAL) {
|
|
@@ -198,14 +216,15 @@ parse_node_test(mkr_parser_t *P, mkr_axis_t axis, mkr_nodetest_t *out)
|
|
|
198
216
|
}
|
|
199
217
|
|
|
200
218
|
if (TOK(P).kind == MKR_TK_QNAME) {
|
|
201
|
-
/* QName name test: `prefix:local` or `prefix:*`
|
|
219
|
+
/* QName name test: `prefix:local` or `prefix:*` - split at the colon. */
|
|
202
220
|
const char *s = TOK(P).text.ptr;
|
|
203
221
|
size_t n = TOK(P).text.len;
|
|
204
|
-
|
|
205
|
-
|
|
222
|
+
mkr_span_t sp = mkr_span(s, n);
|
|
223
|
+
size_t colon;
|
|
224
|
+
if (!mkr_span_find(&sp, ':', &colon)) colon = n;
|
|
206
225
|
if (P_fill_owned_text(P, mkr_borrowed_text(s, colon), &out->prefix) != 0) return -1;
|
|
207
226
|
if (n - colon - 1 == 1 && s[colon + 1] == '*') {
|
|
208
|
-
/* prefix:*
|
|
227
|
+
/* prefix:* - any element in the prefix's namespace. */
|
|
209
228
|
out->kind = MKR_NT_WILDCARD;
|
|
210
229
|
} else {
|
|
211
230
|
out->kind = MKR_NT_NAME;
|
|
@@ -249,8 +268,25 @@ parse_predicates(mkr_parser_t *P, mkr_node_t ***preds, size_t *npreds)
|
|
|
249
268
|
|
|
250
269
|
/* ---------- step ---------- */
|
|
251
270
|
|
|
271
|
+
static int parse_step_inner(mkr_parser_t *P, mkr_step_t *out);
|
|
272
|
+
|
|
273
|
+
/* Parse one Step into *out. On failure *out is CLEARED here - a failing
|
|
274
|
+
* parse_step_inner can leave a partially-built step behind (an owned name-test
|
|
275
|
+
* text already strndup'd, predicates already pushed) and the callers all just
|
|
276
|
+
* bail, so freeing the partial step in one place keeps every error path
|
|
277
|
+
* leak-free without per-call-site cleanup. */
|
|
252
278
|
static int
|
|
253
279
|
parse_step(mkr_parser_t *P, mkr_step_t *out)
|
|
280
|
+
{
|
|
281
|
+
if (parse_step_inner(P, out) == 0) {
|
|
282
|
+
return 0;
|
|
283
|
+
}
|
|
284
|
+
mkr_step_clear(out);
|
|
285
|
+
return -1;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
static int
|
|
289
|
+
parse_step_inner(mkr_parser_t *P, mkr_step_t *out)
|
|
254
290
|
{
|
|
255
291
|
memset(out, 0, sizeof(*out));
|
|
256
292
|
|
|
@@ -292,9 +328,9 @@ parse_step(mkr_parser_t *P, mkr_step_t *out)
|
|
|
292
328
|
* but without re-consuming (we already advanced). */
|
|
293
329
|
if (is_nodetype_name(saved.text.ptr, saved.text.len) && TOK(P).kind == MKR_TK_LPAREN) {
|
|
294
330
|
if (P_advance(P) != 0) return -1;
|
|
295
|
-
if (saved.text.
|
|
296
|
-
else if (saved.text.
|
|
297
|
-
else if (saved.text.
|
|
331
|
+
if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "node", 4)) out->test.kind = MKR_NT_NODE;
|
|
332
|
+
else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "text", 4)) out->test.kind = MKR_NT_TEXT;
|
|
333
|
+
else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "comment", 7)) out->test.kind = MKR_NT_COMMENT;
|
|
298
334
|
else {
|
|
299
335
|
out->test.kind = MKR_NT_PI;
|
|
300
336
|
if (TOK(P).kind == MKR_TK_LITERAL) {
|
|
@@ -330,19 +366,7 @@ parse_relative_path(mkr_parser_t *P, mkr_step_t **steps, size_t *nsteps)
|
|
|
330
366
|
if (parse_step(P, &s) != 0) return -1;
|
|
331
367
|
if (push_step(P, steps, nsteps, &cap, s) != 0) { mkr_step_clear(&s); return -1; }
|
|
332
368
|
|
|
333
|
-
|
|
334
|
-
int dslash = (TOK(P).kind == MKR_TK_DSLASH);
|
|
335
|
-
if (P_advance(P) != 0) return -1;
|
|
336
|
-
if (dslash) {
|
|
337
|
-
mkr_step_t implicit;
|
|
338
|
-
make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
339
|
-
if (push_step(P, steps, nsteps, &cap, implicit) != 0) return -1;
|
|
340
|
-
}
|
|
341
|
-
mkr_step_t next = {0};
|
|
342
|
-
if (parse_step(P, &next) != 0) return -1;
|
|
343
|
-
if (push_step(P, steps, nsteps, &cap, next) != 0) { mkr_step_clear(&next); return -1; }
|
|
344
|
-
}
|
|
345
|
-
return 0;
|
|
369
|
+
return parse_step_tail(P, steps, nsteps, &cap);
|
|
346
370
|
}
|
|
347
371
|
|
|
348
372
|
/* Returns 1 if the current token can begin a Step. */
|
|
@@ -373,28 +397,11 @@ parse_location_path(mkr_parser_t *P)
|
|
|
373
397
|
return n;
|
|
374
398
|
}
|
|
375
399
|
if (TOK(P).kind == MKR_TK_DSLASH) {
|
|
400
|
+
/* '//' = '/descendant-or-self::node()/'. Leave TOK at the DSLASH so the
|
|
401
|
+
* shared loop expands it (implicit step + the following step) itself. */
|
|
376
402
|
n->u.path.absolute = 1;
|
|
377
|
-
if (P_advance(P) != 0) goto fail;
|
|
378
|
-
/* '//' = '/descendant-or-self::node()/' */
|
|
379
403
|
size_t cap = 0;
|
|
380
|
-
|
|
381
|
-
make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
382
|
-
if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, implicit) != 0) goto fail;
|
|
383
|
-
mkr_step_t s = {0};
|
|
384
|
-
if (parse_step(P, &s) != 0) goto fail;
|
|
385
|
-
if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, s) != 0) { mkr_step_clear(&s); goto fail; }
|
|
386
|
-
while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
|
|
387
|
-
int dslash = (TOK(P).kind == MKR_TK_DSLASH);
|
|
388
|
-
if (P_advance(P) != 0) goto fail;
|
|
389
|
-
if (dslash) {
|
|
390
|
-
mkr_step_t imp2;
|
|
391
|
-
make_implicit_step(&imp2, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
392
|
-
if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, imp2) != 0) goto fail;
|
|
393
|
-
}
|
|
394
|
-
mkr_step_t s2 = {0};
|
|
395
|
-
if (parse_step(P, &s2) != 0) goto fail;
|
|
396
|
-
if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, s2) != 0) { mkr_step_clear(&s2); goto fail; }
|
|
397
|
-
}
|
|
404
|
+
if (parse_step_tail(P, &n->u.path.steps, &n->u.path.nsteps, &cap) != 0) goto fail;
|
|
398
405
|
return n;
|
|
399
406
|
}
|
|
400
407
|
/* Relative location path. */
|
|
@@ -423,8 +430,9 @@ parse_function_call(mkr_parser_t *P, mkr_token_t name_tok)
|
|
|
423
430
|
if (n == NULL) return NULL;
|
|
424
431
|
|
|
425
432
|
if (name_tok.kind == MKR_TK_QNAME) {
|
|
426
|
-
|
|
427
|
-
|
|
433
|
+
mkr_span_t sp = mkr_span(name_tok.text.ptr, name_tok.text.len);
|
|
434
|
+
size_t colon;
|
|
435
|
+
if (!mkr_span_find(&sp, ':', &colon)) colon = name_tok.text.len;
|
|
428
436
|
if (P_fill_owned_text(P, mkr_borrowed_text(name_tok.text.ptr, colon), &n->u.fncall.prefix) != 0) goto fail;
|
|
429
437
|
if (P_fill_owned_text(P, mkr_borrowed_text(name_tok.text.ptr + colon + 1, name_tok.text.len - colon - 1), &n->u.fncall.name) != 0) goto fail;
|
|
430
438
|
} else {
|
|
@@ -469,8 +477,9 @@ parse_primary(mkr_parser_t *P)
|
|
|
469
477
|
n = new_node(P, MKR_NK_VARREF);
|
|
470
478
|
if (n == NULL) return NULL;
|
|
471
479
|
if (TOK(P).kind == MKR_TK_QNAME) {
|
|
472
|
-
|
|
473
|
-
|
|
480
|
+
mkr_span_t sp = mkr_span(TOK(P).text.ptr, TOK(P).text.len);
|
|
481
|
+
size_t colon;
|
|
482
|
+
if (!mkr_span_find(&sp, ':', &colon)) colon = TOK(P).text.len;
|
|
474
483
|
if (P_fill_owned_text(P, mkr_borrowed_text(TOK(P).text.ptr, colon), &n->u.varref.prefix) != 0) { mkr_node_free(n); return NULL; }
|
|
475
484
|
if (P_fill_owned_text(P, mkr_borrowed_text(TOK(P).text.ptr + colon + 1, TOK(P).text.len - colon - 1), &n->u.varref.name) != 0) { mkr_node_free(n); return NULL; }
|
|
476
485
|
} else {
|
|
@@ -533,30 +542,12 @@ parse_filter_expr(mkr_parser_t *P)
|
|
|
533
542
|
mkr_node_free(f);
|
|
534
543
|
return NULL;
|
|
535
544
|
}
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
543
|
-
if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, implicit) != 0) { mkr_node_free(f); return NULL; }
|
|
544
|
-
}
|
|
545
|
-
mkr_step_t s = {0};
|
|
546
|
-
if (parse_step(P, &s) != 0) { mkr_node_free(f); return NULL; }
|
|
547
|
-
if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, s) != 0) { mkr_step_clear(&s); mkr_node_free(f); return NULL; }
|
|
548
|
-
while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
|
|
549
|
-
int dd = (TOK(P).kind == MKR_TK_DSLASH);
|
|
550
|
-
if (P_advance(P) != 0) { mkr_node_free(f); return NULL; }
|
|
551
|
-
if (dd) {
|
|
552
|
-
mkr_step_t imp2;
|
|
553
|
-
make_implicit_step(&imp2, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
|
|
554
|
-
if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, imp2) != 0) { mkr_node_free(f); return NULL; }
|
|
555
|
-
}
|
|
556
|
-
mkr_step_t s2 = {0};
|
|
557
|
-
if (parse_step(P, &s2) != 0) { mkr_node_free(f); return NULL; }
|
|
558
|
-
if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, s2) != 0) { mkr_step_clear(&s2); mkr_node_free(f); return NULL; }
|
|
559
|
-
}
|
|
545
|
+
/* Optional trailing location path (e.g. `$x/foo`, `(expr)//bar`). The shared
|
|
546
|
+
* loop is a no-op when no separator follows, so call it unconditionally. */
|
|
547
|
+
size_t cap = 0;
|
|
548
|
+
if (parse_step_tail(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap) != 0) {
|
|
549
|
+
mkr_node_free(f);
|
|
550
|
+
return NULL;
|
|
560
551
|
}
|
|
561
552
|
return f;
|
|
562
553
|
}
|
|
@@ -581,10 +572,8 @@ looks_like_filter_expr(mkr_parser_t *P)
|
|
|
581
572
|
if (TOK(P).kind == MKR_TK_NAME && is_nodetype_name(s, n)) {
|
|
582
573
|
return 0;
|
|
583
574
|
}
|
|
584
|
-
/* peek next
|
|
585
|
-
|
|
586
|
-
while (*p && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')) p++;
|
|
587
|
-
return (*p == '(');
|
|
575
|
+
/* peek next byte (skipping ws) for '(' - cheaper than running the lexer. */
|
|
576
|
+
return (mkr_lexer_peek_nonws(&P->L) == '(');
|
|
588
577
|
}
|
|
589
578
|
default:
|
|
590
579
|
return 0;
|
|
@@ -787,8 +776,8 @@ mkr_parse(mkr_verified_text_t expr, mkr_xpath_limits_t *limits, mkr_xpath_error_
|
|
|
787
776
|
mkr_parser_t P;
|
|
788
777
|
P.err = err;
|
|
789
778
|
P.limits = limits;
|
|
790
|
-
/* expr
|
|
791
|
-
mkr_lexer_init(&P.L, expr.ptr, err);
|
|
779
|
+
/* expr is a validated, NUL-terminated text of known length (mkr_verified_text_t). */
|
|
780
|
+
mkr_lexer_init(&P.L, expr.ptr, expr.len, err);
|
|
792
781
|
if (!P.L.good) return NULL;
|
|
793
782
|
|
|
794
783
|
mkr_node_t *root = parse_expr(&P);
|
|
@@ -804,7 +793,7 @@ mkr_parse(mkr_verified_text_t expr, mkr_xpath_limits_t *limits, mkr_xpath_error_
|
|
|
804
793
|
* pass so the latter sees the rewritten step structure. */
|
|
805
794
|
mkr_apply_peephole(root);
|
|
806
795
|
/* Static hoisting pass: marks subtrees that can be memoized during
|
|
807
|
-
* eval. Cheap (single AST walk) and runs once per parse
|
|
796
|
+
* eval. Cheap (single AST walk) and runs once per parse - cached by
|
|
808
797
|
* the wrapper-level AST cache. */
|
|
809
798
|
mkr_mark_context_independent(root);
|
|
810
799
|
return root;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/* mkr_xpath_prelude_html.h - HTML engine-instance prelude.
|
|
2
|
+
*
|
|
3
|
+
* Included at the top of mkr_xpath_engine_html.c BEFORE the engine bodies,
|
|
4
|
+
* symmetric with mkr_xpath_prelude_xml.h. It binds the DOM types + the
|
|
5
|
+
* MKR_NODE_* node-access contract to Lexbor's lxb_dom, so the bodies compile
|
|
6
|
+
* against lxb_dom.
|
|
7
|
+
*
|
|
8
|
+
* The engine's internals are file-static (one merged TU per instance), so they
|
|
9
|
+
* never collide with the XML instance and need no renaming. Only the two
|
|
10
|
+
* node-dereferencing ENTRY points the driver dispatches on are external; the
|
|
11
|
+
* prelude suffixes them _html so they coexist with the XML instance's _xml
|
|
12
|
+
* pair. (mkr_xpath.c declares both and selects by engine_kind.) */
|
|
13
|
+
#ifndef MKR_XPATH_PRELUDE_HTML_H
|
|
14
|
+
#define MKR_XPATH_PRELUDE_HTML_H
|
|
15
|
+
|
|
16
|
+
#include <lexbor/dom/dom.h>
|
|
17
|
+
|
|
18
|
+
/* DOM types -> Lexbor (the type counterpart of MKR_NODE_*). */
|
|
19
|
+
#define MKR_DOM_NODE lxb_dom_node_t
|
|
20
|
+
#define MKR_DOM_ELEMENT lxb_dom_element_t
|
|
21
|
+
#define MKR_DOM_ATTR lxb_dom_attr_t
|
|
22
|
+
#define MKR_DOM_DOCUMENT lxb_dom_document_t
|
|
23
|
+
|
|
24
|
+
/* The two external entry points (the only symbols not file-static). */
|
|
25
|
+
#define mkr_eval_ast mkr_eval_ast_html
|
|
26
|
+
#define mkr_try_first_match mkr_try_first_match_html
|
|
27
|
+
|
|
28
|
+
#include "mkr_xpath_node_access_html.h" /* MKR_NODE_* for lxb_dom */
|
|
29
|
+
|
|
30
|
+
#endif /* MKR_XPATH_PRELUDE_HTML_H */
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/* mkr_xpath_prelude_xml.h - XML engine-instance prelude.
|
|
2
|
+
*
|
|
3
|
+
* Included at the top of mkr_xpath_engine_xml.c BEFORE the engine bodies. It
|
|
4
|
+
* binds the DOM types + MKR_NODE_* node-access contract to the custom
|
|
5
|
+
* mkr_xml_node_t (and selects the engine's XML host-policy via MKR_HOST_XML in
|
|
6
|
+
* the node-access header), so the same bodies compile for XML.
|
|
7
|
+
*
|
|
8
|
+
* The engine's internals are file-static (one merged TU per instance), so they
|
|
9
|
+
* coexist with the HTML instance without renaming. Only the two
|
|
10
|
+
* node-dereferencing ENTRY points the driver dispatches on are external; the
|
|
11
|
+
* prelude suffixes them _xml (the HTML prelude suffixes the same pair _html).
|
|
12
|
+
*/
|
|
13
|
+
#ifndef MKR_XPATH_PRELUDE_XML_H
|
|
14
|
+
#define MKR_XPATH_PRELUDE_XML_H
|
|
15
|
+
|
|
16
|
+
/* DOM types -> the custom node (the type counterpart of MKR_NODE_*). */
|
|
17
|
+
#define MKR_DOM_NODE mkr_xml_node_t
|
|
18
|
+
#define MKR_DOM_ELEMENT mkr_xml_node_t
|
|
19
|
+
#define MKR_DOM_ATTR mkr_xml_node_t
|
|
20
|
+
#define MKR_DOM_DOCUMENT mkr_xml_doc_t
|
|
21
|
+
|
|
22
|
+
/* The two external entry points (the only symbols not file-static). */
|
|
23
|
+
#define mkr_eval_ast mkr_eval_ast_xml
|
|
24
|
+
#define mkr_try_first_match mkr_try_first_match_xml
|
|
25
|
+
|
|
26
|
+
#include "mkr_xpath_node_access_xml.h" /* MKR_NODE_* + MKR_HOST_XML for the custom node */
|
|
27
|
+
|
|
28
|
+
#endif /* MKR_XPATH_PRELUDE_XML_H */
|