makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,109 @@
1
+ /* mkr_xpath_number.c - XPath 1.0 Number parsing, grammar-exact and
2
+ * locale-independent.
3
+ *
4
+ * The Number production is `Digits ('.' Digits?)? | '.' Digits` - no sign, no
5
+ * exponent, no hex, decimal point only. C strtod accepts a superset of that
6
+ * (hex floats, exponents, INF/NAN words) and honours LC_NUMERIC, so the engine
7
+ * never hands strtod an unscanned buffer: the extent scanner below bounds the
8
+ * exact grammar bytes first, and the converter parses only those.
9
+ *
10
+ * This file is the ONE home of strtod in the engine (the lexer and the
11
+ * string->number coercion both come through here). It is compiled once and is
12
+ * representation-independent: it never touches a DOM node, only bytes.
13
+ */
14
+ #include "mkr_xpath_internal.h"
15
+ #include "../core/mkr_core.h"
16
+
17
+ #include <math.h>
18
+ #include <stdlib.h>
19
+
20
+ static inline int
21
+ mkr_is_ascii_digit(int c)
22
+ {
23
+ return c >= '0' && c <= '9';
24
+ }
25
+
26
+ size_t
27
+ mkr_xpath_number_extent(const char *p, size_t len)
28
+ {
29
+ /* All reads through the bounded span: mkr_span_peek is -1 past the end, so
30
+ * mkr_is_ascii_digit fails closed there and no scan can leave [p, p+len). */
31
+ mkr_span_t s = mkr_span(p, len);
32
+ const char *start = mkr_span_mark(&s);
33
+
34
+ if (mkr_is_ascii_digit(mkr_span_peek(&s))) {
35
+ /* Digits ('.' Digits?)? -> "5", "5.", "5.5" */
36
+ while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
37
+ if (mkr_span_peek(&s) == '.') {
38
+ mkr_span_skip(&s, 1);
39
+ while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
40
+ }
41
+ return mkr_span_since(&s, start);
42
+ }
43
+ if (mkr_span_peek(&s) == '.') {
44
+ /* '.' Digits -> ".5" (a bare "." is NOT a Number) */
45
+ mkr_span_skip(&s, 1);
46
+ if (!mkr_is_ascii_digit(mkr_span_peek(&s))) return 0;
47
+ while (mkr_is_ascii_digit(mkr_span_peek(&s))) mkr_span_skip(&s, 1);
48
+ return mkr_span_since(&s, start);
49
+ }
50
+ return 0;
51
+ }
52
+
53
+ /* Allocation-free, locale-independent assembly of the (already grammar-checked)
54
+ * extent. Reached only when LC_NUMERIC != C makes the libc '.' parse fail, or
55
+ * when the isolating reparse copy can't be allocated (OOM). This corner trades
56
+ * correctly-rounded parsing for locale independence and never-failing: it builds
57
+ * the value digit-by-digit (libxml2-precision-class), so it can never raise and
58
+ * never mis-classifies the grammar. */
59
+ static double
60
+ mkr_xpath_number_manual(const char *p, size_t extent)
61
+ {
62
+ double v = 0.0;
63
+ size_t i = 0;
64
+ for (; i < extent && p[i] >= '0' && p[i] <= '9'; i++) {
65
+ v = v * 10.0 + (double)(p[i] - '0');
66
+ }
67
+ if (i < extent && p[i] == '.') {
68
+ i++;
69
+ double scale = 1.0;
70
+ for (; i < extent && p[i] >= '0' && p[i] <= '9'; i++) {
71
+ scale /= 10.0;
72
+ v += (double)(p[i] - '0') * scale;
73
+ }
74
+ }
75
+ return v;
76
+ }
77
+
78
+ double
79
+ mkr_xpath_number_from_extent(const char *p, size_t extent)
80
+ {
81
+ if (p == NULL || extent == 0) return (double)NAN;
82
+
83
+ /* Fast path (the hot, common case): the engine text contract NUL-terminates
84
+ * the buffer at/after p+extent, so this strtod is bounded and cannot leave the
85
+ * buffer; in the C locale it parses exactly the grammar bytes - the extent
86
+ * holds no sign/exponent/hex/INF/NAN for strtod to over-consume. When it
87
+ * consumed precisely `extent` bytes we are done, with no allocation. */
88
+ char *end = NULL;
89
+ double v = strtod(p, &end);
90
+ if (end != NULL && (size_t)(end - p) == extent) return v;
91
+
92
+ /* Disagreement. Either strtod consumed MORE (a number-like continuation abuts
93
+ * the extent in the surrounding buffer, e.g. "1e3" / "0x10" where only "1" /
94
+ * "0" is the Number) - excluded by the grammar - or LESS (a comma-decimal
95
+ * LC_NUMERIC stopped at '.'). Reparse exactly the extent in isolation. The
96
+ * copy is made ONLY here, off the hot path. */
97
+ char *copy = mkr_strndup(p, extent);
98
+ if (copy != NULL) {
99
+ char *cend = NULL;
100
+ double cv = strtod(copy, &cend);
101
+ int full = (cend != NULL && (size_t)(cend - copy) == extent);
102
+ free(copy);
103
+ if (full) return cv;
104
+ /* parsed but stopped short -> LC_NUMERIC '.' failure: fall through. */
105
+ }
106
+ /* OOM on the copy, or the locale '.' failure: assemble by hand (no alloc), so
107
+ * we fail closed rather than turning an OOM into a NaN or a raise. */
108
+ return mkr_xpath_number_manual(p, extent);
109
+ }
@@ -35,7 +35,7 @@ P_strndup(mkr_parser_t *P, const char *s, size_t n)
35
35
 
36
36
  /* strndup into an owned-text AST slot (node-test name, literal, varref/fncall
37
37
  * name). Returns 0 on success, -1 on OOM (P->err set, slot left {NULL,0}).
38
- * Callers MUST propagate the failure a {NULL,0} slot left in the AST would
38
+ * Callers MUST propagate the failure - a {NULL,0} slot left in the AST would
39
39
  * silently mis-compare at evaluation, so the parse fails closed instead. The
40
40
  * stored length lets the evaluator compare names without a per-node strlen.
41
41
  * `text` is a slice of the already-validated expr buffer, so the copy is valid. */
@@ -64,19 +64,12 @@ P_eat(mkr_parser_t *P, mkr_tok_kind_t k, const char *what)
64
64
  return P_advance(P);
65
65
  }
66
66
 
67
- /* AST node allocator. Counts against max_ast_nodes and reports both
68
- * OOM and LIMIT distinctly. */
67
+ /* AST node allocator - the shared mkr_node_alloc with this parser's limits/err
68
+ * (counts against max_ast_nodes, reports OOM and LIMIT distinctly). */
69
69
  static mkr_node_t *
70
70
  new_node(mkr_parser_t *P, mkr_nk_t kind)
71
71
  {
72
- if (mkr_limit_ast_node(P->limits, P->err) != 0) return NULL;
73
- mkr_node_t *n = mkr_callocarray(1, sizeof(*n));
74
- if (n == NULL) {
75
- mkr_err_set(P->err, MKR_XPATH_ERR_OOM, "out of memory allocating AST node");
76
- return NULL;
77
- }
78
- n->kind = kind;
79
- return n;
72
+ return mkr_node_alloc(P->limits, P->err, kind);
80
73
  }
81
74
 
82
75
  /* ---------- axis lookup ---------- */
@@ -84,7 +77,7 @@ new_node(mkr_parser_t *P, mkr_nk_t kind)
84
77
  static int
85
78
  axis_by_name(const char *s, size_t n, mkr_axis_t *out)
86
79
  {
87
- #define A(name, val) do { if (n == sizeof(name)-1 && memcmp(s, name, n) == 0) { *out = val; return 1; } } while (0)
80
+ #define A(name, val) do { if (mkr_bytes_eq(s, n, name, sizeof(name)-1)) { *out = val; return 1; } } while (0)
88
81
  A("child", MKR_AXIS_CHILD);
89
82
  A("descendant", MKR_AXIS_DESCENDANT);
90
83
  A("parent", MKR_AXIS_PARENT);
@@ -105,10 +98,10 @@ axis_by_name(const char *s, size_t n, mkr_axis_t *out)
105
98
  static int
106
99
  is_nodetype_name(const char *s, size_t n)
107
100
  {
108
- return (n == 4 && memcmp(s, "node", 4) == 0)
109
- || (n == 4 && memcmp(s, "text", 4) == 0)
110
- || (n == 7 && memcmp(s, "comment", 7) == 0)
111
- || (n == 22 && memcmp(s, "processing-instruction", 22) == 0);
101
+ return mkr_bytes_eq(s, n, "node", 4)
102
+ || mkr_bytes_eq(s, n, "text", 4)
103
+ || mkr_bytes_eq(s, n, "comment", 7)
104
+ || mkr_bytes_eq(s, n, "processing-instruction", 22);
112
105
  }
113
106
 
114
107
  /* ---------- forward decls ---------- */
@@ -146,6 +139,31 @@ make_implicit_step(mkr_step_t *out, mkr_axis_t axis, mkr_nt_kind_t nt_kind)
146
139
  return 0;
147
140
  }
148
141
 
142
+ /* Parse a run of `('/' | '//') Step` continuations onto the step array,
143
+ * expanding each `//` to an implicit descendant-or-self::node() step. TOK(P)
144
+ * must be positioned at the (possible) leading separator; a non-separator token
145
+ * makes this a no-op (zero iterations). On failure the steps pushed so far stay
146
+ * in the array - the caller's owning node frees them (the path/filter node, via
147
+ * mkr_node_free). This is the single home for the slash-step loop that the
148
+ * relative-path, absolute `//`, and filter-expr trailing-path forms all share. */
149
+ static int
150
+ parse_step_tail(mkr_parser_t *P, mkr_step_t **steps, size_t *nsteps, size_t *cap)
151
+ {
152
+ while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
153
+ int dslash = (TOK(P).kind == MKR_TK_DSLASH);
154
+ if (P_advance(P) != 0) return -1;
155
+ if (dslash) {
156
+ mkr_step_t implicit;
157
+ make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
158
+ if (push_step(P, steps, nsteps, cap, implicit) != 0) return -1;
159
+ }
160
+ mkr_step_t next = {0};
161
+ if (parse_step(P, &next) != 0) return -1;
162
+ if (push_step(P, steps, nsteps, cap, next) != 0) { mkr_step_clear(&next); return -1; }
163
+ }
164
+ return 0;
165
+ }
166
+
149
167
  /* ---------- node-test parsing ---------- */
150
168
 
151
169
  /*
@@ -173,9 +191,9 @@ parse_node_test(mkr_parser_t *P, mkr_axis_t axis, mkr_nodetest_t *out)
173
191
  if (P_advance(P) != 0) return -1;
174
192
  if (TOK(P).kind == MKR_TK_LPAREN) {
175
193
  if (P_advance(P) != 0) return -1;
176
- if (saved.text.len == 4 && memcmp(saved.text.ptr, "node", 4) == 0) out->kind = MKR_NT_NODE;
177
- else if (saved.text.len == 4 && memcmp(saved.text.ptr, "text", 4) == 0) out->kind = MKR_NT_TEXT;
178
- else if (saved.text.len == 7 && memcmp(saved.text.ptr, "comment", 7) == 0) out->kind = MKR_NT_COMMENT;
194
+ if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "node", 4)) out->kind = MKR_NT_NODE;
195
+ else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "text", 4)) out->kind = MKR_NT_TEXT;
196
+ else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "comment", 7)) out->kind = MKR_NT_COMMENT;
179
197
  else /* processing-instruction */ {
180
198
  out->kind = MKR_NT_PI;
181
199
  if (TOK(P).kind == MKR_TK_LITERAL) {
@@ -198,14 +216,15 @@ parse_node_test(mkr_parser_t *P, mkr_axis_t axis, mkr_nodetest_t *out)
198
216
  }
199
217
 
200
218
  if (TOK(P).kind == MKR_TK_QNAME) {
201
- /* QName name test: `prefix:local` or `prefix:*` split at the colon. */
219
+ /* QName name test: `prefix:local` or `prefix:*` - split at the colon. */
202
220
  const char *s = TOK(P).text.ptr;
203
221
  size_t n = TOK(P).text.len;
204
- size_t colon = 0;
205
- while (colon < n && s[colon] != ':') colon++;
222
+ mkr_span_t sp = mkr_span(s, n);
223
+ size_t colon;
224
+ if (!mkr_span_find(&sp, ':', &colon)) colon = n;
206
225
  if (P_fill_owned_text(P, mkr_borrowed_text(s, colon), &out->prefix) != 0) return -1;
207
226
  if (n - colon - 1 == 1 && s[colon + 1] == '*') {
208
- /* prefix:* any element in the prefix's namespace. */
227
+ /* prefix:* - any element in the prefix's namespace. */
209
228
  out->kind = MKR_NT_WILDCARD;
210
229
  } else {
211
230
  out->kind = MKR_NT_NAME;
@@ -249,8 +268,25 @@ parse_predicates(mkr_parser_t *P, mkr_node_t ***preds, size_t *npreds)
249
268
 
250
269
  /* ---------- step ---------- */
251
270
 
271
+ static int parse_step_inner(mkr_parser_t *P, mkr_step_t *out);
272
+
273
+ /* Parse one Step into *out. On failure *out is CLEARED here - a failing
274
+ * parse_step_inner can leave a partially-built step behind (an owned name-test
275
+ * text already strndup'd, predicates already pushed) and the callers all just
276
+ * bail, so freeing the partial step in one place keeps every error path
277
+ * leak-free without per-call-site cleanup. */
252
278
  static int
253
279
  parse_step(mkr_parser_t *P, mkr_step_t *out)
280
+ {
281
+ if (parse_step_inner(P, out) == 0) {
282
+ return 0;
283
+ }
284
+ mkr_step_clear(out);
285
+ return -1;
286
+ }
287
+
288
+ static int
289
+ parse_step_inner(mkr_parser_t *P, mkr_step_t *out)
254
290
  {
255
291
  memset(out, 0, sizeof(*out));
256
292
 
@@ -292,9 +328,9 @@ parse_step(mkr_parser_t *P, mkr_step_t *out)
292
328
  * but without re-consuming (we already advanced). */
293
329
  if (is_nodetype_name(saved.text.ptr, saved.text.len) && TOK(P).kind == MKR_TK_LPAREN) {
294
330
  if (P_advance(P) != 0) return -1;
295
- if (saved.text.len == 4 && memcmp(saved.text.ptr, "node", 4) == 0) out->test.kind = MKR_NT_NODE;
296
- else if (saved.text.len == 4 && memcmp(saved.text.ptr, "text", 4) == 0) out->test.kind = MKR_NT_TEXT;
297
- else if (saved.text.len == 7 && memcmp(saved.text.ptr, "comment", 7) == 0) out->test.kind = MKR_NT_COMMENT;
331
+ if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "node", 4)) out->test.kind = MKR_NT_NODE;
332
+ else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "text", 4)) out->test.kind = MKR_NT_TEXT;
333
+ else if (mkr_bytes_eq(saved.text.ptr, saved.text.len, "comment", 7)) out->test.kind = MKR_NT_COMMENT;
298
334
  else {
299
335
  out->test.kind = MKR_NT_PI;
300
336
  if (TOK(P).kind == MKR_TK_LITERAL) {
@@ -330,19 +366,7 @@ parse_relative_path(mkr_parser_t *P, mkr_step_t **steps, size_t *nsteps)
330
366
  if (parse_step(P, &s) != 0) return -1;
331
367
  if (push_step(P, steps, nsteps, &cap, s) != 0) { mkr_step_clear(&s); return -1; }
332
368
 
333
- while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
334
- int dslash = (TOK(P).kind == MKR_TK_DSLASH);
335
- if (P_advance(P) != 0) return -1;
336
- if (dslash) {
337
- mkr_step_t implicit;
338
- make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
339
- if (push_step(P, steps, nsteps, &cap, implicit) != 0) return -1;
340
- }
341
- mkr_step_t next = {0};
342
- if (parse_step(P, &next) != 0) return -1;
343
- if (push_step(P, steps, nsteps, &cap, next) != 0) { mkr_step_clear(&next); return -1; }
344
- }
345
- return 0;
369
+ return parse_step_tail(P, steps, nsteps, &cap);
346
370
  }
347
371
 
348
372
  /* Returns 1 if the current token can begin a Step. */
@@ -373,28 +397,11 @@ parse_location_path(mkr_parser_t *P)
373
397
  return n;
374
398
  }
375
399
  if (TOK(P).kind == MKR_TK_DSLASH) {
400
+ /* '//' = '/descendant-or-self::node()/'. Leave TOK at the DSLASH so the
401
+ * shared loop expands it (implicit step + the following step) itself. */
376
402
  n->u.path.absolute = 1;
377
- if (P_advance(P) != 0) goto fail;
378
- /* '//' = '/descendant-or-self::node()/' */
379
403
  size_t cap = 0;
380
- mkr_step_t implicit;
381
- make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
382
- if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, implicit) != 0) goto fail;
383
- mkr_step_t s = {0};
384
- if (parse_step(P, &s) != 0) goto fail;
385
- if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, s) != 0) { mkr_step_clear(&s); goto fail; }
386
- while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
387
- int dslash = (TOK(P).kind == MKR_TK_DSLASH);
388
- if (P_advance(P) != 0) goto fail;
389
- if (dslash) {
390
- mkr_step_t imp2;
391
- make_implicit_step(&imp2, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
392
- if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, imp2) != 0) goto fail;
393
- }
394
- mkr_step_t s2 = {0};
395
- if (parse_step(P, &s2) != 0) goto fail;
396
- if (push_step(P, &n->u.path.steps, &n->u.path.nsteps, &cap, s2) != 0) { mkr_step_clear(&s2); goto fail; }
397
- }
404
+ if (parse_step_tail(P, &n->u.path.steps, &n->u.path.nsteps, &cap) != 0) goto fail;
398
405
  return n;
399
406
  }
400
407
  /* Relative location path. */
@@ -423,8 +430,9 @@ parse_function_call(mkr_parser_t *P, mkr_token_t name_tok)
423
430
  if (n == NULL) return NULL;
424
431
 
425
432
  if (name_tok.kind == MKR_TK_QNAME) {
426
- size_t colon = 0;
427
- while (colon < name_tok.text.len && name_tok.text.ptr[colon] != ':') colon++;
433
+ mkr_span_t sp = mkr_span(name_tok.text.ptr, name_tok.text.len);
434
+ size_t colon;
435
+ if (!mkr_span_find(&sp, ':', &colon)) colon = name_tok.text.len;
428
436
  if (P_fill_owned_text(P, mkr_borrowed_text(name_tok.text.ptr, colon), &n->u.fncall.prefix) != 0) goto fail;
429
437
  if (P_fill_owned_text(P, mkr_borrowed_text(name_tok.text.ptr + colon + 1, name_tok.text.len - colon - 1), &n->u.fncall.name) != 0) goto fail;
430
438
  } else {
@@ -469,8 +477,9 @@ parse_primary(mkr_parser_t *P)
469
477
  n = new_node(P, MKR_NK_VARREF);
470
478
  if (n == NULL) return NULL;
471
479
  if (TOK(P).kind == MKR_TK_QNAME) {
472
- size_t colon = 0;
473
- while (colon < TOK(P).text.len && TOK(P).text.ptr[colon] != ':') colon++;
480
+ mkr_span_t sp = mkr_span(TOK(P).text.ptr, TOK(P).text.len);
481
+ size_t colon;
482
+ if (!mkr_span_find(&sp, ':', &colon)) colon = TOK(P).text.len;
474
483
  if (P_fill_owned_text(P, mkr_borrowed_text(TOK(P).text.ptr, colon), &n->u.varref.prefix) != 0) { mkr_node_free(n); return NULL; }
475
484
  if (P_fill_owned_text(P, mkr_borrowed_text(TOK(P).text.ptr + colon + 1, TOK(P).text.len - colon - 1), &n->u.varref.name) != 0) { mkr_node_free(n); return NULL; }
476
485
  } else {
@@ -533,30 +542,12 @@ parse_filter_expr(mkr_parser_t *P)
533
542
  mkr_node_free(f);
534
543
  return NULL;
535
544
  }
536
- if (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
537
- int dslash = (TOK(P).kind == MKR_TK_DSLASH);
538
- if (P_advance(P) != 0) { mkr_node_free(f); return NULL; }
539
- size_t cap = 0;
540
- if (dslash) {
541
- mkr_step_t implicit;
542
- make_implicit_step(&implicit, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
543
- if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, implicit) != 0) { mkr_node_free(f); return NULL; }
544
- }
545
- mkr_step_t s = {0};
546
- if (parse_step(P, &s) != 0) { mkr_node_free(f); return NULL; }
547
- if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, s) != 0) { mkr_step_clear(&s); mkr_node_free(f); return NULL; }
548
- while (TOK(P).kind == MKR_TK_SLASH || TOK(P).kind == MKR_TK_DSLASH) {
549
- int dd = (TOK(P).kind == MKR_TK_DSLASH);
550
- if (P_advance(P) != 0) { mkr_node_free(f); return NULL; }
551
- if (dd) {
552
- mkr_step_t imp2;
553
- make_implicit_step(&imp2, MKR_AXIS_DESCENDANT_OR_SELF, MKR_NT_NODE);
554
- if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, imp2) != 0) { mkr_node_free(f); return NULL; }
555
- }
556
- mkr_step_t s2 = {0};
557
- if (parse_step(P, &s2) != 0) { mkr_node_free(f); return NULL; }
558
- if (push_step(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap, s2) != 0) { mkr_step_clear(&s2); mkr_node_free(f); return NULL; }
559
- }
545
+ /* Optional trailing location path (e.g. `$x/foo`, `(expr)//bar`). The shared
546
+ * loop is a no-op when no separator follows, so call it unconditionally. */
547
+ size_t cap = 0;
548
+ if (parse_step_tail(P, &f->u.filter.path_steps, &f->u.filter.npath, &cap) != 0) {
549
+ mkr_node_free(f);
550
+ return NULL;
560
551
  }
561
552
  return f;
562
553
  }
@@ -581,10 +572,8 @@ looks_like_filter_expr(mkr_parser_t *P)
581
572
  if (TOK(P).kind == MKR_TK_NAME && is_nodetype_name(s, n)) {
582
573
  return 0;
583
574
  }
584
- /* peek next char (skipping ws) for '(' cheaper than running the lexer. */
585
- const char *p = P->L.cur;
586
- while (*p && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')) p++;
587
- return (*p == '(');
575
+ /* peek next byte (skipping ws) for '(' - cheaper than running the lexer. */
576
+ return (mkr_lexer_peek_nonws(&P->L) == '(');
588
577
  }
589
578
  default:
590
579
  return 0;
@@ -787,8 +776,8 @@ mkr_parse(mkr_verified_text_t expr, mkr_xpath_limits_t *limits, mkr_xpath_error_
787
776
  mkr_parser_t P;
788
777
  P.err = err;
789
778
  P.limits = limits;
790
- /* expr.ptr is a validated, NUL-terminated C string (mkr_verified_text_t). */
791
- mkr_lexer_init(&P.L, expr.ptr, err);
779
+ /* expr is a validated, NUL-terminated text of known length (mkr_verified_text_t). */
780
+ mkr_lexer_init(&P.L, expr.ptr, expr.len, err);
792
781
  if (!P.L.good) return NULL;
793
782
 
794
783
  mkr_node_t *root = parse_expr(&P);
@@ -804,7 +793,7 @@ mkr_parse(mkr_verified_text_t expr, mkr_xpath_limits_t *limits, mkr_xpath_error_
804
793
  * pass so the latter sees the rewritten step structure. */
805
794
  mkr_apply_peephole(root);
806
795
  /* Static hoisting pass: marks subtrees that can be memoized during
807
- * eval. Cheap (single AST walk) and runs once per parse cached by
796
+ * eval. Cheap (single AST walk) and runs once per parse - cached by
808
797
  * the wrapper-level AST cache. */
809
798
  mkr_mark_context_independent(root);
810
799
  return root;
@@ -0,0 +1,30 @@
1
+ /* mkr_xpath_prelude_html.h - HTML engine-instance prelude.
2
+ *
3
+ * Included at the top of mkr_xpath_engine_html.c BEFORE the engine bodies,
4
+ * symmetric with mkr_xpath_prelude_xml.h. It binds the DOM types + the
5
+ * MKR_NODE_* node-access contract to Lexbor's lxb_dom, so the bodies compile
6
+ * against lxb_dom.
7
+ *
8
+ * The engine's internals are file-static (one merged TU per instance), so they
9
+ * never collide with the XML instance and need no renaming. Only the two
10
+ * node-dereferencing ENTRY points the driver dispatches on are external; the
11
+ * prelude suffixes them _html so they coexist with the XML instance's _xml
12
+ * pair. (mkr_xpath.c declares both and selects by engine_kind.) */
13
+ #ifndef MKR_XPATH_PRELUDE_HTML_H
14
+ #define MKR_XPATH_PRELUDE_HTML_H
15
+
16
+ #include <lexbor/dom/dom.h>
17
+
18
+ /* DOM types -> Lexbor (the type counterpart of MKR_NODE_*). */
19
+ #define MKR_DOM_NODE lxb_dom_node_t
20
+ #define MKR_DOM_ELEMENT lxb_dom_element_t
21
+ #define MKR_DOM_ATTR lxb_dom_attr_t
22
+ #define MKR_DOM_DOCUMENT lxb_dom_document_t
23
+
24
+ /* The two external entry points (the only symbols not file-static). */
25
+ #define mkr_eval_ast mkr_eval_ast_html
26
+ #define mkr_try_first_match mkr_try_first_match_html
27
+
28
+ #include "mkr_xpath_node_access_html.h" /* MKR_NODE_* for lxb_dom */
29
+
30
+ #endif /* MKR_XPATH_PRELUDE_HTML_H */
@@ -0,0 +1,28 @@
1
+ /* mkr_xpath_prelude_xml.h - XML engine-instance prelude.
2
+ *
3
+ * Included at the top of mkr_xpath_engine_xml.c BEFORE the engine bodies. It
4
+ * binds the DOM types + MKR_NODE_* node-access contract to the custom
5
+ * mkr_xml_node_t (and selects the engine's XML host-policy via MKR_HOST_XML in
6
+ * the node-access header), so the same bodies compile for XML.
7
+ *
8
+ * The engine's internals are file-static (one merged TU per instance), so they
9
+ * coexist with the HTML instance without renaming. Only the two
10
+ * node-dereferencing ENTRY points the driver dispatches on are external; the
11
+ * prelude suffixes them _xml (the HTML prelude suffixes the same pair _html).
12
+ */
13
+ #ifndef MKR_XPATH_PRELUDE_XML_H
14
+ #define MKR_XPATH_PRELUDE_XML_H
15
+
16
+ /* DOM types -> the custom node (the type counterpart of MKR_NODE_*). */
17
+ #define MKR_DOM_NODE mkr_xml_node_t
18
+ #define MKR_DOM_ELEMENT mkr_xml_node_t
19
+ #define MKR_DOM_ATTR mkr_xml_node_t
20
+ #define MKR_DOM_DOCUMENT mkr_xml_doc_t
21
+
22
+ /* The two external entry points (the only symbols not file-static). */
23
+ #define mkr_eval_ast mkr_eval_ast_xml
24
+ #define mkr_try_first_match mkr_try_first_match_xml
25
+
26
+ #include "mkr_xpath_node_access_xml.h" /* MKR_NODE_* + MKR_HOST_XML for the custom node */
27
+
28
+ #endif /* MKR_XPATH_PRELUDE_XML_H */