makiri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/security.yml +88 -3
  5. data/.github/workflows/valgrind.yml +135 -0
  6. data/CHANGELOG.md +60 -2
  7. data/README.md +81 -77
  8. data/Rakefile +194 -3
  9. data/ext/makiri/bridge/ruby_string.c +119 -66
  10. data/ext/makiri/core/mkr_alloc.c +40 -3
  11. data/ext/makiri/core/mkr_alloc.h +27 -4
  12. data/ext/makiri/core/mkr_buf.c +13 -3
  13. data/ext/makiri/core/mkr_buf.h +80 -5
  14. data/ext/makiri/core/mkr_core.c +143 -0
  15. data/ext/makiri/core/mkr_core.h +10 -1
  16. data/ext/makiri/core/mkr_span.h +186 -0
  17. data/ext/makiri/core/mkr_utf8.c +101 -0
  18. data/ext/makiri/core/mkr_utf8.h +88 -0
  19. data/ext/makiri/extconf.rb +104 -9
  20. data/ext/makiri/fuzz/Makefile +95 -0
  21. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  22. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  23. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  24. data/ext/makiri/glue/glue.h +8 -0
  25. data/ext/makiri/glue/ruby_doc.c +20 -24
  26. data/ext/makiri/glue/ruby_html_css.c +58 -12
  27. data/ext/makiri/glue/ruby_html_mutate.c +11 -6
  28. data/ext/makiri/glue/ruby_html_node.c +3 -32
  29. data/ext/makiri/glue/ruby_node.c +39 -0
  30. data/ext/makiri/glue/ruby_xml.c +198 -16
  31. data/ext/makiri/glue/ruby_xml_node.c +46 -59
  32. data/ext/makiri/glue/ruby_xpath.c +4 -4
  33. data/ext/makiri/lexbor_compat/source_loc.c +14 -16
  34. data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
  35. data/ext/makiri/makiri.c +45 -0
  36. data/ext/makiri/xml/mkr_xml.h +2 -3
  37. data/ext/makiri/xml/mkr_xml_chars.c +67 -97
  38. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  39. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  40. data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
  41. data/ext/makiri/xml/mkr_xml_node.c +147 -15
  42. data/ext/makiri/xml/mkr_xml_node.h +71 -6
  43. data/ext/makiri/xml/mkr_xml_tree.c +185 -149
  44. data/ext/makiri/xpath/mkr_css.c +1023 -0
  45. data/ext/makiri/xpath/mkr_css.h +65 -0
  46. data/ext/makiri/xpath/mkr_xpath.c +37 -0
  47. data/ext/makiri/xpath/mkr_xpath.h +13 -0
  48. data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
  49. data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
  50. data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
  51. data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
  53. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  54. data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
  55. data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
  56. data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
  57. data/lib/makiri/cdata_section.rb +1 -3
  58. data/lib/makiri/comment.rb +1 -3
  59. data/lib/makiri/document.rb +8 -0
  60. data/lib/makiri/element.rb +1 -3
  61. data/lib/makiri/processing_instruction.rb +1 -3
  62. data/lib/makiri/text.rb +1 -3
  63. data/lib/makiri/version.rb +1 -1
  64. data/lib/makiri/xml/builder.rb +263 -0
  65. data/lib/makiri/xml/node_methods.rb +47 -0
  66. data/lib/makiri.rb +1 -0
  67. data/script/check_alloc_failures.rb +266 -0
  68. data/script/check_c_safety.rb +45 -2
  69. data/script/check_c_safety_allowlist.yml +19 -0
  70. data/script/check_leaks.rb +64 -0
  71. data/script/leaks_harness.rb +64 -0
  72. data/vendor/lexbor/CMakeLists.txt +6 -0
  73. data/vendor/lexbor/README.md +12 -0
  74. data/vendor/lexbor/config.cmake +1 -1
  75. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  76. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  77. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  78. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  79. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  80. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  81. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  82. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  83. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  84. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  85. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  86. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  87. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  88. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  89. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  90. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  91. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  92. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  93. data/vendor/lexbor/version +1 -1
  94. metadata +19 -1
@@ -16,85 +16,12 @@
16
16
  * via mkr_utf8_sanitize (declared in compat.h).
17
17
  */
18
18
 
19
- /* Is `src` (len bytes) well-formed UTF-8? A dedicated validator (the Unicode
20
- * "well-formed UTF-8 byte sequences" table, RFC 3629 / WHATWG): it rejects bad
21
- * continuation bytes, overlong forms, surrogates (U+D800..U+DFFF) and code
22
- * points above U+10FFFF, and an incomplete trailing sequence. This is the same
23
- * accept set as Lexbor's decoder but validate-only - it never materialises code
24
- * points, and rips through ASCII (the common case) a machine word at a time, so
25
- * it is much cheaper than decode-and-discard. NUL bytes are valid here and are
26
- * left for the HTML tokenizer to handle per the spec.
27
- *
28
- * The contract that matters: this returns true *only* for input that
19
+ /* UTF-8 validation: the shared, allocation-free validator lives in core
20
+ * (core/mkr_utf8.c, via mkr_core.h) so this fast path and the Ruby bridge's
21
+ * strict input gate (mkr_verify_text) run ONE implementation. Its contract
22
+ * here is unchanged: it returns true *only* for input that
29
23
  * mkr_utf8_replace_invalid would leave byte-identical, so "valid" can safely
30
- * skip the transcode. */
31
- static bool
32
- mkr_utf8_valid(const lxb_char_t *src, size_t len)
33
- {
34
- const unsigned char *p = (const unsigned char *)src;
35
- const unsigned char *const end = p + len;
36
-
37
- while (p < end) {
38
- unsigned char b = *p;
39
-
40
- if (b < 0x80) {
41
- /* ASCII fast path: skip a run of ASCII bytes a word at a time
42
- * (any high bit set ends the run), then byte-wise for the tail. */
43
- while ((size_t)(end - p) >= sizeof(size_t)) {
44
- size_t w;
45
- memcpy(&w, p, sizeof(w));
46
- if (w & (size_t)0x8080808080808080ULL) {
47
- break;
48
- }
49
- p += sizeof(size_t);
50
- }
51
- while (p < end && *p < 0x80) {
52
- p++;
53
- }
54
- continue;
55
- }
56
-
57
- /* Multi-byte: decide length and validate the (length-dependent) ranges
58
- * that exclude overlong forms, surrogates and > U+10FFFF. */
59
- size_t n;
60
- if (b >= 0xC2 && b <= 0xDF) { /* U+0080..U+07FF */
61
- n = 2;
62
- if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
63
- } else if (b == 0xE0) { /* U+0800..U+0FFF */
64
- n = 3;
65
- if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
66
- || (p[2] & 0xC0) != 0x80) return false;
67
- } else if (b >= 0xE1 && b <= 0xEC) { /* U+1000..U+CFFF */
68
- n = 3;
69
- if (end - p < 3 || (p[1] & 0xC0) != 0x80
70
- || (p[2] & 0xC0) != 0x80) return false;
71
- } else if (b == 0xED) { /* U+D000..U+D7FF */
72
- n = 3; /* (excludes surrogates) */
73
- if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
74
- || (p[2] & 0xC0) != 0x80) return false;
75
- } else if (b == 0xEE || b == 0xEF) { /* U+E000..U+FFFF */
76
- n = 3;
77
- if (end - p < 3 || (p[1] & 0xC0) != 0x80
78
- || (p[2] & 0xC0) != 0x80) return false;
79
- } else if (b == 0xF0) { /* U+10000..U+3FFFF */
80
- n = 4;
81
- if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
82
- || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
83
- } else if (b >= 0xF1 && b <= 0xF3) { /* U+40000..U+FFFFF */
84
- n = 4;
85
- if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
86
- || (p[3] & 0xC0) != 0x80) return false;
87
- } else if (b == 0xF4) { /* U+100000..U+10FFFF */
88
- n = 4;
89
- if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
90
- || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
91
- } else { /* C0,C1,F5..FF,stray 80..BF */
92
- return false;
93
- }
94
- p += n;
95
- }
96
- return true;
97
- }
24
+ * skip the transcode. (lxb_char_t is unsigned char, so the call is direct.) */
98
25
 
99
26
  /* Transcode UTF-8 -> UTF-8 replacing every invalid sequence with U+FFFD
100
27
  * (WHATWG byte-stream decoding), into a freshly malloc'd, NUL-terminated
data/ext/makiri/makiri.c CHANGED
@@ -79,6 +79,48 @@ mkr_c_selftest(VALUE self)
79
79
  return Qtrue;
80
80
  }
81
81
 
82
+ /* Makiri.__alloc_inject(n) / __alloc_inject_calls / __alloc_inject? - the OOM
83
+ * sweep harness's controls (script/check_alloc_failures.rb, `rake oom`): arm
84
+ * "the nth core allocation fails once", and read how many core allocations a
85
+ * workload attempted. Compiled to a real hook only under MKR_ALLOC_INJECT
86
+ * (extconf: MAKIRI_ALLOC_INJECT=1); in a normal build __alloc_inject? is
87
+ * false and the others raise, so the harness fails loudly on the wrong build
88
+ * instead of sweeping nothing. Test hooks only. */
89
+ static VALUE
90
+ mkr_s_alloc_inject_p(VALUE self)
91
+ {
92
+ (void)self;
93
+ #ifdef MKR_ALLOC_INJECT
94
+ return Qtrue;
95
+ #else
96
+ return Qfalse;
97
+ #endif
98
+ }
99
+
100
+ static VALUE
101
+ mkr_s_alloc_inject(VALUE self, VALUE nth)
102
+ {
103
+ (void)self;
104
+ #ifdef MKR_ALLOC_INJECT
105
+ mkr_alloc_inject_arm((long long)NUM2LL(nth));
106
+ return Qnil;
107
+ #else
108
+ (void)nth;
109
+ rb_raise(rb_eNotImpError, "rebuild with MAKIRI_ALLOC_INJECT=1 (rake oom does this)");
110
+ #endif
111
+ }
112
+
113
+ static VALUE
114
+ mkr_s_alloc_inject_calls(VALUE self)
115
+ {
116
+ (void)self;
117
+ #ifdef MKR_ALLOC_INJECT
118
+ return ULL2NUM(mkr_alloc_inject_calls());
119
+ #else
120
+ rb_raise(rb_eNotImpError, "rebuild with MAKIRI_ALLOC_INJECT=1 (rake oom does this)");
121
+ #endif
122
+ }
123
+
82
124
  /* Makiri::XML.__decode(str) -> validated, UTF-8-tagged String, or raises
83
125
  * Makiri::XML::SyntaxError. Internal test hook exercising the strict input
84
126
  * decode (§2.1) on its own, until the full Makiri::XML(...) parse pipeline
@@ -223,5 +265,8 @@ Init_makiri(void)
223
265
  mkr_init_xml_node();
224
266
 
225
267
  rb_define_singleton_method(mkr_mMakiri, "__c_selftest", mkr_c_selftest, 0);
268
+ rb_define_singleton_method(mkr_mMakiri, "__alloc_inject?", mkr_s_alloc_inject_p, 0);
269
+ rb_define_singleton_method(mkr_mMakiri, "__alloc_inject", mkr_s_alloc_inject, 1);
270
+ rb_define_singleton_method(mkr_mMakiri, "__alloc_inject_calls", mkr_s_alloc_inject_calls, 0);
226
271
  rb_define_singleton_method(mkr_mXML, "__decode", mkr_xml_s_decode, 1);
227
272
  }
@@ -99,9 +99,8 @@ int mkr_xml_is_char(uint32_t c);
99
99
  * recognition (comment/CDATA/PI content, where '&'/'<' are literal). 0 / -1. */
100
100
  int mkr_xml_validate_chars(const char *src, uint32_t len);
101
101
 
102
- /* Bounds-checked one-codepoint UTF-8 decode (byte length 1-4, or 0 at end /
103
- * on malformed input) + XML 1.0 §2.3 NameStartChar / NameChar (§9.2b). */
104
- int mkr_xml_utf8_decode(const char *p, const char *end, uint32_t *cp);
102
+ /* XML 1.0 §2.3 NameStartChar / NameChar (§9.2b). One-codepoint decoding is the
103
+ * core mkr_utf8_decode1 / mkr_utf8_decode1_span (strict, bounds-checked). */
105
104
  int mkr_xml_is_name_start(uint32_t c);
106
105
  int mkr_xml_is_name_char(uint32_t c);
107
106
 
@@ -7,9 +7,16 @@
7
7
  * resulting codepoint (literal or from a numeric reference) must be an XML 1.0
8
8
  * Char; control characters / surrogates / out-of-range are rejected. The input
9
9
  * is already valid UTF-8 (§2.1).
10
+ *
11
+ * All input reads go through the bounded reader (core mkr_span_t) and the
12
+ * strict core decoder (mkr_utf8_decode1) - an out-of-bounds read is
13
+ * structurally impossible, not a per-site convention; all output goes through
14
+ * the bounded writer (mkr_spanbuf_t). The lint (raw_scan_call /
15
+ * raw_cursor_member) keeps it that way.
10
16
  */
11
17
  #include "mkr_xml.h"
12
18
  #include "mkr_xml_node.h"
19
+ #include "../core/mkr_core.h" /* mkr_span_t + mkr_utf8_decode1 */
13
20
 
14
21
  #include <string.h>
15
22
 
@@ -24,73 +31,25 @@ mkr_xml_is_char(uint32_t c)
24
31
  || (c >= 0x10000u && c <= 0x10FFFFu);
25
32
  }
26
33
 
27
- /* Decode one codepoint from UTF-8. STRICT (self-contained, not trusting the
28
- * caller): rejects truncation, bad continuation bytes, overlong encodings,
29
- * surrogates and out-of-range values. Returns the byte length (1-4) or 0 on any
30
- * violation - fail closed, even if some future caller feeds unvalidated bytes. */
31
- static int
32
- is_cont(const char *p, const char *end)
33
- {
34
- return p < end && ((unsigned char)*p & 0xC0u) == 0x80u;
35
- }
36
-
37
- /* forward decl: utf8_decode is defined below but mkr_xml_validate_chars uses it. */
38
- static int utf8_decode(const char *p, const char *end, uint32_t *cp);
39
-
40
34
  /* Validate that [src, src+len) is entirely XML 1.0 Char, with NO entity/reference
41
35
  * recognition (for comment/CDATA/PI content, where '&' and '<' are literal). 0 if
42
- * all valid, -1 on the first malformed UTF-8 or non-Char (caller raises SYNTAX). */
36
+ * all valid, -1 on the first malformed UTF-8 or non-Char (caller raises SYNTAX).
37
+ * The strict decode (truncation / bad continuation / overlong / surrogate /
38
+ * out-of-range -> 0) is the core mkr_utf8_decode1 - self-contained, not trusting
39
+ * the caller, even if some future caller feeds unvalidated bytes. */
43
40
  int
44
41
  mkr_xml_validate_chars(const char *src, uint32_t len)
45
42
  {
46
- const char *p = src, *end = src + len;
47
- while (p < end) {
43
+ mkr_span_t s = mkr_span(src, len);
44
+ while (mkr_span_left(&s) > 0) {
48
45
  uint32_t cp;
49
- int bl = utf8_decode(p, end, &cp);
46
+ int bl = mkr_utf8_decode1_span(&s, &cp);
50
47
  if (bl == 0 || !mkr_xml_is_char(cp)) return -1;
51
- p += bl;
48
+ mkr_span_skip(&s, (size_t)bl);
52
49
  }
53
50
  return 0;
54
51
  }
55
52
 
56
- static int
57
- utf8_decode(const char *p, const char *end, uint32_t *cp)
58
- {
59
- unsigned char c = (unsigned char)p[0];
60
- if (c < 0x80u) { *cp = c; return 1; }
61
- if ((c & 0xE0u) == 0xC0u) {
62
- if (!is_cont(p + 1, end)) return 0;
63
- uint32_t v = ((uint32_t)(c & 0x1Fu) << 6) | ((unsigned char)p[1] & 0x3Fu);
64
- if (v < 0x80u) return 0; /* overlong */
65
- *cp = v; return 2;
66
- }
67
- if ((c & 0xF0u) == 0xE0u) {
68
- if (!is_cont(p + 1, end) || !is_cont(p + 2, end)) return 0;
69
- uint32_t v = ((uint32_t)(c & 0x0Fu) << 12) | (((unsigned char)p[1] & 0x3Fu) << 6)
70
- | ((unsigned char)p[2] & 0x3Fu);
71
- if (v < 0x800u) return 0; /* overlong */
72
- if (v >= 0xD800u && v <= 0xDFFFu) return 0; /* surrogate */
73
- *cp = v; return 3;
74
- }
75
- if ((c & 0xF8u) == 0xF0u) {
76
- if (!is_cont(p + 1, end) || !is_cont(p + 2, end) || !is_cont(p + 3, end)) return 0;
77
- uint32_t v = ((uint32_t)(c & 0x07u) << 18) | (((unsigned char)p[1] & 0x3Fu) << 12)
78
- | (((unsigned char)p[2] & 0x3Fu) << 6) | ((unsigned char)p[3] & 0x3Fu);
79
- if (v < 0x10000u || v > 0x10FFFFu) return 0; /* overlong / out of range */
80
- *cp = v; return 4;
81
- }
82
- return 0;
83
- }
84
-
85
- /* Public, bounds-checked one-codepoint decode for the tokenizer's name scanning
86
- * (returns 0 at end-of-input as well as on any malformed byte). */
87
- int
88
- mkr_xml_utf8_decode(const char *p, const char *end, uint32_t *cp)
89
- {
90
- if (p >= end) return 0;
91
- return utf8_decode(p, end, cp);
92
- }
93
-
94
53
  /* XML 1.0 §2.3 NameStartChar / NameChar (the full Unicode sets, not just ASCII).
95
54
  * Element/attribute QNames and PI targets are validated against these so an
96
55
  * ill-formed name never reaches the DOM (§9.2b). */
@@ -131,9 +90,12 @@ utf8_encode(uint32_t cp, char *out)
131
90
 
132
91
  /* Expand references in [src, src+len) and validate XML Char. Output is never
133
92
  * longer than the input (every &...; reference is >= 4 chars and yields <= 4
134
- * bytes), so a single arena buffer of `len` bytes suffices. Returns the arena
135
- * slice and sets *out_len; returns NULL on an undefined entity / bad reference /
136
- * non-XML-Char (sets *status = MKR_XML_ERR_SYNTAX) or arena OOM/LIMIT. */
93
+ * bytes), so a buffer of `len` bytes suffices - and the bounded arena writer
94
+ * (mkr_xml_arena_spanbuf + core mkr_spanbuf) enforces that bound rather than
95
+ * trusting it (a write past it fails closed instead of overrunning the arena).
96
+ * Returns the arena slice and sets *out_len;
97
+ * returns NULL on an undefined entity / bad reference / non-XML-Char (sets
98
+ * *status = MKR_XML_ERR_SYNTAX) or arena OOM/LIMIT. */
137
99
  const char *
138
100
  mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
139
101
  mkr_xml_expand_mode_t mode, uint32_t *out_len, mkr_xml_status_t *status)
@@ -148,46 +110,45 @@ mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
148
110
  if (len == 0) { *out_len = 0; return ""; }
149
111
  if (doc == NULL || src == NULL) { *status = MKR_XML_ERR_INTERNAL; return NULL; }
150
112
 
151
- char *buf = mkr_xml_arena_scratch_bytes(doc, len);
152
- if (buf == NULL) { *status = doc->oom; return NULL; } /* doc non-NULL here (guarded above) */
153
-
154
- size_t o = 0;
155
- const char *p = src;
156
- const char *end = src + len;
113
+ /* All output goes through the bounded writer, so no code below can overrun
114
+ * the buffer - a write that would exceed `len` is refused and latches ok=0. */
115
+ mkr_spanbuf_t b = mkr_xml_arena_spanbuf(doc, len);
116
+ if (!b.ok) { *status = doc->oom; return NULL; } /* backing alloc failed */
117
+ mkr_span_t s = mkr_span(src, len);
157
118
 
158
- while (p < end) {
159
- if (*p != '&') {
119
+ while (mkr_span_left(&s) > 0) {
120
+ if (mkr_span_peek(&s) != '&') {
160
121
  uint32_t cp;
161
- int bl = utf8_decode(p, end, &cp);
122
+ int bl = mkr_utf8_decode1_span(&s, &cp);
162
123
  if (bl == 0 || !mkr_xml_is_char(cp)) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
163
124
  /* §3.3.3: in an attribute value a *literal* TAB/LF/CR normalizes to a
164
125
  * space (reference-derived whitespace is preserved - see below). */
165
126
  if (mode == MKR_XML_EXPAND_ATTR
166
127
  && (cp == 0x9u || cp == 0xAu || cp == 0xDu)) {
167
- buf[o++] = ' ';
168
- p += bl;
128
+ mkr_spanbuf_putc(&b, ' ');
129
+ mkr_span_skip(&s, (size_t)bl);
169
130
  continue;
170
131
  }
171
- memcpy(buf + o, p, (size_t)bl);
172
- o += (size_t)bl;
173
- p += bl;
132
+ mkr_spanbuf_write(&b, mkr_span_mark(&s), (size_t)bl);
133
+ mkr_span_skip(&s, (size_t)bl);
174
134
  continue;
175
135
  }
176
136
 
177
137
  /* a reference: '&' ... ';' */
178
- p++; /* past '&' */
179
- if (p < end && *p == '#') { /* numeric character reference */
180
- p++;
138
+ mkr_span_skip(&s, 1); /* past '&' */
139
+ if (mkr_span_peek(&s) == '#') { /* numeric character reference */
140
+ mkr_span_skip(&s, 1);
181
141
  int hex = 0;
182
142
  /* §4.1: the hex marker is a lowercase 'x' only - "&#X58;" is not-wf
183
143
  * (an uppercase 'X' is not a decimal digit either, so it is rejected
184
144
  * as "no digits" below). */
185
- if (p < end && *p == 'x') { hex = 1; p++; }
186
- const char *digits = p;
145
+ if (mkr_span_peek(&s) == 'x') { hex = 1; mkr_span_skip(&s, 1); }
187
146
  uint32_t base = hex ? 16u : 10u;
188
147
  uint32_t cp = 0;
189
- while (p < end && *p != ';') {
190
- unsigned char d = (unsigned char)*p;
148
+ int ndigits = 0;
149
+ for (;;) {
150
+ int d = mkr_span_peek(&s);
151
+ if (d < 0 || d == ';') break;
191
152
  uint32_t dig;
192
153
  if (d >= '0' && d <= '9') dig = (uint32_t)(d - '0');
193
154
  else if (hex && d >= 'a' && d <= 'f') dig = (uint32_t)(d - 'a' + 10);
@@ -197,29 +158,38 @@ mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
197
158
  * uint32_t into the valid range and be falsely accepted. */
198
159
  if (cp > (0x10FFFFu - dig) / base) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
199
160
  cp = cp * base + dig;
200
- p++;
161
+ ndigits++;
162
+ mkr_span_skip(&s, 1);
201
163
  }
202
- if (p >= end || p == digits) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* no ';' / no digits */
203
- p++; /* past ';' */
164
+ if (mkr_span_peek(&s) != ';' || ndigits == 0) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* no ';' / no digits */
165
+ mkr_span_skip(&s, 1); /* past ';' */
204
166
  if (!mkr_xml_is_char(cp)) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
205
- o += (size_t)utf8_encode(cp, buf + o);
167
+ /* encode into a safe 4-byte local, then hand the exact length to the
168
+ * bounded writer (the encode itself can never overrun `enc`). */
169
+ char enc[4];
170
+ mkr_spanbuf_write(&b, enc, (size_t)utf8_encode(cp, enc));
206
171
  } else { /* named entity */
207
- const char *ns = p;
208
- while (p < end && *p != ';') p++;
209
- if (p >= end) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* unterminated */
210
- size_t nlen = (size_t)(p - ns);
211
- p++; /* past ';' */
172
+ size_t nlen;
173
+ if (!mkr_span_find(&s, ';', &nlen)) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* unterminated */
174
+ const char *ns = mkr_span_mark(&s);
175
+ mkr_span_skip(&s, nlen + 1); /* name + ';' */
212
176
  char ch;
213
- if (nlen == 2 && memcmp(ns, "lt", 2) == 0) ch = '<';
214
- else if (nlen == 2 && memcmp(ns, "gt", 2) == 0) ch = '>';
215
- else if (nlen == 3 && memcmp(ns, "amp", 3) == 0) ch = '&';
216
- else if (nlen == 4 && memcmp(ns, "apos", 4) == 0) ch = '\'';
217
- else if (nlen == 4 && memcmp(ns, "quot", 4) == 0) ch = '"';
177
+ if (mkr_bytes_eq(ns, nlen, "lt", 2)) ch = '<';
178
+ else if (mkr_bytes_eq(ns, nlen, "gt", 2)) ch = '>';
179
+ else if (mkr_bytes_eq(ns, nlen, "amp", 3)) ch = '&';
180
+ else if (mkr_bytes_eq(ns, nlen, "apos", 4)) ch = '\'';
181
+ else if (mkr_bytes_eq(ns, nlen, "quot", 4)) ch = '"';
218
182
  else { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* undefined entity */
219
- buf[o++] = ch;
183
+ mkr_spanbuf_putc(&b, ch);
220
184
  }
221
185
  }
222
186
 
223
- *out_len = (uint32_t)o;
224
- return buf;
187
+ /* By construction the buffer was never overrun; finish returns NULL if a
188
+ * write was refused (the "output <= input" invariant broke - our bug). The
189
+ * backing alloc was already checked at init, so a NULL here means a refused
190
+ * write: fail closed rather than return a truncated expansion. */
191
+ const char *out = mkr_spanbuf_finish(&b);
192
+ if (out == NULL) { *status = MKR_XML_ERR_INTERNAL; return NULL; }
193
+ *out_len = (uint32_t)b.pos;
194
+ return out;
225
195
  }
@@ -0,0 +1,169 @@
1
+ /* Element-name index for the XML arena (see mkr_xml_index.h). Open-addressing
2
+ * hash keyed by (local name + namespace URI); each entry holds the
3
+ * document-ordered elements with that name. Two tree passes: count elements to
4
+ * size the table, then fill in document order. Heap-allocated (not the arena),
5
+ * freed on invalidate. OOM fails closed (NULL -> caller walks the tree). */
6
+
7
+ #include "mkr_xml_index.h"
8
+ #include "../core/mkr_core.h"
9
+
10
+ #include <stdlib.h>
11
+ #include <string.h>
12
+
13
+ typedef struct {
14
+ const char *local; /* borrowed from the first element (arena-stable) */
15
+ const char *ns_uri; /* may be NULL (no namespace) */
16
+ uint32_t local_len;
17
+ uint32_t ns_uri_len;
18
+ mkr_xml_node_t **nodes; /* document order */
19
+ size_t count, cap;
20
+ } mkr_xml_index_entry_t;
21
+
22
+ struct mkr_xml_name_index {
23
+ mkr_xml_index_entry_t *buckets;
24
+ size_t cap; /* power of two; 0 only for an empty document */
25
+ };
26
+
27
+ /* FNV-1a over the local name then the namespace URI. */
28
+ static uint64_t
29
+ key_hash(const char *local, size_t local_len, const char *ns_uri, size_t ns_uri_len)
30
+ {
31
+ uint64_t h = 1469598103934665603ULL;
32
+ for (size_t i = 0; i < local_len; i++) { h ^= (unsigned char)local[i]; h *= 1099511628211ULL; }
33
+ h ^= 0xff; h *= 1099511628211ULL; /* separator so "ab"+"" != "a"+"b" */
34
+ for (size_t i = 0; i < ns_uri_len; i++) { h ^= (unsigned char)ns_uri[i]; h *= 1099511628211ULL; }
35
+ return h;
36
+ }
37
+
38
+ static int
39
+ key_eq(const mkr_xml_index_entry_t *e, const char *local, size_t local_len,
40
+ const char *ns_uri, size_t ns_uri_len)
41
+ {
42
+ if (e->local_len != local_len || e->ns_uri_len != ns_uri_len) return 0;
43
+ if (local_len && memcmp(e->local, local, local_len) != 0) return 0;
44
+ if (ns_uri_len && memcmp(e->ns_uri, ns_uri, ns_uri_len) != 0) return 0;
45
+ return 1;
46
+ }
47
+
48
+
49
+ /* Find the entry for the key, or the empty slot to create it (open addressing,
50
+ * power-of-two mask). The table is sized so it never fills (load < 0.5). */
51
+ static mkr_xml_index_entry_t *
52
+ slot_for(mkr_xml_name_index_t *idx, const char *local, size_t local_len,
53
+ const char *ns_uri, size_t ns_uri_len)
54
+ {
55
+ size_t mask = idx->cap - 1;
56
+ size_t i = (size_t)key_hash(local, local_len, ns_uri, ns_uri_len) & mask;
57
+ for (;;) {
58
+ mkr_xml_index_entry_t *e = &idx->buckets[i];
59
+ if (e->local == NULL && e->count == 0) return e; /* empty */
60
+ if (key_eq(e, local, local_len, ns_uri, ns_uri_len)) return e;
61
+ i = (i + 1) & mask;
62
+ }
63
+ }
64
+
65
+ /* Append +node+ to its key's bucket, creating the entry if new. 0 / -1 (OOM). */
66
+ static int
67
+ index_push(mkr_xml_name_index_t *idx, mkr_xml_node_t *node)
68
+ {
69
+ mkr_xml_index_entry_t *e =
70
+ slot_for(idx, node->local, node->local_len, node->ns_uri, node->ns_uri_len);
71
+ if (e->local == NULL && e->count == 0) { /* fresh entry: borrow the key */
72
+ e->local = node->local; e->local_len = node->local_len;
73
+ e->ns_uri = node->ns_uri; e->ns_uri_len = node->ns_uri_len;
74
+ }
75
+ if (mkr_grow_reserve((void **)&e->nodes, &e->cap, e->count + 1, sizeof(*e->nodes)) != MKR_OK) {
76
+ return -1; /* grows geometrically + overflow-safely internally */
77
+ }
78
+ e->nodes[e->count++] = node;
79
+ return 0;
80
+ }
81
+
82
+ /* Count elements (document order is irrelevant here) to size the table. */
83
+ static size_t
84
+ count_elements(mkr_xml_node_t *root)
85
+ {
86
+ size_t n = 0;
87
+ for (mkr_xml_node_t *cur = root; cur != NULL; cur = mkr_xml_preorder_next(root, cur)) {
88
+ if (cur->type == MKR_XML_NODE_TYPE_ELEMENT) n++;
89
+ }
90
+ return n;
91
+ }
92
+
93
+ static mkr_xml_name_index_t *
94
+ build(mkr_xml_doc_t *doc)
95
+ {
96
+ mkr_xml_node_t *root = doc->doc_node;
97
+ if (root == NULL) return NULL;
98
+
99
+ mkr_xml_name_index_t *idx = (mkr_xml_name_index_t *)mkr_callocarray(1, sizeof(*idx));
100
+ if (idx == NULL) return NULL;
101
+
102
+ size_t n = count_elements(root);
103
+ /* Size for load factor < 0.5 (2n+1 slots). The overflow-checked sizer fails
104
+ * closed - unlike the old next_pow2, which saturated to a too-small table on
105
+ * overflow, where open addressing could never find a free slot. */
106
+ size_t want;
107
+ if (!mkr_size_mul(n, 2, &want) || !mkr_size_add(want, 1, &want)
108
+ || !mkr_pow2_ceil(want, &idx->cap)) { free(idx); return NULL; }
109
+ if (idx->cap < 8) idx->cap = 8; /* small floor */
110
+ idx->buckets = (mkr_xml_index_entry_t *)mkr_callocarray(idx->cap, sizeof(*idx->buckets));
111
+ if (idx->buckets == NULL) { free(idx); return NULL; }
112
+
113
+ /* Fill pass: pre-order (document order), elements only. */
114
+ for (mkr_xml_node_t *cur = root; cur != NULL; cur = mkr_xml_preorder_next(root, cur)) {
115
+ if (cur->type == MKR_XML_NODE_TYPE_ELEMENT) {
116
+ if (index_push(idx, cur) != 0) { mkr_xml_name_index_free(idx); return NULL; }
117
+ }
118
+ }
119
+ return idx;
120
+ }
121
+
122
+ mkr_xml_name_index_t *
123
+ mkr_xml_name_index_get(mkr_xml_doc_t *doc)
124
+ {
125
+ if (doc == NULL) return NULL;
126
+ if (doc->name_index != NULL) return (mkr_xml_name_index_t *)doc->name_index;
127
+ mkr_xml_name_index_t *idx = build(doc);
128
+ doc->name_index = idx; /* NULL on OOM: caller walks, retries next time */
129
+ return idx;
130
+ }
131
+
132
+ void
133
+ mkr_xml_name_index_free(mkr_xml_name_index_t *idx)
134
+ {
135
+ if (idx == NULL) return;
136
+ if (idx->buckets != NULL) {
137
+ for (size_t i = 0; i < idx->cap; i++) free(idx->buckets[i].nodes);
138
+ free(idx->buckets);
139
+ }
140
+ free(idx);
141
+ }
142
+
143
+ void
144
+ mkr_xml_name_index_invalidate(mkr_xml_doc_t *doc)
145
+ {
146
+ if (doc == NULL || doc->name_index == NULL) return;
147
+ mkr_xml_name_index_free((mkr_xml_name_index_t *)doc->name_index);
148
+ doc->name_index = NULL;
149
+ }
150
+
151
+ mkr_xml_node_t *const *
152
+ mkr_xml_name_index_lookup(const mkr_xml_name_index_t *idx,
153
+ const char *local, size_t local_len,
154
+ const char *ns_uri, size_t ns_uri_len, size_t *out_count)
155
+ {
156
+ if (out_count != NULL) *out_count = 0;
157
+ if (idx == NULL || idx->cap == 0 || local == NULL) return NULL;
158
+ size_t mask = idx->cap - 1;
159
+ size_t i = (size_t)key_hash(local, local_len, ns_uri, ns_uri_len) & mask;
160
+ for (;;) {
161
+ const mkr_xml_index_entry_t *e = &idx->buckets[i];
162
+ if (e->local == NULL && e->count == 0) return NULL; /* miss */
163
+ if (key_eq(e, local, local_len, ns_uri, ns_uri_len)) {
164
+ if (out_count != NULL) *out_count = e->count;
165
+ return e->nodes;
166
+ }
167
+ i = (i + 1) & mask;
168
+ }
169
+ }
@@ -0,0 +1,48 @@
1
+ #ifndef MKR_XML_INDEX_H
2
+ #define MKR_XML_INDEX_H
3
+
4
+ #include "mkr_xml_node.h"
5
+ #include <stddef.h>
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ /*
12
+ * Element-name index for the XML arena: maps each (local name + namespace URI)
13
+ * to the document-ordered list of elements bearing it, so a document-rooted
14
+ * descendant name test (//entry, css("entry")) is answered from the bucket
15
+ * instead of walking the whole tree. The HTML side has the analogous tag-id
16
+ * index; XML element names are arbitrary strings, so this is keyed by the name
17
+ * bytes (borrowed from the arena, stable until the next mutation).
18
+ *
19
+ * Lazily built and cached on the document; dropped by
20
+ * mkr_xml_name_index_invalidate from the single XML mutation hook (the same
21
+ * discipline as the HTML attr/text indices). Build OOM fails closed: the getter
22
+ * returns NULL and the caller walks the tree.
23
+ */
24
+ typedef struct mkr_xml_name_index mkr_xml_name_index_t;
25
+
26
+ /* The document's element-name index, built and cached on first call. Returns
27
+ * NULL on OOM (caller falls back to a tree walk). */
28
+ mkr_xml_name_index_t *mkr_xml_name_index_get(mkr_xml_doc_t *doc);
29
+
30
+ /* Drop the cached index after a structural mutation (no-op when unbuilt). */
31
+ void mkr_xml_name_index_invalidate(mkr_xml_doc_t *doc);
32
+
33
+ /* Free an index directly (used by mkr_xml_doc_destroy via the invalidate hook). */
34
+ void mkr_xml_name_index_free(mkr_xml_name_index_t *idx);
35
+
36
+ /* The document-ordered elements with local name +local+ and namespace URI
37
+ * +ns_uri+ (ns_uri == NULL / ns_uri_len == 0 means the no-namespace bucket).
38
+ * Returns the borrowed bucket and sets *out_count, or NULL with *out_count 0. */
39
+ mkr_xml_node_t *const *mkr_xml_name_index_lookup(const mkr_xml_name_index_t *idx,
40
+ const char *local, size_t local_len,
41
+ const char *ns_uri, size_t ns_uri_len,
42
+ size_t *out_count);
43
+
44
+ #ifdef __cplusplus
45
+ }
46
+ #endif
47
+
48
+ #endif /* MKR_XML_INDEX_H */