makiri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/security.yml +88 -3
  5. data/.github/workflows/valgrind.yml +135 -0
  6. data/CHANGELOG.md +60 -2
  7. data/README.md +81 -77
  8. data/Rakefile +194 -3
  9. data/ext/makiri/bridge/ruby_string.c +119 -66
  10. data/ext/makiri/core/mkr_alloc.c +40 -3
  11. data/ext/makiri/core/mkr_alloc.h +27 -4
  12. data/ext/makiri/core/mkr_buf.c +13 -3
  13. data/ext/makiri/core/mkr_buf.h +80 -5
  14. data/ext/makiri/core/mkr_core.c +143 -0
  15. data/ext/makiri/core/mkr_core.h +10 -1
  16. data/ext/makiri/core/mkr_span.h +186 -0
  17. data/ext/makiri/core/mkr_utf8.c +101 -0
  18. data/ext/makiri/core/mkr_utf8.h +88 -0
  19. data/ext/makiri/extconf.rb +104 -9
  20. data/ext/makiri/fuzz/Makefile +95 -0
  21. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  22. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  23. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  24. data/ext/makiri/glue/glue.h +8 -0
  25. data/ext/makiri/glue/ruby_doc.c +20 -24
  26. data/ext/makiri/glue/ruby_html_css.c +58 -12
  27. data/ext/makiri/glue/ruby_html_mutate.c +11 -6
  28. data/ext/makiri/glue/ruby_html_node.c +3 -32
  29. data/ext/makiri/glue/ruby_node.c +39 -0
  30. data/ext/makiri/glue/ruby_xml.c +198 -16
  31. data/ext/makiri/glue/ruby_xml_node.c +46 -59
  32. data/ext/makiri/glue/ruby_xpath.c +4 -4
  33. data/ext/makiri/lexbor_compat/source_loc.c +14 -16
  34. data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
  35. data/ext/makiri/makiri.c +45 -0
  36. data/ext/makiri/xml/mkr_xml.h +2 -3
  37. data/ext/makiri/xml/mkr_xml_chars.c +67 -97
  38. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  39. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  40. data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
  41. data/ext/makiri/xml/mkr_xml_node.c +147 -15
  42. data/ext/makiri/xml/mkr_xml_node.h +71 -6
  43. data/ext/makiri/xml/mkr_xml_tree.c +185 -149
  44. data/ext/makiri/xpath/mkr_css.c +1023 -0
  45. data/ext/makiri/xpath/mkr_css.h +65 -0
  46. data/ext/makiri/xpath/mkr_xpath.c +37 -0
  47. data/ext/makiri/xpath/mkr_xpath.h +13 -0
  48. data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
  49. data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
  50. data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
  51. data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
  53. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  54. data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
  55. data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
  56. data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
  57. data/lib/makiri/cdata_section.rb +1 -3
  58. data/lib/makiri/comment.rb +1 -3
  59. data/lib/makiri/document.rb +8 -0
  60. data/lib/makiri/element.rb +1 -3
  61. data/lib/makiri/processing_instruction.rb +1 -3
  62. data/lib/makiri/text.rb +1 -3
  63. data/lib/makiri/version.rb +1 -1
  64. data/lib/makiri/xml/builder.rb +263 -0
  65. data/lib/makiri/xml/node_methods.rb +47 -0
  66. data/lib/makiri.rb +1 -0
  67. data/script/check_alloc_failures.rb +266 -0
  68. data/script/check_c_safety.rb +45 -2
  69. data/script/check_c_safety_allowlist.yml +19 -0
  70. data/script/check_leaks.rb +64 -0
  71. data/script/leaks_harness.rb +64 -0
  72. data/vendor/lexbor/CMakeLists.txt +6 -0
  73. data/vendor/lexbor/README.md +12 -0
  74. data/vendor/lexbor/config.cmake +1 -1
  75. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  76. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  77. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  78. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  79. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  80. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  81. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  82. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  83. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  84. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  85. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  86. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  87. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  88. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  89. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  90. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  91. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  92. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  93. data/vendor/lexbor/version +1 -1
  94. metadata +19 -1
@@ -48,19 +48,31 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
48
48
  void
49
49
  mkr_verify_text(VALUE str, const char *what)
50
50
  {
51
+ /* ALLOCATION-FREE by design: this gate runs between a caller taking a
52
+ * borrowed RSTRING pointer and using it, so it must not be a GC point. The
53
+ * former implementation built a throwaway Ruby String (rb_enc_str_new) to
54
+ * ask for its coderange - a Ruby allocation inside every borrow, which both
55
+ * passed the borrowed ptr into an allocating call and opened a GC window
56
+ * under every OTHER borrow already held at multi-borrow call sites. Bytes
57
+ * are validated as UTF-8 regardless of the String's declared encoding,
58
+ * exactly as before. */
51
59
  long len = RSTRING_LEN(str);
52
60
  const char *ptr = RSTRING_PTR(str);
53
61
 
54
- if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
62
+ mkr_span_t sv = mkr_span(ptr, (size_t)len);
63
+ size_t nul_at;
64
+ if (mkr_span_find(&sv, '\0', &nul_at)) {
55
65
  rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
56
66
  }
57
67
 
58
- /* Validate the bytes as UTF-8 regardless of the String's declared encoding. */
59
- VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
60
- if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
68
+ /* Cached-coderange fast path (reads flags, never scans, never allocates);
69
+ * NUL is valid UTF-8, so the memchr above stays either way. */
70
+ if (mkr_ruby_str_known_valid_utf8(str)) {
71
+ return;
72
+ }
73
+ if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
61
74
  rb_raise(mkr_eError, "%s must be valid UTF-8", what);
62
75
  }
63
- RB_GC_GUARD(str);
64
76
  }
65
77
 
66
78
  mkr_ruby_borrowed_text_t
@@ -96,7 +108,6 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
96
108
  size_t alloc_len = (v.len > 0) ? v.len : 1;
97
109
  char *buf = mkr_reallocarray(NULL, alloc_len, 1);
98
110
  if (buf == NULL) {
99
- RB_GC_GUARD(v.value);
100
111
  return -1;
101
112
  }
102
113
  if (v.len > 0) {
@@ -145,63 +156,89 @@ mkr_xml_strict_transcode_thunk(VALUE str)
145
156
  /* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
146
157
  *
147
158
  * The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
148
- * checked before the UTF-16 LE BOM they share a prefix with. */
159
+ * checked before the UTF-16 LE BOM they share a prefix with.
160
+ *
161
+ * *stride / *ascii_off get the interleave geometry of the ASCII column the decl
162
+ * scanner later extracts (default 1/0 for a single-byte stream). It is resolved
163
+ * HERE, at the match, rather than re-derived downstream, because that derivation
164
+ * needs rb_enc_find (it can autoload an encoding = a GC point) and the decl
165
+ * scanner reads a borrowed RSTRING view that must not be held across one - so
166
+ * the scanner is kept allocation-free until its reads are done. Each span read
167
+ * of p still finishes before the rb_enc_find in the return. */
149
168
  static rb_encoding *
150
- mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len)
169
+ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stride, long *ascii_off)
151
170
  {
171
+ mkr_span_t s = mkr_span((const char *)p, (size_t)len);
152
172
  *bom_len = 0;
153
- if (len >= 4 && p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xFE && p[3] == 0xFF) {
154
- *bom_len = 4; return rb_enc_find("UTF-32BE");
155
- }
156
- if (len >= 4 && p[0] == 0xFF && p[1] == 0xFE && p[2] == 0x00 && p[3] == 0x00) {
157
- *bom_len = 4; return rb_enc_find("UTF-32LE");
158
- }
159
- if (len >= 2 && p[0] == 0xFE && p[1] == 0xFF) { *bom_len = 2; return rb_enc_find("UTF-16BE"); }
160
- if (len >= 2 && p[0] == 0xFF && p[1] == 0xFE) { *bom_len = 2; return rb_enc_find("UTF-16LE"); }
161
- if (len >= 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) { *bom_len = 3; return rb_utf8_encoding(); }
173
+ *stride = 1;
174
+ *ascii_off = 0;
175
+ if (mkr_span_starts(&s, "\x00\x00\xFE\xFF", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 3; return rb_enc_find("UTF-32BE"); }
176
+ if (mkr_span_starts(&s, "\xFF\xFE\x00\x00", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 0; return rb_enc_find("UTF-32LE"); }
177
+ if (mkr_span_starts(&s, "\xFE\xFF", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 1; return rb_enc_find("UTF-16BE"); }
178
+ if (mkr_span_starts(&s, "\xFF\xFE", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 0; return rb_enc_find("UTF-16LE"); }
179
+ if (mkr_span_starts(&s, "\xEF\xBB\xBF", 3)) { *bom_len = 3; return rb_utf8_encoding(); }
162
180
  return NULL;
163
181
  }
164
182
 
165
183
  /* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
166
184
  * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
167
- * stride-interleaved, so the ASCII column is extracted (per the BOM) before the
168
- * scan, letting a BOM-vs-declaration conflict be caught even in UTF-16. */
169
- static rb_encoding *
170
- mkr_xml_decl_encoding(const unsigned char *p, long len, rb_encoding *bom)
185
+ * stride-interleaved, so the ASCII column is extracted (stride/off resolved by
186
+ * the BOM matcher) before the scan, letting a BOM-vs-declaration conflict be
187
+ * caught even in UTF-16.
188
+ *
189
+ * p is a borrowed RSTRING view, so this stays allocation-free until every read
190
+ * of p is done: the stride/off geometry is passed in (rather than derived here
191
+ * via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
192
+ * the final name lookup - runs after the bytes have been copied into head[]. */
193
+ static int
194
+ mkr_decl_ws(int c)
171
195
  {
172
- long stride = 1, off = 0;
173
- if (bom == rb_enc_find("UTF-16LE")) { stride = 2; off = 0; }
174
- else if (bom == rb_enc_find("UTF-16BE")) { stride = 2; off = 1; }
175
- else if (bom == rb_enc_find("UTF-32LE")) { stride = 4; off = 0; }
176
- else if (bom == rb_enc_find("UTF-32BE")) { stride = 4; off = 3; }
196
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n';
197
+ }
177
198
 
199
+ static rb_encoding *
200
+ mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
201
+ {
202
+ /* Extract the ASCII column (per the BOM stride) through bounded reads into
203
+ * a bounded writer - neither side trusts the loop arithmetic. */
204
+ mkr_span_t in = mkr_span((const char *)p, len < 0 ? 0 : (size_t)len);
178
205
  char head[256];
179
- long hn = 0;
180
- for (long i = off; i < len && hn < (long)sizeof(head); i += stride) head[hn++] = (char)p[i];
206
+ mkr_spanbuf_t hw = mkr_spanbuf(head, sizeof(head));
207
+ for (size_t i = (size_t)off; hw.pos < sizeof(head); i += (size_t)stride) {
208
+ int c = mkr_span_at(&in, i);
209
+ if (c < 0) break;
210
+ mkr_spanbuf_putc(&hw, (char)c);
211
+ }
212
+ mkr_span_t h = mkr_span(head, hw.pos);
213
+ size_t hn = hw.pos;
181
214
 
182
- long i = 0;
183
- while (i < hn && (head[i] == ' ' || head[i] == '\t' || head[i] == '\r' || head[i] == '\n')) i++;
184
- if (i + 5 > hn || memcmp(head + i, "<?xml", 5) != 0) return NULL;
215
+ size_t i = 0;
216
+ while (mkr_decl_ws(mkr_span_at(&h, i))) i++;
217
+ {
218
+ mkr_span_t t = mkr_span_tail(&h, i);
219
+ if (!mkr_span_starts(&t, "<?xml", 5)) return NULL;
220
+ }
185
221
  i += 5;
186
222
  /* find a whitespace-introduced "encoding" before the '?>' */
187
223
  for (; i + 8 <= hn; i++) {
188
- if (head[i] == '?' && i + 1 < hn && head[i + 1] == '>') return NULL; /* end of decl */
189
- int ws_before = (head[i - 1] == ' ' || head[i - 1] == '\t' || head[i - 1] == '\r' || head[i - 1] == '\n');
190
- if (!ws_before || memcmp(head + i, "encoding", 8) != 0) continue;
191
- long j = i + 8;
192
- while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
193
- if (j >= hn || head[j] != '=') return NULL;
224
+ if (mkr_span_at(&h, i) == '?' && mkr_span_at(&h, i + 1) == '>') return NULL; /* end of decl */
225
+ mkr_span_t t = mkr_span_tail(&h, i);
226
+ if (!mkr_decl_ws(mkr_span_at(&h, i - 1)) || !mkr_span_starts(&t, "encoding", 8)) continue;
227
+ size_t j = i + 8;
228
+ while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
229
+ if (mkr_span_at(&h, j) != '=') return NULL;
230
+ j++;
231
+ while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
232
+ int q = mkr_span_at(&h, j);
233
+ if (q != '"' && q != '\'') return NULL;
194
234
  j++;
195
- while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
196
- if (j >= hn || (head[j] != '"' && head[j] != '\'')) return NULL;
197
- char q = head[j++];
198
- long ns = j;
199
- while (j < hn && head[j] != q) j++;
235
+ size_t ns = j;
236
+ while (mkr_span_at(&h, j) >= 0 && mkr_span_at(&h, j) != q) j++;
200
237
  if (j >= hn) return NULL;
201
238
  char name[64];
202
- long nl = j - ns;
203
- if (nl <= 0 || nl >= (long)sizeof(name)) return NULL;
204
- memcpy(name, head + ns, (size_t)nl);
239
+ size_t nl = j - ns;
240
+ if (nl == 0 || nl >= sizeof(name)) return NULL;
241
+ memcpy(name, head + ns, nl);
205
242
  name[nl] = '\0';
206
243
  return rb_enc_find(name); /* NULL for an unknown encoding name */
207
244
  }
@@ -229,9 +266,15 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
229
266
  * conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
230
267
  * detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
231
268
  * with File.binread now parses). */
232
- long bom_len = 0;
233
- rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len);
234
- rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom);
269
+ long bom_len = 0, bom_stride = 1, bom_off = 0;
270
+ rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len, &bom_stride, &bom_off);
271
+ /* rb_enc_find inside the BOM lookup can autoload an encoding (a Ruby
272
+ * allocation = a GC point), so re-borrow the bytes before reading them
273
+ * again - a borrowed RSTRING pointer must not be held across one. The
274
+ * interleave geometry (stride/off) is resolved by the BOM matcher and
275
+ * passed through, keeping the decl scanner itself allocation-free. */
276
+ raw = (const unsigned char *)RSTRING_PTR(str);
277
+ rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom_stride, bom_off);
235
278
  int is_binary = (tag == rb_ascii8bit_encoding());
236
279
 
237
280
  if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
@@ -273,36 +316,44 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
273
316
  rb_raise(mkr_eXmlSyntaxError,
274
317
  "XML input could not be decoded to UTF-8: %s", msg);
275
318
  }
276
- RB_GC_GUARD(in);
277
319
  }
278
320
 
279
321
  const char *ptr = RSTRING_PTR(s);
280
322
  long len = RSTRING_LEN(s);
323
+ long off = 0;
281
324
  /* §4.3.3: a leading BOM is the encoding signature, not document content -
282
325
  * strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
283
- if (len >= 3 && (unsigned char)ptr[0] == 0xEF && (unsigned char)ptr[1] == 0xBB
284
- && (unsigned char)ptr[2] == 0xBF) {
285
- ptr += 3; len -= 3;
326
+ mkr_span_t sv = mkr_span(ptr, (size_t)len);
327
+ if (mkr_span_starts(&sv, "\xEF\xBB\xBF", 3)) {
328
+ off = 3; len -= 3;
329
+ mkr_span_skip(&sv, 3);
286
330
  }
287
331
 
288
- /* Fail closed on an over-budget input BEFORE the validation copy and the
332
+ /* Fail closed on an over-budget input BEFORE the validation scan and the
289
333
  * caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
290
334
  * budget can never parse). max_bytes == 0 disables the check (__decode). */
291
335
  if (max_bytes != 0 && (size_t)len > max_bytes) {
292
- RB_GC_GUARD(s);
293
336
  rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
294
337
  }
295
338
 
296
- /* Strict UTF-8 validation: an embedded NUL or any invalid UTF-8 is fatal
297
- * (no U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). */
298
- if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
339
+ /* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
340
+ * borrowed (the former rb_enc_str_new copy handed the borrow straight into
341
+ * an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
342
+ * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
343
+ * cached coderange covers the BOM-stripped suffix too (the BOM is one
344
+ * complete UTF-8 character). */
345
+ size_t nul_at;
346
+ if (mkr_span_find(&sv, '\0', &nul_at)) {
299
347
  rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
300
348
  }
301
- VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
302
- if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
349
+ if (!mkr_ruby_str_known_valid_utf8(s)
350
+ && !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
303
351
  rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
304
352
  }
305
- RB_GC_GUARD(s);
353
+ /* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
354
+ * allocates, so the ptr must not be what it copies from). */
355
+ VALUE u = rb_str_subseq(s, off, len);
356
+ rb_enc_associate(u, rb_utf8_encoding());
306
357
  return u; /* validated, UTF-8-tagged, BOM-stripped */
307
358
  }
308
359
 
@@ -328,22 +379,26 @@ mkr_ruby_str_known_valid_utf8(VALUE str)
328
379
  const char *
329
380
  mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
330
381
  {
382
+ /* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
383
+ * crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
384
+ * funcall allocated twice with `ptr` already taken). */
331
385
  long len = RSTRING_LEN(sv);
332
386
  if ((size_t)len > max_bytes) {
333
387
  return "string exceeds the maximum length";
334
388
  }
335
389
  const char *ptr = RSTRING_PTR(sv);
336
- if (memchr(ptr, '\0', (size_t)len) != NULL) {
390
+ mkr_span_t view = mkr_span(ptr, (size_t)len);
391
+ size_t nul_at;
392
+ if (mkr_span_find(&view, '\0', &nul_at)) {
337
393
  return "string contains a NUL byte";
338
394
  }
339
- VALUE u = rb_utf8_str_new(ptr, len);
340
- if (!RTEST(rb_funcall(u, rb_intern("valid_encoding?"), 0))) {
395
+ if (!mkr_ruby_str_known_valid_utf8(sv)
396
+ && !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
341
397
  return "string is not valid UTF-8";
342
398
  }
343
399
  out->value = sv;
344
400
  out->ptr = ptr;
345
401
  out->len = (size_t)len;
346
- RB_GC_GUARD(u);
347
402
  return NULL;
348
403
  }
349
404
 
@@ -368,9 +423,7 @@ mkr_ruby_exception_message(VALUE exc, char *buf, size_t len)
368
423
  }
369
424
  if (!RB_TYPE_P(msg, T_STRING)) {
370
425
  snprintf(buf, len, "%s", "error");
371
- RB_GC_GUARD(msg);
372
426
  return;
373
427
  }
374
428
  snprintf(buf, len, "%s", RSTRING_PTR(msg));
375
- RB_GC_GUARD(msg);
376
429
  }
@@ -1,5 +1,37 @@
1
1
  #include "mkr_alloc.h"
2
2
 
3
+ #ifdef MKR_ALLOC_INJECT
4
+ /* See mkr_alloc.h: the OOM sweep's failure injection. The counter counts
5
+ * ATTEMPTS (every consult), armed or not, so the harness can size its sweep
6
+ * from a disarmed baseline run; the countdown fails exactly one allocation
7
+ * and then disarms itself, modelling a single transient OOM per run. */
8
+ static long long mkr_inject_countdown = 0; /* 0 = disarmed */
9
+ static unsigned long long mkr_inject_attempts = 0;
10
+
11
+ void
12
+ mkr_alloc_inject_arm(long long nth)
13
+ {
14
+ mkr_inject_countdown = (nth > 0) ? nth : 0;
15
+ mkr_inject_attempts = 0;
16
+ }
17
+
18
+ unsigned long long
19
+ mkr_alloc_inject_calls(void)
20
+ {
21
+ return mkr_inject_attempts;
22
+ }
23
+
24
+ int
25
+ mkr_alloc_inject_should_fail(void)
26
+ {
27
+ mkr_inject_attempts++;
28
+ if (mkr_inject_countdown > 0 && --mkr_inject_countdown == 0) {
29
+ return 1; /* fail this one allocation; now disarmed */
30
+ }
31
+ return 0;
32
+ }
33
+ #endif
34
+
3
35
  void *
4
36
  mkr_reallocarray(void *ptr, size_t count, size_t elem)
5
37
  {
@@ -11,6 +43,7 @@ mkr_reallocarray(void *ptr, size_t count, size_t elem)
11
43
  if (!mkr_size_mul(count, elem, &bytes)) {
12
44
  return NULL; /* overflow: leave ptr unchanged */
13
45
  }
46
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
14
47
  return realloc(ptr, bytes);
15
48
  }
16
49
 
@@ -20,11 +53,14 @@ mkr_callocarray(size_t count, size_t elem)
20
53
  if (count == 0 || elem == 0) {
21
54
  return NULL;
22
55
  }
23
- size_t bytes;
24
- if (!mkr_size_mul(count, elem, &bytes)) {
56
+ /* 2-arg calloc is itself overflow-safe, but check explicitly so every core
57
+ * allocator fails the SAME way (deterministic NULL) rather than leaving the
58
+ * overflow case to calloc's implementation-defined behaviour. */
59
+ if (count > SIZE_MAX / elem) {
25
60
  return NULL; /* overflow */
26
61
  }
27
- return calloc(count, elem); /* 2-arg calloc is itself overflow-safe */
62
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
63
+ return calloc(count, elem);
28
64
  }
29
65
 
30
66
  char *
@@ -34,6 +70,7 @@ mkr_str_alloc(size_t n)
34
70
  if (!mkr_size_add(n, 1, &total)) {
35
71
  return NULL; /* n + 1 overflow */
36
72
  }
73
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
37
74
  char *p = malloc(total);
38
75
  if (p == NULL) {
39
76
  return NULL;
@@ -97,10 +97,33 @@ char *mkr_strdup(const char *s);
97
97
  * `cap *= 2; realloc(p, cap * sizeof(T))` pattern in one call. */
98
98
  mkr_status_t mkr_grow_reserve(void **ptr, size_t *cap, size_t need, size_t elem);
99
99
 
100
- /* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
101
- * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
102
- * Wired to a private Ruby method for the spec suite. */
103
- int mkr_core_selftest(void);
100
+ /* --- allocation-failure injection (debug builds only) ------------------- */
101
+ /*
102
+ * The OOM sweep harness (script/check_alloc_failures.rb, `rake oom`) verifies
103
+ * that every fail-closed OOM branch actually fails closed: it arms "the nth
104
+ * core allocation fails", runs a workload, and asserts the result is either a
105
+ * clean exception or byte-identical to the baseline - never truncated.
106
+ *
107
+ * Compiled in ONLY under -DMKR_ALLOC_INJECT (extconf: MAKIRI_ALLOC_INJECT=1);
108
+ * a release build carries no counter and no branch. Covers every core libc
109
+ * allocation site (mkr_alloc.c + mkr_buf.c - the funnel the direct_alloc lint
110
+ * forces all engine allocations through). Ruby's xmalloc family and Lexbor's
111
+ * internal allocations are out of scope (Ruby raises NoMemoryError itself;
112
+ * Lexbor is vendor code). Not thread-safe by design (the harness is
113
+ * single-threaded).
114
+ */
115
+ #ifdef MKR_ALLOC_INJECT
116
+ /* Arm: the nth subsequent core allocation (1-based) fails once, then the
117
+ * injection disarms itself. nth <= 0 disarms. Resets the call counter. */
118
+ void mkr_alloc_inject_arm(long long nth);
119
+ /* Core allocation attempts since the last arm (sizes the sweep). */
120
+ unsigned long long mkr_alloc_inject_calls(void);
121
+ /* Internal: consulted by each core libc allocation site. */
122
+ int mkr_alloc_inject_should_fail(void);
123
+ #define MKR_ALLOC_INJECT_FAIL() (mkr_alloc_inject_should_fail())
124
+ #else
125
+ #define MKR_ALLOC_INJECT_FAIL() 0
126
+ #endif
104
127
 
105
128
  #ifdef __cplusplus
106
129
  }
@@ -30,7 +30,17 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
30
30
  if (!mkr_grow_capacity(b->cap, need_term, 1, &new_cap)) {
31
31
  return MKR_ERR_OOM;
32
32
  }
33
- char *p = realloc(b->data, new_cap);
33
+ /* Geometric growth can overshoot to ~2x need_term; clamp the ALLOCATION
34
+ * to the same ceiling as the content (limit, plus one byte for the NUL),
35
+ * so cap never runs to ~2x MKR_BUF_HARD_MAX near the limit. Safe: this
36
+ * append already passed need <= limit, so need_term <= limit + 1 and the
37
+ * clamp never drops new_cap below what this append needs. (If limit + 1
38
+ * overflows - only a pathological -DMKR_BUF_HARD_MAX=SIZE_MAX - skip it.) */
39
+ size_t cap_ceiling;
40
+ if (mkr_size_add(limit, 1, &cap_ceiling) && new_cap > cap_ceiling) {
41
+ new_cap = cap_ceiling;
42
+ }
43
+ char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, new_cap);
34
44
  if (p == NULL) {
35
45
  return MKR_ERR_OOM;
36
46
  }
@@ -62,7 +72,7 @@ mkr_buf_reserve(mkr_buf_t *b, size_t n)
62
72
  if (need_term <= b->cap) {
63
73
  return MKR_OK; /* already have room */
64
74
  }
65
- char *p = realloc(b->data, need_term);
75
+ char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, need_term);
66
76
  if (p == NULL) {
67
77
  return MKR_ERR_OOM;
68
78
  }
@@ -76,7 +86,7 @@ char *
76
86
  mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
77
87
  {
78
88
  if (b->data == NULL) {
79
- char *empty = malloc(1);
89
+ char *empty = MKR_ALLOC_INJECT_FAIL() ? NULL : malloc(1);
80
90
  if (empty == NULL) {
81
91
  return NULL;
82
92
  }
@@ -24,11 +24,15 @@ extern "C" {
24
24
  * think about a bound gets a tight one for free, and a
25
25
  * buffer that genuinely needs to be large must opt in
26
26
  * EXPLICITLY by passing a larger max.
27
- * MKR_BUF_HARD_MAX an absolute ceiling no buffer may exceed, even one
28
- * with an explicit max - the last-resort backstop.
29
- * Tight, content-scaled bounds still belong to the
30
- * caller (e.g. the XML serializer caps itself at a
31
- * multiple of arena_bytes); this stops total runaway.
27
+ * MKR_BUF_HARD_MAX an absolute ceiling on a buffer's CONTENT length, even
28
+ * with an explicit max - the last-resort backstop. The
29
+ * ALLOCATION is bounded by it too: geometric growth is
30
+ * clamped so cap never exceeds HARD_MAX + 1 (the one NUL
31
+ * terminator byte), i.e. it does NOT overshoot to ~2x
32
+ * near the limit. Tight, content-scaled bounds still
33
+ * belong to the caller (e.g. the XML serializer caps
34
+ * itself at a multiple of arena_bytes); this stops total
35
+ * runaway.
32
36
  *
33
37
  * Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
34
38
  #ifndef MKR_BUF_DEFAULT_LIMIT
@@ -82,6 +86,77 @@ mkr_buf_free(mkr_buf_t *b)
82
86
  b->len = b->cap = 0;
83
87
  }
84
88
 
89
+ /* ------------------------------------------------------------------------- */
90
+ /* mkr_spanbuf_t - a bounded writer over a borrowed, fixed-capacity buffer */
91
+ /* ------------------------------------------------------------------------- */
92
+ /*
93
+ * "span" = a non-owning view of a fixed-extent contiguous region; a spanbuf is
94
+ * the bounded *writer* over such a span. The complement to mkr_buf_t: where
95
+ * mkr_buf_t GROWS and OWNS its malloc storage, a spanbuf BORROWS a fixed buffer
96
+ * owned elsewhere (e.g. an arena cut) and never grows it.
97
+ *
98
+ * The name leads with "borrowed/fixed" deliberately, for safety: the
99
+ * fixed/bounded property is SELF-ENFORCED (a write past `cap` is refused, so
100
+ * misunderstanding it costs at most a truncation, caught by _finish), but the
101
+ * BORROWED property is the caller's responsibility - the type cannot stop you
102
+ * from free()ing `buf` or holding `finish()`'s pointer past the owner's
103
+ * lifetime, and getting that wrong is a use-after-free / double-free. So:
104
+ * - never free() buf (the owner does, e.g. the arena is freed wholesale);
105
+ * - finish()'s pointer is valid only while the backing storage lives.
106
+ *
107
+ * Bounds safety is BY CONSTRUCTION (the writer owns the cursor + check, not the
108
+ * caller's per-write guard): an over-long write is refused and latches `ok` to
109
+ * false (sticky). The caller's only duty is one check at the end (via _finish or
110
+ * the public `ok` field), failing closed rather than using a truncated buffer.
111
+ * This is the sanctioned way to hand-fill a raw region; see mkr_xml_arena_spanbuf
112
+ * for the arena adapter.
113
+ */
114
+ typedef struct {
115
+ char *buf; /* borrowed; the owner keeps it alive - do NOT free. NULL => never ok. */
116
+ size_t cap; /* capacity in bytes (fixed) */
117
+ size_t pos; /* bytes written so far (always <= cap) */
118
+ bool ok; /* false once a write would overflow, or buf was NULL */
119
+ } mkr_spanbuf_t;
120
+
121
+ /* Wrap [buf, buf+cap) for bounded writing. buf == NULL yields a permanently
122
+ * not-ok writer (every write a no-op), so an upstream allocation failure flows
123
+ * straight through without a separate guard at each call site. */
124
+ static inline mkr_spanbuf_t
125
+ mkr_spanbuf(char *buf, size_t cap)
126
+ {
127
+ return (mkr_spanbuf_t){ .buf = buf, .cap = cap, .pos = 0, .ok = (buf != NULL) };
128
+ }
129
+
130
+ /* Append one byte; refuse (latch ok=false) if it would exceed cap. */
131
+ static inline void
132
+ mkr_spanbuf_putc(mkr_spanbuf_t *b, char c)
133
+ {
134
+ if (!b->ok) return;
135
+ if (b->pos >= b->cap) { b->ok = false; return; }
136
+ b->buf[b->pos++] = c;
137
+ }
138
+
139
+ /* Append n bytes; refuse (latch ok=false) if they would exceed cap. n == 0 is a
140
+ * no-op. pos <= cap is the invariant (a refused write never advances pos), so
141
+ * cap - pos cannot underflow; the pos > cap arm is belt-and-suspenders. */
142
+ static inline void
143
+ mkr_spanbuf_write(mkr_spanbuf_t *b, const void *src, size_t n)
144
+ {
145
+ if (!b->ok || n == 0) return;
146
+ if (b->pos > b->cap || n > b->cap - b->pos) { b->ok = false; return; }
147
+ memcpy(b->buf + b->pos, src, n);
148
+ b->pos += n;
149
+ }
150
+
151
+ /* The filled prefix [buf, buf+pos), or NULL if any write was refused (or buf was
152
+ * NULL); on a non-NULL return the length is `b->pos`. Forces the caller through a
153
+ * single fail-closed check instead of trusting the writes individually. */
154
+ static inline const char *
155
+ mkr_spanbuf_finish(const mkr_spanbuf_t *b)
156
+ {
157
+ return b->ok ? b->buf : NULL;
158
+ }
159
+
85
160
  #ifdef __cplusplus
86
161
  }
87
162
  #endif