makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -48,19 +48,31 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
48
48
  void
49
49
  mkr_verify_text(VALUE str, const char *what)
50
50
  {
51
+ /* ALLOCATION-FREE by design: this gate runs between a caller taking a
52
+ * borrowed RSTRING pointer and using it, so it must not be a GC point. The
53
+ * former implementation built a throwaway Ruby String (rb_enc_str_new) to
54
+ * ask for its coderange - a Ruby allocation inside every borrow, which both
55
+ * passed the borrowed ptr into an allocating call and opened a GC window
56
+ * under every OTHER borrow already held at multi-borrow call sites. Bytes
57
+ * are validated as UTF-8 regardless of the String's declared encoding,
58
+ * exactly as before. */
51
59
  long len = RSTRING_LEN(str);
52
60
  const char *ptr = RSTRING_PTR(str);
53
61
 
54
- if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
62
+ mkr_span_t sv = mkr_span(ptr, (size_t)len);
63
+ size_t nul_at;
64
+ if (mkr_span_find(&sv, '\0', &nul_at)) {
55
65
  rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
56
66
  }
57
67
 
58
- /* Validate the bytes as UTF-8 regardless of the String's declared encoding. */
59
- VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
60
- if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
68
+ /* Cached-coderange fast path (reads flags, never scans, never allocates);
69
+ * NUL is valid UTF-8, so the memchr above stays either way. */
70
+ if (mkr_ruby_str_known_valid_utf8(str)) {
71
+ return;
72
+ }
73
+ if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
61
74
  rb_raise(mkr_eError, "%s must be valid UTF-8", what);
62
75
  }
63
- RB_GC_GUARD(str);
64
76
  }
65
77
 
66
78
  mkr_ruby_borrowed_text_t
@@ -96,7 +108,6 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
96
108
  size_t alloc_len = (v.len > 0) ? v.len : 1;
97
109
  char *buf = mkr_reallocarray(NULL, alloc_len, 1);
98
110
  if (buf == NULL) {
99
- RB_GC_GUARD(v.value);
100
111
  return -1;
101
112
  }
102
113
  if (v.len > 0) {
@@ -108,25 +119,286 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
108
119
  return 0;
109
120
  }
110
121
 
122
+ VALUE
123
+ mkr_ruby_to_utf8(VALUE str)
124
+ {
125
+ /* Honour the Ruby String's declared encoding so its content survives:
126
+ *
127
+ * - UTF-8 / US-ASCII / ASCII-8BIT (binary): returned unchanged. These are
128
+ * already UTF-8 bytes (or deliberately raw bytes), and the native parser
129
+ * does the WHATWG invalid-byte replacement for them. The UTF-8 common
130
+ * case costs only this encoding comparison - no transcode, no copy.
131
+ *
132
+ * - any other encoding (Shift_JIS, EUC-JP, ISO-8859-1, Windows-1252, ...):
133
+ * transcoded to UTF-8 with invalid/undef -> U+FFFD, so e.g. Shift_JIS
134
+ * text becomes the right UTF-8 characters instead of being read as raw
135
+ * UTF-8 bytes and mangled. Only non-UTF-8 input pays this. */
136
+ rb_encoding *enc = rb_enc_get(str);
137
+ if (enc == rb_utf8_encoding()
138
+ || enc == rb_usascii_encoding()
139
+ || enc == rb_ascii8bit_encoding()) {
140
+ return str;
141
+ }
142
+ return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()),
143
+ ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE, Qnil);
144
+ }
145
+
146
+ /* rb_str_encode with no replacement flags: an undefined conversion or invalid
147
+ * byte sequence RAISES (Encoding::UndefinedConversionError /
148
+ * Encoding::InvalidByteSequenceError) instead of substituting U+FFFD. Run under
149
+ * rb_protect so we can remap the Ruby Encoding error to Makiri::XML::SyntaxError. */
150
+ static VALUE
151
+ mkr_xml_strict_transcode_thunk(VALUE str)
152
+ {
153
+ return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()), 0, Qnil);
154
+ }
155
+
156
+ /* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
157
+ *
158
+ * The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
159
+ * checked before the UTF-16 LE BOM they share a prefix with.
160
+ *
161
+ * *stride / *ascii_off get the interleave geometry of the ASCII column the decl
162
+ * scanner later extracts (default 1/0 for a single-byte stream). It is resolved
163
+ * HERE, at the match, rather than re-derived downstream, because that derivation
164
+ * needs rb_enc_find (it can autoload an encoding = a GC point) and the decl
165
+ * scanner reads a borrowed RSTRING view that must not be held across one - so
166
+ * the scanner is kept allocation-free until its reads are done. Each span read
167
+ * of p still finishes before the rb_enc_find in the return. */
168
+ static rb_encoding *
169
+ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stride, long *ascii_off)
170
+ {
171
+ mkr_span_t s = mkr_span((const char *)p, (size_t)len);
172
+ *bom_len = 0;
173
+ *stride = 1;
174
+ *ascii_off = 0;
175
+ if (mkr_span_starts(&s, "\x00\x00\xFE\xFF", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 3; return rb_enc_find("UTF-32BE"); }
176
+ if (mkr_span_starts(&s, "\xFF\xFE\x00\x00", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 0; return rb_enc_find("UTF-32LE"); }
177
+ if (mkr_span_starts(&s, "\xFE\xFF", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 1; return rb_enc_find("UTF-16BE"); }
178
+ if (mkr_span_starts(&s, "\xFF\xFE", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 0; return rb_enc_find("UTF-16LE"); }
179
+ if (mkr_span_starts(&s, "\xEF\xBB\xBF", 3)) { *bom_len = 3; return rb_utf8_encoding(); }
180
+ return NULL;
181
+ }
182
+
183
+ /* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
184
+ * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
185
+ * stride-interleaved, so the ASCII column is extracted (stride/off resolved by
186
+ * the BOM matcher) before the scan, letting a BOM-vs-declaration conflict be
187
+ * caught even in UTF-16.
188
+ *
189
+ * p is a borrowed RSTRING view, so this stays allocation-free until every read
190
+ * of p is done: the stride/off geometry is passed in (rather than derived here
191
+ * via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
192
+ * the final name lookup - runs after the bytes have been copied into head[]. */
193
+ static int
194
+ mkr_decl_ws(int c)
195
+ {
196
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n';
197
+ }
198
+
199
+ static rb_encoding *
200
+ mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
201
+ {
202
+ /* Extract the ASCII column (per the BOM stride) through bounded reads into
203
+ * a bounded writer - neither side trusts the loop arithmetic. */
204
+ mkr_span_t in = mkr_span((const char *)p, len < 0 ? 0 : (size_t)len);
205
+ char head[256];
206
+ mkr_spanbuf_t hw = mkr_spanbuf(head, sizeof(head));
207
+ for (size_t i = (size_t)off; hw.pos < sizeof(head); i += (size_t)stride) {
208
+ int c = mkr_span_at(&in, i);
209
+ if (c < 0) break;
210
+ mkr_spanbuf_putc(&hw, (char)c);
211
+ }
212
+ mkr_span_t h = mkr_span(head, hw.pos);
213
+ size_t hn = hw.pos;
214
+
215
+ size_t i = 0;
216
+ while (mkr_decl_ws(mkr_span_at(&h, i))) i++;
217
+ {
218
+ mkr_span_t t = mkr_span_tail(&h, i);
219
+ if (!mkr_span_starts(&t, "<?xml", 5)) return NULL;
220
+ }
221
+ i += 5;
222
+ /* find a whitespace-introduced "encoding" before the '?>' */
223
+ for (; i + 8 <= hn; i++) {
224
+ if (mkr_span_at(&h, i) == '?' && mkr_span_at(&h, i + 1) == '>') return NULL; /* end of decl */
225
+ mkr_span_t t = mkr_span_tail(&h, i);
226
+ if (!mkr_decl_ws(mkr_span_at(&h, i - 1)) || !mkr_span_starts(&t, "encoding", 8)) continue;
227
+ size_t j = i + 8;
228
+ while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
229
+ if (mkr_span_at(&h, j) != '=') return NULL;
230
+ j++;
231
+ while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
232
+ int q = mkr_span_at(&h, j);
233
+ if (q != '"' && q != '\'') return NULL;
234
+ j++;
235
+ size_t ns = j;
236
+ while (mkr_span_at(&h, j) >= 0 && mkr_span_at(&h, j) != q) j++;
237
+ if (j >= hn) return NULL;
238
+ char name[64];
239
+ size_t nl = j - ns;
240
+ if (nl == 0 || nl >= sizeof(name)) return NULL;
241
+ memcpy(name, head + ns, nl);
242
+ name[nl] = '\0';
243
+ return rb_enc_find(name); /* NULL for an unknown encoding name */
244
+ }
245
+ return NULL;
246
+ }
247
+
248
+ /* Two encodings agree for conflict purposes when identical, or when either is
249
+ * US-ASCII (a subset of UTF-8 and the single-byte encodings). */
250
+ static int
251
+ mkr_xml_enc_compatible(rb_encoding *a, rb_encoding *b)
252
+ {
253
+ return a == b || a == rb_usascii_encoding() || b == rb_usascii_encoding();
254
+ }
255
+
256
+ VALUE
257
+ mkr_xml_decode_input(VALUE str, size_t max_bytes)
258
+ {
259
+ rb_encoding *tag = rb_enc_get(str);
260
+ const unsigned char *raw = (const unsigned char *)RSTRING_PTR(str);
261
+ long rawlen = RSTRING_LEN(str);
262
+
263
+ /* Detect the byte encoding (XML 1.0 Appendix F): a BOM wins, else the
264
+ * declaration. The Ruby String's encoding is authoritative when it is a
265
+ * concrete text encoding; a BOM/declaration that disagrees is a fatal
266
+ * conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
267
+ * detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
268
+ * with File.binread now parses). */
269
+ long bom_len = 0, bom_stride = 1, bom_off = 0;
270
+ rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len, &bom_stride, &bom_off);
271
+ /* rb_enc_find inside the BOM lookup can autoload an encoding (a Ruby
272
+ * allocation = a GC point), so re-borrow the bytes before reading them
273
+ * again - a borrowed RSTRING pointer must not be held across one. The
274
+ * interleave geometry (stride/off) is resolved by the BOM matcher and
275
+ * passed through, keeping the decl scanner itself allocation-free. */
276
+ raw = (const unsigned char *)RSTRING_PTR(str);
277
+ rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom_stride, bom_off);
278
+ int is_binary = (tag == rb_ascii8bit_encoding());
279
+
280
+ if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
281
+ rb_raise(mkr_eXmlSyntaxError,
282
+ "XML encoding conflict: the byte-order mark and the encoding declaration disagree");
283
+ }
284
+ if (!is_binary && bom && !mkr_xml_enc_compatible(bom, tag)) {
285
+ rb_raise(mkr_eXmlSyntaxError,
286
+ "XML encoding conflict: the byte-order mark disagrees with the string's encoding");
287
+ }
288
+ if (!is_binary && decl && !mkr_xml_enc_compatible(decl, tag)) {
289
+ /* A concrete String encoding is authoritative for decoding, so the
290
+ * declaration is not used to transcode - but a declaration that names a
291
+ * different encoding than the String is tagged with (e.g. a Shift_JIS
292
+ * String declaring encoding="UTF-8") is a self-inconsistent document and
293
+ * a fatal error, not a silently-ignored mismatch. */
294
+ rb_raise(mkr_eXmlSyntaxError,
295
+ "XML encoding conflict: the encoding declaration disagrees with the string's encoding");
296
+ }
297
+
298
+ rb_encoding *eff = is_binary ? (bom ? bom : (decl ? decl : rb_utf8_encoding())) : tag;
299
+
300
+ /* Decode to UTF-8 (strict). UTF-8 / US-ASCII / ASCII-8BIT are already UTF-8
301
+ * bytes (validated below); anything else is strict-transcoded, raising rather
302
+ * than substituting U+FFFD. */
303
+ VALUE s;
304
+ if (eff == rb_utf8_encoding() || eff == rb_usascii_encoding() || eff == rb_ascii8bit_encoding()) {
305
+ s = str;
306
+ } else {
307
+ VALUE in = str;
308
+ if (rb_enc_get(str) != eff) { in = rb_str_dup(str); rb_enc_associate(in, eff); }
309
+ int state = 0;
310
+ s = rb_protect(mkr_xml_strict_transcode_thunk, in, &state);
311
+ if (state != 0) {
312
+ VALUE exc = rb_errinfo();
313
+ rb_set_errinfo(Qnil);
314
+ char msg[256];
315
+ mkr_ruby_exception_message(exc, msg, sizeof msg);
316
+ rb_raise(mkr_eXmlSyntaxError,
317
+ "XML input could not be decoded to UTF-8: %s", msg);
318
+ }
319
+ }
320
+
321
+ const char *ptr = RSTRING_PTR(s);
322
+ long len = RSTRING_LEN(s);
323
+ long off = 0;
324
+ /* §4.3.3: a leading BOM is the encoding signature, not document content -
325
+ * strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
326
+ mkr_span_t sv = mkr_span(ptr, (size_t)len);
327
+ if (mkr_span_starts(&sv, "\xEF\xBB\xBF", 3)) {
328
+ off = 3; len -= 3;
329
+ mkr_span_skip(&sv, 3);
330
+ }
331
+
332
+ /* Fail closed on an over-budget input BEFORE the validation scan and the
333
+ * caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
334
+ * budget can never parse). max_bytes == 0 disables the check (__decode). */
335
+ if (max_bytes != 0 && (size_t)len > max_bytes) {
336
+ rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
337
+ }
338
+
339
+ /* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
340
+ * borrowed (the former rb_enc_str_new copy handed the borrow straight into
341
+ * an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
342
+ * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
343
+ * cached coderange covers the BOM-stripped suffix too (the BOM is one
344
+ * complete UTF-8 character). */
345
+ size_t nul_at;
346
+ if (mkr_span_find(&sv, '\0', &nul_at)) {
347
+ rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
348
+ }
349
+ if (!mkr_ruby_str_known_valid_utf8(s)
350
+ && !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
351
+ rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
352
+ }
353
+ /* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
354
+ * allocates, so the ptr must not be what it copies from). */
355
+ VALUE u = rb_str_subseq(s, off, len);
356
+ rb_enc_associate(u, rb_utf8_encoding());
357
+ return u; /* validated, UTF-8-tagged, BOM-stripped */
358
+ }
359
+
360
+ bool
361
+ mkr_ruby_str_known_valid_utf8(VALUE str)
362
+ {
363
+ if (!RB_TYPE_P(str, T_STRING)) {
364
+ return false;
365
+ }
366
+ /* ENC_CODERANGE reads the *cached* classification from the object's flags;
367
+ * it does NOT scan (rb_enc_str_coderange would, costing as much as our own
368
+ * validator). So this only wins when Ruby already knows the answer. */
369
+ int cr = ENC_CODERANGE(str);
370
+ if (cr == ENC_CODERANGE_7BIT) {
371
+ return true; /* all bytes < 0x80 in an ASCII-compatible encoding */
372
+ }
373
+ if (cr == ENC_CODERANGE_VALID) {
374
+ return rb_enc_get(str) == rb_utf8_encoding(); /* valid AND UTF-8 */
375
+ }
376
+ return false; /* UNKNOWN or BROKEN: let mkr_utf8_sanitize handle it */
377
+ }
378
+
111
379
  const char *
112
380
  mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
113
381
  {
382
+ /* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
383
+ * crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
384
+ * funcall allocated twice with `ptr` already taken). */
114
385
  long len = RSTRING_LEN(sv);
115
386
  if ((size_t)len > max_bytes) {
116
387
  return "string exceeds the maximum length";
117
388
  }
118
389
  const char *ptr = RSTRING_PTR(sv);
119
- if (memchr(ptr, '\0', (size_t)len) != NULL) {
390
+ mkr_span_t view = mkr_span(ptr, (size_t)len);
391
+ size_t nul_at;
392
+ if (mkr_span_find(&view, '\0', &nul_at)) {
120
393
  return "string contains a NUL byte";
121
394
  }
122
- VALUE u = rb_utf8_str_new(ptr, len);
123
- if (!RTEST(rb_funcall(u, rb_intern("valid_encoding?"), 0))) {
395
+ if (!mkr_ruby_str_known_valid_utf8(sv)
396
+ && !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
124
397
  return "string is not valid UTF-8";
125
398
  }
126
399
  out->value = sv;
127
400
  out->ptr = ptr;
128
401
  out->len = (size_t)len;
129
- RB_GC_GUARD(u);
130
402
  return NULL;
131
403
  }
132
404
 
@@ -151,9 +423,7 @@ mkr_ruby_exception_message(VALUE exc, char *buf, size_t len)
151
423
  }
152
424
  if (!RB_TYPE_P(msg, T_STRING)) {
153
425
  snprintf(buf, len, "%s", "error");
154
- RB_GC_GUARD(msg);
155
426
  return;
156
427
  }
157
428
  snprintf(buf, len, "%s", RSTRING_PTR(msg));
158
- RB_GC_GUARD(msg);
159
429
  }
@@ -1,5 +1,37 @@
1
1
  #include "mkr_alloc.h"
2
2
 
3
+ #ifdef MKR_ALLOC_INJECT
4
+ /* See mkr_alloc.h: the OOM sweep's failure injection. The counter counts
5
+ * ATTEMPTS (every consult), armed or not, so the harness can size its sweep
6
+ * from a disarmed baseline run; the countdown fails exactly one allocation
7
+ * and then disarms itself, modelling a single transient OOM per run. */
8
+ static long long mkr_inject_countdown = 0; /* 0 = disarmed */
9
+ static unsigned long long mkr_inject_attempts = 0;
10
+
11
+ void
12
+ mkr_alloc_inject_arm(long long nth)
13
+ {
14
+ mkr_inject_countdown = (nth > 0) ? nth : 0;
15
+ mkr_inject_attempts = 0;
16
+ }
17
+
18
+ unsigned long long
19
+ mkr_alloc_inject_calls(void)
20
+ {
21
+ return mkr_inject_attempts;
22
+ }
23
+
24
+ int
25
+ mkr_alloc_inject_should_fail(void)
26
+ {
27
+ mkr_inject_attempts++;
28
+ if (mkr_inject_countdown > 0 && --mkr_inject_countdown == 0) {
29
+ return 1; /* fail this one allocation; now disarmed */
30
+ }
31
+ return 0;
32
+ }
33
+ #endif
34
+
3
35
  void *
4
36
  mkr_reallocarray(void *ptr, size_t count, size_t elem)
5
37
  {
@@ -11,6 +43,7 @@ mkr_reallocarray(void *ptr, size_t count, size_t elem)
11
43
  if (!mkr_size_mul(count, elem, &bytes)) {
12
44
  return NULL; /* overflow: leave ptr unchanged */
13
45
  }
46
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
14
47
  return realloc(ptr, bytes);
15
48
  }
16
49
 
@@ -20,11 +53,14 @@ mkr_callocarray(size_t count, size_t elem)
20
53
  if (count == 0 || elem == 0) {
21
54
  return NULL;
22
55
  }
23
- size_t bytes;
24
- if (!mkr_size_mul(count, elem, &bytes)) {
56
+ /* 2-arg calloc is itself overflow-safe, but check explicitly so every core
57
+ * allocator fails the SAME way (deterministic NULL) rather than leaving the
58
+ * overflow case to calloc's implementation-defined behaviour. */
59
+ if (count > SIZE_MAX / elem) {
25
60
  return NULL; /* overflow */
26
61
  }
27
- return calloc(count, elem); /* 2-arg calloc is itself overflow-safe */
62
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
63
+ return calloc(count, elem);
28
64
  }
29
65
 
30
66
  char *
@@ -34,6 +70,7 @@ mkr_str_alloc(size_t n)
34
70
  if (!mkr_size_add(n, 1, &total)) {
35
71
  return NULL; /* n + 1 overflow */
36
72
  }
73
+ if (MKR_ALLOC_INJECT_FAIL()) return NULL;
37
74
  char *p = malloc(total);
38
75
  if (p == NULL) {
39
76
  return NULL;
@@ -6,7 +6,7 @@
6
6
  * allocators, the foundation every other C layer (glue, xpath engine,
7
7
  * lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
8
8
  * `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
9
- * NOTHING in this header touches Ruby exception mapping happens at the glue
9
+ * NOTHING in this header touches Ruby - exception mapping happens at the glue
10
10
  * boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
11
11
  */
12
12
 
@@ -97,10 +97,33 @@ char *mkr_strdup(const char *s);
97
97
  * `cap *= 2; realloc(p, cap * sizeof(T))` pattern in one call. */
98
98
  mkr_status_t mkr_grow_reserve(void **ptr, size_t *cap, size_t need, size_t elem);
99
99
 
100
- /* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
101
- * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
102
- * Wired to a private Ruby method for the spec suite. */
103
- int mkr_core_selftest(void);
100
+ /* --- allocation-failure injection (debug builds only) ------------------- */
101
+ /*
102
+ * The OOM sweep harness (script/check_alloc_failures.rb, `rake oom`) verifies
103
+ * that every fail-closed OOM branch actually fails closed: it arms "the nth
104
+ * core allocation fails", runs a workload, and asserts the result is either a
105
+ * clean exception or byte-identical to the baseline - never truncated.
106
+ *
107
+ * Compiled in ONLY under -DMKR_ALLOC_INJECT (extconf: MAKIRI_ALLOC_INJECT=1);
108
+ * a release build carries no counter and no branch. Covers every core libc
109
+ * allocation site (mkr_alloc.c + mkr_buf.c - the funnel the direct_alloc lint
110
+ * forces all engine allocations through). Ruby's xmalloc family and Lexbor's
111
+ * internal allocations are out of scope (Ruby raises NoMemoryError itself;
112
+ * Lexbor is vendor code). Not thread-safe by design (the harness is
113
+ * single-threaded).
114
+ */
115
+ #ifdef MKR_ALLOC_INJECT
116
+ /* Arm: the nth subsequent core allocation (1-based) fails once, then the
117
+ * injection disarms itself. nth <= 0 disarms. Resets the call counter. */
118
+ void mkr_alloc_inject_arm(long long nth);
119
+ /* Core allocation attempts since the last arm (sizes the sweep). */
120
+ unsigned long long mkr_alloc_inject_calls(void);
121
+ /* Internal: consulted by each core libc allocation site. */
122
+ int mkr_alloc_inject_should_fail(void);
123
+ #define MKR_ALLOC_INJECT_FAIL() (mkr_alloc_inject_should_fail())
124
+ #else
125
+ #define MKR_ALLOC_INJECT_FAIL() 0
126
+ #endif
104
127
 
105
128
  #ifdef __cplusplus
106
129
  }
@@ -13,7 +13,12 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
13
13
  if (!mkr_size_add(b->len, n, &need)) {
14
14
  return MKR_ERR_OOM;
15
15
  }
16
- if (b->max != 0 && need > b->max) {
16
+ /* max == 0 is NOT unbounded: it falls back to the conservative default
17
+ * ceiling, so a caller that never set a cap still fails closed. Either way the
18
+ * absolute hard ceiling clamps it, so no buffer can exhaust memory. */
19
+ size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
20
+ size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
21
+ if (need > limit) {
17
22
  return MKR_ERR_LIMIT;
18
23
  }
19
24
  size_t need_term; /* room for the NUL terminator too */
@@ -25,7 +30,17 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
25
30
  if (!mkr_grow_capacity(b->cap, need_term, 1, &new_cap)) {
26
31
  return MKR_ERR_OOM;
27
32
  }
28
- char *p = realloc(b->data, new_cap);
33
+ /* Geometric growth can overshoot to ~2x need_term; clamp the ALLOCATION
34
+ * to the same ceiling as the content (limit, plus one byte for the NUL),
35
+ * so cap never runs to ~2x MKR_BUF_HARD_MAX near the limit. Safe: this
36
+ * append already passed need <= limit, so need_term <= limit + 1 and the
37
+ * clamp never drops new_cap below what this append needs. (If limit + 1
38
+ * overflows - only a pathological -DMKR_BUF_HARD_MAX=SIZE_MAX - skip it.) */
39
+ size_t cap_ceiling;
40
+ if (mkr_size_add(limit, 1, &cap_ceiling) && new_cap > cap_ceiling) {
41
+ new_cap = cap_ceiling;
42
+ }
43
+ char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, new_cap);
29
44
  if (p == NULL) {
30
45
  return MKR_ERR_OOM;
31
46
  }
@@ -38,11 +53,40 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
38
53
  return MKR_OK;
39
54
  }
40
55
 
56
+ mkr_status_t
57
+ mkr_buf_reserve(mkr_buf_t *b, size_t n)
58
+ {
59
+ /* Pre-allocate capacity for n bytes so a known-size fill does not realloc on
60
+ * every geometric step (the serializer reserves ~the output size up front).
61
+ * Best-effort: never grow past the buffer's own cap, and a later append still
62
+ * fails closed if the real output exceeds it. */
63
+ size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
64
+ size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
65
+ if (n > limit) {
66
+ n = limit;
67
+ }
68
+ size_t need_term; /* room for the NUL terminator too */
69
+ if (!mkr_size_add(n, 1, &need_term)) {
70
+ return MKR_ERR_OOM;
71
+ }
72
+ if (need_term <= b->cap) {
73
+ return MKR_OK; /* already have room */
74
+ }
75
+ char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, need_term);
76
+ if (p == NULL) {
77
+ return MKR_ERR_OOM;
78
+ }
79
+ b->data = p;
80
+ b->cap = need_term;
81
+ b->data[b->len] = '\0'; /* keep NUL-terminated */
82
+ return MKR_OK;
83
+ }
84
+
41
85
  char *
42
86
  mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
43
87
  {
44
88
  if (b->data == NULL) {
45
- char *empty = malloc(1);
89
+ char *empty = MKR_ALLOC_INJECT_FAIL() ? NULL : malloc(1);
46
90
  if (empty == NULL) {
47
91
  return NULL;
48
92
  }