makiri 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +60 -2
- data/README.md +81 -77
- data/Rakefile +194 -3
- data/ext/makiri/bridge/ruby_string.c +119 -66
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +27 -4
- data/ext/makiri/core/mkr_buf.c +13 -3
- data/ext/makiri/core/mkr_buf.h +80 -5
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +10 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +104 -9
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +8 -0
- data/ext/makiri/glue/ruby_doc.c +20 -24
- data/ext/makiri/glue/ruby_html_css.c +58 -12
- data/ext/makiri/glue/ruby_html_mutate.c +11 -6
- data/ext/makiri/glue/ruby_html_node.c +3 -32
- data/ext/makiri/glue/ruby_node.c +39 -0
- data/ext/makiri/glue/ruby_xml.c +198 -16
- data/ext/makiri/glue/ruby_xml_node.c +46 -59
- data/ext/makiri/glue/ruby_xpath.c +4 -4
- data/ext/makiri/lexbor_compat/source_loc.c +14 -16
- data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
- data/ext/makiri/makiri.c +45 -0
- data/ext/makiri/xml/mkr_xml.h +2 -3
- data/ext/makiri/xml/mkr_xml_chars.c +67 -97
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
- data/ext/makiri/xml/mkr_xml_node.c +147 -15
- data/ext/makiri/xml/mkr_xml_node.h +71 -6
- data/ext/makiri/xml/mkr_xml_tree.c +185 -149
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +37 -0
- data/ext/makiri/xpath/mkr_xpath.h +13 -0
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
- data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
- data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
- data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
- data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
- data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
- data/lib/makiri/cdata_section.rb +1 -3
- data/lib/makiri/comment.rb +1 -3
- data/lib/makiri/document.rb +8 -0
- data/lib/makiri/element.rb +1 -3
- data/lib/makiri/processing_instruction.rb +1 -3
- data/lib/makiri/text.rb +1 -3
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/node_methods.rb +47 -0
- data/lib/makiri.rb +1 -0
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +45 -2
- data/script/check_c_safety_allowlist.yml +19 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +19 -1
|
@@ -48,19 +48,31 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
|
|
|
48
48
|
void
|
|
49
49
|
mkr_verify_text(VALUE str, const char *what)
|
|
50
50
|
{
|
|
51
|
+
/* ALLOCATION-FREE by design: this gate runs between a caller taking a
|
|
52
|
+
* borrowed RSTRING pointer and using it, so it must not be a GC point. The
|
|
53
|
+
* former implementation built a throwaway Ruby String (rb_enc_str_new) to
|
|
54
|
+
* ask for its coderange - a Ruby allocation inside every borrow, which both
|
|
55
|
+
* passed the borrowed ptr into an allocating call and opened a GC window
|
|
56
|
+
* under every OTHER borrow already held at multi-borrow call sites. Bytes
|
|
57
|
+
* are validated as UTF-8 regardless of the String's declared encoding,
|
|
58
|
+
* exactly as before. */
|
|
51
59
|
long len = RSTRING_LEN(str);
|
|
52
60
|
const char *ptr = RSTRING_PTR(str);
|
|
53
61
|
|
|
54
|
-
|
|
62
|
+
mkr_span_t sv = mkr_span(ptr, (size_t)len);
|
|
63
|
+
size_t nul_at;
|
|
64
|
+
if (mkr_span_find(&sv, '\0', &nul_at)) {
|
|
55
65
|
rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
|
|
56
66
|
}
|
|
57
67
|
|
|
58
|
-
/*
|
|
59
|
-
|
|
60
|
-
if (
|
|
68
|
+
/* Cached-coderange fast path (reads flags, never scans, never allocates);
|
|
69
|
+
* NUL is valid UTF-8, so the memchr above stays either way. */
|
|
70
|
+
if (mkr_ruby_str_known_valid_utf8(str)) {
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
|
|
61
74
|
rb_raise(mkr_eError, "%s must be valid UTF-8", what);
|
|
62
75
|
}
|
|
63
|
-
RB_GC_GUARD(str);
|
|
64
76
|
}
|
|
65
77
|
|
|
66
78
|
mkr_ruby_borrowed_text_t
|
|
@@ -96,7 +108,6 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
|
|
|
96
108
|
size_t alloc_len = (v.len > 0) ? v.len : 1;
|
|
97
109
|
char *buf = mkr_reallocarray(NULL, alloc_len, 1);
|
|
98
110
|
if (buf == NULL) {
|
|
99
|
-
RB_GC_GUARD(v.value);
|
|
100
111
|
return -1;
|
|
101
112
|
}
|
|
102
113
|
if (v.len > 0) {
|
|
@@ -145,63 +156,89 @@ mkr_xml_strict_transcode_thunk(VALUE str)
|
|
|
145
156
|
/* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
|
|
146
157
|
*
|
|
147
158
|
* The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
|
|
148
|
-
* checked before the UTF-16 LE BOM they share a prefix with.
|
|
159
|
+
* checked before the UTF-16 LE BOM they share a prefix with.
|
|
160
|
+
*
|
|
161
|
+
* *stride / *ascii_off get the interleave geometry of the ASCII column the decl
|
|
162
|
+
* scanner later extracts (default 1/0 for a single-byte stream). It is resolved
|
|
163
|
+
* HERE, at the match, rather than re-derived downstream, because that derivation
|
|
164
|
+
* needs rb_enc_find (it can autoload an encoding = a GC point) and the decl
|
|
165
|
+
* scanner reads a borrowed RSTRING view that must not be held across one - so
|
|
166
|
+
* the scanner is kept allocation-free until its reads are done. Each span read
|
|
167
|
+
* of p still finishes before the rb_enc_find in the return. */
|
|
149
168
|
static rb_encoding *
|
|
150
|
-
mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len)
|
|
169
|
+
mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stride, long *ascii_off)
|
|
151
170
|
{
|
|
171
|
+
mkr_span_t s = mkr_span((const char *)p, (size_t)len);
|
|
152
172
|
*bom_len = 0;
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
}
|
|
156
|
-
if (
|
|
157
|
-
|
|
158
|
-
}
|
|
159
|
-
if (
|
|
160
|
-
if (len >= 2 && p[0] == 0xFF && p[1] == 0xFE) { *bom_len = 2; return rb_enc_find("UTF-16LE"); }
|
|
161
|
-
if (len >= 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) { *bom_len = 3; return rb_utf8_encoding(); }
|
|
173
|
+
*stride = 1;
|
|
174
|
+
*ascii_off = 0;
|
|
175
|
+
if (mkr_span_starts(&s, "\x00\x00\xFE\xFF", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 3; return rb_enc_find("UTF-32BE"); }
|
|
176
|
+
if (mkr_span_starts(&s, "\xFF\xFE\x00\x00", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 0; return rb_enc_find("UTF-32LE"); }
|
|
177
|
+
if (mkr_span_starts(&s, "\xFE\xFF", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 1; return rb_enc_find("UTF-16BE"); }
|
|
178
|
+
if (mkr_span_starts(&s, "\xFF\xFE", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 0; return rb_enc_find("UTF-16LE"); }
|
|
179
|
+
if (mkr_span_starts(&s, "\xEF\xBB\xBF", 3)) { *bom_len = 3; return rb_utf8_encoding(); }
|
|
162
180
|
return NULL;
|
|
163
181
|
}
|
|
164
182
|
|
|
165
183
|
/* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
|
|
166
184
|
* The declaration is ASCII; for a UTF-16/32-detected document its bytes are
|
|
167
|
-
* stride-interleaved, so the ASCII column is extracted (
|
|
168
|
-
* scan, letting a BOM-vs-declaration conflict be
|
|
169
|
-
|
|
170
|
-
|
|
185
|
+
* stride-interleaved, so the ASCII column is extracted (stride/off resolved by
|
|
186
|
+
* the BOM matcher) before the scan, letting a BOM-vs-declaration conflict be
|
|
187
|
+
* caught even in UTF-16.
|
|
188
|
+
*
|
|
189
|
+
* p is a borrowed RSTRING view, so this stays allocation-free until every read
|
|
190
|
+
* of p is done: the stride/off geometry is passed in (rather than derived here
|
|
191
|
+
* via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
|
|
192
|
+
* the final name lookup - runs after the bytes have been copied into head[]. */
|
|
193
|
+
static int
|
|
194
|
+
mkr_decl_ws(int c)
|
|
171
195
|
{
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
else if (bom == rb_enc_find("UTF-16BE")) { stride = 2; off = 1; }
|
|
175
|
-
else if (bom == rb_enc_find("UTF-32LE")) { stride = 4; off = 0; }
|
|
176
|
-
else if (bom == rb_enc_find("UTF-32BE")) { stride = 4; off = 3; }
|
|
196
|
+
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
|
197
|
+
}
|
|
177
198
|
|
|
199
|
+
static rb_encoding *
|
|
200
|
+
mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
|
|
201
|
+
{
|
|
202
|
+
/* Extract the ASCII column (per the BOM stride) through bounded reads into
|
|
203
|
+
* a bounded writer - neither side trusts the loop arithmetic. */
|
|
204
|
+
mkr_span_t in = mkr_span((const char *)p, len < 0 ? 0 : (size_t)len);
|
|
178
205
|
char head[256];
|
|
179
|
-
|
|
180
|
-
for (
|
|
206
|
+
mkr_spanbuf_t hw = mkr_spanbuf(head, sizeof(head));
|
|
207
|
+
for (size_t i = (size_t)off; hw.pos < sizeof(head); i += (size_t)stride) {
|
|
208
|
+
int c = mkr_span_at(&in, i);
|
|
209
|
+
if (c < 0) break;
|
|
210
|
+
mkr_spanbuf_putc(&hw, (char)c);
|
|
211
|
+
}
|
|
212
|
+
mkr_span_t h = mkr_span(head, hw.pos);
|
|
213
|
+
size_t hn = hw.pos;
|
|
181
214
|
|
|
182
|
-
|
|
183
|
-
while (
|
|
184
|
-
|
|
215
|
+
size_t i = 0;
|
|
216
|
+
while (mkr_decl_ws(mkr_span_at(&h, i))) i++;
|
|
217
|
+
{
|
|
218
|
+
mkr_span_t t = mkr_span_tail(&h, i);
|
|
219
|
+
if (!mkr_span_starts(&t, "<?xml", 5)) return NULL;
|
|
220
|
+
}
|
|
185
221
|
i += 5;
|
|
186
222
|
/* find a whitespace-introduced "encoding" before the '?>' */
|
|
187
223
|
for (; i + 8 <= hn; i++) {
|
|
188
|
-
if (
|
|
189
|
-
|
|
190
|
-
if (!
|
|
191
|
-
|
|
192
|
-
while (
|
|
193
|
-
if (
|
|
224
|
+
if (mkr_span_at(&h, i) == '?' && mkr_span_at(&h, i + 1) == '>') return NULL; /* end of decl */
|
|
225
|
+
mkr_span_t t = mkr_span_tail(&h, i);
|
|
226
|
+
if (!mkr_decl_ws(mkr_span_at(&h, i - 1)) || !mkr_span_starts(&t, "encoding", 8)) continue;
|
|
227
|
+
size_t j = i + 8;
|
|
228
|
+
while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
|
|
229
|
+
if (mkr_span_at(&h, j) != '=') return NULL;
|
|
230
|
+
j++;
|
|
231
|
+
while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
|
|
232
|
+
int q = mkr_span_at(&h, j);
|
|
233
|
+
if (q != '"' && q != '\'') return NULL;
|
|
194
234
|
j++;
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
char q = head[j++];
|
|
198
|
-
long ns = j;
|
|
199
|
-
while (j < hn && head[j] != q) j++;
|
|
235
|
+
size_t ns = j;
|
|
236
|
+
while (mkr_span_at(&h, j) >= 0 && mkr_span_at(&h, j) != q) j++;
|
|
200
237
|
if (j >= hn) return NULL;
|
|
201
238
|
char name[64];
|
|
202
|
-
|
|
203
|
-
if (nl
|
|
204
|
-
memcpy(name, head + ns,
|
|
239
|
+
size_t nl = j - ns;
|
|
240
|
+
if (nl == 0 || nl >= sizeof(name)) return NULL;
|
|
241
|
+
memcpy(name, head + ns, nl);
|
|
205
242
|
name[nl] = '\0';
|
|
206
243
|
return rb_enc_find(name); /* NULL for an unknown encoding name */
|
|
207
244
|
}
|
|
@@ -229,9 +266,15 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
|
|
|
229
266
|
* conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
|
|
230
267
|
* detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
|
|
231
268
|
* with File.binread now parses). */
|
|
232
|
-
long bom_len = 0;
|
|
233
|
-
rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len);
|
|
234
|
-
|
|
269
|
+
long bom_len = 0, bom_stride = 1, bom_off = 0;
|
|
270
|
+
rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len, &bom_stride, &bom_off);
|
|
271
|
+
/* rb_enc_find inside the BOM lookup can autoload an encoding (a Ruby
|
|
272
|
+
* allocation = a GC point), so re-borrow the bytes before reading them
|
|
273
|
+
* again - a borrowed RSTRING pointer must not be held across one. The
|
|
274
|
+
* interleave geometry (stride/off) is resolved by the BOM matcher and
|
|
275
|
+
* passed through, keeping the decl scanner itself allocation-free. */
|
|
276
|
+
raw = (const unsigned char *)RSTRING_PTR(str);
|
|
277
|
+
rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom_stride, bom_off);
|
|
235
278
|
int is_binary = (tag == rb_ascii8bit_encoding());
|
|
236
279
|
|
|
237
280
|
if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
|
|
@@ -273,36 +316,44 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
|
|
|
273
316
|
rb_raise(mkr_eXmlSyntaxError,
|
|
274
317
|
"XML input could not be decoded to UTF-8: %s", msg);
|
|
275
318
|
}
|
|
276
|
-
RB_GC_GUARD(in);
|
|
277
319
|
}
|
|
278
320
|
|
|
279
321
|
const char *ptr = RSTRING_PTR(s);
|
|
280
322
|
long len = RSTRING_LEN(s);
|
|
323
|
+
long off = 0;
|
|
281
324
|
/* §4.3.3: a leading BOM is the encoding signature, not document content -
|
|
282
325
|
* strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
326
|
+
mkr_span_t sv = mkr_span(ptr, (size_t)len);
|
|
327
|
+
if (mkr_span_starts(&sv, "\xEF\xBB\xBF", 3)) {
|
|
328
|
+
off = 3; len -= 3;
|
|
329
|
+
mkr_span_skip(&sv, 3);
|
|
286
330
|
}
|
|
287
331
|
|
|
288
|
-
/* Fail closed on an over-budget input BEFORE the validation
|
|
332
|
+
/* Fail closed on an over-budget input BEFORE the validation scan and the
|
|
289
333
|
* caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
|
|
290
334
|
* budget can never parse). max_bytes == 0 disables the check (__decode). */
|
|
291
335
|
if (max_bytes != 0 && (size_t)len > max_bytes) {
|
|
292
|
-
RB_GC_GUARD(s);
|
|
293
336
|
rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
|
|
294
337
|
}
|
|
295
338
|
|
|
296
|
-
/* Strict UTF-8 validation
|
|
297
|
-
* (
|
|
298
|
-
|
|
339
|
+
/* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
|
|
340
|
+
* borrowed (the former rb_enc_str_new copy handed the borrow straight into
|
|
341
|
+
* an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
|
|
342
|
+
* U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
|
|
343
|
+
* cached coderange covers the BOM-stripped suffix too (the BOM is one
|
|
344
|
+
* complete UTF-8 character). */
|
|
345
|
+
size_t nul_at;
|
|
346
|
+
if (mkr_span_find(&sv, '\0', &nul_at)) {
|
|
299
347
|
rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
|
|
300
348
|
}
|
|
301
|
-
|
|
302
|
-
|
|
349
|
+
if (!mkr_ruby_str_known_valid_utf8(s)
|
|
350
|
+
&& !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
|
|
303
351
|
rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
|
|
304
352
|
}
|
|
305
|
-
|
|
353
|
+
/* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
|
|
354
|
+
* allocates, so the ptr must not be what it copies from). */
|
|
355
|
+
VALUE u = rb_str_subseq(s, off, len);
|
|
356
|
+
rb_enc_associate(u, rb_utf8_encoding());
|
|
306
357
|
return u; /* validated, UTF-8-tagged, BOM-stripped */
|
|
307
358
|
}
|
|
308
359
|
|
|
@@ -328,22 +379,26 @@ mkr_ruby_str_known_valid_utf8(VALUE str)
|
|
|
328
379
|
const char *
|
|
329
380
|
mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
|
|
330
381
|
{
|
|
382
|
+
/* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
|
|
383
|
+
* crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
|
|
384
|
+
* funcall allocated twice with `ptr` already taken). */
|
|
331
385
|
long len = RSTRING_LEN(sv);
|
|
332
386
|
if ((size_t)len > max_bytes) {
|
|
333
387
|
return "string exceeds the maximum length";
|
|
334
388
|
}
|
|
335
389
|
const char *ptr = RSTRING_PTR(sv);
|
|
336
|
-
|
|
390
|
+
mkr_span_t view = mkr_span(ptr, (size_t)len);
|
|
391
|
+
size_t nul_at;
|
|
392
|
+
if (mkr_span_find(&view, '\0', &nul_at)) {
|
|
337
393
|
return "string contains a NUL byte";
|
|
338
394
|
}
|
|
339
|
-
|
|
340
|
-
|
|
395
|
+
if (!mkr_ruby_str_known_valid_utf8(sv)
|
|
396
|
+
&& !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
|
|
341
397
|
return "string is not valid UTF-8";
|
|
342
398
|
}
|
|
343
399
|
out->value = sv;
|
|
344
400
|
out->ptr = ptr;
|
|
345
401
|
out->len = (size_t)len;
|
|
346
|
-
RB_GC_GUARD(u);
|
|
347
402
|
return NULL;
|
|
348
403
|
}
|
|
349
404
|
|
|
@@ -368,9 +423,7 @@ mkr_ruby_exception_message(VALUE exc, char *buf, size_t len)
|
|
|
368
423
|
}
|
|
369
424
|
if (!RB_TYPE_P(msg, T_STRING)) {
|
|
370
425
|
snprintf(buf, len, "%s", "error");
|
|
371
|
-
RB_GC_GUARD(msg);
|
|
372
426
|
return;
|
|
373
427
|
}
|
|
374
428
|
snprintf(buf, len, "%s", RSTRING_PTR(msg));
|
|
375
|
-
RB_GC_GUARD(msg);
|
|
376
429
|
}
|
data/ext/makiri/core/mkr_alloc.c
CHANGED
|
@@ -1,5 +1,37 @@
|
|
|
1
1
|
#include "mkr_alloc.h"
|
|
2
2
|
|
|
3
|
+
#ifdef MKR_ALLOC_INJECT
|
|
4
|
+
/* See mkr_alloc.h: the OOM sweep's failure injection. The counter counts
|
|
5
|
+
* ATTEMPTS (every consult), armed or not, so the harness can size its sweep
|
|
6
|
+
* from a disarmed baseline run; the countdown fails exactly one allocation
|
|
7
|
+
* and then disarms itself, modelling a single transient OOM per run. */
|
|
8
|
+
static long long mkr_inject_countdown = 0; /* 0 = disarmed */
|
|
9
|
+
static unsigned long long mkr_inject_attempts = 0;
|
|
10
|
+
|
|
11
|
+
void
|
|
12
|
+
mkr_alloc_inject_arm(long long nth)
|
|
13
|
+
{
|
|
14
|
+
mkr_inject_countdown = (nth > 0) ? nth : 0;
|
|
15
|
+
mkr_inject_attempts = 0;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
unsigned long long
|
|
19
|
+
mkr_alloc_inject_calls(void)
|
|
20
|
+
{
|
|
21
|
+
return mkr_inject_attempts;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
int
|
|
25
|
+
mkr_alloc_inject_should_fail(void)
|
|
26
|
+
{
|
|
27
|
+
mkr_inject_attempts++;
|
|
28
|
+
if (mkr_inject_countdown > 0 && --mkr_inject_countdown == 0) {
|
|
29
|
+
return 1; /* fail this one allocation; now disarmed */
|
|
30
|
+
}
|
|
31
|
+
return 0;
|
|
32
|
+
}
|
|
33
|
+
#endif
|
|
34
|
+
|
|
3
35
|
void *
|
|
4
36
|
mkr_reallocarray(void *ptr, size_t count, size_t elem)
|
|
5
37
|
{
|
|
@@ -11,6 +43,7 @@ mkr_reallocarray(void *ptr, size_t count, size_t elem)
|
|
|
11
43
|
if (!mkr_size_mul(count, elem, &bytes)) {
|
|
12
44
|
return NULL; /* overflow: leave ptr unchanged */
|
|
13
45
|
}
|
|
46
|
+
if (MKR_ALLOC_INJECT_FAIL()) return NULL;
|
|
14
47
|
return realloc(ptr, bytes);
|
|
15
48
|
}
|
|
16
49
|
|
|
@@ -20,11 +53,14 @@ mkr_callocarray(size_t count, size_t elem)
|
|
|
20
53
|
if (count == 0 || elem == 0) {
|
|
21
54
|
return NULL;
|
|
22
55
|
}
|
|
23
|
-
|
|
24
|
-
|
|
56
|
+
/* 2-arg calloc is itself overflow-safe, but check explicitly so every core
|
|
57
|
+
* allocator fails the SAME way (deterministic NULL) rather than leaving the
|
|
58
|
+
* overflow case to calloc's implementation-defined behaviour. */
|
|
59
|
+
if (count > SIZE_MAX / elem) {
|
|
25
60
|
return NULL; /* overflow */
|
|
26
61
|
}
|
|
27
|
-
|
|
62
|
+
if (MKR_ALLOC_INJECT_FAIL()) return NULL;
|
|
63
|
+
return calloc(count, elem);
|
|
28
64
|
}
|
|
29
65
|
|
|
30
66
|
char *
|
|
@@ -34,6 +70,7 @@ mkr_str_alloc(size_t n)
|
|
|
34
70
|
if (!mkr_size_add(n, 1, &total)) {
|
|
35
71
|
return NULL; /* n + 1 overflow */
|
|
36
72
|
}
|
|
73
|
+
if (MKR_ALLOC_INJECT_FAIL()) return NULL;
|
|
37
74
|
char *p = malloc(total);
|
|
38
75
|
if (p == NULL) {
|
|
39
76
|
return NULL;
|
data/ext/makiri/core/mkr_alloc.h
CHANGED
|
@@ -97,10 +97,33 @@ char *mkr_strdup(const char *s);
|
|
|
97
97
|
* `cap *= 2; realloc(p, cap * sizeof(T))` pattern in one call. */
|
|
98
98
|
mkr_status_t mkr_grow_reserve(void **ptr, size_t *cap, size_t need, size_t elem);
|
|
99
99
|
|
|
100
|
-
/*
|
|
101
|
-
|
|
102
|
-
*
|
|
103
|
-
|
|
100
|
+
/* --- allocation-failure injection (debug builds only) ------------------- */
|
|
101
|
+
/*
|
|
102
|
+
* The OOM sweep harness (script/check_alloc_failures.rb, `rake oom`) verifies
|
|
103
|
+
* that every fail-closed OOM branch actually fails closed: it arms "the nth
|
|
104
|
+
* core allocation fails", runs a workload, and asserts the result is either a
|
|
105
|
+
* clean exception or byte-identical to the baseline - never truncated.
|
|
106
|
+
*
|
|
107
|
+
* Compiled in ONLY under -DMKR_ALLOC_INJECT (extconf: MAKIRI_ALLOC_INJECT=1);
|
|
108
|
+
* a release build carries no counter and no branch. Covers every core libc
|
|
109
|
+
* allocation site (mkr_alloc.c + mkr_buf.c - the funnel the direct_alloc lint
|
|
110
|
+
* forces all engine allocations through). Ruby's xmalloc family and Lexbor's
|
|
111
|
+
* internal allocations are out of scope (Ruby raises NoMemoryError itself;
|
|
112
|
+
* Lexbor is vendor code). Not thread-safe by design (the harness is
|
|
113
|
+
* single-threaded).
|
|
114
|
+
*/
|
|
115
|
+
#ifdef MKR_ALLOC_INJECT
|
|
116
|
+
/* Arm: the nth subsequent core allocation (1-based) fails once, then the
|
|
117
|
+
* injection disarms itself. nth <= 0 disarms. Resets the call counter. */
|
|
118
|
+
void mkr_alloc_inject_arm(long long nth);
|
|
119
|
+
/* Core allocation attempts since the last arm (sizes the sweep). */
|
|
120
|
+
unsigned long long mkr_alloc_inject_calls(void);
|
|
121
|
+
/* Internal: consulted by each core libc allocation site. */
|
|
122
|
+
int mkr_alloc_inject_should_fail(void);
|
|
123
|
+
#define MKR_ALLOC_INJECT_FAIL() (mkr_alloc_inject_should_fail())
|
|
124
|
+
#else
|
|
125
|
+
#define MKR_ALLOC_INJECT_FAIL() 0
|
|
126
|
+
#endif
|
|
104
127
|
|
|
105
128
|
#ifdef __cplusplus
|
|
106
129
|
}
|
data/ext/makiri/core/mkr_buf.c
CHANGED
|
@@ -30,7 +30,17 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
|
|
|
30
30
|
if (!mkr_grow_capacity(b->cap, need_term, 1, &new_cap)) {
|
|
31
31
|
return MKR_ERR_OOM;
|
|
32
32
|
}
|
|
33
|
-
|
|
33
|
+
/* Geometric growth can overshoot to ~2x need_term; clamp the ALLOCATION
|
|
34
|
+
* to the same ceiling as the content (limit, plus one byte for the NUL),
|
|
35
|
+
* so cap never runs to ~2x MKR_BUF_HARD_MAX near the limit. Safe: this
|
|
36
|
+
* append already passed need <= limit, so need_term <= limit + 1 and the
|
|
37
|
+
* clamp never drops new_cap below what this append needs. (If limit + 1
|
|
38
|
+
* overflows - only a pathological -DMKR_BUF_HARD_MAX=SIZE_MAX - skip it.) */
|
|
39
|
+
size_t cap_ceiling;
|
|
40
|
+
if (mkr_size_add(limit, 1, &cap_ceiling) && new_cap > cap_ceiling) {
|
|
41
|
+
new_cap = cap_ceiling;
|
|
42
|
+
}
|
|
43
|
+
char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, new_cap);
|
|
34
44
|
if (p == NULL) {
|
|
35
45
|
return MKR_ERR_OOM;
|
|
36
46
|
}
|
|
@@ -62,7 +72,7 @@ mkr_buf_reserve(mkr_buf_t *b, size_t n)
|
|
|
62
72
|
if (need_term <= b->cap) {
|
|
63
73
|
return MKR_OK; /* already have room */
|
|
64
74
|
}
|
|
65
|
-
char *p = realloc(b->data, need_term);
|
|
75
|
+
char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, need_term);
|
|
66
76
|
if (p == NULL) {
|
|
67
77
|
return MKR_ERR_OOM;
|
|
68
78
|
}
|
|
@@ -76,7 +86,7 @@ char *
|
|
|
76
86
|
mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
|
|
77
87
|
{
|
|
78
88
|
if (b->data == NULL) {
|
|
79
|
-
char *empty = malloc(1);
|
|
89
|
+
char *empty = MKR_ALLOC_INJECT_FAIL() ? NULL : malloc(1);
|
|
80
90
|
if (empty == NULL) {
|
|
81
91
|
return NULL;
|
|
82
92
|
}
|
data/ext/makiri/core/mkr_buf.h
CHANGED
|
@@ -24,11 +24,15 @@ extern "C" {
|
|
|
24
24
|
* think about a bound gets a tight one for free, and a
|
|
25
25
|
* buffer that genuinely needs to be large must opt in
|
|
26
26
|
* EXPLICITLY by passing a larger max.
|
|
27
|
-
* MKR_BUF_HARD_MAX an absolute ceiling
|
|
28
|
-
* with an explicit max - the last-resort backstop.
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
27
|
+
* MKR_BUF_HARD_MAX an absolute ceiling on a buffer's CONTENT length, even
|
|
28
|
+
* with an explicit max - the last-resort backstop. The
|
|
29
|
+
* ALLOCATION is bounded by it too: geometric growth is
|
|
30
|
+
* clamped so cap never exceeds HARD_MAX + 1 (the one NUL
|
|
31
|
+
* terminator byte), i.e. it does NOT overshoot to ~2x
|
|
32
|
+
* near the limit. Tight, content-scaled bounds still
|
|
33
|
+
* belong to the caller (e.g. the XML serializer caps
|
|
34
|
+
* itself at a multiple of arena_bytes); this stops total
|
|
35
|
+
* runaway.
|
|
32
36
|
*
|
|
33
37
|
* Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
|
|
34
38
|
#ifndef MKR_BUF_DEFAULT_LIMIT
|
|
@@ -82,6 +86,77 @@ mkr_buf_free(mkr_buf_t *b)
|
|
|
82
86
|
b->len = b->cap = 0;
|
|
83
87
|
}
|
|
84
88
|
|
|
89
|
+
/* ------------------------------------------------------------------------- */
|
|
90
|
+
/* mkr_spanbuf_t - a bounded writer over a borrowed, fixed-capacity buffer */
|
|
91
|
+
/* ------------------------------------------------------------------------- */
|
|
92
|
+
/*
|
|
93
|
+
* "span" = a non-owning view of a fixed-extent contiguous region; a spanbuf is
|
|
94
|
+
* the bounded *writer* over such a span. The complement to mkr_buf_t: where
|
|
95
|
+
* mkr_buf_t GROWS and OWNS its malloc storage, a spanbuf BORROWS a fixed buffer
|
|
96
|
+
* owned elsewhere (e.g. an arena cut) and never grows it.
|
|
97
|
+
*
|
|
98
|
+
* The name leads with "borrowed/fixed" deliberately, for safety: the
|
|
99
|
+
* fixed/bounded property is SELF-ENFORCED (a write past `cap` is refused, so
|
|
100
|
+
* misunderstanding it costs at most a truncation, caught by _finish), but the
|
|
101
|
+
* BORROWED property is the caller's responsibility - the type cannot stop you
|
|
102
|
+
* from free()ing `buf` or holding `finish()`'s pointer past the owner's
|
|
103
|
+
* lifetime, and getting that wrong is a use-after-free / double-free. So:
|
|
104
|
+
* - never free() buf (the owner does, e.g. the arena is freed wholesale);
|
|
105
|
+
* - finish()'s pointer is valid only while the backing storage lives.
|
|
106
|
+
*
|
|
107
|
+
* Bounds safety is BY CONSTRUCTION (the writer owns the cursor + check, not the
|
|
108
|
+
* caller's per-write guard): an over-long write is refused and latches `ok` to
|
|
109
|
+
* false (sticky). The caller's only duty is one check at the end (via _finish or
|
|
110
|
+
* the public `ok` field), failing closed rather than using a truncated buffer.
|
|
111
|
+
* This is the sanctioned way to hand-fill a raw region; see mkr_xml_arena_spanbuf
|
|
112
|
+
* for the arena adapter.
|
|
113
|
+
*/
|
|
114
|
+
typedef struct {
|
|
115
|
+
char *buf; /* borrowed; the owner keeps it alive - do NOT free. NULL => never ok. */
|
|
116
|
+
size_t cap; /* capacity in bytes (fixed) */
|
|
117
|
+
size_t pos; /* bytes written so far (always <= cap) */
|
|
118
|
+
bool ok; /* false once a write would overflow, or buf was NULL */
|
|
119
|
+
} mkr_spanbuf_t;
|
|
120
|
+
|
|
121
|
+
/* Wrap [buf, buf+cap) for bounded writing. buf == NULL yields a permanently
|
|
122
|
+
* not-ok writer (every write a no-op), so an upstream allocation failure flows
|
|
123
|
+
* straight through without a separate guard at each call site. */
|
|
124
|
+
static inline mkr_spanbuf_t
|
|
125
|
+
mkr_spanbuf(char *buf, size_t cap)
|
|
126
|
+
{
|
|
127
|
+
return (mkr_spanbuf_t){ .buf = buf, .cap = cap, .pos = 0, .ok = (buf != NULL) };
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/* Append one byte; refuse (latch ok=false) if it would exceed cap. */
|
|
131
|
+
static inline void
|
|
132
|
+
mkr_spanbuf_putc(mkr_spanbuf_t *b, char c)
|
|
133
|
+
{
|
|
134
|
+
if (!b->ok) return;
|
|
135
|
+
if (b->pos >= b->cap) { b->ok = false; return; }
|
|
136
|
+
b->buf[b->pos++] = c;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/* Append n bytes; refuse (latch ok=false) if they would exceed cap. n == 0 is a
|
|
140
|
+
* no-op. pos <= cap is the invariant (a refused write never advances pos), so
|
|
141
|
+
* cap - pos cannot underflow; the pos > cap arm is belt-and-suspenders. */
|
|
142
|
+
static inline void
|
|
143
|
+
mkr_spanbuf_write(mkr_spanbuf_t *b, const void *src, size_t n)
|
|
144
|
+
{
|
|
145
|
+
if (!b->ok || n == 0) return;
|
|
146
|
+
if (b->pos > b->cap || n > b->cap - b->pos) { b->ok = false; return; }
|
|
147
|
+
memcpy(b->buf + b->pos, src, n);
|
|
148
|
+
b->pos += n;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/* The filled prefix [buf, buf+pos), or NULL if any write was refused (or buf was
|
|
152
|
+
* NULL); on a non-NULL return the length is `b->pos`. Forces the caller through a
|
|
153
|
+
* single fail-closed check instead of trusting the writes individually. */
|
|
154
|
+
static inline const char *
|
|
155
|
+
mkr_spanbuf_finish(const mkr_spanbuf_t *b)
|
|
156
|
+
{
|
|
157
|
+
return b->ok ? b->buf : NULL;
|
|
158
|
+
}
|
|
159
|
+
|
|
85
160
|
#ifdef __cplusplus
|
|
86
161
|
}
|
|
87
162
|
#endif
|