RubyGems - makiri - Versions diffs - 0.2.0 → 0.4.0 - Mend

makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

checksums.yaml +4 -4
data/.github/workflows/conformance.yml +22 -0
data/.github/workflows/libfuzzer.yml +83 -0
data/.github/workflows/release.yml +12 -7
data/.github/workflows/security.yml +88 -3
data/.github/workflows/valgrind.yml +135 -0
data/CHANGELOG.md +152 -15
data/README.md +183 -13
data/Rakefile +294 -7
data/ext/makiri/bridge/bridge.h +28 -0
data/ext/makiri/bridge/ruby_string.c +282 -12
data/ext/makiri/core/mkr_alloc.c +40 -3
data/ext/makiri/core/mkr_alloc.h +28 -5
data/ext/makiri/core/mkr_buf.c +47 -3
data/ext/makiri/core/mkr_buf.h +112 -3
data/ext/makiri/core/mkr_core.c +143 -0
data/ext/makiri/core/mkr_core.h +11 -2
data/ext/makiri/core/mkr_hash.h +1 -1
data/ext/makiri/core/mkr_span.h +186 -0
data/ext/makiri/core/mkr_text.h +8 -8
data/ext/makiri/core/mkr_utf8.c +101 -0
data/ext/makiri/core/mkr_utf8.h +88 -0
data/ext/makiri/extconf.rb +123 -10
data/ext/makiri/fuzz/Makefile +95 -0
data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
data/ext/makiri/fuzz/xml_fuzz.c +24 -0
data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
data/ext/makiri/glue/glue.h +55 -11
data/ext/makiri/glue/ruby_doc.c +129 -59
data/ext/makiri/glue/ruby_html_css.c +292 -0
data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
data/ext/makiri/glue/ruby_html_node.c +859 -0
data/ext/makiri/glue/ruby_html_serialize.c +154 -0
data/ext/makiri/glue/ruby_node.c +74 -729
data/ext/makiri/glue/ruby_node_set.c +167 -32
data/ext/makiri/glue/ruby_xml.c +602 -0
data/ext/makiri/glue/ruby_xml_node.c +1373 -0
data/ext/makiri/glue/ruby_xpath.c +63 -30
data/ext/makiri/glue/ruby_xpath.h +19 -0
data/ext/makiri/lexbor_compat/compat.h +42 -9
data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
data/ext/makiri/lexbor_compat/dom_index.c +2 -2
data/ext/makiri/lexbor_compat/post_parse.c +100 -10
data/ext/makiri/lexbor_compat/source_loc.c +15 -13
data/ext/makiri/lexbor_compat/text_index.c +14 -8
data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
data/ext/makiri/makiri.c +184 -6
data/ext/makiri/makiri.h +43 -2
data/ext/makiri/xml/mkr_xml.h +125 -0
data/ext/makiri/xml/mkr_xml_chars.c +195 -0
data/ext/makiri/xml/mkr_xml_index.c +169 -0
data/ext/makiri/xml/mkr_xml_index.h +48 -0
data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
data/ext/makiri/xml/mkr_xml_node.c +399 -0
data/ext/makiri/xml/mkr_xml_node.h +184 -0
data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
data/ext/makiri/xpath/mkr_css.c +1023 -0
data/ext/makiri/xpath/mkr_css.h +65 -0
data/ext/makiri/xpath/mkr_xpath.c +96 -32
data/ext/makiri/xpath/mkr_xpath.h +109 -4
data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
data/lib/makiri/{attribute.rb → attr.rb} +7 -3
data/lib/makiri/cdata_section.rb +19 -0
data/lib/makiri/comment.rb +10 -0
data/lib/makiri/compat_aliases.rb +30 -0
data/lib/makiri/document.rb +9 -73
data/lib/makiri/document_fragment.rb +14 -9
data/lib/makiri/element.rb +4 -4
data/lib/makiri/html/document.rb +106 -0
data/lib/makiri/html/node_methods.rb +19 -0
data/lib/makiri/html.rb +12 -0
data/lib/makiri/node.rb +58 -15
data/lib/makiri/node_set.rb +8 -0
data/lib/makiri/processing_instruction.rb +10 -0
data/lib/makiri/text.rb +1 -1
data/lib/makiri/version.rb +1 -1
data/lib/makiri/xml/builder.rb +263 -0
data/lib/makiri/xml/document.rb +24 -0
data/lib/makiri/xml/node_methods.rb +84 -0
data/lib/makiri/xml.rb +10 -0
data/lib/makiri/xpath_context.rb +1 -1
data/lib/makiri.rb +24 -5
data/script/build_native_gem.rb +2 -2
data/script/check_alloc_failures.rb +266 -0
data/script/check_c_safety.rb +77 -2
data/script/check_c_safety_allowlist.yml +102 -0
data/script/check_leaks.rb +64 -0
data/script/leaks_harness.rb +64 -0
data/vendor/lexbor/CMakeLists.txt +6 -0
data/vendor/lexbor/README.md +12 -0
data/vendor/lexbor/config.cmake +1 -1
data/vendor/lexbor/source/lexbor/core/base.h +1 -1
data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
data/vendor/lexbor/source/lexbor/html/base.h +1 -1
data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
data/vendor/lexbor/source/lexbor/url/base.h +1 -1
data/vendor/lexbor/source/lexbor/url/url.c +5 -2
data/vendor/lexbor/source/lexbor/url/url.h +9 -0
data/vendor/lexbor/version +1 -1
metadata +53 -9
data/ext/makiri/glue/ruby_css.c +0 -185
data/ext/makiri/glue/ruby_serialize.c +0 -92
data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
data/lib/makiri/cdata.rb +0 -6

data/ext/makiri/bridge/ruby_string.c CHANGED Viewed

@@ -48,19 +48,31 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
 void
 mkr_verify_text(VALUE str, const char *what)
 {
+    /* ALLOCATION-FREE by design: this gate runs between a caller taking a
+     * borrowed RSTRING pointer and using it, so it must not be a GC point. The
+     * former implementation built a throwaway Ruby String (rb_enc_str_new) to
+     * ask for its coderange - a Ruby allocation inside every borrow, which both
+     * passed the borrowed ptr into an allocating call and opened a GC window
+     * under every OTHER borrow already held at multi-borrow call sites. Bytes
+     * are validated as UTF-8 regardless of the String's declared encoding,
+     * exactly as before. */
     long        len = RSTRING_LEN(str);
     const char *ptr = RSTRING_PTR(str);
-    if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
+    mkr_span_t sv = mkr_span(ptr, (size_t)len);
+    size_t nul_at;
+    if (mkr_span_find(&sv, '\0', &nul_at)) {
         rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
     }
-    /* Validate the bytes as UTF-8 regardless of the String's declared encoding. */
-    VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
-    if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
+    /* Cached-coderange fast path (reads flags, never scans, never allocates);
+     * NUL is valid UTF-8, so the memchr above stays either way. */
+    if (mkr_ruby_str_known_valid_utf8(str)) {
+        return;
+    }
+    if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
         rb_raise(mkr_eError, "%s must be valid UTF-8", what);
     }
-    RB_GC_GUARD(str);
 }
 mkr_ruby_borrowed_text_t
@@ -96,7 +108,6 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
     size_t alloc_len = (v.len > 0) ? v.len : 1;
     char *buf = mkr_reallocarray(NULL, alloc_len, 1);
     if (buf == NULL) {
-        RB_GC_GUARD(v.value);
         return -1;
     }
     if (v.len > 0) {
@@ -108,25 +119,286 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
     return 0;
 }
+VALUE
+mkr_ruby_to_utf8(VALUE str)
+{
+    /* Honour the Ruby String's declared encoding so its content survives:
+     *
+     *  - UTF-8 / US-ASCII / ASCII-8BIT (binary): returned unchanged. These are
+     *    already UTF-8 bytes (or deliberately raw bytes), and the native parser
+     *    does the WHATWG invalid-byte replacement for them. The UTF-8 common
+     *    case costs only this encoding comparison - no transcode, no copy.
+     *
+     *  - any other encoding (Shift_JIS, EUC-JP, ISO-8859-1, Windows-1252, ...):
+     *    transcoded to UTF-8 with invalid/undef -> U+FFFD, so e.g. Shift_JIS
+     *    text becomes the right UTF-8 characters instead of being read as raw
+     *    UTF-8 bytes and mangled. Only non-UTF-8 input pays this. */
+    rb_encoding *enc = rb_enc_get(str);
+    if (enc == rb_utf8_encoding()
+        || enc == rb_usascii_encoding()
+        || enc == rb_ascii8bit_encoding()) {
+        return str;
+    }
+    return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()),
+                         ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE, Qnil);
+}
+/* rb_str_encode with no replacement flags: an undefined conversion or invalid
+ * byte sequence RAISES (Encoding::UndefinedConversionError /
+ * Encoding::InvalidByteSequenceError) instead of substituting U+FFFD. Run under
+ * rb_protect so we can remap the Ruby Encoding error to Makiri::XML::SyntaxError. */
+static VALUE
+mkr_xml_strict_transcode_thunk(VALUE str)
+{
+    return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()), 0, Qnil);
+}
+/* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
+ *
+ * The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
+ * checked before the UTF-16 LE BOM they share a prefix with.
+ *
+ * *stride / *ascii_off get the interleave geometry of the ASCII column the decl
+ * scanner later extracts (default 1/0 for a single-byte stream). It is resolved
+ * HERE, at the match, rather than re-derived downstream, because that derivation
+ * needs rb_enc_find (it can autoload an encoding = a GC point) and the decl
+ * scanner reads a borrowed RSTRING view that must not be held across one - so
+ * the scanner is kept allocation-free until its reads are done. Each span read
+ * of p still finishes before the rb_enc_find in the return. */
+static rb_encoding *
+mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stride, long *ascii_off)
+{
+    mkr_span_t s = mkr_span((const char *)p, (size_t)len);
+    *bom_len = 0;
+    *stride = 1;
+    *ascii_off = 0;
+    if (mkr_span_starts(&s, "\x00\x00\xFE\xFF", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 3; return rb_enc_find("UTF-32BE"); }
+    if (mkr_span_starts(&s, "\xFF\xFE\x00\x00", 4)) { *bom_len = 4; *stride = 4; *ascii_off = 0; return rb_enc_find("UTF-32LE"); }
+    if (mkr_span_starts(&s, "\xFE\xFF", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 1; return rb_enc_find("UTF-16BE"); }
+    if (mkr_span_starts(&s, "\xFF\xFE", 2)) { *bom_len = 2; *stride = 2; *ascii_off = 0; return rb_enc_find("UTF-16LE"); }
+    if (mkr_span_starts(&s, "\xEF\xBB\xBF", 3)) { *bom_len = 3; return rb_utf8_encoding(); }
+    return NULL;
+}
+/* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
+ * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
+ * stride-interleaved, so the ASCII column is extracted (stride/off resolved by
+ * the BOM matcher) before the scan, letting a BOM-vs-declaration conflict be
+ * caught even in UTF-16.
+ *
+ * p is a borrowed RSTRING view, so this stays allocation-free until every read
+ * of p is done: the stride/off geometry is passed in (rather than derived here
+ * via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
+ * the final name lookup - runs after the bytes have been copied into head[]. */
+static int
+mkr_decl_ws(int c)
+{
+    return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+static rb_encoding *
+mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
+{
+    /* Extract the ASCII column (per the BOM stride) through bounded reads into
+     * a bounded writer - neither side trusts the loop arithmetic. */
+    mkr_span_t in = mkr_span((const char *)p, len < 0 ? 0 : (size_t)len);
+    char head[256];
+    mkr_spanbuf_t hw = mkr_spanbuf(head, sizeof(head));
+    for (size_t i = (size_t)off; hw.pos < sizeof(head); i += (size_t)stride) {
+        int c = mkr_span_at(&in, i);
+        if (c < 0) break;
+        mkr_spanbuf_putc(&hw, (char)c);
+    }
+    mkr_span_t h = mkr_span(head, hw.pos);
+    size_t hn = hw.pos;
+    size_t i = 0;
+    while (mkr_decl_ws(mkr_span_at(&h, i))) i++;
+    {
+        mkr_span_t t = mkr_span_tail(&h, i);
+        if (!mkr_span_starts(&t, "<?xml", 5)) return NULL;
+    }
+    i += 5;
+    /* find a whitespace-introduced "encoding" before the '?>' */
+    for (; i + 8 <= hn; i++) {
+        if (mkr_span_at(&h, i) == '?' && mkr_span_at(&h, i + 1) == '>') return NULL; /* end of decl */
+        mkr_span_t t = mkr_span_tail(&h, i);
+        if (!mkr_decl_ws(mkr_span_at(&h, i - 1)) || !mkr_span_starts(&t, "encoding", 8)) continue;
+        size_t j = i + 8;
+        while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
+        if (mkr_span_at(&h, j) != '=') return NULL;
+        j++;
+        while (mkr_decl_ws(mkr_span_at(&h, j))) j++;
+        int q = mkr_span_at(&h, j);
+        if (q != '"' && q != '\'') return NULL;
+        j++;
+        size_t ns = j;
+        while (mkr_span_at(&h, j) >= 0 && mkr_span_at(&h, j) != q) j++;
+        if (j >= hn) return NULL;
+        char name[64];
+        size_t nl = j - ns;
+        if (nl == 0 || nl >= sizeof(name)) return NULL;
+        memcpy(name, head + ns, nl);
+        name[nl] = '\0';
+        return rb_enc_find(name);   /* NULL for an unknown encoding name */
+    }
+    return NULL;
+}
+/* Two encodings agree for conflict purposes when identical, or when either is
+ * US-ASCII (a subset of UTF-8 and the single-byte encodings). */
+static int
+mkr_xml_enc_compatible(rb_encoding *a, rb_encoding *b)
+{
+    return a == b || a == rb_usascii_encoding() || b == rb_usascii_encoding();
+}
+VALUE
+mkr_xml_decode_input(VALUE str, size_t max_bytes)
+{
+    rb_encoding   *tag    = rb_enc_get(str);
+    const unsigned char *raw = (const unsigned char *)RSTRING_PTR(str);
+    long           rawlen = RSTRING_LEN(str);
+    /* Detect the byte encoding (XML 1.0 Appendix F): a BOM wins, else the
+     * declaration. The Ruby String's encoding is authoritative when it is a
+     * concrete text encoding; a BOM/declaration that disagrees is a fatal
+     * conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
+     * detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
+     * with File.binread now parses). */
+    long bom_len = 0, bom_stride = 1, bom_off = 0;
+    rb_encoding *bom  = mkr_xml_bom_encoding(raw, rawlen, &bom_len, &bom_stride, &bom_off);
+    /* rb_enc_find inside the BOM lookup can autoload an encoding (a Ruby
+     * allocation = a GC point), so re-borrow the bytes before reading them
+     * again - a borrowed RSTRING pointer must not be held across one. The
+     * interleave geometry (stride/off) is resolved by the BOM matcher and
+     * passed through, keeping the decl scanner itself allocation-free. */
+    raw = (const unsigned char *)RSTRING_PTR(str);
+    rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom_stride, bom_off);
+    int is_binary = (tag == rb_ascii8bit_encoding());
+    if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
+        rb_raise(mkr_eXmlSyntaxError,
+                 "XML encoding conflict: the byte-order mark and the encoding declaration disagree");
+    }
+    if (!is_binary && bom && !mkr_xml_enc_compatible(bom, tag)) {
+        rb_raise(mkr_eXmlSyntaxError,
+                 "XML encoding conflict: the byte-order mark disagrees with the string's encoding");
+    }
+    if (!is_binary && decl && !mkr_xml_enc_compatible(decl, tag)) {
+        /* A concrete String encoding is authoritative for decoding, so the
+         * declaration is not used to transcode - but a declaration that names a
+         * different encoding than the String is tagged with (e.g. a Shift_JIS
+         * String declaring encoding="UTF-8") is a self-inconsistent document and
+         * a fatal error, not a silently-ignored mismatch. */
+        rb_raise(mkr_eXmlSyntaxError,
+                 "XML encoding conflict: the encoding declaration disagrees with the string's encoding");
+    }
+    rb_encoding *eff = is_binary ? (bom ? bom : (decl ? decl : rb_utf8_encoding())) : tag;
+    /* Decode to UTF-8 (strict). UTF-8 / US-ASCII / ASCII-8BIT are already UTF-8
+     * bytes (validated below); anything else is strict-transcoded, raising rather
+     * than substituting U+FFFD. */
+    VALUE s;
+    if (eff == rb_utf8_encoding() || eff == rb_usascii_encoding() || eff == rb_ascii8bit_encoding()) {
+        s = str;
+    } else {
+        VALUE in = str;
+        if (rb_enc_get(str) != eff) { in = rb_str_dup(str); rb_enc_associate(in, eff); }
+        int state = 0;
+        s = rb_protect(mkr_xml_strict_transcode_thunk, in, &state);
+        if (state != 0) {
+            VALUE exc = rb_errinfo();
+            rb_set_errinfo(Qnil);
+            char msg[256];
+            mkr_ruby_exception_message(exc, msg, sizeof msg);
+            rb_raise(mkr_eXmlSyntaxError,
+                     "XML input could not be decoded to UTF-8: %s", msg);
+        }
+    }
+    const char *ptr = RSTRING_PTR(s);
+    long        len = RSTRING_LEN(s);
+    long        off = 0;
+    /* §4.3.3: a leading BOM is the encoding signature, not document content -
+     * strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
+    mkr_span_t sv = mkr_span(ptr, (size_t)len);
+    if (mkr_span_starts(&sv, "\xEF\xBB\xBF", 3)) {
+        off = 3; len -= 3;
+        mkr_span_skip(&sv, 3);
+    }
+    /* Fail closed on an over-budget input BEFORE the validation scan and the
+     * caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
+     * budget can never parse). max_bytes == 0 disables the check (__decode). */
+    if (max_bytes != 0 && (size_t)len > max_bytes) {
+        rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
+    }
+    /* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
+     * borrowed (the former rb_enc_str_new copy handed the borrow straight into
+     * an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
+     * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
+     * cached coderange covers the BOM-stripped suffix too (the BOM is one
+     * complete UTF-8 character). */
+    size_t nul_at;
+    if (mkr_span_find(&sv, '\0', &nul_at)) {
+        rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
+    }
+    if (!mkr_ruby_str_known_valid_utf8(s)
+        && !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
+        rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
+    }
+    /* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
+     * allocates, so the ptr must not be what it copies from). */
+    VALUE u = rb_str_subseq(s, off, len);
+    rb_enc_associate(u, rb_utf8_encoding());
+    return u; /* validated, UTF-8-tagged, BOM-stripped */
+}
+bool
+mkr_ruby_str_known_valid_utf8(VALUE str)
+{
+    if (!RB_TYPE_P(str, T_STRING)) {
+        return false;
+    }
+    /* ENC_CODERANGE reads the *cached* classification from the object's flags;
+     * it does NOT scan (rb_enc_str_coderange would, costing as much as our own
+     * validator). So this only wins when Ruby already knows the answer. */
+    int cr = ENC_CODERANGE(str);
+    if (cr == ENC_CODERANGE_7BIT) {
+        return true; /* all bytes < 0x80 in an ASCII-compatible encoding */
+    }
+    if (cr == ENC_CODERANGE_VALID) {
+        return rb_enc_get(str) == rb_utf8_encoding(); /* valid AND UTF-8 */
+    }
+    return false; /* UNKNOWN or BROKEN: let mkr_utf8_sanitize handle it */
+}
 const char *
 mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
 {
+    /* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
+     * crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
+     * funcall allocated twice with `ptr` already taken). */
     long len = RSTRING_LEN(sv);
     if ((size_t)len > max_bytes) {
         return "string exceeds the maximum length";
     }
     const char *ptr = RSTRING_PTR(sv);
-    if (memchr(ptr, '\0', (size_t)len) != NULL) {
+    mkr_span_t view = mkr_span(ptr, (size_t)len);
+    size_t nul_at;
+    if (mkr_span_find(&view, '\0', &nul_at)) {
         return "string contains a NUL byte";
     }
-    VALUE u = rb_utf8_str_new(ptr, len);
-    if (!RTEST(rb_funcall(u, rb_intern("valid_encoding?"), 0))) {
+    if (!mkr_ruby_str_known_valid_utf8(sv)
+        && !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
         return "string is not valid UTF-8";
     }
     out->value = sv;
     out->ptr   = ptr;
     out->len   = (size_t)len;
-    RB_GC_GUARD(u);
     return NULL;
 }
@@ -151,9 +423,7 @@ mkr_ruby_exception_message(VALUE exc, char *buf, size_t len)
     }
     if (!RB_TYPE_P(msg, T_STRING)) {
         snprintf(buf, len, "%s", "error");
-        RB_GC_GUARD(msg);
         return;
     }
     snprintf(buf, len, "%s", RSTRING_PTR(msg));
-    RB_GC_GUARD(msg);
 }

data/ext/makiri/core/mkr_alloc.c CHANGED Viewed

@@ -1,5 +1,37 @@
 #include "mkr_alloc.h"
+#ifdef MKR_ALLOC_INJECT
+/* See mkr_alloc.h: the OOM sweep's failure injection. The counter counts
+ * ATTEMPTS (every consult), armed or not, so the harness can size its sweep
+ * from a disarmed baseline run; the countdown fails exactly one allocation
+ * and then disarms itself, modelling a single transient OOM per run. */
+static long long          mkr_inject_countdown = 0;  /* 0 = disarmed */
+static unsigned long long mkr_inject_attempts  = 0;
+void
+mkr_alloc_inject_arm(long long nth)
+{
+    mkr_inject_countdown = (nth > 0) ? nth : 0;
+    mkr_inject_attempts  = 0;
+}
+unsigned long long
+mkr_alloc_inject_calls(void)
+{
+    return mkr_inject_attempts;
+}
+int
+mkr_alloc_inject_should_fail(void)
+{
+    mkr_inject_attempts++;
+    if (mkr_inject_countdown > 0 && --mkr_inject_countdown == 0) {
+        return 1; /* fail this one allocation; now disarmed */
+    }
+    return 0;
+}
+#endif
 void *
 mkr_reallocarray(void *ptr, size_t count, size_t elem)
 {
@@ -11,6 +43,7 @@ mkr_reallocarray(void *ptr, size_t count, size_t elem)
     if (!mkr_size_mul(count, elem, &bytes)) {
         return NULL; /* overflow: leave ptr unchanged */
     }
+    if (MKR_ALLOC_INJECT_FAIL()) return NULL;
     return realloc(ptr, bytes);
 }
@@ -20,11 +53,14 @@ mkr_callocarray(size_t count, size_t elem)
     if (count == 0 || elem == 0) {
         return NULL;
     }
-    size_t bytes;
-    if (!mkr_size_mul(count, elem, &bytes)) {
+    /* 2-arg calloc is itself overflow-safe, but check explicitly so every core
+     * allocator fails the SAME way (deterministic NULL) rather than leaving the
+     * overflow case to calloc's implementation-defined behaviour. */
+    if (count > SIZE_MAX / elem) {
         return NULL; /* overflow */
     }
-    return calloc(count, elem); /* 2-arg calloc is itself overflow-safe */
+    if (MKR_ALLOC_INJECT_FAIL()) return NULL;
+    return calloc(count, elem);
 }
 char *
@@ -34,6 +70,7 @@ mkr_str_alloc(size_t n)
     if (!mkr_size_add(n, 1, &total)) {
         return NULL; /* n + 1 overflow */
     }
+    if (MKR_ALLOC_INJECT_FAIL()) return NULL;
     char *p = malloc(total);
     if (p == NULL) {
         return NULL;

data/ext/makiri/core/mkr_alloc.h CHANGED Viewed

@@ -6,7 +6,7 @@
  * allocators, the foundation every other C layer (glue, xpath engine,
  * lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
  * `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
- * NOTHING in this header touches Ruby — exception mapping happens at the glue
+ * NOTHING in this header touches Ruby - exception mapping happens at the glue
  * boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
  */
@@ -97,10 +97,33 @@ char *mkr_strdup(const char *s);
  * `cap *= 2; realloc(p, cap * sizeof(T))` pattern in one call. */
 mkr_status_t mkr_grow_reserve(void **ptr, size_t *cap, size_t need, size_t elem);
-/* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
- * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
- * Wired to a private Ruby method for the spec suite. */
-int mkr_core_selftest(void);
+/* --- allocation-failure injection (debug builds only) ------------------- */
+/*
+ * The OOM sweep harness (script/check_alloc_failures.rb, `rake oom`) verifies
+ * that every fail-closed OOM branch actually fails closed: it arms "the nth
+ * core allocation fails", runs a workload, and asserts the result is either a
+ * clean exception or byte-identical to the baseline - never truncated.
+ *
+ * Compiled in ONLY under -DMKR_ALLOC_INJECT (extconf: MAKIRI_ALLOC_INJECT=1);
+ * a release build carries no counter and no branch. Covers every core libc
+ * allocation site (mkr_alloc.c + mkr_buf.c - the funnel the direct_alloc lint
+ * forces all engine allocations through). Ruby's xmalloc family and Lexbor's
+ * internal allocations are out of scope (Ruby raises NoMemoryError itself;
+ * Lexbor is vendor code). Not thread-safe by design (the harness is
+ * single-threaded).
+ */
+#ifdef MKR_ALLOC_INJECT
+/* Arm: the nth subsequent core allocation (1-based) fails once, then the
+ * injection disarms itself. nth <= 0 disarms. Resets the call counter. */
+void mkr_alloc_inject_arm(long long nth);
+/* Core allocation attempts since the last arm (sizes the sweep). */
+unsigned long long mkr_alloc_inject_calls(void);
+/* Internal: consulted by each core libc allocation site. */
+int mkr_alloc_inject_should_fail(void);
+#define MKR_ALLOC_INJECT_FAIL() (mkr_alloc_inject_should_fail())
+#else
+#define MKR_ALLOC_INJECT_FAIL() 0
+#endif
 #ifdef __cplusplus
 }

data/ext/makiri/core/mkr_buf.c CHANGED Viewed

@@ -13,7 +13,12 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
     if (!mkr_size_add(b->len, n, &need)) {
         return MKR_ERR_OOM;
     }
-    if (b->max != 0 && need > b->max) {
+    /* max == 0 is NOT unbounded: it falls back to the conservative default
+     * ceiling, so a caller that never set a cap still fails closed. Either way the
+     * absolute hard ceiling clamps it, so no buffer can exhaust memory. */
+    size_t soft  = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
+    size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
+    if (need > limit) {
         return MKR_ERR_LIMIT;
     }
     size_t need_term; /* room for the NUL terminator too */
@@ -25,7 +30,17 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
         if (!mkr_grow_capacity(b->cap, need_term, 1, &new_cap)) {
             return MKR_ERR_OOM;
         }
-        char *p = realloc(b->data, new_cap);
+        /* Geometric growth can overshoot to ~2x need_term; clamp the ALLOCATION
+         * to the same ceiling as the content (limit, plus one byte for the NUL),
+         * so cap never runs to ~2x MKR_BUF_HARD_MAX near the limit. Safe: this
+         * append already passed need <= limit, so need_term <= limit + 1 and the
+         * clamp never drops new_cap below what this append needs. (If limit + 1
+         * overflows - only a pathological -DMKR_BUF_HARD_MAX=SIZE_MAX - skip it.) */
+        size_t cap_ceiling;
+        if (mkr_size_add(limit, 1, &cap_ceiling) && new_cap > cap_ceiling) {
+            new_cap = cap_ceiling;
+        }
+        char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, new_cap);
         if (p == NULL) {
             return MKR_ERR_OOM;
         }
@@ -38,11 +53,40 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
     return MKR_OK;
 }
+mkr_status_t
+mkr_buf_reserve(mkr_buf_t *b, size_t n)
+{
+    /* Pre-allocate capacity for n bytes so a known-size fill does not realloc on
+     * every geometric step (the serializer reserves ~the output size up front).
+     * Best-effort: never grow past the buffer's own cap, and a later append still
+     * fails closed if the real output exceeds it. */
+    size_t soft  = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
+    size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
+    if (n > limit) {
+        n = limit;
+    }
+    size_t need_term; /* room for the NUL terminator too */
+    if (!mkr_size_add(n, 1, &need_term)) {
+        return MKR_ERR_OOM;
+    }
+    if (need_term <= b->cap) {
+        return MKR_OK; /* already have room */
+    }
+    char *p = MKR_ALLOC_INJECT_FAIL() ? NULL : realloc(b->data, need_term);
+    if (p == NULL) {
+        return MKR_ERR_OOM;
+    }
+    b->data = p;
+    b->cap  = need_term;
+    b->data[b->len] = '\0'; /* keep NUL-terminated */
+    return MKR_OK;
+}
 char *
 mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
 {
     if (b->data == NULL) {
-        char *empty = malloc(1);
+        char *empty = MKR_ALLOC_INJECT_FAIL() ? NULL : malloc(1);
         if (empty == NULL) {
             return NULL;
         }