RubyGems - makiri - Versions diffs - 0.2.0 → 0.4.0 - Mend

makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

checksums.yaml +4 -4
data/.github/workflows/conformance.yml +22 -0
data/.github/workflows/libfuzzer.yml +83 -0
data/.github/workflows/release.yml +12 -7
data/.github/workflows/security.yml +88 -3
data/.github/workflows/valgrind.yml +135 -0
data/CHANGELOG.md +152 -15
data/README.md +183 -13
data/Rakefile +294 -7
data/ext/makiri/bridge/bridge.h +28 -0
data/ext/makiri/bridge/ruby_string.c +282 -12
data/ext/makiri/core/mkr_alloc.c +40 -3
data/ext/makiri/core/mkr_alloc.h +28 -5
data/ext/makiri/core/mkr_buf.c +47 -3
data/ext/makiri/core/mkr_buf.h +112 -3
data/ext/makiri/core/mkr_core.c +143 -0
data/ext/makiri/core/mkr_core.h +11 -2
data/ext/makiri/core/mkr_hash.h +1 -1
data/ext/makiri/core/mkr_span.h +186 -0
data/ext/makiri/core/mkr_text.h +8 -8
data/ext/makiri/core/mkr_utf8.c +101 -0
data/ext/makiri/core/mkr_utf8.h +88 -0
data/ext/makiri/extconf.rb +123 -10
data/ext/makiri/fuzz/Makefile +95 -0
data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
data/ext/makiri/fuzz/xml_fuzz.c +24 -0
data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
data/ext/makiri/glue/glue.h +55 -11
data/ext/makiri/glue/ruby_doc.c +129 -59
data/ext/makiri/glue/ruby_html_css.c +292 -0
data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
data/ext/makiri/glue/ruby_html_node.c +859 -0
data/ext/makiri/glue/ruby_html_serialize.c +154 -0
data/ext/makiri/glue/ruby_node.c +74 -729
data/ext/makiri/glue/ruby_node_set.c +167 -32
data/ext/makiri/glue/ruby_xml.c +602 -0
data/ext/makiri/glue/ruby_xml_node.c +1373 -0
data/ext/makiri/glue/ruby_xpath.c +63 -30
data/ext/makiri/glue/ruby_xpath.h +19 -0
data/ext/makiri/lexbor_compat/compat.h +42 -9
data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
data/ext/makiri/lexbor_compat/dom_index.c +2 -2
data/ext/makiri/lexbor_compat/post_parse.c +100 -10
data/ext/makiri/lexbor_compat/source_loc.c +15 -13
data/ext/makiri/lexbor_compat/text_index.c +14 -8
data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
data/ext/makiri/makiri.c +184 -6
data/ext/makiri/makiri.h +43 -2
data/ext/makiri/xml/mkr_xml.h +125 -0
data/ext/makiri/xml/mkr_xml_chars.c +195 -0
data/ext/makiri/xml/mkr_xml_index.c +169 -0
data/ext/makiri/xml/mkr_xml_index.h +48 -0
data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
data/ext/makiri/xml/mkr_xml_node.c +399 -0
data/ext/makiri/xml/mkr_xml_node.h +184 -0
data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
data/ext/makiri/xpath/mkr_css.c +1023 -0
data/ext/makiri/xpath/mkr_css.h +65 -0
data/ext/makiri/xpath/mkr_xpath.c +96 -32
data/ext/makiri/xpath/mkr_xpath.h +109 -4
data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
data/lib/makiri/{attribute.rb → attr.rb} +7 -3
data/lib/makiri/cdata_section.rb +19 -0
data/lib/makiri/comment.rb +10 -0
data/lib/makiri/compat_aliases.rb +30 -0
data/lib/makiri/document.rb +9 -73
data/lib/makiri/document_fragment.rb +14 -9
data/lib/makiri/element.rb +4 -4
data/lib/makiri/html/document.rb +106 -0
data/lib/makiri/html/node_methods.rb +19 -0
data/lib/makiri/html.rb +12 -0
data/lib/makiri/node.rb +58 -15
data/lib/makiri/node_set.rb +8 -0
data/lib/makiri/processing_instruction.rb +10 -0
data/lib/makiri/text.rb +1 -1
data/lib/makiri/version.rb +1 -1
data/lib/makiri/xml/builder.rb +263 -0
data/lib/makiri/xml/document.rb +24 -0
data/lib/makiri/xml/node_methods.rb +84 -0
data/lib/makiri/xml.rb +10 -0
data/lib/makiri/xpath_context.rb +1 -1
data/lib/makiri.rb +24 -5
data/script/build_native_gem.rb +2 -2
data/script/check_alloc_failures.rb +266 -0
data/script/check_c_safety.rb +77 -2
data/script/check_c_safety_allowlist.yml +102 -0
data/script/check_leaks.rb +64 -0
data/script/leaks_harness.rb +64 -0
data/vendor/lexbor/CMakeLists.txt +6 -0
data/vendor/lexbor/README.md +12 -0
data/vendor/lexbor/config.cmake +1 -1
data/vendor/lexbor/source/lexbor/core/base.h +1 -1
data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
data/vendor/lexbor/source/lexbor/html/base.h +1 -1
data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
data/vendor/lexbor/source/lexbor/url/base.h +1 -1
data/vendor/lexbor/source/lexbor/url/url.c +5 -2
data/vendor/lexbor/source/lexbor/url/url.h +9 -0
data/vendor/lexbor/version +1 -1
metadata +53 -9
data/ext/makiri/glue/ruby_css.c +0 -185
data/ext/makiri/glue/ruby_serialize.c +0 -92
data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
data/lib/makiri/cdata.rb +0 -6

data/ext/makiri/core/mkr_utf8.c ADDED Viewed

@@ -0,0 +1,101 @@
+/* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
+ * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
+ * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
+#include "mkr_utf8.h"
+#include <string.h>   /* memcpy for the word-at-a-time ASCII scan */
+bool
+mkr_utf8_valid(const unsigned char *src, size_t len)
+{
+    const unsigned char *p   = src;
+    const unsigned char *const end = p + len;
+    while (p < end) {
+        unsigned char b = *p;
+        if (b < 0x80) {
+            /* ASCII fast path: skip a run of ASCII bytes a word at a time
+             * (any high bit set ends the run), then byte-wise for the tail. */
+            while ((size_t)(end - p) >= sizeof(size_t)) {
+                size_t w;
+                memcpy(&w, p, sizeof(w));
+                if (w & (size_t)0x8080808080808080ULL) {
+                    break;
+                }
+                p += sizeof(size_t);
+            }
+            while (p < end && *p < 0x80) {
+                p++;
+            }
+            continue;
+        }
+        /* Multi-byte: decide length and validate the (length-dependent) ranges
+         * that exclude overlong forms, surrogates and > U+10FFFF. */
+        size_t n;
+        if (b >= 0xC2 && b <= 0xDF) {                 /* U+0080..U+07FF   */
+            n = 2;
+            if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
+        } else if (b == 0xE0) {                       /* U+0800..U+0FFF   */
+            n = 3;
+            if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b >= 0xE1 && b <= 0xEC) {          /* U+1000..U+CFFF   */
+            n = 3;
+            if (end - p < 3 || (p[1] & 0xC0) != 0x80
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xED) {                       /* U+D000..U+D7FF   */
+            n = 3;                                    /* (excludes surrogates) */
+            if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xEE || b == 0xEF) {          /* U+E000..U+FFFF   */
+            n = 3;
+            if (end - p < 3 || (p[1] & 0xC0) != 0x80
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xF0) {                       /* U+10000..U+3FFFF */
+            n = 4;
+            if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
+                || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
+        } else if (b >= 0xF1 && b <= 0xF3) {          /* U+40000..U+FFFFF */
+            n = 4;
+            if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
+                || (p[3] & 0xC0) != 0x80) return false;
+        } else if (b == 0xF4) {                       /* U+100000..U+10FFFF */
+            n = 4;
+            if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
+                || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
+        } else {                                      /* C0,C1,F5..FF,stray 80..BF */
+            return false;
+        }
+        p += n;
+    }
+    return true;
+}
+int
+mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp)
+{
+    if (len == 0) return 0;
+    unsigned char b0 = p[0];
+    if (b0 < 0x80u) { *cp = b0; return 1; }
+    int n;
+    uint32_t c, min;
+    if      ((b0 & 0xE0u) == 0xC0u) { n = 2; c = b0 & 0x1Fu; min = 0x80u; }
+    else if ((b0 & 0xF0u) == 0xE0u) { n = 3; c = b0 & 0x0Fu; min = 0x800u; }
+    else if ((b0 & 0xF8u) == 0xF0u) { n = 4; c = b0 & 0x07u; min = 0x10000u; }
+    else return 0;                              /* continuation / 0xF8+ lead */
+    if ((size_t)n > len) return 0;              /* truncated */
+    for (int i = 1; i < n; i++) {
+        unsigned char b = p[i];
+        if ((b & 0xC0u) != 0x80u) return 0;     /* bad continuation byte */
+        c = (c << 6) | (b & 0x3Fu);
+    }
+    if (c < min) return 0;                      /* overlong */
+    if (c >= 0xD800u && c <= 0xDFFFu) return 0; /* surrogate */
+    if (c > 0x10FFFFu) return 0;                /* out of Unicode range */
+    *cp = c;
+    return n;
+}

data/ext/makiri/core/mkr_utf8.h ADDED Viewed

@@ -0,0 +1,88 @@
+#ifndef MAKIRI_CORE_MKR_UTF8_H
+#define MAKIRI_CORE_MKR_UTF8_H
+/*
+ * mkr_utf8_valid - the ONE pure-C UTF-8 validator (Ruby-free, allocation-free).
+ *
+ * Validates [src, src+len) against the Unicode "well-formed UTF-8 byte
+ * sequences" table (RFC 3629 / WHATWG): rejects bad continuation bytes,
+ * overlong forms, surrogates (U+D800..U+DFFF), code points above U+10FFFF, and
+ * an incomplete trailing sequence. Validate-only - it never materialises code
+ * points - and rips through ASCII a machine word at a time. NUL bytes are VALID
+ * here (U+0000 is well-formed UTF-8); callers that must reject NUL check it
+ * separately (memchr).
+ *
+ * This lives in core so the Ruby bridge (mkr_verify_text - the strict
+ * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
+ * utf8_input.c fast path) share a single implementation, and so the bridge's
+ * validation never allocates: a borrowed RSTRING pointer must not be held
+ * across a Ruby allocation (= GC point), so the validator the bridge runs
+ * between taking a borrow and using it has to be allocation-free by
+ * construction. (The former implementation built a throwaway Ruby String and
+ * asked for its coderange - an allocation inside every borrow.)
+ */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "mkr_span.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+bool mkr_utf8_valid(const unsigned char *src, size_t len);
+/* mkr_utf8_decode1 - decode ONE code point from [p, p+len), strictly: rejects
+ * truncation, bad continuation bytes, overlong forms, surrogates and values
+ * above U+10FFFF. Returns the byte length (1-4) with *cp set, or 0 on any
+ * violation (including len == 0) - fail closed, never read past the bound.
+ * The ONE strict decoder, shared by the XML tokenizer's name/Char scanning and
+ * the XPath lexer (each formerly carried its own equivalent copy). */
+int mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp);
+/* Span form: decode the code point at the span's cursor (without consuming -
+ * the caller mkr_span_skip()s the returned length). 0 at end-of-span. */
+static inline int
+mkr_utf8_decode1_span(const mkr_span_t *s, uint32_t *cp)
+{
+    return mkr_utf8_decode1((const unsigned char *)s->p, mkr_span_left(s), cp);
+}
+/* mkr_utf8_count_chars - count Unicode code points in [ptr, ptr+len): every
+ * byte that is NOT a 0x80..0xBF continuation byte starts a new code point.
+ * Length-bounded (does not rely on a NUL terminator); ptr may be NULL when
+ * len == 0. Used where XPath measures string length / offsets in characters. */
+static inline size_t
+mkr_utf8_count_chars(const char *ptr, size_t len)
+{
+    size_t n = 0;
+    for (size_t i = 0; i < len; ++i) {
+        if (((unsigned char)ptr[i] & 0xC0) != 0x80) ++n;
+    }
+    return n;
+}
+/* mkr_utf8_advance_chars - byte offset within [ptr, ptr+len) after advancing up
+ * to nchars UTF-8 characters from the start, clamped at len. A character is its
+ * leading byte plus the run of 0x80..0xBF continuation bytes that follow;
+ * advancing stops at len even mid-sequence. Length-bounded (no NUL reliance).
+ * Returns len when nchars exceeds the available character count. */
+static inline size_t
+mkr_utf8_advance_chars(const char *ptr, size_t len, size_t nchars)
+{
+    size_t i = 0;
+    while (nchars > 0 && i < len) {
+        ++i;
+        while (i < len && ((unsigned char)ptr[i] & 0xC0) == 0x80) ++i;
+        --nchars;
+    }
+    return i;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif /* MAKIRI_CORE_MKR_UTF8_H */

data/ext/makiri/extconf.rb CHANGED Viewed

@@ -12,7 +12,7 @@ require "etc"
 #   1. Build vendored Lexbor (unpatched) via cmake into vendor/lexbor/build,
 #      install headers + a static archive into vendor/lexbor/dist.
 #   2. Compile ext/makiri/**/*.c with rake-compiler, linking against the
-#      static Lexbor archive only — no system libxml2/libxslt.
+#      static Lexbor archive only - no system libxml2/libxslt.
 #
 # Security note: the C extension is built with -D_FORTIFY_SOURCE=2,
 # -fstack-protector-strong, and -Wformat -Wformat-security. -O2 is kept
@@ -28,7 +28,29 @@ abort "Lexbor source not found at #{LEXBOR_SRC}. Did you `git submodule update -
 cmake = find_executable("cmake") or abort "cmake is required to build Lexbor."
-unless File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
+# Optionally build the vendored Lexbor itself under AddressSanitizer. This is the
+# ONLY way to catch overflows *inside* Lexbor's bump (mraw) arena: a sub-allocation
+# overrunning into the next one stays within one big malloc'd chunk, so the heap
+# allocator's red-zones (and thus a plain ASan build of just our ext) never see it.
+# Lexbor's own mraw is ASan-aware - with -DLEXBOR_BUILD_WITH_ASAN=ON its CMake
+# defines LEXBOR_HAVE_ADDRESS_SANITIZER, and mraw then unpoisons exactly each
+# allocation and re-poisons the gap, so an intra-arena overrun writes into
+# poisoned memory and ASan reports it. Opt-in (slow full rebuild), only meaningful
+# with MAKIRI_SANITIZE=...address...; drive it via `rake sanitize:lexbor`.
+# vendor/lexbor stays vanilla - this is a build flag, not a source patch.
+sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
+lexbor_asan = !ENV["MAKIRI_SANITIZE_LEXBOR"].to_s.strip.empty? && sanitize.include?("address")
+lexbor_mode = lexbor_asan ? "asan" : "plain"
+lexbor_stamp = File.join(LEXBOR_DST, ".makiri_build_mode")
+# Reuse the cached archive only when it was built in the mode we now want; a mode
+# switch (plain <-> asan) forces a rebuild, so a sanitized Lexbor can never leak
+# into a normal build or vice versa.
+have_archive = File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
+stamp_ok = have_archive && File.exist?(lexbor_stamp) && File.read(lexbor_stamp).strip == lexbor_mode
+unless stamp_ok
+  FileUtils.rm_rf(LEXBOR_BLD)
+  FileUtils.rm_rf(LEXBOR_DST) if have_archive   # drop a wrong-mode install
   FileUtils.mkdir_p(LEXBOR_BLD)
   Dir.chdir(LEXBOR_BLD) do
     cmd = [
@@ -41,12 +63,15 @@ unless File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
       "-DCMAKE_BUILD_TYPE=Release",
       "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
       "-DCMAKE_INSTALL_PREFIX=#{LEXBOR_DST}",
+      *(lexbor_asan ? ["-DLEXBOR_BUILD_WITH_ASAN=ON"] : []),
       LEXBOR_SRC,
     ].shelljoin
+    warn "makiri: building vendored Lexbor (mode=#{lexbor_mode})"
     system(cmd) or abort "cmake configure failed for Lexbor."
     system("#{cmake.shellescape} --build . --target install -- -j#{Etc.respond_to?(:nprocessors) ? Etc.nprocessors : 4}") or
       abort "cmake build/install failed for Lexbor."
   end
+  File.write(lexbor_stamp, lexbor_mode)
 end
 $INCFLAGS << " -I#{File.join(LEXBOR_DST, 'include').shellescape}"
@@ -60,11 +85,35 @@ $LDFLAGS << " #{lexbor_archive.shellescape}"
 # Sanitizer build (opt-in): MAKIRI_SANITIZE=address,undefined rake clean compile
 # Then run the suite under the runtime via `rake sanitize` (which preloads the
 # ASan runtime). Sanitizers replace the heap allocator, so even the vendored
-# (uninstrumented) Lexbor's allocations get red-zoned — heap overflows on
-# Lexbor-owned buffers are still caught. _FORTIFY_SOURCE is dropped here because
-# it conflicts with the sanitizer interceptors.
-sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
-if sanitize.empty?
+# (uninstrumented) Lexbor's allocations get red-zoned - a heap overflow off the
+# END of a Lexbor malloc is caught. Overflows *inside* Lexbor's mraw arena are
+# NOT caught this way (they stay within one malloc'd chunk); for those, also build
+# Lexbor under ASan via MAKIRI_SANITIZE_LEXBOR=1 (see the Lexbor build above and
+# `rake sanitize:lexbor`). _FORTIFY_SOURCE is dropped here because it conflicts
+# with the sanitizer interceptors.
+# Coverage build (opt-in): MAKIRI_COVERAGE=1 instruments OUR sources with clang
+# source-based coverage (the vendored Lexbor is built separately and is NOT
+# instrumented - we measure only the code we write). Run via `rake coverage`,
+# which sets LLVM_PROFILE_FILE and renders an llvm-cov report. -O0 keeps the
+# region map close to the source; _FORTIFY_SOURCE is dropped (it needs -O2).
+coverage = !ENV["MAKIRI_COVERAGE"].to_s.strip.empty?
+# OOM-injection build (opt-in): MAKIRI_ALLOC_INJECT=1 compiles the core
+# allocation-failure hook (mkr_alloc.h) so `rake oom` can sweep "the nth core
+# allocation fails" over representative workloads and assert every OOM branch
+# fails closed. Debug/test builds only - a normal build carries no hook.
+# Composes with the sanitize/coverage modes below.
+if ENV["MAKIRI_ALLOC_INJECT"].to_s.strip == "1"
+  $CFLAGS << " -DMKR_ALLOC_INJECT=1"
+  warn "makiri: building with allocation-failure injection (MKR_ALLOC_INJECT)"
+end
+if coverage
+  $CFLAGS   << " -O0 -g -fprofile-instr-generate -fcoverage-mapping"
+  $LDFLAGS  << " -fprofile-instr-generate"
+  $DLDFLAGS << " -fprofile-instr-generate"
+  warn "makiri: building with clang source-based coverage"
+elsif sanitize.empty?
   # Security hardening flags. Keep -O2 active so _FORTIFY_SOURCE works.
   $CFLAGS << " -O2"
   $CFLAGS << " -D_FORTIFY_SOURCE=2"
@@ -72,6 +121,20 @@ else
   $CFLAGS << " -O1 -g -fno-omit-frame-pointer -fsanitize=#{sanitize}"
   $LDFLAGS  << " -fsanitize=#{sanitize}"
   $DLDFLAGS << " -fsanitize=#{sanitize}"
+  if sanitize.include?("address")
+    # No ASan *stack* red zones in the ext. CRuby is built with
+    # RUBY_SETJMP = __builtin_setjmp, so rb_raise unwinds via __builtin_longjmp,
+    # which the ASan runtime does not intercept (no __asan_handle_no_return):
+    # any raise crossing an instrumented frame - ours, or Ruby code raising
+    # through rb_protect under the evaluator - leaves that frame's stack poison
+    # behind, and an interceptor (memcpy & co.) in the uninstrumented interpreter
+    # later trips over the stale shadow: a spurious report, which ASan itself
+    # then aborts while rendering (asan_thread.cpp kCurrentStackFrameMagic
+    # CHECK). Heap red zones, UBSan, and the manual arena poisoning in
+    # mkr_xml_node.c are unaffected; only stack-buffer checks are lost.
+    $CFLAGS << (RbConfig::CONFIG["CC"] =~ /clang/ || RbConfig::CONFIG["target_os"] =~ /darwin/ ?
+                  " -mllvm -asan-stack=0" : " --param asan-stack=0")
+  end
   warn "makiri: building with -fsanitize=#{sanitize}"
 end
@@ -115,9 +178,59 @@ elsif RbConfig::CONFIG["target_os"] =~ /linux/
   $LIBRUBYARG_STATIC = ""
 end
-# Recursively pick up C sources under ext/makiri/.
-$srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c")).map { |f| f.sub("#{EXT_DIR}/", "") }
+# Export ONLY Init_makiri from the compiled extension. `-fvisibility=hidden`
+# above hides our own sources' symbols, but the vendored Lexbor static library
+# is built (by Lexbor's own CMake) with default visibility, so without this the
+# linker re-exports ~1700 `lxb_*` / `lexbor_*` symbols into the bundle's dynamic
+# table. Another Lexbor-based extension loaded in the same process (e.g.
+# nokolexbor) would then resolve its own `lxb_*` calls to OUR copy - a different
+# Lexbor version with an incompatible ABI - and segfault. Restricting the export
+# list to Init_makiri keeps Makiri's Lexbor entirely private (Ruby only needs
+# Init_makiri, found via dlsym at require time).
+if RbConfig::CONFIG["target_os"] =~ /darwin/
+  $DLDFLAGS << " -Wl,-exported_symbol,_Init_makiri"
+elsif RbConfig::CONFIG["target_os"] =~ /linux/
+  # Hide every symbol pulled in from static archives (the Lexbor .a); our own
+  # are already hidden by -fvisibility=hidden, leaving just RUBY_FUNC_EXPORTED
+  # Init_makiri in the dynamic symbol table.
+  $DLDFLAGS << " -Wl,--exclude-libs,ALL"
+end
+# Recursively pick up C sources under ext/makiri/, excluding standalone
+# libFuzzer harnesses. Those define LLVMFuzzerTestOneInput and are linked by
+# ext/makiri/fuzz/Makefile, never into the Ruby extension.
+$srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c"))
+           .reject { |f| f.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
+           .map { |f| f.sub("#{EXT_DIR}/", "") }
 $VPATH ||= []
-$VPATH += Dir.glob(File.join(EXT_DIR, "**/")).map { |d| "$(srcdir)/#{d.sub("#{EXT_DIR}/", "")}".chomp("/") }
+# fuzz/ must be excluded here too: after a `rake fuzz:libfuzzer_build`,
+# fuzz/build/{core,xml,xpath}/ hold sanitizer-instrumented .o files, and a
+# VPATH that includes them lets make resolve the extension's object
+# prerequisites there instead of compiling them - breaking the link (or worse,
+# silently mixing differently-flagged objects).
+$VPATH += Dir.glob(File.join(EXT_DIR, "**/"))
+             .reject { |d| d.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
+             .map { |d| "$(srcdir)/#{d.sub("#{EXT_DIR}/", "")}".chomp("/") }
 create_makefile("makiri/makiri")
+# mkmf's generated Makefile carries NO header dependencies, so editing a header
+# (e.g. a struct layout in an internal .h) recompiles only the .c files whose
+# own timestamps changed - the rest keep their stale layout and the objects
+# silently disagree (ABI mismatch, runtime breakage). Append the coarsest sound
+# rule instead: every object depends on every project header. A header edit
+# then recompiles everything - a few seconds for this ext, and it can never
+# rot (the list regenerates each configure; a header NEW since the last
+# configure is reachable only from .c files edited to include it, which rebuild
+# on their own timestamp). Ruby/Lexbor headers are deliberately excluded: a
+# Ruby upgrade gets a fresh build dir from rake-compiler, and a Lexbor pin
+# change already requires `rake clean:lexbor` (see CLAUDE.md).
+project_headers = Dir.glob(File.join(EXT_DIR, "**", "*.h"))
+                     .reject { |f| f.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
+                     .map { |f| "$(srcdir)/#{f.sub("#{EXT_DIR}/", "")}" }
+                     .sort
+File.open("Makefile", "a") do |mk|
+  mk.puts
+  mk.puts "# Project-header dependencies appended by extconf.rb (mkmf emits none)."
+  mk.puts "$(OBJS): #{project_headers.join(" ")}"
+end

data/ext/makiri/fuzz/Makefile ADDED Viewed

@@ -0,0 +1,95 @@
+# ext/makiri/fuzz/Makefile - libFuzzer harness build
+#
+# Usage:
+#   make xml_fuzz      # build the XML parser harness
+#   make xpath_fuzz    # build the XPath compile+eval harness
+#   make clean         # remove build artifacts
+#
+# Requires: clang with libFuzzer support (usually part of the clang distribution)
+#   and the vendored Lexbor static library already built (via `rake compile`).
+CLANG ?= clang
+CXX   := $(if $(filter default,$(origin CXX)),clang++,$(CXX))
+# Paths relative to ext/makiri/fuzz/
+EXT_DIR    = ..
+LEXBOR_SRC = ../../../vendor/lexbor
+LEXBOR_DST = $(LEXBOR_SRC)/dist
+# CFLAGS mirror the security flags from extconf.rb, minus Ruby-specific flags.
+# Sanitizer instrumentation is added at compile time so every TU is covered.
+SANITIZE ?= address,undefined
+CFLAGS = -O2 -g -Wall -Wextra \
+  -fstack-protector-strong \
+  -Wformat -Wformat-security \
+  -fvisibility=hidden \
+  -fno-common \
+  -fsanitize=$(SANITIZE) \
+  -I$(LEXBOR_DST)/include \
+  -I$(EXT_DIR) \
+  -I$(EXT_DIR)/core \
+  -I$(EXT_DIR)/xml \
+  -I$(EXT_DIR)/xpath \
+  -I$(EXT_DIR)/lexbor_compat
+# Linker flags: libFuzzer driver + sanitizers.
+LDFLAGS  = -fsanitize=fuzzer,$(SANITIZE)
+BUILD_DIR = build
+# Ruby-free C sources that the harnesses link against.
+CORE_SRCS = \
+  $(EXT_DIR)/core/mkr_alloc.c \
+  $(EXT_DIR)/core/mkr_utf8.c \
+  $(EXT_DIR)/core/mkr_buf.c \
+  $(EXT_DIR)/core/mkr_core.c
+XML_SRCS = \
+  $(EXT_DIR)/xml/mkr_xml_tree.c \
+  $(EXT_DIR)/xml/mkr_xml_node.c \
+  $(EXT_DIR)/xml/mkr_xml_chars.c \
+  $(EXT_DIR)/xml/mkr_xml_index.c \
+  $(EXT_DIR)/xml/mkr_xml_mutate.c
+XPATH_SRCS = \
+  $(EXT_DIR)/xpath/mkr_xpath.c \
+  $(EXT_DIR)/xpath/mkr_xpath_lex.c \
+  $(EXT_DIR)/xpath/mkr_xpath_parse.c \
+  $(EXT_DIR)/xpath/mkr_xpath_shared.c \
+  $(EXT_DIR)/xpath/mkr_xpath_number.c \
+  $(EXT_DIR)/xpath/mkr_xpath_engine_xml.c \
+  $(EXT_DIR)/xpath/mkr_xpath_engine_html.c
+CORE_OBJS  = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(CORE_SRCS))
+XML_OBJS   = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(XML_SRCS))
+XPATH_OBJS = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(XPATH_SRCS))
+.PHONY: all clean check-libfuzzer xml_fuzz xpath_fuzz
+all: check-libfuzzer xml_fuzz xpath_fuzz
+check-libfuzzer:
+	@mkdir -p $(BUILD_DIR)
+	@$(CXX) $(LDFLAGS) -o $(BUILD_DIR)/check_fuzzer check_fuzzer.cc >/dev/null 2>&1 || \
+	  (echo "libFuzzer runtime not available for $(CXX). Install LLVM clang and run with CLANG=/path/to/clang CXX=/path/to/clang++." >&2; exit 1)
+xml_fuzz: $(BUILD_DIR)/xml_fuzz.o $(CORE_OBJS) $(XML_OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^
+xpath_fuzz: $(BUILD_DIR)/xpath_fuzz.o $(CORE_OBJS) $(XML_OBJS) $(XPATH_OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^ $(LEXBOR_DST)/lib/liblexbor_static.a
+$(BUILD_DIR)/%.o: $(EXT_DIR)/%.c
+	@mkdir -p $(dir $@)
+	$(CLANG) $(CFLAGS) -c -o $@ $<
+$(BUILD_DIR)/xml_fuzz.o: xml_fuzz.c
+	@mkdir -p $(dir $@)
+	$(CLANG) $(CFLAGS) -c -o $@ $<
+$(BUILD_DIR)/xpath_fuzz.o: xpath_fuzz.c
+	@mkdir -p $(dir $@)
+	$(CLANG) $(CFLAGS) -c -o $@ $<
+clean:
+	rm -rf $(BUILD_DIR) xml_fuzz xpath_fuzz

data/ext/makiri/fuzz/check_fuzzer.cc ADDED Viewed

@@ -0,0 +1,4 @@
+extern "C" int LLVMFuzzerTestOneInput(const unsigned char *, unsigned long)
+{
+    return 0;
+}

data/ext/makiri/fuzz/xml_fuzz.c ADDED Viewed

@@ -0,0 +1,24 @@
+/* xml_fuzz.c - libFuzzer harness for mkr_xml_parse.
+ *
+ * Coverage-guided fuzzing of the XML tokenizer + tree builder.
+ * Ruby-free; runs directly on the pure-C parser surface.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include "xml/mkr_xml.h"
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
+{
+    mkr_xml_status_t status;
+    /* The parser contract says "valid UTF-8, NUL-free", but the fuzzer feeds
+     * raw bytes.  mkr_xml_parse is fail-closed: it validates as it goes and
+     * returns an error status on malformed input, never a partial document.
+     * We pass the bytes through untouched so the fuzzer can reach the invalid-
+     * UTF-8 / unexpected-NUL error paths too. */
+    mkr_xml_doc_t *doc = mkr_xml_parse((const char *) data, size, &status);
+    if (doc) {
+        mkr_xml_doc_destroy(doc);
+    }
+    return 0;
+}

data/ext/makiri/fuzz/xpath_fuzz.c ADDED Viewed

@@ -0,0 +1,109 @@
+/* xpath_fuzz.c - libFuzzer harness for the XPath compile + eval path.
+ *
+ * Coverage-guided fuzzing of the XPath lexer, parser, and evaluator.
+ * We build a small fixed XML document as the evaluation context, then
+ * treat the fuzzer input as the XPath expression string.
+ *
+ * Ruby-free; runs directly on the pure-C engine surface.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include "core/mkr_alloc.h"
+#include "xml/mkr_xml.h"
+#include "xpath/mkr_xpath.h"
+#include "xpath/mkr_xpath_internal.h"
+/* A small, fixed XML document that gives the evaluator something to walk.
+ * The expression is the fuzzer input; the document is static so the coverage
+ * signal comes from the engine, not the parser. */
+static const char FIXED_XML[] =
+    "<?xml version='1.0'?>"
+    "<root xmlns='http://example.com/default' xmlns:ns='http://example.com/ns'>"
+    "  <a id='1' ns:attr='x'>text1</a>"
+    "  <b id='2'><c/><c/></b>"
+    "  <ns:d>namespaced</ns:d>"
+    "  <!-- comment -->"
+    "  <?pi target='value'?>"
+    "</root>";
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
+{
+    /* 1. Parse the fixed document. */
+    mkr_xml_status_t status;
+    mkr_xml_doc_t *doc = mkr_xml_parse(FIXED_XML, sizeof(FIXED_XML) - 1, &status);
+    if (!doc) return 0;
+    if (!doc->doc_node) {
+        mkr_xml_doc_destroy(doc);
+        return 0;
+    }
+    /* 2. The fuzzer input is the XPath expression.
+     *    The engine text contract requires no interior NUL and a NUL at
+     *    ptr[len]. libFuzzer hands us exactly `size` bytes with no terminator,
+     *    so we copy the expression prefix into an owned, NUL-terminated heap
+     *    buffer and mint the verified-text token over that copy - this is what
+     *    supplies the NUL-termination + no-interior-NUL the lexer's strtod and
+     *    "%.10s" error path rely on. If the input contains a NUL we truncate to
+     *    the prefix (the lexer hits the terminator and reports a syntax error,
+     *    a path worth exercising). UTF-8 validity is deliberately NOT
+     *    pre-checked: the lexer's strict decoder rejecting invalid UTF-8 is
+     *    itself a path the fuzzer should hit. */
+    size_t expr_len = size;
+    for (size_t i = 0; i < size; i++) {
+        if (data[i] == '\0') {
+            expr_len = i;
+            break;
+        }
+    }
+    char *expr_copy = mkr_strndup((const char *) data, expr_len);
+    if (!expr_copy) {
+        mkr_xml_doc_destroy(doc);
+        return 0;
+    }
+    /* Empty expression is a quick syntax error; still worth a run. */
+    mkr_verified_text_t expr = { expr_copy, expr_len };
+    /* 3. Compile the expression. */
+    mkr_xpath_limits_t limits;
+    mkr_xpath_limits_init_defaults(&limits);
+    /* Tighten the compile-time budgets so a hostile expression fails fast
+     * rather than burning fuzzer time on pathological ASTs. */
+    limits.max_ast_nodes = 10000;
+    limits.max_expr_bytes = 16 * 1024;
+    mkr_xpath_error_t err = {0};
+    mkr_node_t *ast = mkr_parse(expr, &limits, &err);
+    if (!ast) {
+        mkr_xpath_error_clear(&err);
+        free(expr_copy);
+        mkr_xml_doc_destroy(doc);
+        return 0;
+    }
+    /* 4. Evaluate against the fixed document. */
+    mkr_xpath_context_t *ctx = mkr_xpath_context_new(doc->doc_node, doc->doc_node);
+    if (ctx) {
+        mkr_xpath_set_engine_kind(ctx, 1); /* XML engine */
+        mkr_xpath_limits_init_defaults(&limits);
+        limits.max_eval_ops        = 5 * 1000 * 1000; /* 5M ops - enough for a real query */
+        limits.max_nodeset_size    = 10000;
+        limits.max_string_bytes    = 1024 * 1024;
+        limits.max_recursion_depth = 64;
+        mkr_xpath_value_t out = {0};
+        mkr_xpath_error_t eval_err = {0};
+        if (mkr_xpath_eval_compiled(ctx, ast, &out, &eval_err) == 0) {
+            mkr_xpath_value_clear(&out);
+        } else {
+            mkr_xpath_error_clear(&eval_err);
+        }
+        mkr_xpath_context_free(ctx);
+    }
+    mkr_node_free(ast);
+    free(expr_copy);
+    mkr_xml_doc_destroy(doc);
+    return 0;
+}