RubyGems - makiri - Versions diffs - 0.2.0 → 0.4.0 - Mend

makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

checksums.yaml +4 -4
data/.github/workflows/conformance.yml +22 -0
data/.github/workflows/libfuzzer.yml +83 -0
data/.github/workflows/release.yml +12 -7
data/.github/workflows/security.yml +88 -3
data/.github/workflows/valgrind.yml +135 -0
data/CHANGELOG.md +152 -15
data/README.md +183 -13
data/Rakefile +294 -7
data/ext/makiri/bridge/bridge.h +28 -0
data/ext/makiri/bridge/ruby_string.c +282 -12
data/ext/makiri/core/mkr_alloc.c +40 -3
data/ext/makiri/core/mkr_alloc.h +28 -5
data/ext/makiri/core/mkr_buf.c +47 -3
data/ext/makiri/core/mkr_buf.h +112 -3
data/ext/makiri/core/mkr_core.c +143 -0
data/ext/makiri/core/mkr_core.h +11 -2
data/ext/makiri/core/mkr_hash.h +1 -1
data/ext/makiri/core/mkr_span.h +186 -0
data/ext/makiri/core/mkr_text.h +8 -8
data/ext/makiri/core/mkr_utf8.c +101 -0
data/ext/makiri/core/mkr_utf8.h +88 -0
data/ext/makiri/extconf.rb +123 -10
data/ext/makiri/fuzz/Makefile +95 -0
data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
data/ext/makiri/fuzz/xml_fuzz.c +24 -0
data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
data/ext/makiri/glue/glue.h +55 -11
data/ext/makiri/glue/ruby_doc.c +129 -59
data/ext/makiri/glue/ruby_html_css.c +292 -0
data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
data/ext/makiri/glue/ruby_html_node.c +859 -0
data/ext/makiri/glue/ruby_html_serialize.c +154 -0
data/ext/makiri/glue/ruby_node.c +74 -729
data/ext/makiri/glue/ruby_node_set.c +167 -32
data/ext/makiri/glue/ruby_xml.c +602 -0
data/ext/makiri/glue/ruby_xml_node.c +1373 -0
data/ext/makiri/glue/ruby_xpath.c +63 -30
data/ext/makiri/glue/ruby_xpath.h +19 -0
data/ext/makiri/lexbor_compat/compat.h +42 -9
data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
data/ext/makiri/lexbor_compat/dom_index.c +2 -2
data/ext/makiri/lexbor_compat/post_parse.c +100 -10
data/ext/makiri/lexbor_compat/source_loc.c +15 -13
data/ext/makiri/lexbor_compat/text_index.c +14 -8
data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
data/ext/makiri/makiri.c +184 -6
data/ext/makiri/makiri.h +43 -2
data/ext/makiri/xml/mkr_xml.h +125 -0
data/ext/makiri/xml/mkr_xml_chars.c +195 -0
data/ext/makiri/xml/mkr_xml_index.c +169 -0
data/ext/makiri/xml/mkr_xml_index.h +48 -0
data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
data/ext/makiri/xml/mkr_xml_node.c +399 -0
data/ext/makiri/xml/mkr_xml_node.h +184 -0
data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
data/ext/makiri/xpath/mkr_css.c +1023 -0
data/ext/makiri/xpath/mkr_css.h +65 -0
data/ext/makiri/xpath/mkr_xpath.c +96 -32
data/ext/makiri/xpath/mkr_xpath.h +109 -4
data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
data/lib/makiri/{attribute.rb → attr.rb} +7 -3
data/lib/makiri/cdata_section.rb +19 -0
data/lib/makiri/comment.rb +10 -0
data/lib/makiri/compat_aliases.rb +30 -0
data/lib/makiri/document.rb +9 -73
data/lib/makiri/document_fragment.rb +14 -9
data/lib/makiri/element.rb +4 -4
data/lib/makiri/html/document.rb +106 -0
data/lib/makiri/html/node_methods.rb +19 -0
data/lib/makiri/html.rb +12 -0
data/lib/makiri/node.rb +58 -15
data/lib/makiri/node_set.rb +8 -0
data/lib/makiri/processing_instruction.rb +10 -0
data/lib/makiri/text.rb +1 -1
data/lib/makiri/version.rb +1 -1
data/lib/makiri/xml/builder.rb +263 -0
data/lib/makiri/xml/document.rb +24 -0
data/lib/makiri/xml/node_methods.rb +84 -0
data/lib/makiri/xml.rb +10 -0
data/lib/makiri/xpath_context.rb +1 -1
data/lib/makiri.rb +24 -5
data/script/build_native_gem.rb +2 -2
data/script/check_alloc_failures.rb +266 -0
data/script/check_c_safety.rb +77 -2
data/script/check_c_safety_allowlist.yml +102 -0
data/script/check_leaks.rb +64 -0
data/script/leaks_harness.rb +64 -0
data/vendor/lexbor/CMakeLists.txt +6 -0
data/vendor/lexbor/README.md +12 -0
data/vendor/lexbor/config.cmake +1 -1
data/vendor/lexbor/source/lexbor/core/base.h +1 -1
data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
data/vendor/lexbor/source/lexbor/html/base.h +1 -1
data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
data/vendor/lexbor/source/lexbor/url/base.h +1 -1
data/vendor/lexbor/source/lexbor/url/url.c +5 -2
data/vendor/lexbor/source/lexbor/url/url.h +9 -0
data/vendor/lexbor/version +1 -1
metadata +53 -9
data/ext/makiri/glue/ruby_css.c +0 -185
data/ext/makiri/glue/ruby_serialize.c +0 -92
data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
data/lib/makiri/cdata.rb +0 -6

data/ext/makiri/core/mkr_buf.h CHANGED Viewed

@@ -2,7 +2,7 @@
 #define MAKIRI_CORE_MKR_BUF_H
 /*
- * mkr_buf_t — an owned, growable, optionally capped byte buffer, kept
+ * mkr_buf_t - an owned, growable, optionally capped byte buffer, kept
  * NUL-terminated. Built on the fail-closed allocators in mkr_alloc.h.
  * (mkr_core.h is a thin umbrella over mkr_alloc.h + mkr_text.h + this.)
  */
@@ -13,14 +13,46 @@
 extern "C" {
 #endif
+/* Memory safety for buffers lives HERE, at the one buffer primitive, not at each
+ * call site: "max == 0" can no longer mean "unbounded". Two ceilings bound every
+ * mkr_buf so a runaway - a cycle, an unbounded loop, or a caller that forgot to
+ * pass a cap - fails closed with MKR_ERR_LIMIT instead of exhausting memory and
+ * freezing the machine:
+ *
+ *   MKR_BUF_DEFAULT_LIMIT  the cap applied when the caller passes max == 0. A
+ *                          conservative default (100 MiB): code that did not
+ *                          think about a bound gets a tight one for free, and a
+ *                          buffer that genuinely needs to be large must opt in
+ *                          EXPLICITLY by passing a larger max.
+ *   MKR_BUF_HARD_MAX       an absolute ceiling on a buffer's CONTENT length, even
+ *                          with an explicit max - the last-resort backstop. The
+ *                          ALLOCATION is bounded by it too: geometric growth is
+ *                          clamped so cap never exceeds HARD_MAX + 1 (the one NUL
+ *                          terminator byte), i.e. it does NOT overshoot to ~2x
+ *                          near the limit. Tight, content-scaled bounds still
+ *                          belong to the caller (e.g. the XML serializer caps
+ *                          itself at a multiple of arena_bytes); this stops total
+ *                          runaway.
+ *
+ * Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
+#ifndef MKR_BUF_DEFAULT_LIMIT
+#define MKR_BUF_DEFAULT_LIMIT ((size_t)100 << 20)   /* 100 MiB */
+#endif
+#ifndef MKR_BUF_HARD_MAX
+#define MKR_BUF_HARD_MAX ((size_t)4 << 30)        /* 4 GiB */
+#endif
 typedef struct {
     char  *data; /* owned; kept NUL-terminated after any append */
     size_t len;  /* bytes used (excluding the terminator) */
     size_t cap;  /* bytes allocated */
-    size_t max;  /* 0 = unbounded; else append past max returns MKR_ERR_LIMIT */
+    size_t max;  /* 0 = the conservative MKR_BUF_DEFAULT_LIMIT; else this value -
+                  * either way clamped by MKR_BUF_HARD_MAX (past it -> ERR_LIMIT) */
 } mkr_buf_t;
-/* Initialise an empty buffer. max == 0 means unbounded. */
+/* Initialise an empty buffer. max == 0 applies the conservative default ceiling
+ * (MKR_BUF_DEFAULT_LIMIT) - it is NOT unbounded; pass an explicit (larger or
+ * smaller) value to opt into a different bound, always under MKR_BUF_HARD_MAX. */
 static inline void
 mkr_buf_init(mkr_buf_t *b, size_t max)
 {
@@ -35,6 +67,12 @@ mkr_buf_init(mkr_buf_t *b, size_t max)
  * failure (the buffer is left intact in every failure case). n == 0 is a no-op. */
 mkr_status_t mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n);
+/* Pre-allocate capacity for at least n bytes (best-effort, clamped to the
+ * buffer's cap), so a fill of known approximate size avoids per-append reallocs.
+ * A no-op if the buffer already has room. Returns MKR_ERR_OOM on overflow /
+ * allocation failure (the buffer is left intact). */
+mkr_status_t mkr_buf_reserve(mkr_buf_t *b, size_t n);
 /* Take ownership of the (NUL-terminated) bytes; the buffer is reset to empty.
  * Returns a freshly owned "" for an empty buffer, or NULL on OOM. */
 char *mkr_buf_steal(mkr_buf_t *b, size_t *out_len);
@@ -48,6 +86,77 @@ mkr_buf_free(mkr_buf_t *b)
     b->len = b->cap = 0;
 }
+/* ------------------------------------------------------------------------- */
+/* mkr_spanbuf_t - a bounded writer over a borrowed, fixed-capacity buffer    */
+/* ------------------------------------------------------------------------- */
+/*
+ * "span" = a non-owning view of a fixed-extent contiguous region; a spanbuf is
+ * the bounded *writer* over such a span. The complement to mkr_buf_t: where
+ * mkr_buf_t GROWS and OWNS its malloc storage, a spanbuf BORROWS a fixed buffer
+ * owned elsewhere (e.g. an arena cut) and never grows it.
+ *
+ * The name leads with "borrowed/fixed" deliberately, for safety: the
+ * fixed/bounded property is SELF-ENFORCED (a write past `cap` is refused, so
+ * misunderstanding it costs at most a truncation, caught by _finish), but the
+ * BORROWED property is the caller's responsibility - the type cannot stop you
+ * from free()ing `buf` or holding `finish()`'s pointer past the owner's
+ * lifetime, and getting that wrong is a use-after-free / double-free. So:
+ *   - never free() buf (the owner does, e.g. the arena is freed wholesale);
+ *   - finish()'s pointer is valid only while the backing storage lives.
+ *
+ * Bounds safety is BY CONSTRUCTION (the writer owns the cursor + check, not the
+ * caller's per-write guard): an over-long write is refused and latches `ok` to
+ * false (sticky). The caller's only duty is one check at the end (via _finish or
+ * the public `ok` field), failing closed rather than using a truncated buffer.
+ * This is the sanctioned way to hand-fill a raw region; see mkr_xml_arena_spanbuf
+ * for the arena adapter.
+ */
+typedef struct {
+    char  *buf;   /* borrowed; the owner keeps it alive - do NOT free. NULL => never ok. */
+    size_t cap;   /* capacity in bytes (fixed) */
+    size_t pos;   /* bytes written so far (always <= cap) */
+    bool   ok;    /* false once a write would overflow, or buf was NULL */
+} mkr_spanbuf_t;
+/* Wrap [buf, buf+cap) for bounded writing. buf == NULL yields a permanently
+ * not-ok writer (every write a no-op), so an upstream allocation failure flows
+ * straight through without a separate guard at each call site. */
+static inline mkr_spanbuf_t
+mkr_spanbuf(char *buf, size_t cap)
+{
+    return (mkr_spanbuf_t){ .buf = buf, .cap = cap, .pos = 0, .ok = (buf != NULL) };
+}
+/* Append one byte; refuse (latch ok=false) if it would exceed cap. */
+static inline void
+mkr_spanbuf_putc(mkr_spanbuf_t *b, char c)
+{
+    if (!b->ok) return;
+    if (b->pos >= b->cap) { b->ok = false; return; }
+    b->buf[b->pos++] = c;
+}
+/* Append n bytes; refuse (latch ok=false) if they would exceed cap. n == 0 is a
+ * no-op. pos <= cap is the invariant (a refused write never advances pos), so
+ * cap - pos cannot underflow; the pos > cap arm is belt-and-suspenders. */
+static inline void
+mkr_spanbuf_write(mkr_spanbuf_t *b, const void *src, size_t n)
+{
+    if (!b->ok || n == 0) return;
+    if (b->pos > b->cap || n > b->cap - b->pos) { b->ok = false; return; }
+    memcpy(b->buf + b->pos, src, n);
+    b->pos += n;
+}
+/* The filled prefix [buf, buf+pos), or NULL if any write was refused (or buf was
+ * NULL); on a non-NULL return the length is `b->pos`. Forces the caller through a
+ * single fail-closed check instead of trusting the writes individually. */
+static inline const char *
+mkr_spanbuf_finish(const mkr_spanbuf_t *b)
+{
+    return b->ok ? b->buf : NULL;
+}
 #ifdef __cplusplus
 }
 #endif

data/ext/makiri/core/mkr_core.c CHANGED Viewed

@@ -18,6 +18,12 @@ mkr_core_selftest(void)
     if (!mkr_size_mul(4, 5, &out) || out != 20) return 3;
     if (mkr_size_mul(SIZE_MAX / 2 + 1, 2, &out)) return 4;  /* must overflow */
     if (!mkr_size_mul(0, SIZE_MAX, &out) || out != 0) return 5;
+    /* exact-boundary: the largest non-overflowing result MUST succeed (catches an
+     * off-by-one in the `a > SIZE_MAX - b` / `b > SIZE_MAX / a` predicates). */
+    if (!mkr_size_add(SIZE_MAX, 0, &out) || out != SIZE_MAX) return 55;
+    if (!mkr_size_add(SIZE_MAX - 1, 1, &out) || out != SIZE_MAX) return 56;
+    if (!mkr_size_mul(SIZE_MAX, 1, &out) || out != SIZE_MAX) return 57;
+    if (!mkr_size_mul(SIZE_MAX / 2, 2, &out) || out != (SIZE_MAX / 2) * 2) return 58;
     /* grow_capacity: geometric, and overflow on a huge element */
     if (!mkr_grow_capacity(0, 1, 1, &out) || out < 1) return 6;
@@ -39,6 +45,9 @@ mkr_core_selftest(void)
     /* str_alloc / strndup / strdup: copy, terminate, fail closed on NULL+len */
     {
         if (mkr_str_alloc(SIZE_MAX) != NULL) return 26;          /* n + 1 overflow */
+        char *z = mkr_str_alloc(0);                              /* boundary: 0 -> 1-byte "\0" */
+        if (z == NULL || z[0] != '\0') { free(z); return 59; }
+        free(z);
         char *p = mkr_strndup("hello", 3);
         if (p == NULL || memcmp(p, "hel", 4) != 0) { free(p); return 27; } /* "hel\0" */
         free(p);
@@ -97,5 +106,139 @@ mkr_core_selftest(void)
         free(s);
     }
+    /* spanbuf: writes that fit, then one that overruns -> refused + sticky */
+    {
+        char store[4];
+        mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
+        mkr_spanbuf_putc(&f, 'a');
+        mkr_spanbuf_write(&f, "bc", 2);                  /* pos == 3, room for 1 */
+        if (!f.ok || f.pos != 3) return 35;
+        mkr_spanbuf_write(&f, "de", 2);                  /* exceeds cap -> refused */
+        if (f.ok) return 36;                              /* must have latched */
+        if (f.pos != 3) return 37;                        /* refused write didn't advance */
+        if (mkr_spanbuf_finish(&f) != NULL) return 38;   /* not-ok -> NULL */
+        if (memcmp(store, "abc", 3) != 0) return 39;      /* no overrun past pos */
+    }
+    /* spanbuf: exact fill is ok; a NULL backing buffer is never ok */
+    {
+        char store[2];
+        mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
+        mkr_spanbuf_write(&f, "xy", 2);                  /* exactly cap -> still ok */
+        if (!f.ok || mkr_spanbuf_finish(&f) != store || f.pos != 2) return 40;
+        mkr_spanbuf_putc(&f, 'z');                       /* boundary: at pos==cap -> refused */
+        if (f.ok || f.pos != 2) return 60;
+        mkr_spanbuf_t g = mkr_spanbuf(NULL, 8);         /* alloc-failed backing */
+        mkr_spanbuf_putc(&g, 'z');                       /* no-op, no crash */
+        if (g.ok || mkr_spanbuf_finish(&g) != NULL) return 41;
+    }
+    /* grow_reserve: grows + updates ptr/cap; already-enough is a no-op; an
+     * overflowing (need*elem) request fails closed (OOM) leaving ptr/cap as-is. */
+    {
+        size_t *p = NULL, cap = 0;
+        if (mkr_grow_reserve((void **)&p, &cap, 10, sizeof(*p)) != MKR_OK) { free(p); return 42; }
+        if (p == NULL || cap < 10) { free(p); return 43; }
+        p[0] = 1; p[9] = 2;                                 /* in-bounds after grow */
+        size_t *d0 = p; size_t c0 = cap;
+        if (mkr_grow_reserve((void **)&p, &cap, 5, sizeof(*p)) != MKR_OK) { free(p); return 44; }
+        if (p != d0 || cap != c0) { free(p); return 45; }   /* need < cap -> no-op */
+        if (mkr_grow_reserve((void **)&p, &cap, cap, sizeof(*p)) != MKR_OK) { free(p); return 61; }
+        if (p != d0 || cap != c0) { free(p); return 62; }   /* boundary: need == cap -> no-op */
+        if (mkr_grow_reserve((void **)&p, &cap, SIZE_MAX, sizeof(*p)) != MKR_ERR_OOM) { free(p); return 46; }
+        if (p != d0 || cap != c0) { free(p); return 47; }   /* overflow -> unchanged */
+        free(p);
+    }
+    /* buf_reserve: pre-allocates capacity without touching len; no-op when room
+     * already exists; clamps a request to the buffer's max (never allocates past
+     * it); the buffer stays usable afterwards. */
+    {
+        mkr_buf_t b;
+        mkr_buf_init(&b, 16);                                /* small ceiling */
+        if (mkr_buf_reserve(&b, 8) != MKR_OK) { mkr_buf_free(&b); return 48; }
+        if (b.cap < 9 || b.len != 0) { mkr_buf_free(&b); return 49; } /* reserved (+NUL), still empty */
+        char *d0 = b.data; size_t c0 = b.cap;
+        if (mkr_buf_reserve(&b, 4) != MKR_OK) { mkr_buf_free(&b); return 50; }
+        if (b.data != d0 || b.cap != c0) { mkr_buf_free(&b); return 51; } /* already room -> no-op */
+        if (mkr_buf_reserve(&b, (size_t)1 << 40) != MKR_OK) { mkr_buf_free(&b); return 52; }
+        if (b.cap > 17) { mkr_buf_free(&b); return 53; }     /* clamped to max(16)+NUL, no huge alloc */
+        if (mkr_buf_append(&b, "abc", 3) != MKR_OK || b.len != 3) { mkr_buf_free(&b); return 54; }
+        mkr_buf_free(&b);
+    }
+    /* buf_append: geometric growth is clamped to the content ceiling (max + the
+     * NUL), so a near-limit append never over-allocates to ~2x. Here filling to
+     * max(10) would geometrically reach cap 16; the clamp holds it at max+1(11),
+     * the same property that keeps a real buffer's cap under MKR_BUF_HARD_MAX. */
+    {
+        mkr_buf_t b;
+        mkr_buf_init(&b, 10);
+        if (mkr_buf_append(&b, "0123456789", 10) != MKR_OK) { mkr_buf_free(&b); return 63; }
+        if (b.len != 10 || b.cap > 11) { mkr_buf_free(&b); return 64; }  /* clamped, not the geometric 16 */
+        if (mkr_buf_append(&b, "x", 1) != MKR_ERR_LIMIT) { mkr_buf_free(&b); return 65; } /* still capped */
+        mkr_buf_free(&b);
+    }
+    /* span: bounded reads - in-bounds values, out-of-bounds -1/false/clamp,
+     * never an overrun. Exercises every helper at its boundary. */
+    {
+        mkr_span_t s = mkr_span("abc", 3);
+        if (mkr_span_left(&s) != 3 || mkr_span_peek(&s) != 'a') return 66;
+        if (mkr_span_at(&s, 2) != 'c' || mkr_span_at(&s, 3) != -1) return 67; /* exact bound */
+        if (!mkr_span_starts(&s, "abc", 3) || mkr_span_starts(&s, "abcd", 4)) return 68;
+        if (mkr_span_take(&s) != 'a' || mkr_span_left(&s) != 2) return 69;
+        size_t idx = 99;
+        if (!mkr_span_find(&s, 'c', &idx) || idx != 1) return 70;
+        if (mkr_span_find(&s, 'z', &idx)) return 71;             /* absent -> false */
+        const char *mark = mkr_span_mark(&s);
+        mkr_span_skip(&s, 5);                                     /* clamped to the 2 left */
+        if (mkr_span_left(&s) != 0 || mkr_span_since(&s, mark) != 2) return 72;
+        if (mkr_span_peek(&s) != -1 || mkr_span_take(&s) != -1) return 73; /* empty: every read -1 */
+        if (mkr_span_take(&s) != -1) return 74;                   /* and stays empty (no underflow) */
+        mkr_span_t e = mkr_span(NULL, 8);   /* NULL -> empty span over a VALID address */
+        if (e.p == NULL || e.p != e.end) return 89;               /* normalized, no NULL arithmetic */
+        if (mkr_span_left(&e) != 0 || mkr_span_peek(&e) != -1 || mkr_span_starts(&e, "x", 1)) return 75;
+        if (!mkr_span_starts(&e, NULL, 0)) return 90;             /* zero-length literal: always true */
+        mkr_span_t base = mkr_span("abcdef", 6);
+        mkr_span_t tl = mkr_span_tail(&base, 2);                  /* sub-span, parent unconsumed */
+        if (mkr_span_peek(&tl) != 'c' || mkr_span_left(&tl) != 4) return 91;
+        if (mkr_span_peek(&base) != 'a' || mkr_span_left(&base) != 6) return 92;
+        mkr_span_t tc = mkr_span_tail(&base, 9);                  /* past the end: clamped empty */
+        if (mkr_span_left(&tc) != 0 || mkr_span_peek(&tc) != -1) return 93;
+        if (!mkr_bytes_eq("ab", 2, "ab", 2) || mkr_bytes_eq("ab", 2, "ac", 2)
+            || mkr_bytes_eq("a", 1, "ab", 2) || !mkr_bytes_eq(NULL, 0, "x", 0)) return 76;
+        /* peek/at return bytes as 0..255, never sign-extended negatives */
+        mkr_span_t hb = mkr_span("\xFF", 1);
+        if (mkr_span_peek(&hb) != 0xFF || mkr_span_at(&hb, 0) != 0xFF) return 77;
+    }
+    /* utf8_decode1: strict - valid lengths 1-4; truncation / overlong /
+     * surrogate / out-of-range / bad lead all fail closed with 0. */
+    {
+        uint32_t cp = 0;
+        if (mkr_utf8_decode1((const unsigned char *)"A", 1, &cp) != 1 || cp != 'A') return 78;
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3\xA9", 2, &cp) != 2 || cp != 0xE9u) return 79;
+        if (mkr_utf8_decode1((const unsigned char *)"\xE3\x81\x82", 3, &cp) != 3 || cp != 0x3042u) return 80;
+        if (mkr_utf8_decode1((const unsigned char *)"\xF0\x9F\x98\x80", 4, &cp) != 4 || cp != 0x1F600u) return 81;
+        if (mkr_utf8_decode1((const unsigned char *)"", 0, &cp) != 0) return 82;                 /* empty */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3", 1, &cp) != 0) return 83;             /* truncated */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC0\xAF", 2, &cp) != 0) return 84;         /* overlong */
+        if (mkr_utf8_decode1((const unsigned char *)"\xED\xA0\x80", 3, &cp) != 0) return 85;     /* surrogate */
+        if (mkr_utf8_decode1((const unsigned char *)"\xF4\x90\x80\x80", 4, &cp) != 0) return 86; /* > U+10FFFF */
+        if (mkr_utf8_decode1((const unsigned char *)"\x80", 1, &cp) != 0) return 87;             /* stray cont. */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3\x28", 2, &cp) != 0) return 88;         /* bad cont. */
+    }
     return 0;
 }

data/ext/makiri/core/mkr_core.h CHANGED Viewed

@@ -8,14 +8,23 @@
  *   mkr_alloc.h  fail-closed size arithmetic + allocators (the foundation)
  *   mkr_hash.h   pointer hash + power-of-two sizer (pointer-keyed index tables)
  *   mkr_text.h   string-type lattice (owned/borrowed/verified text + bytes)
- *   mkr_buf.h    mkr_buf_t (growable, capped byte buffer)
+ *   mkr_utf8.h   the one pure-C UTF-8 validator + strict 1-cp decoder
+ *   mkr_buf.h    mkr_buf_t (growable, capped byte buffer) + mkr_spanbuf_t
+ *   mkr_span.h   mkr_span_t (bounded reader - the spanbuf's read twin)
  *
- * NOTHING here touches Ruby — exception mapping happens at the glue boundary.
+ * NOTHING here touches Ruby - exception mapping happens at the glue boundary.
  */
 #include "mkr_alloc.h"
 #include "mkr_hash.h"
 #include "mkr_text.h"
+#include "mkr_span.h"
+#include "mkr_utf8.h"
 #include "mkr_buf.h"
+/* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
+ * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
+ * Wired to a private Ruby method for the spec suite. */
+int mkr_core_selftest(void);
 #endif /* MAKIRI_CORE_MKR_CORE_H */

data/ext/makiri/core/mkr_hash.h CHANGED Viewed

@@ -35,7 +35,7 @@ mkr_ptr_hash(const void *p)
 /* Smallest power of two >= n, into *out. Returns false on overflow (no power of
  * two >= n fits in size_t) so the caller fails closed rather than sizing a
- * power-of-two hash table below the element count it must hold — which would
+ * power-of-two hash table below the element count it must hold - which would
  * never find a free slot under linear probing. Shared by the pointer-keyed
  * indexes (attr->owner, text-index). */
 static inline bool

data/ext/makiri/core/mkr_span.h ADDED Viewed

@@ -0,0 +1,186 @@
+#ifndef MAKIRI_CORE_MKR_SPAN_H
+#define MAKIRI_CORE_MKR_SPAN_H
+/*
+ * mkr_span_t - a bounded READER over a borrowed byte region: the read twin of
+ * mkr_spanbuf_t (mkr_buf.h), completing the structural-safety model:
+ *
+ *               allocate            write              read
+ *   structure   checked wrappers    mkr_spanbuf_t      mkr_span_t (this)
+ *   enforced    lint (direct_alloc) (arena's only way) lint (raw_scan_call /
+ *                                                            raw_cursor_member)
+ *
+ * Byte-scanning parsers used to guard every read by convention ("check
+ * p < end, then *p") - correct at every current site, but a single forgotten
+ * check is invisible to the compiler. A span owns the cursor AND the bound:
+ * an out-of-bounds read is not an overrun but a -1 / false / clamp, so the
+ * unchecked-read bug class is structurally impossible inside a converted TU.
+ * The lint (script/check_c_safety.rb) bans the raw scanning primitives in the
+ * converted parser TUs, turning the convention into a machine-enforced rule.
+ *
+ * Every helper is a static inline whose bound check compiles to the same
+ * single compare the hand-written guard used - performance-neutral.
+ *
+ * Like a spanbuf, the BORROW is the caller's responsibility: the span cannot
+ * stop you from outliving the buffer it views (input lifetime is handled at
+ * the bridge: parse entries copy, borrowed slices never cross a GC point).
+ * mkr_span_mark/mkr_span_since exist to CAPTURE slices (pointer arithmetic,
+ * never a dereference); reading a captured slice goes back through a span or
+ * an audited core primitive.
+ */
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct {
+    const char *p;    /* cursor (always <= end) */
+    const char *end;  /* one past the last readable byte */
+} mkr_span_t;
+/* Wrap [ptr, ptr+len) for bounded reading. ptr == NULL yields the empty span
+ * (every read -1 / false), so an absent input flows through without a guard.
+ * The NULL case is normalized to a VALID empty address (not p = end = NULL):
+ * the helpers do pointer subtraction and relational compares, which C defines
+ * only within one object - NULL - NULL / NULL < NULL is formally UB - so the
+ * normalization here keeps every helper unconditional AND well-defined. */
+static inline mkr_span_t
+mkr_span(const char *ptr, size_t len)
+{
+    if (ptr == NULL) { ptr = ""; len = 0; }
+    mkr_span_t s;
+    s.p   = ptr;
+    s.end = ptr + len;
+    return s;
+}
+/* Bytes remaining. */
+static inline size_t
+mkr_span_left(const mkr_span_t *s)
+{
+    return (size_t)(s->end - s->p);
+}
+/* The byte at the cursor as 0..255, or -1 at end-of-span. */
+static inline int
+mkr_span_peek(const mkr_span_t *s)
+{
+    return s->p < s->end ? (unsigned char)*s->p : -1;
+}
+/* The byte at cursor+i (bounded lookahead), or -1 past the end. */
+static inline int
+mkr_span_at(const mkr_span_t *s, size_t i)
+{
+    return i < mkr_span_left(s) ? (unsigned char)s->p[i] : -1;
+}
+/* Consume and return the byte at the cursor, or -1 at end-of-span. */
+static inline int
+mkr_span_take(mkr_span_t *s)
+{
+    return s->p < s->end ? (unsigned char)*s->p++ : -1;
+}
+/* Advance up to n bytes (clamped at the end - never past it). */
+static inline void
+mkr_span_skip(mkr_span_t *s, size_t n)
+{
+    size_t left = mkr_span_left(s);
+    s->p += (n <= left) ? n : left;
+}
+/* True if the remaining input begins with the n-byte literal. n == 0 is true
+ * regardless of lit (even NULL - aligned with mkr_bytes_eq; also keeps the
+ * memcmp away from a possibly-NULL lit, which is formally UB even for 0). */
+static inline bool
+mkr_span_starts(const mkr_span_t *s, const char *lit, size_t n)
+{
+    if (n == 0) return true;
+    return mkr_span_left(s) >= n && memcmp(s->p, lit, n) == 0;
+}
+/* Find byte c in the remaining input; true + its offset from the cursor in
+ * *idx, or false if absent (idx untouched). */
+static inline bool
+mkr_span_find(const mkr_span_t *s, char c, size_t *idx)
+{
+    const char *hit = (const char *)memchr(s->p, c, mkr_span_left(s));
+    if (hit == NULL) return false;
+    *idx = (size_t)(hit - s->p);
+    return true;
+}
+/* The remaining input from cursor+off onward, as a fresh sub-span (clamped at
+ * the end). For bounded lookahead scans that must not consume the parent. */
+static inline mkr_span_t
+mkr_span_tail(const mkr_span_t *s, size_t off)
+{
+    mkr_span_t t = *s;
+    mkr_span_skip(&t, off);
+    return t;
+}
+/* The cursor position, for capturing a slice start (an address, NEVER read
+ * through directly - pair with mkr_span_since and hand the slice to a span or
+ * an audited primitive). */
+static inline const char *
+mkr_span_mark(const mkr_span_t *s)
+{
+    return s->p;
+}
+/* Bytes consumed since +mark+ (a prior mkr_span_mark of the SAME span). */
+static inline size_t
+mkr_span_since(const mkr_span_t *s, const char *mark)
+{
+    return (size_t)(s->p - mark);
+}
+/* Length-checked slice equality (the audited replacement for an open-coded
+ * memcmp over two captured slices). Zero-length slices are equal regardless
+ * of pointers (a NULL "" never gets dereferenced). */
+static inline bool
+mkr_bytes_eq(const void *a, size_t alen, const void *b, size_t blen)
+{
+    return alen == blen && (alen == 0 || memcmp(a, b, alen) == 0);
+}
+/* Substring search: find the first occurrence of [needle, needle+needle_len)
+ * within [hay, hay+hay_len). On a hit returns true and writes the byte offset to
+ * *idx; on a miss returns false and leaves *idx untouched. The audited
+ * replacement for an open-coded substring memcmp (a scan, so it lives in core).
+ * Boundary behavior mirrors mkr_bytes_eq's: lengths are the truth, pointers are
+ * never dereferenced at length 0 -
+ *   - an empty needle (needle_len == 0) matches at offset 0 in ANY haystack,
+ *     including an empty or NULL "" one (the substring analogue of two empty
+ *     slices comparing equal);
+ *   - a needle longer than the haystack never matches, so a NULL/empty haystack
+ *     misses every non-empty needle without a read. */
+static inline bool
+mkr_bytes_find(const void *hay, size_t hay_len,
+               const void *needle, size_t needle_len, size_t *idx)
+{
+    if (needle_len == 0) { *idx = 0; return true; }
+    if (needle_len > hay_len) return false;
+    const char *h = (const char *)hay;
+    const char *n = (const char *)needle;
+    size_t last = hay_len - needle_len;
+    for (size_t i = 0; i <= last; ++i) {
+        if (h[i] == n[0] && memcmp(h + i, n, needle_len) == 0) {
+            *idx = i;
+            return true;
+        }
+    }
+    return false;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif /* MAKIRI_CORE_MKR_SPAN_H */

data/ext/makiri/core/mkr_text.h CHANGED Viewed

@@ -16,7 +16,7 @@ extern "C" {
 #endif
 /* ---------------------------------------------------------------- */
-/* mkr_verified_text_t — a string proven to meet the engine text contract */
+/* mkr_verified_text_t - a string proven to meet the engine text contract */
 /* ---------------------------------------------------------------- */
 /* A borrowed byte slice whose contents are guaranteed to satisfy Makiri's
@@ -39,7 +39,7 @@ typedef struct {
 /*
  * Makiri's string types form a small lattice over two axes plus a shape marker.
  * They look alike ({ptr,len}) but C has no subtyping, so each contract is its
- * own type — that distinctness IS the guarantee, and is why there is no single
+ * own type - that distinctness IS the guarantee, and is why there is no single
  * "string" type.
  *
  *   axis 1  ownership : borrowed (we never free) | owned (free via *_clear)
@@ -51,7 +51,7 @@ typedef struct {
  *   shape \ contract        raw (bytes)               valid (text)
  *   ----------------------  ------------------------  -------------------------
  *   ruby-anchored borrowed  mkr_ruby_borrowed_bytes_t mkr_ruby_borrowed_text_t  (bridge.h)
- *   borrowed slice          (none yet — would be      mkr_borrowed_text_t /
+ *   borrowed slice          (none yet - would be      mkr_borrowed_text_t /
  *                            mkr_borrowed_bytes_t)     mkr_verified_text_t (*)
  *   owned                   mkr_owned_bytes_t         mkr_owned_text_t
  *
@@ -65,22 +65,22 @@ typedef struct {
  *   cannot reach the engine's public API. Internally the engine carries the
  *   freely-constructible mkr_borrowed_text_t instead.
  *
- * Conversions — the only sanctioned edges. The points that actually VALIDATE
+ * Conversions - the only sanctioned edges. The points that actually VALIDATE
  * raw bytes are the bridge's checked entry points; everything else only moves
  * already-valid text between shapes (no edge re-validates, and none turns raw
  * bytes into text without one of those checks):
- *   validate raw -> valid : the bridge's checked entry points only —
+ *   validate raw -> valid : the bridge's checked entry points only -
  *                           mkr_ruby_verified_text / mkr_ruby_try_verified_text
  *                           (both validate UTF-8 + no NUL); never a cast.
  *   drop the GC anchor    : mkr_verified_text_from_view (ruby_borrowed_text -> verified_text)
  *   assert valid (no copy) : mkr_borrowed_text (const char*,len -> borrowed_text)
- *                            — caller asserts the bytes already meet the contract
+ *                            - caller asserts the bytes already meet the contract
  *   downgrade to borrow   : mkr_borrowed_text_from_owned (owned_text -> borrowed_text)
  *                           mkr_borrowed_text_from_verified (verified_text -> borrowed_text)
  *   copy into owned       : mkr_owned_text_from_borrowed_copy /
- *                           mkr_owned_text_from_buf_steal — accept only
+ *                           mkr_owned_text_from_buf_steal - accept only
  *                           already-asserted-valid text; they copy, not validate.
- *   take ownership        : mkr_owned_text (char*,len -> owned_text) — caller
+ *   take ownership        : mkr_owned_text (char*,len -> owned_text) - caller
  *                           transfers an already-valid heap buffer it produced
  *                           (substring/concat/format output); asserts validity.
  */