RubyGems - makiri - Versions diffs - 0.3.0 → 0.4.0 - Mend

makiri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

checksums.yaml +4 -4
data/.github/workflows/conformance.yml +22 -0
data/.github/workflows/libfuzzer.yml +83 -0
data/.github/workflows/security.yml +88 -3
data/.github/workflows/valgrind.yml +135 -0
data/CHANGELOG.md +60 -2
data/README.md +81 -77
data/Rakefile +194 -3
data/ext/makiri/bridge/ruby_string.c +119 -66
data/ext/makiri/core/mkr_alloc.c +40 -3
data/ext/makiri/core/mkr_alloc.h +27 -4
data/ext/makiri/core/mkr_buf.c +13 -3
data/ext/makiri/core/mkr_buf.h +80 -5
data/ext/makiri/core/mkr_core.c +143 -0
data/ext/makiri/core/mkr_core.h +10 -1
data/ext/makiri/core/mkr_span.h +186 -0
data/ext/makiri/core/mkr_utf8.c +101 -0
data/ext/makiri/core/mkr_utf8.h +88 -0
data/ext/makiri/extconf.rb +104 -9
data/ext/makiri/fuzz/Makefile +95 -0
data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
data/ext/makiri/fuzz/xml_fuzz.c +24 -0
data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
data/ext/makiri/glue/glue.h +8 -0
data/ext/makiri/glue/ruby_doc.c +20 -24
data/ext/makiri/glue/ruby_html_css.c +58 -12
data/ext/makiri/glue/ruby_html_mutate.c +11 -6
data/ext/makiri/glue/ruby_html_node.c +3 -32
data/ext/makiri/glue/ruby_node.c +39 -0
data/ext/makiri/glue/ruby_xml.c +198 -16
data/ext/makiri/glue/ruby_xml_node.c +46 -59
data/ext/makiri/glue/ruby_xpath.c +4 -4
data/ext/makiri/lexbor_compat/source_loc.c +14 -16
data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
data/ext/makiri/makiri.c +45 -0
data/ext/makiri/xml/mkr_xml.h +2 -3
data/ext/makiri/xml/mkr_xml_chars.c +67 -97
data/ext/makiri/xml/mkr_xml_index.c +169 -0
data/ext/makiri/xml/mkr_xml_index.h +48 -0
data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
data/ext/makiri/xml/mkr_xml_node.c +147 -15
data/ext/makiri/xml/mkr_xml_node.h +71 -6
data/ext/makiri/xml/mkr_xml_tree.c +185 -149
data/ext/makiri/xpath/mkr_css.c +1023 -0
data/ext/makiri/xpath/mkr_css.h +65 -0
data/ext/makiri/xpath/mkr_xpath.c +37 -0
data/ext/makiri/xpath/mkr_xpath.h +13 -0
data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
data/lib/makiri/cdata_section.rb +1 -3
data/lib/makiri/comment.rb +1 -3
data/lib/makiri/document.rb +8 -0
data/lib/makiri/element.rb +1 -3
data/lib/makiri/processing_instruction.rb +1 -3
data/lib/makiri/text.rb +1 -3
data/lib/makiri/version.rb +1 -1
data/lib/makiri/xml/builder.rb +263 -0
data/lib/makiri/xml/node_methods.rb +47 -0
data/lib/makiri.rb +1 -0
data/script/check_alloc_failures.rb +266 -0
data/script/check_c_safety.rb +45 -2
data/script/check_c_safety_allowlist.yml +19 -0
data/script/check_leaks.rb +64 -0
data/script/leaks_harness.rb +64 -0
data/vendor/lexbor/CMakeLists.txt +6 -0
data/vendor/lexbor/README.md +12 -0
data/vendor/lexbor/config.cmake +1 -1
data/vendor/lexbor/source/lexbor/core/base.h +1 -1
data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
data/vendor/lexbor/source/lexbor/html/base.h +1 -1
data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
data/vendor/lexbor/source/lexbor/url/base.h +1 -1
data/vendor/lexbor/source/lexbor/url/url.c +5 -2
data/vendor/lexbor/source/lexbor/url/url.h +9 -0
data/vendor/lexbor/version +1 -1
metadata +19 -1

data/ext/makiri/core/mkr_core.c CHANGED Viewed

@@ -18,6 +18,12 @@ mkr_core_selftest(void)
     if (!mkr_size_mul(4, 5, &out) || out != 20) return 3;
     if (mkr_size_mul(SIZE_MAX / 2 + 1, 2, &out)) return 4;  /* must overflow */
     if (!mkr_size_mul(0, SIZE_MAX, &out) || out != 0) return 5;
+    /* exact-boundary: the largest non-overflowing result MUST succeed (catches an
+     * off-by-one in the `a > SIZE_MAX - b` / `b > SIZE_MAX / a` predicates). */
+    if (!mkr_size_add(SIZE_MAX, 0, &out) || out != SIZE_MAX) return 55;
+    if (!mkr_size_add(SIZE_MAX - 1, 1, &out) || out != SIZE_MAX) return 56;
+    if (!mkr_size_mul(SIZE_MAX, 1, &out) || out != SIZE_MAX) return 57;
+    if (!mkr_size_mul(SIZE_MAX / 2, 2, &out) || out != (SIZE_MAX / 2) * 2) return 58;
     /* grow_capacity: geometric, and overflow on a huge element */
     if (!mkr_grow_capacity(0, 1, 1, &out) || out < 1) return 6;
@@ -39,6 +45,9 @@ mkr_core_selftest(void)
     /* str_alloc / strndup / strdup: copy, terminate, fail closed on NULL+len */
     {
         if (mkr_str_alloc(SIZE_MAX) != NULL) return 26;          /* n + 1 overflow */
+        char *z = mkr_str_alloc(0);                              /* boundary: 0 -> 1-byte "\0" */
+        if (z == NULL || z[0] != '\0') { free(z); return 59; }
+        free(z);
         char *p = mkr_strndup("hello", 3);
         if (p == NULL || memcmp(p, "hel", 4) != 0) { free(p); return 27; } /* "hel\0" */
         free(p);
@@ -97,5 +106,139 @@ mkr_core_selftest(void)
         free(s);
     }
+    /* spanbuf: writes that fit, then one that overruns -> refused + sticky */
+    {
+        char store[4];
+        mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
+        mkr_spanbuf_putc(&f, 'a');
+        mkr_spanbuf_write(&f, "bc", 2);                  /* pos == 3, room for 1 */
+        if (!f.ok || f.pos != 3) return 35;
+        mkr_spanbuf_write(&f, "de", 2);                  /* exceeds cap -> refused */
+        if (f.ok) return 36;                              /* must have latched */
+        if (f.pos != 3) return 37;                        /* refused write didn't advance */
+        if (mkr_spanbuf_finish(&f) != NULL) return 38;   /* not-ok -> NULL */
+        if (memcmp(store, "abc", 3) != 0) return 39;      /* no overrun past pos */
+    }
+    /* spanbuf: exact fill is ok; a NULL backing buffer is never ok */
+    {
+        char store[2];
+        mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
+        mkr_spanbuf_write(&f, "xy", 2);                  /* exactly cap -> still ok */
+        if (!f.ok || mkr_spanbuf_finish(&f) != store || f.pos != 2) return 40;
+        mkr_spanbuf_putc(&f, 'z');                       /* boundary: at pos==cap -> refused */
+        if (f.ok || f.pos != 2) return 60;
+        mkr_spanbuf_t g = mkr_spanbuf(NULL, 8);         /* alloc-failed backing */
+        mkr_spanbuf_putc(&g, 'z');                       /* no-op, no crash */
+        if (g.ok || mkr_spanbuf_finish(&g) != NULL) return 41;
+    }
+    /* grow_reserve: grows + updates ptr/cap; already-enough is a no-op; an
+     * overflowing (need*elem) request fails closed (OOM) leaving ptr/cap as-is. */
+    {
+        size_t *p = NULL, cap = 0;
+        if (mkr_grow_reserve((void **)&p, &cap, 10, sizeof(*p)) != MKR_OK) { free(p); return 42; }
+        if (p == NULL || cap < 10) { free(p); return 43; }
+        p[0] = 1; p[9] = 2;                                 /* in-bounds after grow */
+        size_t *d0 = p; size_t c0 = cap;
+        if (mkr_grow_reserve((void **)&p, &cap, 5, sizeof(*p)) != MKR_OK) { free(p); return 44; }
+        if (p != d0 || cap != c0) { free(p); return 45; }   /* need < cap -> no-op */
+        if (mkr_grow_reserve((void **)&p, &cap, cap, sizeof(*p)) != MKR_OK) { free(p); return 61; }
+        if (p != d0 || cap != c0) { free(p); return 62; }   /* boundary: need == cap -> no-op */
+        if (mkr_grow_reserve((void **)&p, &cap, SIZE_MAX, sizeof(*p)) != MKR_ERR_OOM) { free(p); return 46; }
+        if (p != d0 || cap != c0) { free(p); return 47; }   /* overflow -> unchanged */
+        free(p);
+    }
+    /* buf_reserve: pre-allocates capacity without touching len; no-op when room
+     * already exists; clamps a request to the buffer's max (never allocates past
+     * it); the buffer stays usable afterwards. */
+    {
+        mkr_buf_t b;
+        mkr_buf_init(&b, 16);                                /* small ceiling */
+        if (mkr_buf_reserve(&b, 8) != MKR_OK) { mkr_buf_free(&b); return 48; }
+        if (b.cap < 9 || b.len != 0) { mkr_buf_free(&b); return 49; } /* reserved (+NUL), still empty */
+        char *d0 = b.data; size_t c0 = b.cap;
+        if (mkr_buf_reserve(&b, 4) != MKR_OK) { mkr_buf_free(&b); return 50; }
+        if (b.data != d0 || b.cap != c0) { mkr_buf_free(&b); return 51; } /* already room -> no-op */
+        if (mkr_buf_reserve(&b, (size_t)1 << 40) != MKR_OK) { mkr_buf_free(&b); return 52; }
+        if (b.cap > 17) { mkr_buf_free(&b); return 53; }     /* clamped to max(16)+NUL, no huge alloc */
+        if (mkr_buf_append(&b, "abc", 3) != MKR_OK || b.len != 3) { mkr_buf_free(&b); return 54; }
+        mkr_buf_free(&b);
+    }
+    /* buf_append: geometric growth is clamped to the content ceiling (max + the
+     * NUL), so a near-limit append never over-allocates to ~2x. Here filling to
+     * max(10) would geometrically reach cap 16; the clamp holds it at max+1(11),
+     * the same property that keeps a real buffer's cap under MKR_BUF_HARD_MAX. */
+    {
+        mkr_buf_t b;
+        mkr_buf_init(&b, 10);
+        if (mkr_buf_append(&b, "0123456789", 10) != MKR_OK) { mkr_buf_free(&b); return 63; }
+        if (b.len != 10 || b.cap > 11) { mkr_buf_free(&b); return 64; }  /* clamped, not the geometric 16 */
+        if (mkr_buf_append(&b, "x", 1) != MKR_ERR_LIMIT) { mkr_buf_free(&b); return 65; } /* still capped */
+        mkr_buf_free(&b);
+    }
+    /* span: bounded reads - in-bounds values, out-of-bounds -1/false/clamp,
+     * never an overrun. Exercises every helper at its boundary. */
+    {
+        mkr_span_t s = mkr_span("abc", 3);
+        if (mkr_span_left(&s) != 3 || mkr_span_peek(&s) != 'a') return 66;
+        if (mkr_span_at(&s, 2) != 'c' || mkr_span_at(&s, 3) != -1) return 67; /* exact bound */
+        if (!mkr_span_starts(&s, "abc", 3) || mkr_span_starts(&s, "abcd", 4)) return 68;
+        if (mkr_span_take(&s) != 'a' || mkr_span_left(&s) != 2) return 69;
+        size_t idx = 99;
+        if (!mkr_span_find(&s, 'c', &idx) || idx != 1) return 70;
+        if (mkr_span_find(&s, 'z', &idx)) return 71;             /* absent -> false */
+        const char *mark = mkr_span_mark(&s);
+        mkr_span_skip(&s, 5);                                     /* clamped to the 2 left */
+        if (mkr_span_left(&s) != 0 || mkr_span_since(&s, mark) != 2) return 72;
+        if (mkr_span_peek(&s) != -1 || mkr_span_take(&s) != -1) return 73; /* empty: every read -1 */
+        if (mkr_span_take(&s) != -1) return 74;                   /* and stays empty (no underflow) */
+        mkr_span_t e = mkr_span(NULL, 8);   /* NULL -> empty span over a VALID address */
+        if (e.p == NULL || e.p != e.end) return 89;               /* normalized, no NULL arithmetic */
+        if (mkr_span_left(&e) != 0 || mkr_span_peek(&e) != -1 || mkr_span_starts(&e, "x", 1)) return 75;
+        if (!mkr_span_starts(&e, NULL, 0)) return 90;             /* zero-length literal: always true */
+        mkr_span_t base = mkr_span("abcdef", 6);
+        mkr_span_t tl = mkr_span_tail(&base, 2);                  /* sub-span, parent unconsumed */
+        if (mkr_span_peek(&tl) != 'c' || mkr_span_left(&tl) != 4) return 91;
+        if (mkr_span_peek(&base) != 'a' || mkr_span_left(&base) != 6) return 92;
+        mkr_span_t tc = mkr_span_tail(&base, 9);                  /* past the end: clamped empty */
+        if (mkr_span_left(&tc) != 0 || mkr_span_peek(&tc) != -1) return 93;
+        if (!mkr_bytes_eq("ab", 2, "ab", 2) || mkr_bytes_eq("ab", 2, "ac", 2)
+            || mkr_bytes_eq("a", 1, "ab", 2) || !mkr_bytes_eq(NULL, 0, "x", 0)) return 76;
+        /* peek/at return bytes as 0..255, never sign-extended negatives */
+        mkr_span_t hb = mkr_span("\xFF", 1);
+        if (mkr_span_peek(&hb) != 0xFF || mkr_span_at(&hb, 0) != 0xFF) return 77;
+    }
+    /* utf8_decode1: strict - valid lengths 1-4; truncation / overlong /
+     * surrogate / out-of-range / bad lead all fail closed with 0. */
+    {
+        uint32_t cp = 0;
+        if (mkr_utf8_decode1((const unsigned char *)"A", 1, &cp) != 1 || cp != 'A') return 78;
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3\xA9", 2, &cp) != 2 || cp != 0xE9u) return 79;
+        if (mkr_utf8_decode1((const unsigned char *)"\xE3\x81\x82", 3, &cp) != 3 || cp != 0x3042u) return 80;
+        if (mkr_utf8_decode1((const unsigned char *)"\xF0\x9F\x98\x80", 4, &cp) != 4 || cp != 0x1F600u) return 81;
+        if (mkr_utf8_decode1((const unsigned char *)"", 0, &cp) != 0) return 82;                 /* empty */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3", 1, &cp) != 0) return 83;             /* truncated */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC0\xAF", 2, &cp) != 0) return 84;         /* overlong */
+        if (mkr_utf8_decode1((const unsigned char *)"\xED\xA0\x80", 3, &cp) != 0) return 85;     /* surrogate */
+        if (mkr_utf8_decode1((const unsigned char *)"\xF4\x90\x80\x80", 4, &cp) != 0) return 86; /* > U+10FFFF */
+        if (mkr_utf8_decode1((const unsigned char *)"\x80", 1, &cp) != 0) return 87;             /* stray cont. */
+        if (mkr_utf8_decode1((const unsigned char *)"\xC3\x28", 2, &cp) != 0) return 88;         /* bad cont. */
+    }
     return 0;
 }

data/ext/makiri/core/mkr_core.h CHANGED Viewed

@@ -8,7 +8,9 @@
  *   mkr_alloc.h  fail-closed size arithmetic + allocators (the foundation)
  *   mkr_hash.h   pointer hash + power-of-two sizer (pointer-keyed index tables)
  *   mkr_text.h   string-type lattice (owned/borrowed/verified text + bytes)
- *   mkr_buf.h    mkr_buf_t (growable, capped byte buffer)
+ *   mkr_utf8.h   the one pure-C UTF-8 validator + strict 1-cp decoder
+ *   mkr_buf.h    mkr_buf_t (growable, capped byte buffer) + mkr_spanbuf_t
+ *   mkr_span.h   mkr_span_t (bounded reader - the spanbuf's read twin)
  *
  * NOTHING here touches Ruby - exception mapping happens at the glue boundary.
  */
@@ -16,6 +18,13 @@
 #include "mkr_alloc.h"
 #include "mkr_hash.h"
 #include "mkr_text.h"
+#include "mkr_span.h"
+#include "mkr_utf8.h"
 #include "mkr_buf.h"
+/* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
+ * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
+ * Wired to a private Ruby method for the spec suite. */
+int mkr_core_selftest(void);
 #endif /* MAKIRI_CORE_MKR_CORE_H */

data/ext/makiri/core/mkr_span.h ADDED Viewed

@@ -0,0 +1,186 @@
+#ifndef MAKIRI_CORE_MKR_SPAN_H
+#define MAKIRI_CORE_MKR_SPAN_H
+/*
+ * mkr_span_t - a bounded READER over a borrowed byte region: the read twin of
+ * mkr_spanbuf_t (mkr_buf.h), completing the structural-safety model:
+ *
+ *               allocate            write              read
+ *   structure   checked wrappers    mkr_spanbuf_t      mkr_span_t (this)
+ *   enforced    lint (direct_alloc) (arena's only way) lint (raw_scan_call /
+ *                                                            raw_cursor_member)
+ *
+ * Byte-scanning parsers used to guard every read by convention ("check
+ * p < end, then *p") - correct at every current site, but a single forgotten
+ * check is invisible to the compiler. A span owns the cursor AND the bound:
+ * an out-of-bounds read is not an overrun but a -1 / false / clamp, so the
+ * unchecked-read bug class is structurally impossible inside a converted TU.
+ * The lint (script/check_c_safety.rb) bans the raw scanning primitives in the
+ * converted parser TUs, turning the convention into a machine-enforced rule.
+ *
+ * Every helper is a static inline whose bound check compiles to the same
+ * single compare the hand-written guard used - performance-neutral.
+ *
+ * Like a spanbuf, the BORROW is the caller's responsibility: the span cannot
+ * stop you from outliving the buffer it views (input lifetime is handled at
+ * the bridge: parse entries copy, borrowed slices never cross a GC point).
+ * mkr_span_mark/mkr_span_since exist to CAPTURE slices (pointer arithmetic,
+ * never a dereference); reading a captured slice goes back through a span or
+ * an audited core primitive.
+ */
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct {
+    const char *p;    /* cursor (always <= end) */
+    const char *end;  /* one past the last readable byte */
+} mkr_span_t;
+/* Wrap [ptr, ptr+len) for bounded reading. ptr == NULL yields the empty span
+ * (every read -1 / false), so an absent input flows through without a guard.
+ * The NULL case is normalized to a VALID empty address (not p = end = NULL):
+ * the helpers do pointer subtraction and relational compares, which C defines
+ * only within one object - NULL - NULL / NULL < NULL is formally UB - so the
+ * normalization here keeps every helper unconditional AND well-defined. */
+static inline mkr_span_t
+mkr_span(const char *ptr, size_t len)
+{
+    if (ptr == NULL) { ptr = ""; len = 0; }
+    mkr_span_t s;
+    s.p   = ptr;
+    s.end = ptr + len;
+    return s;
+}
+/* Bytes remaining. */
+static inline size_t
+mkr_span_left(const mkr_span_t *s)
+{
+    return (size_t)(s->end - s->p);
+}
+/* The byte at the cursor as 0..255, or -1 at end-of-span. */
+static inline int
+mkr_span_peek(const mkr_span_t *s)
+{
+    return s->p < s->end ? (unsigned char)*s->p : -1;
+}
+/* The byte at cursor+i (bounded lookahead), or -1 past the end. */
+static inline int
+mkr_span_at(const mkr_span_t *s, size_t i)
+{
+    return i < mkr_span_left(s) ? (unsigned char)s->p[i] : -1;
+}
+/* Consume and return the byte at the cursor, or -1 at end-of-span. */
+static inline int
+mkr_span_take(mkr_span_t *s)
+{
+    return s->p < s->end ? (unsigned char)*s->p++ : -1;
+}
+/* Advance up to n bytes (clamped at the end - never past it). */
+static inline void
+mkr_span_skip(mkr_span_t *s, size_t n)
+{
+    size_t left = mkr_span_left(s);
+    s->p += (n <= left) ? n : left;
+}
+/* True if the remaining input begins with the n-byte literal. n == 0 is true
+ * regardless of lit (even NULL - aligned with mkr_bytes_eq; also keeps the
+ * memcmp away from a possibly-NULL lit, which is formally UB even for 0). */
+static inline bool
+mkr_span_starts(const mkr_span_t *s, const char *lit, size_t n)
+{
+    if (n == 0) return true;
+    return mkr_span_left(s) >= n && memcmp(s->p, lit, n) == 0;
+}
+/* Find byte c in the remaining input; true + its offset from the cursor in
+ * *idx, or false if absent (idx untouched). */
+static inline bool
+mkr_span_find(const mkr_span_t *s, char c, size_t *idx)
+{
+    const char *hit = (const char *)memchr(s->p, c, mkr_span_left(s));
+    if (hit == NULL) return false;
+    *idx = (size_t)(hit - s->p);
+    return true;
+}
+/* The remaining input from cursor+off onward, as a fresh sub-span (clamped at
+ * the end). For bounded lookahead scans that must not consume the parent. */
+static inline mkr_span_t
+mkr_span_tail(const mkr_span_t *s, size_t off)
+{
+    mkr_span_t t = *s;
+    mkr_span_skip(&t, off);
+    return t;
+}
+/* The cursor position, for capturing a slice start (an address, NEVER read
+ * through directly - pair with mkr_span_since and hand the slice to a span or
+ * an audited primitive). */
+static inline const char *
+mkr_span_mark(const mkr_span_t *s)
+{
+    return s->p;
+}
+/* Bytes consumed since +mark+ (a prior mkr_span_mark of the SAME span). */
+static inline size_t
+mkr_span_since(const mkr_span_t *s, const char *mark)
+{
+    return (size_t)(s->p - mark);
+}
+/* Length-checked slice equality (the audited replacement for an open-coded
+ * memcmp over two captured slices). Zero-length slices are equal regardless
+ * of pointers (a NULL "" never gets dereferenced). */
+static inline bool
+mkr_bytes_eq(const void *a, size_t alen, const void *b, size_t blen)
+{
+    return alen == blen && (alen == 0 || memcmp(a, b, alen) == 0);
+}
+/* Substring search: find the first occurrence of [needle, needle+needle_len)
+ * within [hay, hay+hay_len). On a hit returns true and writes the byte offset to
+ * *idx; on a miss returns false and leaves *idx untouched. The audited
+ * replacement for an open-coded substring memcmp (a scan, so it lives in core).
+ * Boundary behavior mirrors mkr_bytes_eq's: lengths are the truth, pointers are
+ * never dereferenced at length 0 -
+ *   - an empty needle (needle_len == 0) matches at offset 0 in ANY haystack,
+ *     including an empty or NULL "" one (the substring analogue of two empty
+ *     slices comparing equal);
+ *   - a needle longer than the haystack never matches, so a NULL/empty haystack
+ *     misses every non-empty needle without a read. */
+static inline bool
+mkr_bytes_find(const void *hay, size_t hay_len,
+               const void *needle, size_t needle_len, size_t *idx)
+{
+    if (needle_len == 0) { *idx = 0; return true; }
+    if (needle_len > hay_len) return false;
+    const char *h = (const char *)hay;
+    const char *n = (const char *)needle;
+    size_t last = hay_len - needle_len;
+    for (size_t i = 0; i <= last; ++i) {
+        if (h[i] == n[0] && memcmp(h + i, n, needle_len) == 0) {
+            *idx = i;
+            return true;
+        }
+    }
+    return false;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif /* MAKIRI_CORE_MKR_SPAN_H */

data/ext/makiri/core/mkr_utf8.c ADDED Viewed

@@ -0,0 +1,101 @@
+/* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
+ * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
+ * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
+#include "mkr_utf8.h"
+#include <string.h>   /* memcpy for the word-at-a-time ASCII scan */
+bool
+mkr_utf8_valid(const unsigned char *src, size_t len)
+{
+    const unsigned char *p   = src;
+    const unsigned char *const end = p + len;
+    while (p < end) {
+        unsigned char b = *p;
+        if (b < 0x80) {
+            /* ASCII fast path: skip a run of ASCII bytes a word at a time
+             * (any high bit set ends the run), then byte-wise for the tail. */
+            while ((size_t)(end - p) >= sizeof(size_t)) {
+                size_t w;
+                memcpy(&w, p, sizeof(w));
+                if (w & (size_t)0x8080808080808080ULL) {
+                    break;
+                }
+                p += sizeof(size_t);
+            }
+            while (p < end && *p < 0x80) {
+                p++;
+            }
+            continue;
+        }
+        /* Multi-byte: decide length and validate the (length-dependent) ranges
+         * that exclude overlong forms, surrogates and > U+10FFFF. */
+        size_t n;
+        if (b >= 0xC2 && b <= 0xDF) {                 /* U+0080..U+07FF   */
+            n = 2;
+            if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
+        } else if (b == 0xE0) {                       /* U+0800..U+0FFF   */
+            n = 3;
+            if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b >= 0xE1 && b <= 0xEC) {          /* U+1000..U+CFFF   */
+            n = 3;
+            if (end - p < 3 || (p[1] & 0xC0) != 0x80
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xED) {                       /* U+D000..U+D7FF   */
+            n = 3;                                    /* (excludes surrogates) */
+            if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xEE || b == 0xEF) {          /* U+E000..U+FFFF   */
+            n = 3;
+            if (end - p < 3 || (p[1] & 0xC0) != 0x80
+                || (p[2] & 0xC0) != 0x80) return false;
+        } else if (b == 0xF0) {                       /* U+10000..U+3FFFF */
+            n = 4;
+            if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
+                || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
+        } else if (b >= 0xF1 && b <= 0xF3) {          /* U+40000..U+FFFFF */
+            n = 4;
+            if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
+                || (p[3] & 0xC0) != 0x80) return false;
+        } else if (b == 0xF4) {                       /* U+100000..U+10FFFF */
+            n = 4;
+            if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
+                || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
+        } else {                                      /* C0,C1,F5..FF,stray 80..BF */
+            return false;
+        }
+        p += n;
+    }
+    return true;
+}
+int
+mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp)
+{
+    if (len == 0) return 0;
+    unsigned char b0 = p[0];
+    if (b0 < 0x80u) { *cp = b0; return 1; }
+    int n;
+    uint32_t c, min;
+    if      ((b0 & 0xE0u) == 0xC0u) { n = 2; c = b0 & 0x1Fu; min = 0x80u; }
+    else if ((b0 & 0xF0u) == 0xE0u) { n = 3; c = b0 & 0x0Fu; min = 0x800u; }
+    else if ((b0 & 0xF8u) == 0xF0u) { n = 4; c = b0 & 0x07u; min = 0x10000u; }
+    else return 0;                              /* continuation / 0xF8+ lead */
+    if ((size_t)n > len) return 0;              /* truncated */
+    for (int i = 1; i < n; i++) {
+        unsigned char b = p[i];
+        if ((b & 0xC0u) != 0x80u) return 0;     /* bad continuation byte */
+        c = (c << 6) | (b & 0x3Fu);
+    }
+    if (c < min) return 0;                      /* overlong */
+    if (c >= 0xD800u && c <= 0xDFFFu) return 0; /* surrogate */
+    if (c > 0x10FFFFu) return 0;                /* out of Unicode range */
+    *cp = c;
+    return n;
+}

data/ext/makiri/core/mkr_utf8.h ADDED Viewed

@@ -0,0 +1,88 @@
+#ifndef MAKIRI_CORE_MKR_UTF8_H
+#define MAKIRI_CORE_MKR_UTF8_H
+/*
+ * mkr_utf8_valid - the ONE pure-C UTF-8 validator (Ruby-free, allocation-free).
+ *
+ * Validates [src, src+len) against the Unicode "well-formed UTF-8 byte
+ * sequences" table (RFC 3629 / WHATWG): rejects bad continuation bytes,
+ * overlong forms, surrogates (U+D800..U+DFFF), code points above U+10FFFF, and
+ * an incomplete trailing sequence. Validate-only - it never materialises code
+ * points - and rips through ASCII a machine word at a time. NUL bytes are VALID
+ * here (U+0000 is well-formed UTF-8); callers that must reject NUL check it
+ * separately (memchr).
+ *
+ * This lives in core so the Ruby bridge (mkr_verify_text - the strict
+ * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
+ * utf8_input.c fast path) share a single implementation, and so the bridge's
+ * validation never allocates: a borrowed RSTRING pointer must not be held
+ * across a Ruby allocation (= GC point), so the validator the bridge runs
+ * between taking a borrow and using it has to be allocation-free by
+ * construction. (The former implementation built a throwaway Ruby String and
+ * asked for its coderange - an allocation inside every borrow.)
+ */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "mkr_span.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+bool mkr_utf8_valid(const unsigned char *src, size_t len);
+/* mkr_utf8_decode1 - decode ONE code point from [p, p+len), strictly: rejects
+ * truncation, bad continuation bytes, overlong forms, surrogates and values
+ * above U+10FFFF. Returns the byte length (1-4) with *cp set, or 0 on any
+ * violation (including len == 0) - fail closed, never read past the bound.
+ * The ONE strict decoder, shared by the XML tokenizer's name/Char scanning and
+ * the XPath lexer (each formerly carried its own equivalent copy). */
+int mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp);
+/* Span form: decode the code point at the span's cursor (without consuming -
+ * the caller mkr_span_skip()s the returned length). 0 at end-of-span. */
+static inline int
+mkr_utf8_decode1_span(const mkr_span_t *s, uint32_t *cp)
+{
+    return mkr_utf8_decode1((const unsigned char *)s->p, mkr_span_left(s), cp);
+}
+/* mkr_utf8_count_chars - count Unicode code points in [ptr, ptr+len): every
+ * byte that is NOT a 0x80..0xBF continuation byte starts a new code point.
+ * Length-bounded (does not rely on a NUL terminator); ptr may be NULL when
+ * len == 0. Used where XPath measures string length / offsets in characters. */
+static inline size_t
+mkr_utf8_count_chars(const char *ptr, size_t len)
+{
+    size_t n = 0;
+    for (size_t i = 0; i < len; ++i) {
+        if (((unsigned char)ptr[i] & 0xC0) != 0x80) ++n;
+    }
+    return n;
+}
+/* mkr_utf8_advance_chars - byte offset within [ptr, ptr+len) after advancing up
+ * to nchars UTF-8 characters from the start, clamped at len. A character is its
+ * leading byte plus the run of 0x80..0xBF continuation bytes that follow;
+ * advancing stops at len even mid-sequence. Length-bounded (no NUL reliance).
+ * Returns len when nchars exceeds the available character count. */
+static inline size_t
+mkr_utf8_advance_chars(const char *ptr, size_t len, size_t nchars)
+{
+    size_t i = 0;
+    while (nchars > 0 && i < len) {
+        ++i;
+        while (i < len && ((unsigned char)ptr[i] & 0xC0) == 0x80) ++i;
+        --nchars;
+    }
+    return i;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif /* MAKIRI_CORE_MKR_UTF8_H */