makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -2,7 +2,7 @@
2
2
  #define MAKIRI_CORE_MKR_BUF_H
3
3
 
4
4
  /*
5
- * mkr_buf_t an owned, growable, optionally capped byte buffer, kept
5
+ * mkr_buf_t - an owned, growable, optionally capped byte buffer, kept
6
6
  * NUL-terminated. Built on the fail-closed allocators in mkr_alloc.h.
7
7
  * (mkr_core.h is a thin umbrella over mkr_alloc.h + mkr_text.h + this.)
8
8
  */
@@ -13,14 +13,46 @@
13
13
  extern "C" {
14
14
  #endif
15
15
 
16
+ /* Memory safety for buffers lives HERE, at the one buffer primitive, not at each
17
+ * call site: "max == 0" can no longer mean "unbounded". Two ceilings bound every
18
+ * mkr_buf so a runaway - a cycle, an unbounded loop, or a caller that forgot to
19
+ * pass a cap - fails closed with MKR_ERR_LIMIT instead of exhausting memory and
20
+ * freezing the machine:
21
+ *
22
+ * MKR_BUF_DEFAULT_LIMIT the cap applied when the caller passes max == 0. A
23
+ * conservative default (100 MiB): code that did not
24
+ * think about a bound gets a tight one for free, and a
25
+ * buffer that genuinely needs to be large must opt in
26
+ * EXPLICITLY by passing a larger max.
27
+ * MKR_BUF_HARD_MAX an absolute ceiling on a buffer's CONTENT length, even
28
+ * with an explicit max - the last-resort backstop. The
29
+ * ALLOCATION is bounded by it too: geometric growth is
30
+ * clamped so cap never exceeds HARD_MAX + 1 (the one NUL
31
+ * terminator byte), i.e. it does NOT overshoot to ~2x
32
+ * near the limit. Tight, content-scaled bounds still
33
+ * belong to the caller (e.g. the XML serializer caps
34
+ * itself at a multiple of arena_bytes); this stops total
35
+ * runaway.
36
+ *
37
+ * Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
38
+ #ifndef MKR_BUF_DEFAULT_LIMIT
39
+ #define MKR_BUF_DEFAULT_LIMIT ((size_t)100 << 20) /* 100 MiB */
40
+ #endif
41
+ #ifndef MKR_BUF_HARD_MAX
42
+ #define MKR_BUF_HARD_MAX ((size_t)4 << 30) /* 4 GiB */
43
+ #endif
44
+
16
45
  typedef struct {
17
46
  char *data; /* owned; kept NUL-terminated after any append */
18
47
  size_t len; /* bytes used (excluding the terminator) */
19
48
  size_t cap; /* bytes allocated */
20
- size_t max; /* 0 = unbounded; else append past max returns MKR_ERR_LIMIT */
49
+ size_t max; /* 0 = the conservative MKR_BUF_DEFAULT_LIMIT; else this value -
50
+ * either way clamped by MKR_BUF_HARD_MAX (past it -> ERR_LIMIT) */
21
51
  } mkr_buf_t;
22
52
 
23
- /* Initialise an empty buffer. max == 0 means unbounded. */
53
+ /* Initialise an empty buffer. max == 0 applies the conservative default ceiling
54
+ * (MKR_BUF_DEFAULT_LIMIT) - it is NOT unbounded; pass an explicit (larger or
55
+ * smaller) value to opt into a different bound, always under MKR_BUF_HARD_MAX. */
24
56
  static inline void
25
57
  mkr_buf_init(mkr_buf_t *b, size_t max)
26
58
  {
@@ -35,6 +67,12 @@ mkr_buf_init(mkr_buf_t *b, size_t max)
35
67
  * failure (the buffer is left intact in every failure case). n == 0 is a no-op. */
36
68
  mkr_status_t mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n);
37
69
 
70
+ /* Pre-allocate capacity for at least n bytes (best-effort, clamped to the
71
+ * buffer's cap), so a fill of known approximate size avoids per-append reallocs.
72
+ * A no-op if the buffer already has room. Returns MKR_ERR_OOM on overflow /
73
+ * allocation failure (the buffer is left intact). */
74
+ mkr_status_t mkr_buf_reserve(mkr_buf_t *b, size_t n);
75
+
38
76
  /* Take ownership of the (NUL-terminated) bytes; the buffer is reset to empty.
39
77
  * Returns a freshly owned "" for an empty buffer, or NULL on OOM. */
40
78
  char *mkr_buf_steal(mkr_buf_t *b, size_t *out_len);
@@ -48,6 +86,77 @@ mkr_buf_free(mkr_buf_t *b)
48
86
  b->len = b->cap = 0;
49
87
  }
50
88
 
89
+ /* ------------------------------------------------------------------------- */
90
+ /* mkr_spanbuf_t - a bounded writer over a borrowed, fixed-capacity buffer */
91
+ /* ------------------------------------------------------------------------- */
92
+ /*
93
+ * "span" = a non-owning view of a fixed-extent contiguous region; a spanbuf is
94
+ * the bounded *writer* over such a span. The complement to mkr_buf_t: where
95
+ * mkr_buf_t GROWS and OWNS its malloc storage, a spanbuf BORROWS a fixed buffer
96
+ * owned elsewhere (e.g. an arena cut) and never grows it.
97
+ *
98
+ * The name leads with "borrowed/fixed" deliberately, for safety: the
99
+ * fixed/bounded property is SELF-ENFORCED (a write past `cap` is refused, so
100
+ * misunderstanding it costs at most a truncation, caught by _finish), but the
101
+ * BORROWED property is the caller's responsibility - the type cannot stop you
102
+ * from free()ing `buf` or holding `finish()`'s pointer past the owner's
103
+ * lifetime, and getting that wrong is a use-after-free / double-free. So:
104
+ * - never free() buf (the owner does, e.g. the arena is freed wholesale);
105
+ * - finish()'s pointer is valid only while the backing storage lives.
106
+ *
107
+ * Bounds safety is BY CONSTRUCTION (the writer owns the cursor + check, not the
108
+ * caller's per-write guard): an over-long write is refused and latches `ok` to
109
+ * false (sticky). The caller's only duty is one check at the end (via _finish or
110
+ * the public `ok` field), failing closed rather than using a truncated buffer.
111
+ * This is the sanctioned way to hand-fill a raw region; see mkr_xml_arena_spanbuf
112
+ * for the arena adapter.
113
+ */
114
+ typedef struct {
115
+ char *buf; /* borrowed; the owner keeps it alive - do NOT free. NULL => never ok. */
116
+ size_t cap; /* capacity in bytes (fixed) */
117
+ size_t pos; /* bytes written so far (always <= cap) */
118
+ bool ok; /* false once a write would overflow, or buf was NULL */
119
+ } mkr_spanbuf_t;
120
+
121
+ /* Wrap [buf, buf+cap) for bounded writing. buf == NULL yields a permanently
122
+ * not-ok writer (every write a no-op), so an upstream allocation failure flows
123
+ * straight through without a separate guard at each call site. */
124
+ static inline mkr_spanbuf_t
125
+ mkr_spanbuf(char *buf, size_t cap)
126
+ {
127
+ return (mkr_spanbuf_t){ .buf = buf, .cap = cap, .pos = 0, .ok = (buf != NULL) };
128
+ }
129
+
130
+ /* Append one byte; refuse (latch ok=false) if it would exceed cap. */
131
+ static inline void
132
+ mkr_spanbuf_putc(mkr_spanbuf_t *b, char c)
133
+ {
134
+ if (!b->ok) return;
135
+ if (b->pos >= b->cap) { b->ok = false; return; }
136
+ b->buf[b->pos++] = c;
137
+ }
138
+
139
+ /* Append n bytes; refuse (latch ok=false) if they would exceed cap. n == 0 is a
140
+ * no-op. pos <= cap is the invariant (a refused write never advances pos), so
141
+ * cap - pos cannot underflow; the pos > cap arm is belt-and-suspenders. */
142
+ static inline void
143
+ mkr_spanbuf_write(mkr_spanbuf_t *b, const void *src, size_t n)
144
+ {
145
+ if (!b->ok || n == 0) return;
146
+ if (b->pos > b->cap || n > b->cap - b->pos) { b->ok = false; return; }
147
+ memcpy(b->buf + b->pos, src, n);
148
+ b->pos += n;
149
+ }
150
+
151
+ /* The filled prefix [buf, buf+pos), or NULL if any write was refused (or buf was
152
+ * NULL); on a non-NULL return the length is `b->pos`. Forces the caller through a
153
+ * single fail-closed check instead of trusting the writes individually. */
154
+ static inline const char *
155
+ mkr_spanbuf_finish(const mkr_spanbuf_t *b)
156
+ {
157
+ return b->ok ? b->buf : NULL;
158
+ }
159
+
51
160
  #ifdef __cplusplus
52
161
  }
53
162
  #endif
@@ -18,6 +18,12 @@ mkr_core_selftest(void)
18
18
  if (!mkr_size_mul(4, 5, &out) || out != 20) return 3;
19
19
  if (mkr_size_mul(SIZE_MAX / 2 + 1, 2, &out)) return 4; /* must overflow */
20
20
  if (!mkr_size_mul(0, SIZE_MAX, &out) || out != 0) return 5;
21
+ /* exact-boundary: the largest non-overflowing result MUST succeed (catches an
22
+ * off-by-one in the `a > SIZE_MAX - b` / `b > SIZE_MAX / a` predicates). */
23
+ if (!mkr_size_add(SIZE_MAX, 0, &out) || out != SIZE_MAX) return 55;
24
+ if (!mkr_size_add(SIZE_MAX - 1, 1, &out) || out != SIZE_MAX) return 56;
25
+ if (!mkr_size_mul(SIZE_MAX, 1, &out) || out != SIZE_MAX) return 57;
26
+ if (!mkr_size_mul(SIZE_MAX / 2, 2, &out) || out != (SIZE_MAX / 2) * 2) return 58;
21
27
 
22
28
  /* grow_capacity: geometric, and overflow on a huge element */
23
29
  if (!mkr_grow_capacity(0, 1, 1, &out) || out < 1) return 6;
@@ -39,6 +45,9 @@ mkr_core_selftest(void)
39
45
  /* str_alloc / strndup / strdup: copy, terminate, fail closed on NULL+len */
40
46
  {
41
47
  if (mkr_str_alloc(SIZE_MAX) != NULL) return 26; /* n + 1 overflow */
48
+ char *z = mkr_str_alloc(0); /* boundary: 0 -> 1-byte "\0" */
49
+ if (z == NULL || z[0] != '\0') { free(z); return 59; }
50
+ free(z);
42
51
  char *p = mkr_strndup("hello", 3);
43
52
  if (p == NULL || memcmp(p, "hel", 4) != 0) { free(p); return 27; } /* "hel\0" */
44
53
  free(p);
@@ -97,5 +106,139 @@ mkr_core_selftest(void)
97
106
  free(s);
98
107
  }
99
108
 
109
+ /* spanbuf: writes that fit, then one that overruns -> refused + sticky */
110
+ {
111
+ char store[4];
112
+ mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
113
+ mkr_spanbuf_putc(&f, 'a');
114
+ mkr_spanbuf_write(&f, "bc", 2); /* pos == 3, room for 1 */
115
+ if (!f.ok || f.pos != 3) return 35;
116
+ mkr_spanbuf_write(&f, "de", 2); /* exceeds cap -> refused */
117
+ if (f.ok) return 36; /* must have latched */
118
+ if (f.pos != 3) return 37; /* refused write didn't advance */
119
+ if (mkr_spanbuf_finish(&f) != NULL) return 38; /* not-ok -> NULL */
120
+ if (memcmp(store, "abc", 3) != 0) return 39; /* no overrun past pos */
121
+ }
122
+
123
+ /* spanbuf: exact fill is ok; a NULL backing buffer is never ok */
124
+ {
125
+ char store[2];
126
+ mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
127
+ mkr_spanbuf_write(&f, "xy", 2); /* exactly cap -> still ok */
128
+ if (!f.ok || mkr_spanbuf_finish(&f) != store || f.pos != 2) return 40;
129
+ mkr_spanbuf_putc(&f, 'z'); /* boundary: at pos==cap -> refused */
130
+ if (f.ok || f.pos != 2) return 60;
131
+
132
+ mkr_spanbuf_t g = mkr_spanbuf(NULL, 8); /* alloc-failed backing */
133
+ mkr_spanbuf_putc(&g, 'z'); /* no-op, no crash */
134
+ if (g.ok || mkr_spanbuf_finish(&g) != NULL) return 41;
135
+ }
136
+
137
+ /* grow_reserve: grows + updates ptr/cap; already-enough is a no-op; an
138
+ * overflowing (need*elem) request fails closed (OOM) leaving ptr/cap as-is. */
139
+ {
140
+ size_t *p = NULL, cap = 0;
141
+ if (mkr_grow_reserve((void **)&p, &cap, 10, sizeof(*p)) != MKR_OK) { free(p); return 42; }
142
+ if (p == NULL || cap < 10) { free(p); return 43; }
143
+ p[0] = 1; p[9] = 2; /* in-bounds after grow */
144
+
145
+ size_t *d0 = p; size_t c0 = cap;
146
+ if (mkr_grow_reserve((void **)&p, &cap, 5, sizeof(*p)) != MKR_OK) { free(p); return 44; }
147
+ if (p != d0 || cap != c0) { free(p); return 45; } /* need < cap -> no-op */
148
+ if (mkr_grow_reserve((void **)&p, &cap, cap, sizeof(*p)) != MKR_OK) { free(p); return 61; }
149
+ if (p != d0 || cap != c0) { free(p); return 62; } /* boundary: need == cap -> no-op */
150
+
151
+ if (mkr_grow_reserve((void **)&p, &cap, SIZE_MAX, sizeof(*p)) != MKR_ERR_OOM) { free(p); return 46; }
152
+ if (p != d0 || cap != c0) { free(p); return 47; } /* overflow -> unchanged */
153
+ free(p);
154
+ }
155
+
156
+ /* buf_reserve: pre-allocates capacity without touching len; no-op when room
157
+ * already exists; clamps a request to the buffer's max (never allocates past
158
+ * it); the buffer stays usable afterwards. */
159
+ {
160
+ mkr_buf_t b;
161
+ mkr_buf_init(&b, 16); /* small ceiling */
162
+ if (mkr_buf_reserve(&b, 8) != MKR_OK) { mkr_buf_free(&b); return 48; }
163
+ if (b.cap < 9 || b.len != 0) { mkr_buf_free(&b); return 49; } /* reserved (+NUL), still empty */
164
+
165
+ char *d0 = b.data; size_t c0 = b.cap;
166
+ if (mkr_buf_reserve(&b, 4) != MKR_OK) { mkr_buf_free(&b); return 50; }
167
+ if (b.data != d0 || b.cap != c0) { mkr_buf_free(&b); return 51; } /* already room -> no-op */
168
+
169
+ if (mkr_buf_reserve(&b, (size_t)1 << 40) != MKR_OK) { mkr_buf_free(&b); return 52; }
170
+ if (b.cap > 17) { mkr_buf_free(&b); return 53; } /* clamped to max(16)+NUL, no huge alloc */
171
+
172
+ if (mkr_buf_append(&b, "abc", 3) != MKR_OK || b.len != 3) { mkr_buf_free(&b); return 54; }
173
+ mkr_buf_free(&b);
174
+ }
175
+
176
+ /* buf_append: geometric growth is clamped to the content ceiling (max + the
177
+ * NUL), so a near-limit append never over-allocates to ~2x. Here filling to
178
+ * max(10) would geometrically reach cap 16; the clamp holds it at max+1(11),
179
+ * the same property that keeps a real buffer's cap under MKR_BUF_HARD_MAX. */
180
+ {
181
+ mkr_buf_t b;
182
+ mkr_buf_init(&b, 10);
183
+ if (mkr_buf_append(&b, "0123456789", 10) != MKR_OK) { mkr_buf_free(&b); return 63; }
184
+ if (b.len != 10 || b.cap > 11) { mkr_buf_free(&b); return 64; } /* clamped, not the geometric 16 */
185
+ if (mkr_buf_append(&b, "x", 1) != MKR_ERR_LIMIT) { mkr_buf_free(&b); return 65; } /* still capped */
186
+ mkr_buf_free(&b);
187
+ }
188
+
189
+ /* span: bounded reads - in-bounds values, out-of-bounds -1/false/clamp,
190
+ * never an overrun. Exercises every helper at its boundary. */
191
+ {
192
+ mkr_span_t s = mkr_span("abc", 3);
193
+ if (mkr_span_left(&s) != 3 || mkr_span_peek(&s) != 'a') return 66;
194
+ if (mkr_span_at(&s, 2) != 'c' || mkr_span_at(&s, 3) != -1) return 67; /* exact bound */
195
+ if (!mkr_span_starts(&s, "abc", 3) || mkr_span_starts(&s, "abcd", 4)) return 68;
196
+ if (mkr_span_take(&s) != 'a' || mkr_span_left(&s) != 2) return 69;
197
+ size_t idx = 99;
198
+ if (!mkr_span_find(&s, 'c', &idx) || idx != 1) return 70;
199
+ if (mkr_span_find(&s, 'z', &idx)) return 71; /* absent -> false */
200
+ const char *mark = mkr_span_mark(&s);
201
+ mkr_span_skip(&s, 5); /* clamped to the 2 left */
202
+ if (mkr_span_left(&s) != 0 || mkr_span_since(&s, mark) != 2) return 72;
203
+ if (mkr_span_peek(&s) != -1 || mkr_span_take(&s) != -1) return 73; /* empty: every read -1 */
204
+ if (mkr_span_take(&s) != -1) return 74; /* and stays empty (no underflow) */
205
+
206
+ mkr_span_t e = mkr_span(NULL, 8); /* NULL -> empty span over a VALID address */
207
+ if (e.p == NULL || e.p != e.end) return 89; /* normalized, no NULL arithmetic */
208
+ if (mkr_span_left(&e) != 0 || mkr_span_peek(&e) != -1 || mkr_span_starts(&e, "x", 1)) return 75;
209
+ if (!mkr_span_starts(&e, NULL, 0)) return 90; /* zero-length literal: always true */
210
+
211
+ mkr_span_t base = mkr_span("abcdef", 6);
212
+ mkr_span_t tl = mkr_span_tail(&base, 2); /* sub-span, parent unconsumed */
213
+ if (mkr_span_peek(&tl) != 'c' || mkr_span_left(&tl) != 4) return 91;
214
+ if (mkr_span_peek(&base) != 'a' || mkr_span_left(&base) != 6) return 92;
215
+ mkr_span_t tc = mkr_span_tail(&base, 9); /* past the end: clamped empty */
216
+ if (mkr_span_left(&tc) != 0 || mkr_span_peek(&tc) != -1) return 93;
217
+
218
+ if (!mkr_bytes_eq("ab", 2, "ab", 2) || mkr_bytes_eq("ab", 2, "ac", 2)
219
+ || mkr_bytes_eq("a", 1, "ab", 2) || !mkr_bytes_eq(NULL, 0, "x", 0)) return 76;
220
+
221
+ /* peek/at return bytes as 0..255, never sign-extended negatives */
222
+ mkr_span_t hb = mkr_span("\xFF", 1);
223
+ if (mkr_span_peek(&hb) != 0xFF || mkr_span_at(&hb, 0) != 0xFF) return 77;
224
+ }
225
+
226
+ /* utf8_decode1: strict - valid lengths 1-4; truncation / overlong /
227
+ * surrogate / out-of-range / bad lead all fail closed with 0. */
228
+ {
229
+ uint32_t cp = 0;
230
+ if (mkr_utf8_decode1((const unsigned char *)"A", 1, &cp) != 1 || cp != 'A') return 78;
231
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3\xA9", 2, &cp) != 2 || cp != 0xE9u) return 79;
232
+ if (mkr_utf8_decode1((const unsigned char *)"\xE3\x81\x82", 3, &cp) != 3 || cp != 0x3042u) return 80;
233
+ if (mkr_utf8_decode1((const unsigned char *)"\xF0\x9F\x98\x80", 4, &cp) != 4 || cp != 0x1F600u) return 81;
234
+ if (mkr_utf8_decode1((const unsigned char *)"", 0, &cp) != 0) return 82; /* empty */
235
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3", 1, &cp) != 0) return 83; /* truncated */
236
+ if (mkr_utf8_decode1((const unsigned char *)"\xC0\xAF", 2, &cp) != 0) return 84; /* overlong */
237
+ if (mkr_utf8_decode1((const unsigned char *)"\xED\xA0\x80", 3, &cp) != 0) return 85; /* surrogate */
238
+ if (mkr_utf8_decode1((const unsigned char *)"\xF4\x90\x80\x80", 4, &cp) != 0) return 86; /* > U+10FFFF */
239
+ if (mkr_utf8_decode1((const unsigned char *)"\x80", 1, &cp) != 0) return 87; /* stray cont. */
240
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3\x28", 2, &cp) != 0) return 88; /* bad cont. */
241
+ }
242
+
100
243
  return 0;
101
244
  }
@@ -8,14 +8,23 @@
8
8
  * mkr_alloc.h fail-closed size arithmetic + allocators (the foundation)
9
9
  * mkr_hash.h pointer hash + power-of-two sizer (pointer-keyed index tables)
10
10
  * mkr_text.h string-type lattice (owned/borrowed/verified text + bytes)
11
- * mkr_buf.h mkr_buf_t (growable, capped byte buffer)
11
+ * mkr_utf8.h the one pure-C UTF-8 validator + strict 1-cp decoder
12
+ * mkr_buf.h mkr_buf_t (growable, capped byte buffer) + mkr_spanbuf_t
13
+ * mkr_span.h mkr_span_t (bounded reader - the spanbuf's read twin)
12
14
  *
13
- * NOTHING here touches Ruby exception mapping happens at the glue boundary.
15
+ * NOTHING here touches Ruby - exception mapping happens at the glue boundary.
14
16
  */
15
17
 
16
18
  #include "mkr_alloc.h"
17
19
  #include "mkr_hash.h"
18
20
  #include "mkr_text.h"
21
+ #include "mkr_span.h"
22
+ #include "mkr_utf8.h"
19
23
  #include "mkr_buf.h"
20
24
 
25
+ /* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
26
+ * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
27
+ * Wired to a private Ruby method for the spec suite. */
28
+ int mkr_core_selftest(void);
29
+
21
30
  #endif /* MAKIRI_CORE_MKR_CORE_H */
@@ -35,7 +35,7 @@ mkr_ptr_hash(const void *p)
35
35
 
36
36
  /* Smallest power of two >= n, into *out. Returns false on overflow (no power of
37
37
  * two >= n fits in size_t) so the caller fails closed rather than sizing a
38
- * power-of-two hash table below the element count it must hold which would
38
+ * power-of-two hash table below the element count it must hold - which would
39
39
  * never find a free slot under linear probing. Shared by the pointer-keyed
40
40
  * indexes (attr->owner, text-index). */
41
41
  static inline bool
@@ -0,0 +1,186 @@
1
+ #ifndef MAKIRI_CORE_MKR_SPAN_H
2
+ #define MAKIRI_CORE_MKR_SPAN_H
3
+
4
+ /*
5
+ * mkr_span_t - a bounded READER over a borrowed byte region: the read twin of
6
+ * mkr_spanbuf_t (mkr_buf.h), completing the structural-safety model:
7
+ *
8
+ * allocate write read
9
+ * structure checked wrappers mkr_spanbuf_t mkr_span_t (this)
10
+ * enforced lint (direct_alloc) (arena's only way) lint (raw_scan_call /
11
+ * raw_cursor_member)
12
+ *
13
+ * Byte-scanning parsers used to guard every read by convention ("check
14
+ * p < end, then *p") - correct at every current site, but a single forgotten
15
+ * check is invisible to the compiler. A span owns the cursor AND the bound:
16
+ * an out-of-bounds read is not an overrun but a -1 / false / clamp, so the
17
+ * unchecked-read bug class is structurally impossible inside a converted TU.
18
+ * The lint (script/check_c_safety.rb) bans the raw scanning primitives in the
19
+ * converted parser TUs, turning the convention into a machine-enforced rule.
20
+ *
21
+ * Every helper is a static inline whose bound check compiles to the same
22
+ * single compare the hand-written guard used - performance-neutral.
23
+ *
24
+ * Like a spanbuf, the BORROW is the caller's responsibility: the span cannot
25
+ * stop you from outliving the buffer it views (input lifetime is handled at
26
+ * the bridge: parse entries copy, borrowed slices never cross a GC point).
27
+ * mkr_span_mark/mkr_span_since exist to CAPTURE slices (pointer arithmetic,
28
+ * never a dereference); reading a captured slice goes back through a span or
29
+ * an audited core primitive.
30
+ */
31
+
32
+ #include <stdbool.h>
33
+ #include <stddef.h>
34
+ #include <string.h>
35
+
36
+ #ifdef __cplusplus
37
+ extern "C" {
38
+ #endif
39
+
40
+ typedef struct {
41
+ const char *p; /* cursor (always <= end) */
42
+ const char *end; /* one past the last readable byte */
43
+ } mkr_span_t;
44
+
45
+ /* Wrap [ptr, ptr+len) for bounded reading. ptr == NULL yields the empty span
46
+ * (every read -1 / false), so an absent input flows through without a guard.
47
+ * The NULL case is normalized to a VALID empty address (not p = end = NULL):
48
+ * the helpers do pointer subtraction and relational compares, which C defines
49
+ * only within one object - NULL - NULL / NULL < NULL is formally UB - so the
50
+ * normalization here keeps every helper unconditional AND well-defined. */
51
+ static inline mkr_span_t
52
+ mkr_span(const char *ptr, size_t len)
53
+ {
54
+ if (ptr == NULL) { ptr = ""; len = 0; }
55
+ mkr_span_t s;
56
+ s.p = ptr;
57
+ s.end = ptr + len;
58
+ return s;
59
+ }
60
+
61
+ /* Bytes remaining. */
62
+ static inline size_t
63
+ mkr_span_left(const mkr_span_t *s)
64
+ {
65
+ return (size_t)(s->end - s->p);
66
+ }
67
+
68
+ /* The byte at the cursor as 0..255, or -1 at end-of-span. */
69
+ static inline int
70
+ mkr_span_peek(const mkr_span_t *s)
71
+ {
72
+ return s->p < s->end ? (unsigned char)*s->p : -1;
73
+ }
74
+
75
+ /* The byte at cursor+i (bounded lookahead), or -1 past the end. */
76
+ static inline int
77
+ mkr_span_at(const mkr_span_t *s, size_t i)
78
+ {
79
+ return i < mkr_span_left(s) ? (unsigned char)s->p[i] : -1;
80
+ }
81
+
82
+ /* Consume and return the byte at the cursor, or -1 at end-of-span. */
83
+ static inline int
84
+ mkr_span_take(mkr_span_t *s)
85
+ {
86
+ return s->p < s->end ? (unsigned char)*s->p++ : -1;
87
+ }
88
+
89
+ /* Advance up to n bytes (clamped at the end - never past it). */
90
+ static inline void
91
+ mkr_span_skip(mkr_span_t *s, size_t n)
92
+ {
93
+ size_t left = mkr_span_left(s);
94
+ s->p += (n <= left) ? n : left;
95
+ }
96
+
97
+ /* True if the remaining input begins with the n-byte literal. n == 0 is true
98
+ * regardless of lit (even NULL - aligned with mkr_bytes_eq; also keeps the
99
+ * memcmp away from a possibly-NULL lit, which is formally UB even for 0). */
100
+ static inline bool
101
+ mkr_span_starts(const mkr_span_t *s, const char *lit, size_t n)
102
+ {
103
+ if (n == 0) return true;
104
+ return mkr_span_left(s) >= n && memcmp(s->p, lit, n) == 0;
105
+ }
106
+
107
+ /* Find byte c in the remaining input; true + its offset from the cursor in
108
+ * *idx, or false if absent (idx untouched). */
109
+ static inline bool
110
+ mkr_span_find(const mkr_span_t *s, char c, size_t *idx)
111
+ {
112
+ const char *hit = (const char *)memchr(s->p, c, mkr_span_left(s));
113
+ if (hit == NULL) return false;
114
+ *idx = (size_t)(hit - s->p);
115
+ return true;
116
+ }
117
+
118
+ /* The remaining input from cursor+off onward, as a fresh sub-span (clamped at
119
+ * the end). For bounded lookahead scans that must not consume the parent. */
120
+ static inline mkr_span_t
121
+ mkr_span_tail(const mkr_span_t *s, size_t off)
122
+ {
123
+ mkr_span_t t = *s;
124
+ mkr_span_skip(&t, off);
125
+ return t;
126
+ }
127
+
128
+ /* The cursor position, for capturing a slice start (an address, NEVER read
129
+ * through directly - pair with mkr_span_since and hand the slice to a span or
130
+ * an audited primitive). */
131
+ static inline const char *
132
+ mkr_span_mark(const mkr_span_t *s)
133
+ {
134
+ return s->p;
135
+ }
136
+
137
+ /* Bytes consumed since +mark+ (a prior mkr_span_mark of the SAME span). */
138
+ static inline size_t
139
+ mkr_span_since(const mkr_span_t *s, const char *mark)
140
+ {
141
+ return (size_t)(s->p - mark);
142
+ }
143
+
144
+ /* Length-checked slice equality (the audited replacement for an open-coded
145
+ * memcmp over two captured slices). Zero-length slices are equal regardless
146
+ * of pointers (a NULL "" never gets dereferenced). */
147
+ static inline bool
148
+ mkr_bytes_eq(const void *a, size_t alen, const void *b, size_t blen)
149
+ {
150
+ return alen == blen && (alen == 0 || memcmp(a, b, alen) == 0);
151
+ }
152
+
153
+ /* Substring search: find the first occurrence of [needle, needle+needle_len)
154
+ * within [hay, hay+hay_len). On a hit returns true and writes the byte offset to
155
+ * *idx; on a miss returns false and leaves *idx untouched. The audited
156
+ * replacement for an open-coded substring memcmp (a scan, so it lives in core).
157
+ * Boundary behavior mirrors mkr_bytes_eq's: lengths are the truth, pointers are
158
+ * never dereferenced at length 0 -
159
+ * - an empty needle (needle_len == 0) matches at offset 0 in ANY haystack,
160
+ * including an empty or NULL "" one (the substring analogue of two empty
161
+ * slices comparing equal);
162
+ * - a needle longer than the haystack never matches, so a NULL/empty haystack
163
+ * misses every non-empty needle without a read. */
164
+ static inline bool
165
+ mkr_bytes_find(const void *hay, size_t hay_len,
166
+ const void *needle, size_t needle_len, size_t *idx)
167
+ {
168
+ if (needle_len == 0) { *idx = 0; return true; }
169
+ if (needle_len > hay_len) return false;
170
+ const char *h = (const char *)hay;
171
+ const char *n = (const char *)needle;
172
+ size_t last = hay_len - needle_len;
173
+ for (size_t i = 0; i <= last; ++i) {
174
+ if (h[i] == n[0] && memcmp(h + i, n, needle_len) == 0) {
175
+ *idx = i;
176
+ return true;
177
+ }
178
+ }
179
+ return false;
180
+ }
181
+
182
+ #ifdef __cplusplus
183
+ }
184
+ #endif
185
+
186
+ #endif /* MAKIRI_CORE_MKR_SPAN_H */
@@ -16,7 +16,7 @@ extern "C" {
16
16
  #endif
17
17
 
18
18
  /* ---------------------------------------------------------------- */
19
- /* mkr_verified_text_t a string proven to meet the engine text contract */
19
+ /* mkr_verified_text_t - a string proven to meet the engine text contract */
20
20
  /* ---------------------------------------------------------------- */
21
21
 
22
22
  /* A borrowed byte slice whose contents are guaranteed to satisfy Makiri's
@@ -39,7 +39,7 @@ typedef struct {
39
39
  /*
40
40
  * Makiri's string types form a small lattice over two axes plus a shape marker.
41
41
  * They look alike ({ptr,len}) but C has no subtyping, so each contract is its
42
- * own type that distinctness IS the guarantee, and is why there is no single
42
+ * own type - that distinctness IS the guarantee, and is why there is no single
43
43
  * "string" type.
44
44
  *
45
45
  * axis 1 ownership : borrowed (we never free) | owned (free via *_clear)
@@ -51,7 +51,7 @@ typedef struct {
51
51
  * shape \ contract raw (bytes) valid (text)
52
52
  * ---------------------- ------------------------ -------------------------
53
53
  * ruby-anchored borrowed mkr_ruby_borrowed_bytes_t mkr_ruby_borrowed_text_t (bridge.h)
54
- * borrowed slice (none yet would be mkr_borrowed_text_t /
54
+ * borrowed slice (none yet - would be mkr_borrowed_text_t /
55
55
  * mkr_borrowed_bytes_t) mkr_verified_text_t (*)
56
56
  * owned mkr_owned_bytes_t mkr_owned_text_t
57
57
  *
@@ -65,22 +65,22 @@ typedef struct {
65
65
  * cannot reach the engine's public API. Internally the engine carries the
66
66
  * freely-constructible mkr_borrowed_text_t instead.
67
67
  *
68
- * Conversions the only sanctioned edges. The points that actually VALIDATE
68
+ * Conversions - the only sanctioned edges. The points that actually VALIDATE
69
69
  * raw bytes are the bridge's checked entry points; everything else only moves
70
70
  * already-valid text between shapes (no edge re-validates, and none turns raw
71
71
  * bytes into text without one of those checks):
72
- * validate raw -> valid : the bridge's checked entry points only
72
+ * validate raw -> valid : the bridge's checked entry points only -
73
73
  * mkr_ruby_verified_text / mkr_ruby_try_verified_text
74
74
  * (both validate UTF-8 + no NUL); never a cast.
75
75
  * drop the GC anchor : mkr_verified_text_from_view (ruby_borrowed_text -> verified_text)
76
76
  * assert valid (no copy) : mkr_borrowed_text (const char*,len -> borrowed_text)
77
- * caller asserts the bytes already meet the contract
77
+ * - caller asserts the bytes already meet the contract
78
78
  * downgrade to borrow : mkr_borrowed_text_from_owned (owned_text -> borrowed_text)
79
79
  * mkr_borrowed_text_from_verified (verified_text -> borrowed_text)
80
80
  * copy into owned : mkr_owned_text_from_borrowed_copy /
81
- * mkr_owned_text_from_buf_steal accept only
81
+ * mkr_owned_text_from_buf_steal - accept only
82
82
  * already-asserted-valid text; they copy, not validate.
83
- * take ownership : mkr_owned_text (char*,len -> owned_text) caller
83
+ * take ownership : mkr_owned_text (char*,len -> owned_text) - caller
84
84
  * transfers an already-valid heap buffer it produced
85
85
  * (substring/concat/format output); asserts validity.
86
86
  */