makiri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/security.yml +88 -3
  5. data/.github/workflows/valgrind.yml +135 -0
  6. data/CHANGELOG.md +60 -2
  7. data/README.md +81 -77
  8. data/Rakefile +194 -3
  9. data/ext/makiri/bridge/ruby_string.c +119 -66
  10. data/ext/makiri/core/mkr_alloc.c +40 -3
  11. data/ext/makiri/core/mkr_alloc.h +27 -4
  12. data/ext/makiri/core/mkr_buf.c +13 -3
  13. data/ext/makiri/core/mkr_buf.h +80 -5
  14. data/ext/makiri/core/mkr_core.c +143 -0
  15. data/ext/makiri/core/mkr_core.h +10 -1
  16. data/ext/makiri/core/mkr_span.h +186 -0
  17. data/ext/makiri/core/mkr_utf8.c +101 -0
  18. data/ext/makiri/core/mkr_utf8.h +88 -0
  19. data/ext/makiri/extconf.rb +104 -9
  20. data/ext/makiri/fuzz/Makefile +95 -0
  21. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  22. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  23. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  24. data/ext/makiri/glue/glue.h +8 -0
  25. data/ext/makiri/glue/ruby_doc.c +20 -24
  26. data/ext/makiri/glue/ruby_html_css.c +58 -12
  27. data/ext/makiri/glue/ruby_html_mutate.c +11 -6
  28. data/ext/makiri/glue/ruby_html_node.c +3 -32
  29. data/ext/makiri/glue/ruby_node.c +39 -0
  30. data/ext/makiri/glue/ruby_xml.c +198 -16
  31. data/ext/makiri/glue/ruby_xml_node.c +46 -59
  32. data/ext/makiri/glue/ruby_xpath.c +4 -4
  33. data/ext/makiri/lexbor_compat/source_loc.c +14 -16
  34. data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
  35. data/ext/makiri/makiri.c +45 -0
  36. data/ext/makiri/xml/mkr_xml.h +2 -3
  37. data/ext/makiri/xml/mkr_xml_chars.c +67 -97
  38. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  39. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  40. data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
  41. data/ext/makiri/xml/mkr_xml_node.c +147 -15
  42. data/ext/makiri/xml/mkr_xml_node.h +71 -6
  43. data/ext/makiri/xml/mkr_xml_tree.c +185 -149
  44. data/ext/makiri/xpath/mkr_css.c +1023 -0
  45. data/ext/makiri/xpath/mkr_css.h +65 -0
  46. data/ext/makiri/xpath/mkr_xpath.c +37 -0
  47. data/ext/makiri/xpath/mkr_xpath.h +13 -0
  48. data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
  49. data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
  50. data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
  51. data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
  53. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  54. data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
  55. data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
  56. data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
  57. data/lib/makiri/cdata_section.rb +1 -3
  58. data/lib/makiri/comment.rb +1 -3
  59. data/lib/makiri/document.rb +8 -0
  60. data/lib/makiri/element.rb +1 -3
  61. data/lib/makiri/processing_instruction.rb +1 -3
  62. data/lib/makiri/text.rb +1 -3
  63. data/lib/makiri/version.rb +1 -1
  64. data/lib/makiri/xml/builder.rb +263 -0
  65. data/lib/makiri/xml/node_methods.rb +47 -0
  66. data/lib/makiri.rb +1 -0
  67. data/script/check_alloc_failures.rb +266 -0
  68. data/script/check_c_safety.rb +45 -2
  69. data/script/check_c_safety_allowlist.yml +19 -0
  70. data/script/check_leaks.rb +64 -0
  71. data/script/leaks_harness.rb +64 -0
  72. data/vendor/lexbor/CMakeLists.txt +6 -0
  73. data/vendor/lexbor/README.md +12 -0
  74. data/vendor/lexbor/config.cmake +1 -1
  75. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  76. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  77. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  78. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  79. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  80. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  81. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  82. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  83. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  84. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  85. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  86. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  87. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  88. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  89. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  90. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  91. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  92. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  93. data/vendor/lexbor/version +1 -1
  94. metadata +19 -1
@@ -18,6 +18,12 @@ mkr_core_selftest(void)
18
18
  if (!mkr_size_mul(4, 5, &out) || out != 20) return 3;
19
19
  if (mkr_size_mul(SIZE_MAX / 2 + 1, 2, &out)) return 4; /* must overflow */
20
20
  if (!mkr_size_mul(0, SIZE_MAX, &out) || out != 0) return 5;
21
+ /* exact-boundary: the largest non-overflowing result MUST succeed (catches an
22
+ * off-by-one in the `a > SIZE_MAX - b` / `b > SIZE_MAX / a` predicates). */
23
+ if (!mkr_size_add(SIZE_MAX, 0, &out) || out != SIZE_MAX) return 55;
24
+ if (!mkr_size_add(SIZE_MAX - 1, 1, &out) || out != SIZE_MAX) return 56;
25
+ if (!mkr_size_mul(SIZE_MAX, 1, &out) || out != SIZE_MAX) return 57;
26
+ if (!mkr_size_mul(SIZE_MAX / 2, 2, &out) || out != (SIZE_MAX / 2) * 2) return 58;
21
27
 
22
28
  /* grow_capacity: geometric, and overflow on a huge element */
23
29
  if (!mkr_grow_capacity(0, 1, 1, &out) || out < 1) return 6;
@@ -39,6 +45,9 @@ mkr_core_selftest(void)
39
45
  /* str_alloc / strndup / strdup: copy, terminate, fail closed on NULL+len */
40
46
  {
41
47
  if (mkr_str_alloc(SIZE_MAX) != NULL) return 26; /* n + 1 overflow */
48
+ char *z = mkr_str_alloc(0); /* boundary: 0 -> 1-byte "\0" */
49
+ if (z == NULL || z[0] != '\0') { free(z); return 59; }
50
+ free(z);
42
51
  char *p = mkr_strndup("hello", 3);
43
52
  if (p == NULL || memcmp(p, "hel", 4) != 0) { free(p); return 27; } /* "hel\0" */
44
53
  free(p);
@@ -97,5 +106,139 @@ mkr_core_selftest(void)
97
106
  free(s);
98
107
  }
99
108
 
109
+ /* spanbuf: writes that fit, then one that overruns -> refused + sticky */
110
+ {
111
+ char store[4];
112
+ mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
113
+ mkr_spanbuf_putc(&f, 'a');
114
+ mkr_spanbuf_write(&f, "bc", 2); /* pos == 3, room for 1 */
115
+ if (!f.ok || f.pos != 3) return 35;
116
+ mkr_spanbuf_write(&f, "de", 2); /* exceeds cap -> refused */
117
+ if (f.ok) return 36; /* must have latched */
118
+ if (f.pos != 3) return 37; /* refused write didn't advance */
119
+ if (mkr_spanbuf_finish(&f) != NULL) return 38; /* not-ok -> NULL */
120
+ if (memcmp(store, "abc", 3) != 0) return 39; /* no overrun past pos */
121
+ }
122
+
123
+ /* spanbuf: exact fill is ok; a NULL backing buffer is never ok */
124
+ {
125
+ char store[2];
126
+ mkr_spanbuf_t f = mkr_spanbuf(store, sizeof store);
127
+ mkr_spanbuf_write(&f, "xy", 2); /* exactly cap -> still ok */
128
+ if (!f.ok || mkr_spanbuf_finish(&f) != store || f.pos != 2) return 40;
129
+ mkr_spanbuf_putc(&f, 'z'); /* boundary: at pos==cap -> refused */
130
+ if (f.ok || f.pos != 2) return 60;
131
+
132
+ mkr_spanbuf_t g = mkr_spanbuf(NULL, 8); /* alloc-failed backing */
133
+ mkr_spanbuf_putc(&g, 'z'); /* no-op, no crash */
134
+ if (g.ok || mkr_spanbuf_finish(&g) != NULL) return 41;
135
+ }
136
+
137
+ /* grow_reserve: grows + updates ptr/cap; already-enough is a no-op; an
138
+ * overflowing (need*elem) request fails closed (OOM) leaving ptr/cap as-is. */
139
+ {
140
+ size_t *p = NULL, cap = 0;
141
+ if (mkr_grow_reserve((void **)&p, &cap, 10, sizeof(*p)) != MKR_OK) { free(p); return 42; }
142
+ if (p == NULL || cap < 10) { free(p); return 43; }
143
+ p[0] = 1; p[9] = 2; /* in-bounds after grow */
144
+
145
+ size_t *d0 = p; size_t c0 = cap;
146
+ if (mkr_grow_reserve((void **)&p, &cap, 5, sizeof(*p)) != MKR_OK) { free(p); return 44; }
147
+ if (p != d0 || cap != c0) { free(p); return 45; } /* need < cap -> no-op */
148
+ if (mkr_grow_reserve((void **)&p, &cap, cap, sizeof(*p)) != MKR_OK) { free(p); return 61; }
149
+ if (p != d0 || cap != c0) { free(p); return 62; } /* boundary: need == cap -> no-op */
150
+
151
+ if (mkr_grow_reserve((void **)&p, &cap, SIZE_MAX, sizeof(*p)) != MKR_ERR_OOM) { free(p); return 46; }
152
+ if (p != d0 || cap != c0) { free(p); return 47; } /* overflow -> unchanged */
153
+ free(p);
154
+ }
155
+
156
+ /* buf_reserve: pre-allocates capacity without touching len; no-op when room
157
+ * already exists; clamps a request to the buffer's max (never allocates past
158
+ * it); the buffer stays usable afterwards. */
159
+ {
160
+ mkr_buf_t b;
161
+ mkr_buf_init(&b, 16); /* small ceiling */
162
+ if (mkr_buf_reserve(&b, 8) != MKR_OK) { mkr_buf_free(&b); return 48; }
163
+ if (b.cap < 9 || b.len != 0) { mkr_buf_free(&b); return 49; } /* reserved (+NUL), still empty */
164
+
165
+ char *d0 = b.data; size_t c0 = b.cap;
166
+ if (mkr_buf_reserve(&b, 4) != MKR_OK) { mkr_buf_free(&b); return 50; }
167
+ if (b.data != d0 || b.cap != c0) { mkr_buf_free(&b); return 51; } /* already room -> no-op */
168
+
169
+ if (mkr_buf_reserve(&b, (size_t)1 << 40) != MKR_OK) { mkr_buf_free(&b); return 52; }
170
+ if (b.cap > 17) { mkr_buf_free(&b); return 53; } /* clamped to max(16)+NUL, no huge alloc */
171
+
172
+ if (mkr_buf_append(&b, "abc", 3) != MKR_OK || b.len != 3) { mkr_buf_free(&b); return 54; }
173
+ mkr_buf_free(&b);
174
+ }
175
+
176
+ /* buf_append: geometric growth is clamped to the content ceiling (max + the
177
+ * NUL), so a near-limit append never over-allocates to ~2x. Here filling to
178
+ * max(10) would geometrically reach cap 16; the clamp holds it at max+1(11),
179
+ * the same property that keeps a real buffer's cap under MKR_BUF_HARD_MAX. */
180
+ {
181
+ mkr_buf_t b;
182
+ mkr_buf_init(&b, 10);
183
+ if (mkr_buf_append(&b, "0123456789", 10) != MKR_OK) { mkr_buf_free(&b); return 63; }
184
+ if (b.len != 10 || b.cap > 11) { mkr_buf_free(&b); return 64; } /* clamped, not the geometric 16 */
185
+ if (mkr_buf_append(&b, "x", 1) != MKR_ERR_LIMIT) { mkr_buf_free(&b); return 65; } /* still capped */
186
+ mkr_buf_free(&b);
187
+ }
188
+
189
+ /* span: bounded reads - in-bounds values, out-of-bounds -1/false/clamp,
190
+ * never an overrun. Exercises every helper at its boundary. */
191
+ {
192
+ mkr_span_t s = mkr_span("abc", 3);
193
+ if (mkr_span_left(&s) != 3 || mkr_span_peek(&s) != 'a') return 66;
194
+ if (mkr_span_at(&s, 2) != 'c' || mkr_span_at(&s, 3) != -1) return 67; /* exact bound */
195
+ if (!mkr_span_starts(&s, "abc", 3) || mkr_span_starts(&s, "abcd", 4)) return 68;
196
+ if (mkr_span_take(&s) != 'a' || mkr_span_left(&s) != 2) return 69;
197
+ size_t idx = 99;
198
+ if (!mkr_span_find(&s, 'c', &idx) || idx != 1) return 70;
199
+ if (mkr_span_find(&s, 'z', &idx)) return 71; /* absent -> false */
200
+ const char *mark = mkr_span_mark(&s);
201
+ mkr_span_skip(&s, 5); /* clamped to the 2 left */
202
+ if (mkr_span_left(&s) != 0 || mkr_span_since(&s, mark) != 2) return 72;
203
+ if (mkr_span_peek(&s) != -1 || mkr_span_take(&s) != -1) return 73; /* empty: every read -1 */
204
+ if (mkr_span_take(&s) != -1) return 74; /* and stays empty (no underflow) */
205
+
206
+ mkr_span_t e = mkr_span(NULL, 8); /* NULL -> empty span over a VALID address */
207
+ if (e.p == NULL || e.p != e.end) return 89; /* normalized, no NULL arithmetic */
208
+ if (mkr_span_left(&e) != 0 || mkr_span_peek(&e) != -1 || mkr_span_starts(&e, "x", 1)) return 75;
209
+ if (!mkr_span_starts(&e, NULL, 0)) return 90; /* zero-length literal: always true */
210
+
211
+ mkr_span_t base = mkr_span("abcdef", 6);
212
+ mkr_span_t tl = mkr_span_tail(&base, 2); /* sub-span, parent unconsumed */
213
+ if (mkr_span_peek(&tl) != 'c' || mkr_span_left(&tl) != 4) return 91;
214
+ if (mkr_span_peek(&base) != 'a' || mkr_span_left(&base) != 6) return 92;
215
+ mkr_span_t tc = mkr_span_tail(&base, 9); /* past the end: clamped empty */
216
+ if (mkr_span_left(&tc) != 0 || mkr_span_peek(&tc) != -1) return 93;
217
+
218
+ if (!mkr_bytes_eq("ab", 2, "ab", 2) || mkr_bytes_eq("ab", 2, "ac", 2)
219
+ || mkr_bytes_eq("a", 1, "ab", 2) || !mkr_bytes_eq(NULL, 0, "x", 0)) return 76;
220
+
221
+ /* peek/at return bytes as 0..255, never sign-extended negatives */
222
+ mkr_span_t hb = mkr_span("\xFF", 1);
223
+ if (mkr_span_peek(&hb) != 0xFF || mkr_span_at(&hb, 0) != 0xFF) return 77;
224
+ }
225
+
226
+ /* utf8_decode1: strict - valid lengths 1-4; truncation / overlong /
227
+ * surrogate / out-of-range / bad lead all fail closed with 0. */
228
+ {
229
+ uint32_t cp = 0;
230
+ if (mkr_utf8_decode1((const unsigned char *)"A", 1, &cp) != 1 || cp != 'A') return 78;
231
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3\xA9", 2, &cp) != 2 || cp != 0xE9u) return 79;
232
+ if (mkr_utf8_decode1((const unsigned char *)"\xE3\x81\x82", 3, &cp) != 3 || cp != 0x3042u) return 80;
233
+ if (mkr_utf8_decode1((const unsigned char *)"\xF0\x9F\x98\x80", 4, &cp) != 4 || cp != 0x1F600u) return 81;
234
+ if (mkr_utf8_decode1((const unsigned char *)"", 0, &cp) != 0) return 82; /* empty */
235
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3", 1, &cp) != 0) return 83; /* truncated */
236
+ if (mkr_utf8_decode1((const unsigned char *)"\xC0\xAF", 2, &cp) != 0) return 84; /* overlong */
237
+ if (mkr_utf8_decode1((const unsigned char *)"\xED\xA0\x80", 3, &cp) != 0) return 85; /* surrogate */
238
+ if (mkr_utf8_decode1((const unsigned char *)"\xF4\x90\x80\x80", 4, &cp) != 0) return 86; /* > U+10FFFF */
239
+ if (mkr_utf8_decode1((const unsigned char *)"\x80", 1, &cp) != 0) return 87; /* stray cont. */
240
+ if (mkr_utf8_decode1((const unsigned char *)"\xC3\x28", 2, &cp) != 0) return 88; /* bad cont. */
241
+ }
242
+
100
243
  return 0;
101
244
  }
@@ -8,7 +8,9 @@
8
8
  * mkr_alloc.h fail-closed size arithmetic + allocators (the foundation)
9
9
  * mkr_hash.h pointer hash + power-of-two sizer (pointer-keyed index tables)
10
10
  * mkr_text.h string-type lattice (owned/borrowed/verified text + bytes)
11
- * mkr_buf.h mkr_buf_t (growable, capped byte buffer)
11
+ * mkr_utf8.h the one pure-C UTF-8 validator + strict 1-cp decoder
12
+ * mkr_buf.h mkr_buf_t (growable, capped byte buffer) + mkr_spanbuf_t
13
+ * mkr_span.h mkr_span_t (bounded reader - the spanbuf's read twin)
12
14
  *
13
15
  * NOTHING here touches Ruby - exception mapping happens at the glue boundary.
14
16
  */
@@ -16,6 +18,13 @@
16
18
  #include "mkr_alloc.h"
17
19
  #include "mkr_hash.h"
18
20
  #include "mkr_text.h"
21
+ #include "mkr_span.h"
22
+ #include "mkr_utf8.h"
19
23
  #include "mkr_buf.h"
20
24
 
25
+ /* Self-test of the overflow / allocation / buffer edge cases (incl. paths real
26
+ * inputs cannot reach). Returns 0 on success, nonzero on the first failure.
27
+ * Wired to a private Ruby method for the spec suite. */
28
+ int mkr_core_selftest(void);
29
+
21
30
  #endif /* MAKIRI_CORE_MKR_CORE_H */
@@ -0,0 +1,186 @@
1
+ #ifndef MAKIRI_CORE_MKR_SPAN_H
2
+ #define MAKIRI_CORE_MKR_SPAN_H
3
+
4
+ /*
5
+ * mkr_span_t - a bounded READER over a borrowed byte region: the read twin of
6
+ * mkr_spanbuf_t (mkr_buf.h), completing the structural-safety model:
7
+ *
8
+ * allocate write read
9
+ * structure checked wrappers mkr_spanbuf_t mkr_span_t (this)
10
+ * enforced lint (direct_alloc) (arena's only way) lint (raw_scan_call /
11
+ * raw_cursor_member)
12
+ *
13
+ * Byte-scanning parsers used to guard every read by convention ("check
14
+ * p < end, then *p") - correct at every current site, but a single forgotten
15
+ * check is invisible to the compiler. A span owns the cursor AND the bound:
16
+ * an out-of-bounds read is not an overrun but a -1 / false / clamp, so the
17
+ * unchecked-read bug class is structurally impossible inside a converted TU.
18
+ * The lint (script/check_c_safety.rb) bans the raw scanning primitives in the
19
+ * converted parser TUs, turning the convention into a machine-enforced rule.
20
+ *
21
+ * Every helper is a static inline whose bound check compiles to the same
22
+ * single compare the hand-written guard used - performance-neutral.
23
+ *
24
+ * Like a spanbuf, the BORROW is the caller's responsibility: the span cannot
25
+ * stop you from outliving the buffer it views (input lifetime is handled at
26
+ * the bridge: parse entries copy, borrowed slices never cross a GC point).
27
+ * mkr_span_mark/mkr_span_since exist to CAPTURE slices (pointer arithmetic,
28
+ * never a dereference); reading a captured slice goes back through a span or
29
+ * an audited core primitive.
30
+ */
31
+
32
+ #include <stdbool.h>
33
+ #include <stddef.h>
34
+ #include <string.h>
35
+
36
+ #ifdef __cplusplus
37
+ extern "C" {
38
+ #endif
39
+
40
+ typedef struct {
41
+ const char *p; /* cursor (always <= end) */
42
+ const char *end; /* one past the last readable byte */
43
+ } mkr_span_t;
44
+
45
+ /* Wrap [ptr, ptr+len) for bounded reading. ptr == NULL yields the empty span
46
+ * (every read -1 / false), so an absent input flows through without a guard.
47
+ * The NULL case is normalized to a VALID empty address (not p = end = NULL):
48
+ * the helpers do pointer subtraction and relational compares, which C defines
49
+ * only within one object - NULL - NULL / NULL < NULL is formally UB - so the
50
+ * normalization here keeps every helper unconditional AND well-defined. */
51
+ static inline mkr_span_t
52
+ mkr_span(const char *ptr, size_t len)
53
+ {
54
+ if (ptr == NULL) { ptr = ""; len = 0; }
55
+ mkr_span_t s;
56
+ s.p = ptr;
57
+ s.end = ptr + len;
58
+ return s;
59
+ }
60
+
61
+ /* Bytes remaining. */
62
+ static inline size_t
63
+ mkr_span_left(const mkr_span_t *s)
64
+ {
65
+ return (size_t)(s->end - s->p);
66
+ }
67
+
68
+ /* The byte at the cursor as 0..255, or -1 at end-of-span. */
69
+ static inline int
70
+ mkr_span_peek(const mkr_span_t *s)
71
+ {
72
+ return s->p < s->end ? (unsigned char)*s->p : -1;
73
+ }
74
+
75
+ /* The byte at cursor+i (bounded lookahead), or -1 past the end. */
76
+ static inline int
77
+ mkr_span_at(const mkr_span_t *s, size_t i)
78
+ {
79
+ return i < mkr_span_left(s) ? (unsigned char)s->p[i] : -1;
80
+ }
81
+
82
+ /* Consume and return the byte at the cursor, or -1 at end-of-span. */
83
+ static inline int
84
+ mkr_span_take(mkr_span_t *s)
85
+ {
86
+ return s->p < s->end ? (unsigned char)*s->p++ : -1;
87
+ }
88
+
89
+ /* Advance up to n bytes (clamped at the end - never past it). */
90
+ static inline void
91
+ mkr_span_skip(mkr_span_t *s, size_t n)
92
+ {
93
+ size_t left = mkr_span_left(s);
94
+ s->p += (n <= left) ? n : left;
95
+ }
96
+
97
+ /* True if the remaining input begins with the n-byte literal. n == 0 is true
98
+ * regardless of lit (even NULL - aligned with mkr_bytes_eq; also keeps the
99
+ * memcmp away from a possibly-NULL lit, which is formally UB even for 0). */
100
+ static inline bool
101
+ mkr_span_starts(const mkr_span_t *s, const char *lit, size_t n)
102
+ {
103
+ if (n == 0) return true;
104
+ return mkr_span_left(s) >= n && memcmp(s->p, lit, n) == 0;
105
+ }
106
+
107
+ /* Find byte c in the remaining input; true + its offset from the cursor in
108
+ * *idx, or false if absent (idx untouched). */
109
+ static inline bool
110
+ mkr_span_find(const mkr_span_t *s, char c, size_t *idx)
111
+ {
112
+ const char *hit = (const char *)memchr(s->p, c, mkr_span_left(s));
113
+ if (hit == NULL) return false;
114
+ *idx = (size_t)(hit - s->p);
115
+ return true;
116
+ }
117
+
118
+ /* The remaining input from cursor+off onward, as a fresh sub-span (clamped at
119
+ * the end). For bounded lookahead scans that must not consume the parent. */
120
+ static inline mkr_span_t
121
+ mkr_span_tail(const mkr_span_t *s, size_t off)
122
+ {
123
+ mkr_span_t t = *s;
124
+ mkr_span_skip(&t, off);
125
+ return t;
126
+ }
127
+
128
+ /* The cursor position, for capturing a slice start (an address, NEVER read
129
+ * through directly - pair with mkr_span_since and hand the slice to a span or
130
+ * an audited primitive). */
131
+ static inline const char *
132
+ mkr_span_mark(const mkr_span_t *s)
133
+ {
134
+ return s->p;
135
+ }
136
+
137
+ /* Bytes consumed since +mark+ (a prior mkr_span_mark of the SAME span). */
138
+ static inline size_t
139
+ mkr_span_since(const mkr_span_t *s, const char *mark)
140
+ {
141
+ return (size_t)(s->p - mark);
142
+ }
143
+
144
+ /* Length-checked slice equality (the audited replacement for an open-coded
145
+ * memcmp over two captured slices). Zero-length slices are equal regardless
146
+ * of pointers (a NULL "" never gets dereferenced). */
147
+ static inline bool
148
+ mkr_bytes_eq(const void *a, size_t alen, const void *b, size_t blen)
149
+ {
150
+ return alen == blen && (alen == 0 || memcmp(a, b, alen) == 0);
151
+ }
152
+
153
+ /* Substring search: find the first occurrence of [needle, needle+needle_len)
154
+ * within [hay, hay+hay_len). On a hit returns true and writes the byte offset to
155
+ * *idx; on a miss returns false and leaves *idx untouched. The audited
156
+ * replacement for an open-coded substring memcmp (a scan, so it lives in core).
157
+ * Boundary behavior mirrors mkr_bytes_eq's: lengths are the truth, pointers are
158
+ * never dereferenced at length 0 -
159
+ * - an empty needle (needle_len == 0) matches at offset 0 in ANY haystack,
160
+ * including an empty or NULL "" one (the substring analogue of two empty
161
+ * slices comparing equal);
162
+ * - a needle longer than the haystack never matches, so a NULL/empty haystack
163
+ * misses every non-empty needle without a read. */
164
+ static inline bool
165
+ mkr_bytes_find(const void *hay, size_t hay_len,
166
+ const void *needle, size_t needle_len, size_t *idx)
167
+ {
168
+ if (needle_len == 0) { *idx = 0; return true; }
169
+ if (needle_len > hay_len) return false;
170
+ const char *h = (const char *)hay;
171
+ const char *n = (const char *)needle;
172
+ size_t last = hay_len - needle_len;
173
+ for (size_t i = 0; i <= last; ++i) {
174
+ if (h[i] == n[0] && memcmp(h + i, n, needle_len) == 0) {
175
+ *idx = i;
176
+ return true;
177
+ }
178
+ }
179
+ return false;
180
+ }
181
+
182
+ #ifdef __cplusplus
183
+ }
184
+ #endif
185
+
186
+ #endif /* MAKIRI_CORE_MKR_SPAN_H */
@@ -0,0 +1,101 @@
1
+ /* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
2
+ * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
3
+ * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
4
+ #include "mkr_utf8.h"
5
+
6
+ #include <string.h> /* memcpy for the word-at-a-time ASCII scan */
7
+
8
+ bool
9
+ mkr_utf8_valid(const unsigned char *src, size_t len)
10
+ {
11
+ const unsigned char *p = src;
12
+ const unsigned char *const end = p + len;
13
+
14
+ while (p < end) {
15
+ unsigned char b = *p;
16
+
17
+ if (b < 0x80) {
18
+ /* ASCII fast path: skip a run of ASCII bytes a word at a time
19
+ * (any high bit set ends the run), then byte-wise for the tail. */
20
+ while ((size_t)(end - p) >= sizeof(size_t)) {
21
+ size_t w;
22
+ memcpy(&w, p, sizeof(w));
23
+ if (w & (size_t)0x8080808080808080ULL) {
24
+ break;
25
+ }
26
+ p += sizeof(size_t);
27
+ }
28
+ while (p < end && *p < 0x80) {
29
+ p++;
30
+ }
31
+ continue;
32
+ }
33
+
34
+ /* Multi-byte: decide length and validate the (length-dependent) ranges
35
+ * that exclude overlong forms, surrogates and > U+10FFFF. */
36
+ size_t n;
37
+ if (b >= 0xC2 && b <= 0xDF) { /* U+0080..U+07FF */
38
+ n = 2;
39
+ if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
40
+ } else if (b == 0xE0) { /* U+0800..U+0FFF */
41
+ n = 3;
42
+ if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
43
+ || (p[2] & 0xC0) != 0x80) return false;
44
+ } else if (b >= 0xE1 && b <= 0xEC) { /* U+1000..U+CFFF */
45
+ n = 3;
46
+ if (end - p < 3 || (p[1] & 0xC0) != 0x80
47
+ || (p[2] & 0xC0) != 0x80) return false;
48
+ } else if (b == 0xED) { /* U+D000..U+D7FF */
49
+ n = 3; /* (excludes surrogates) */
50
+ if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
51
+ || (p[2] & 0xC0) != 0x80) return false;
52
+ } else if (b == 0xEE || b == 0xEF) { /* U+E000..U+FFFF */
53
+ n = 3;
54
+ if (end - p < 3 || (p[1] & 0xC0) != 0x80
55
+ || (p[2] & 0xC0) != 0x80) return false;
56
+ } else if (b == 0xF0) { /* U+10000..U+3FFFF */
57
+ n = 4;
58
+ if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
59
+ || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
60
+ } else if (b >= 0xF1 && b <= 0xF3) { /* U+40000..U+FFFFF */
61
+ n = 4;
62
+ if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
63
+ || (p[3] & 0xC0) != 0x80) return false;
64
+ } else if (b == 0xF4) { /* U+100000..U+10FFFF */
65
+ n = 4;
66
+ if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
67
+ || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
68
+ } else { /* C0,C1,F5..FF,stray 80..BF */
69
+ return false;
70
+ }
71
+ p += n;
72
+ }
73
+ return true;
74
+ }
75
+
76
+ int
77
+ mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp)
78
+ {
79
+ if (len == 0) return 0;
80
+ unsigned char b0 = p[0];
81
+ if (b0 < 0x80u) { *cp = b0; return 1; }
82
+
83
+ int n;
84
+ uint32_t c, min;
85
+ if ((b0 & 0xE0u) == 0xC0u) { n = 2; c = b0 & 0x1Fu; min = 0x80u; }
86
+ else if ((b0 & 0xF0u) == 0xE0u) { n = 3; c = b0 & 0x0Fu; min = 0x800u; }
87
+ else if ((b0 & 0xF8u) == 0xF0u) { n = 4; c = b0 & 0x07u; min = 0x10000u; }
88
+ else return 0; /* continuation / 0xF8+ lead */
89
+
90
+ if ((size_t)n > len) return 0; /* truncated */
91
+ for (int i = 1; i < n; i++) {
92
+ unsigned char b = p[i];
93
+ if ((b & 0xC0u) != 0x80u) return 0; /* bad continuation byte */
94
+ c = (c << 6) | (b & 0x3Fu);
95
+ }
96
+ if (c < min) return 0; /* overlong */
97
+ if (c >= 0xD800u && c <= 0xDFFFu) return 0; /* surrogate */
98
+ if (c > 0x10FFFFu) return 0; /* out of Unicode range */
99
+ *cp = c;
100
+ return n;
101
+ }
@@ -0,0 +1,88 @@
1
+ #ifndef MAKIRI_CORE_MKR_UTF8_H
2
+ #define MAKIRI_CORE_MKR_UTF8_H
3
+
4
+ /*
5
+ * mkr_utf8_valid - the ONE pure-C UTF-8 validator (Ruby-free, allocation-free).
6
+ *
7
+ * Validates [src, src+len) against the Unicode "well-formed UTF-8 byte
8
+ * sequences" table (RFC 3629 / WHATWG): rejects bad continuation bytes,
9
+ * overlong forms, surrogates (U+D800..U+DFFF), code points above U+10FFFF, and
10
+ * an incomplete trailing sequence. Validate-only - it never materialises code
11
+ * points - and rips through ASCII a machine word at a time. NUL bytes are VALID
12
+ * here (U+0000 is well-formed UTF-8); callers that must reject NUL check it
13
+ * separately (memchr).
14
+ *
15
+ * This lives in core so the Ruby bridge (mkr_verify_text - the strict
16
+ * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
17
+ * utf8_input.c fast path) share a single implementation, and so the bridge's
18
+ * validation never allocates: a borrowed RSTRING pointer must not be held
19
+ * across a Ruby allocation (= GC point), so the validator the bridge runs
20
+ * between taking a borrow and using it has to be allocation-free by
21
+ * construction. (The former implementation built a throwaway Ruby String and
22
+ * asked for its coderange - an allocation inside every borrow.)
23
+ */
24
+
25
+ #include <stdbool.h>
26
+ #include <stddef.h>
27
+ #include <stdint.h>
28
+
29
+ #include "mkr_span.h"
30
+
31
+ #ifdef __cplusplus
32
+ extern "C" {
33
+ #endif
34
+
35
+ bool mkr_utf8_valid(const unsigned char *src, size_t len);
36
+
37
+ /* mkr_utf8_decode1 - decode ONE code point from [p, p+len), strictly: rejects
38
+ * truncation, bad continuation bytes, overlong forms, surrogates and values
39
+ * above U+10FFFF. Returns the byte length (1-4) with *cp set, or 0 on any
40
+ * violation (including len == 0) - fail closed, never read past the bound.
41
+ * The ONE strict decoder, shared by the XML tokenizer's name/Char scanning and
42
+ * the XPath lexer (each formerly carried its own equivalent copy). */
43
+ int mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp);
44
+
45
+ /* Span form: decode the code point at the span's cursor (without consuming -
46
+ * the caller mkr_span_skip()s the returned length). 0 at end-of-span. */
47
+ static inline int
48
+ mkr_utf8_decode1_span(const mkr_span_t *s, uint32_t *cp)
49
+ {
50
+ return mkr_utf8_decode1((const unsigned char *)s->p, mkr_span_left(s), cp);
51
+ }
52
+
53
+ /* mkr_utf8_count_chars - count Unicode code points in [ptr, ptr+len): every
54
+ * byte that is NOT a 0x80..0xBF continuation byte starts a new code point.
55
+ * Length-bounded (does not rely on a NUL terminator); ptr may be NULL when
56
+ * len == 0. Used where XPath measures string length / offsets in characters. */
57
+ static inline size_t
58
+ mkr_utf8_count_chars(const char *ptr, size_t len)
59
+ {
60
+ size_t n = 0;
61
+ for (size_t i = 0; i < len; ++i) {
62
+ if (((unsigned char)ptr[i] & 0xC0) != 0x80) ++n;
63
+ }
64
+ return n;
65
+ }
66
+
67
+ /* mkr_utf8_advance_chars - byte offset within [ptr, ptr+len) after advancing up
68
+ * to nchars UTF-8 characters from the start, clamped at len. A character is its
69
+ * leading byte plus the run of 0x80..0xBF continuation bytes that follow;
70
+ * advancing stops at len even mid-sequence. Length-bounded (no NUL reliance).
71
+ * Returns len when nchars exceeds the available character count. */
72
+ static inline size_t
73
+ mkr_utf8_advance_chars(const char *ptr, size_t len, size_t nchars)
74
+ {
75
+ size_t i = 0;
76
+ while (nchars > 0 && i < len) {
77
+ ++i;
78
+ while (i < len && ((unsigned char)ptr[i] & 0xC0) == 0x80) ++i;
79
+ --nchars;
80
+ }
81
+ return i;
82
+ }
83
+
84
+ #ifdef __cplusplus
85
+ }
86
+ #endif
87
+
88
+ #endif /* MAKIRI_CORE_MKR_UTF8_H */