makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,101 @@
1
+ /* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
2
+ * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
3
+ * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
4
+ #include "mkr_utf8.h"
5
+
6
+ #include <string.h> /* memcpy for the word-at-a-time ASCII scan */
7
+
8
+ bool
9
+ mkr_utf8_valid(const unsigned char *src, size_t len)
10
+ {
11
+ const unsigned char *p = src;
12
+ const unsigned char *const end = p + len;
13
+
14
+ while (p < end) {
15
+ unsigned char b = *p;
16
+
17
+ if (b < 0x80) {
18
+ /* ASCII fast path: skip a run of ASCII bytes a word at a time
19
+ * (any high bit set ends the run), then byte-wise for the tail. */
20
+ while ((size_t)(end - p) >= sizeof(size_t)) {
21
+ size_t w;
22
+ memcpy(&w, p, sizeof(w));
23
+ if (w & (size_t)0x8080808080808080ULL) {
24
+ break;
25
+ }
26
+ p += sizeof(size_t);
27
+ }
28
+ while (p < end && *p < 0x80) {
29
+ p++;
30
+ }
31
+ continue;
32
+ }
33
+
34
+ /* Multi-byte: decide length and validate the (length-dependent) ranges
35
+ * that exclude overlong forms, surrogates and > U+10FFFF. */
36
+ size_t n;
37
+ if (b >= 0xC2 && b <= 0xDF) { /* U+0080..U+07FF */
38
+ n = 2;
39
+ if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
40
+ } else if (b == 0xE0) { /* U+0800..U+0FFF */
41
+ n = 3;
42
+ if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
43
+ || (p[2] & 0xC0) != 0x80) return false;
44
+ } else if (b >= 0xE1 && b <= 0xEC) { /* U+1000..U+CFFF */
45
+ n = 3;
46
+ if (end - p < 3 || (p[1] & 0xC0) != 0x80
47
+ || (p[2] & 0xC0) != 0x80) return false;
48
+ } else if (b == 0xED) { /* U+D000..U+D7FF */
49
+ n = 3; /* (excludes surrogates) */
50
+ if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
51
+ || (p[2] & 0xC0) != 0x80) return false;
52
+ } else if (b == 0xEE || b == 0xEF) { /* U+E000..U+FFFF */
53
+ n = 3;
54
+ if (end - p < 3 || (p[1] & 0xC0) != 0x80
55
+ || (p[2] & 0xC0) != 0x80) return false;
56
+ } else if (b == 0xF0) { /* U+10000..U+3FFFF */
57
+ n = 4;
58
+ if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
59
+ || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
60
+ } else if (b >= 0xF1 && b <= 0xF3) { /* U+40000..U+FFFFF */
61
+ n = 4;
62
+ if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
63
+ || (p[3] & 0xC0) != 0x80) return false;
64
+ } else if (b == 0xF4) { /* U+100000..U+10FFFF */
65
+ n = 4;
66
+ if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
67
+ || (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
68
+ } else { /* C0,C1,F5..FF,stray 80..BF */
69
+ return false;
70
+ }
71
+ p += n;
72
+ }
73
+ return true;
74
+ }
75
+
76
+ int
77
+ mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp)
78
+ {
79
+ if (len == 0) return 0;
80
+ unsigned char b0 = p[0];
81
+ if (b0 < 0x80u) { *cp = b0; return 1; }
82
+
83
+ int n;
84
+ uint32_t c, min;
85
+ if ((b0 & 0xE0u) == 0xC0u) { n = 2; c = b0 & 0x1Fu; min = 0x80u; }
86
+ else if ((b0 & 0xF0u) == 0xE0u) { n = 3; c = b0 & 0x0Fu; min = 0x800u; }
87
+ else if ((b0 & 0xF8u) == 0xF0u) { n = 4; c = b0 & 0x07u; min = 0x10000u; }
88
+ else return 0; /* continuation / 0xF8+ lead */
89
+
90
+ if ((size_t)n > len) return 0; /* truncated */
91
+ for (int i = 1; i < n; i++) {
92
+ unsigned char b = p[i];
93
+ if ((b & 0xC0u) != 0x80u) return 0; /* bad continuation byte */
94
+ c = (c << 6) | (b & 0x3Fu);
95
+ }
96
+ if (c < min) return 0; /* overlong */
97
+ if (c >= 0xD800u && c <= 0xDFFFu) return 0; /* surrogate */
98
+ if (c > 0x10FFFFu) return 0; /* out of Unicode range */
99
+ *cp = c;
100
+ return n;
101
+ }
@@ -0,0 +1,88 @@
1
+ #ifndef MAKIRI_CORE_MKR_UTF8_H
2
+ #define MAKIRI_CORE_MKR_UTF8_H
3
+
4
+ /*
5
+ * mkr_utf8_valid - the ONE pure-C UTF-8 validator (Ruby-free, allocation-free).
6
+ *
7
+ * Validates [src, src+len) against the Unicode "well-formed UTF-8 byte
8
+ * sequences" table (RFC 3629 / WHATWG): rejects bad continuation bytes,
9
+ * overlong forms, surrogates (U+D800..U+DFFF), code points above U+10FFFF, and
10
+ * an incomplete trailing sequence. Validate-only - it never materialises code
11
+ * points - and rips through ASCII a machine word at a time. NUL bytes are VALID
12
+ * here (U+0000 is well-formed UTF-8); callers that must reject NUL check it
13
+ * separately (memchr).
14
+ *
15
+ * This lives in core so the Ruby bridge (mkr_verify_text - the strict
16
+ * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
17
+ * utf8_input.c fast path) share a single implementation, and so the bridge's
18
+ * validation never allocates: a borrowed RSTRING pointer must not be held
19
+ * across a Ruby allocation (= GC point), so the validator the bridge runs
20
+ * between taking a borrow and using it has to be allocation-free by
21
+ * construction. (The former implementation built a throwaway Ruby String and
22
+ * asked for its coderange - an allocation inside every borrow.)
23
+ */
24
+
25
+ #include <stdbool.h>
26
+ #include <stddef.h>
27
+ #include <stdint.h>
28
+
29
+ #include "mkr_span.h"
30
+
31
+ #ifdef __cplusplus
32
+ extern "C" {
33
+ #endif
34
+
35
+ bool mkr_utf8_valid(const unsigned char *src, size_t len);
36
+
37
+ /* mkr_utf8_decode1 - decode ONE code point from [p, p+len), strictly: rejects
38
+ * truncation, bad continuation bytes, overlong forms, surrogates and values
39
+ * above U+10FFFF. Returns the byte length (1-4) with *cp set, or 0 on any
40
+ * violation (including len == 0) - fail closed, never read past the bound.
41
+ * The ONE strict decoder, shared by the XML tokenizer's name/Char scanning and
42
+ * the XPath lexer (each formerly carried its own equivalent copy). */
43
+ int mkr_utf8_decode1(const unsigned char *p, size_t len, uint32_t *cp);
44
+
45
+ /* Span form: decode the code point at the span's cursor (without consuming -
46
+ * the caller mkr_span_skip()s the returned length). 0 at end-of-span. */
47
+ static inline int
48
+ mkr_utf8_decode1_span(const mkr_span_t *s, uint32_t *cp)
49
+ {
50
+ return mkr_utf8_decode1((const unsigned char *)s->p, mkr_span_left(s), cp);
51
+ }
52
+
53
+ /* mkr_utf8_count_chars - count Unicode code points in [ptr, ptr+len): every
54
+ * byte that is NOT a 0x80..0xBF continuation byte starts a new code point.
55
+ * Length-bounded (does not rely on a NUL terminator); ptr may be NULL when
56
+ * len == 0. Used where XPath measures string length / offsets in characters. */
57
+ static inline size_t
58
+ mkr_utf8_count_chars(const char *ptr, size_t len)
59
+ {
60
+ size_t n = 0;
61
+ for (size_t i = 0; i < len; ++i) {
62
+ if (((unsigned char)ptr[i] & 0xC0) != 0x80) ++n;
63
+ }
64
+ return n;
65
+ }
66
+
67
+ /* mkr_utf8_advance_chars - byte offset within [ptr, ptr+len) after advancing up
68
+ * to nchars UTF-8 characters from the start, clamped at len. A character is its
69
+ * leading byte plus the run of 0x80..0xBF continuation bytes that follow;
70
+ * advancing stops at len even mid-sequence. Length-bounded (no NUL reliance).
71
+ * Returns len when nchars exceeds the available character count. */
72
+ static inline size_t
73
+ mkr_utf8_advance_chars(const char *ptr, size_t len, size_t nchars)
74
+ {
75
+ size_t i = 0;
76
+ while (nchars > 0 && i < len) {
77
+ ++i;
78
+ while (i < len && ((unsigned char)ptr[i] & 0xC0) == 0x80) ++i;
79
+ --nchars;
80
+ }
81
+ return i;
82
+ }
83
+
84
+ #ifdef __cplusplus
85
+ }
86
+ #endif
87
+
88
+ #endif /* MAKIRI_CORE_MKR_UTF8_H */
@@ -12,7 +12,7 @@ require "etc"
12
12
  # 1. Build vendored Lexbor (unpatched) via cmake into vendor/lexbor/build,
13
13
  # install headers + a static archive into vendor/lexbor/dist.
14
14
  # 2. Compile ext/makiri/**/*.c with rake-compiler, linking against the
15
- # static Lexbor archive only no system libxml2/libxslt.
15
+ # static Lexbor archive only - no system libxml2/libxslt.
16
16
  #
17
17
  # Security note: the C extension is built with -D_FORTIFY_SOURCE=2,
18
18
  # -fstack-protector-strong, and -Wformat -Wformat-security. -O2 is kept
@@ -28,7 +28,29 @@ abort "Lexbor source not found at #{LEXBOR_SRC}. Did you `git submodule update -
28
28
 
29
29
  cmake = find_executable("cmake") or abort "cmake is required to build Lexbor."
30
30
 
31
- unless File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
31
+ # Optionally build the vendored Lexbor itself under AddressSanitizer. This is the
32
+ # ONLY way to catch overflows *inside* Lexbor's bump (mraw) arena: a sub-allocation
33
+ # overrunning into the next one stays within one big malloc'd chunk, so the heap
34
+ # allocator's red-zones (and thus a plain ASan build of just our ext) never see it.
35
+ # Lexbor's own mraw is ASan-aware - with -DLEXBOR_BUILD_WITH_ASAN=ON its CMake
36
+ # defines LEXBOR_HAVE_ADDRESS_SANITIZER, and mraw then unpoisons exactly each
37
+ # allocation and re-poisons the gap, so an intra-arena overrun writes into
38
+ # poisoned memory and ASan reports it. Opt-in (slow full rebuild), only meaningful
39
+ # with MAKIRI_SANITIZE=...address...; drive it via `rake sanitize:lexbor`.
40
+ # vendor/lexbor stays vanilla - this is a build flag, not a source patch.
41
+ sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
42
+ lexbor_asan = !ENV["MAKIRI_SANITIZE_LEXBOR"].to_s.strip.empty? && sanitize.include?("address")
43
+ lexbor_mode = lexbor_asan ? "asan" : "plain"
44
+ lexbor_stamp = File.join(LEXBOR_DST, ".makiri_build_mode")
45
+
46
+ # Reuse the cached archive only when it was built in the mode we now want; a mode
47
+ # switch (plain <-> asan) forces a rebuild, so a sanitized Lexbor can never leak
48
+ # into a normal build or vice versa.
49
+ have_archive = File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
50
+ stamp_ok = have_archive && File.exist?(lexbor_stamp) && File.read(lexbor_stamp).strip == lexbor_mode
51
+ unless stamp_ok
52
+ FileUtils.rm_rf(LEXBOR_BLD)
53
+ FileUtils.rm_rf(LEXBOR_DST) if have_archive # drop a wrong-mode install
32
54
  FileUtils.mkdir_p(LEXBOR_BLD)
33
55
  Dir.chdir(LEXBOR_BLD) do
34
56
  cmd = [
@@ -41,12 +63,15 @@ unless File.exist?(File.join(LEXBOR_DST, "lib", "liblexbor_static.a"))
41
63
  "-DCMAKE_BUILD_TYPE=Release",
42
64
  "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
43
65
  "-DCMAKE_INSTALL_PREFIX=#{LEXBOR_DST}",
66
+ *(lexbor_asan ? ["-DLEXBOR_BUILD_WITH_ASAN=ON"] : []),
44
67
  LEXBOR_SRC,
45
68
  ].shelljoin
69
+ warn "makiri: building vendored Lexbor (mode=#{lexbor_mode})"
46
70
  system(cmd) or abort "cmake configure failed for Lexbor."
47
71
  system("#{cmake.shellescape} --build . --target install -- -j#{Etc.respond_to?(:nprocessors) ? Etc.nprocessors : 4}") or
48
72
  abort "cmake build/install failed for Lexbor."
49
73
  end
74
+ File.write(lexbor_stamp, lexbor_mode)
50
75
  end
51
76
 
52
77
  $INCFLAGS << " -I#{File.join(LEXBOR_DST, 'include').shellescape}"
@@ -60,11 +85,35 @@ $LDFLAGS << " #{lexbor_archive.shellescape}"
60
85
  # Sanitizer build (opt-in): MAKIRI_SANITIZE=address,undefined rake clean compile
61
86
  # Then run the suite under the runtime via `rake sanitize` (which preloads the
62
87
  # ASan runtime). Sanitizers replace the heap allocator, so even the vendored
63
- # (uninstrumented) Lexbor's allocations get red-zoned heap overflows on
64
- # Lexbor-owned buffers are still caught. _FORTIFY_SOURCE is dropped here because
65
- # it conflicts with the sanitizer interceptors.
66
- sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
67
- if sanitize.empty?
88
+ # (uninstrumented) Lexbor's allocations get red-zoned - a heap overflow off the
89
+ # END of a Lexbor malloc is caught. Overflows *inside* Lexbor's mraw arena are
90
+ # NOT caught this way (they stay within one malloc'd chunk); for those, also build
91
+ # Lexbor under ASan via MAKIRI_SANITIZE_LEXBOR=1 (see the Lexbor build above and
92
+ # `rake sanitize:lexbor`). _FORTIFY_SOURCE is dropped here because it conflicts
93
+ # with the sanitizer interceptors.
94
+ # Coverage build (opt-in): MAKIRI_COVERAGE=1 instruments OUR sources with clang
95
+ # source-based coverage (the vendored Lexbor is built separately and is NOT
96
+ # instrumented - we measure only the code we write). Run via `rake coverage`,
97
+ # which sets LLVM_PROFILE_FILE and renders an llvm-cov report. -O0 keeps the
98
+ # region map close to the source; _FORTIFY_SOURCE is dropped (it needs -O2).
99
+ coverage = !ENV["MAKIRI_COVERAGE"].to_s.strip.empty?
100
+
101
+ # OOM-injection build (opt-in): MAKIRI_ALLOC_INJECT=1 compiles the core
102
+ # allocation-failure hook (mkr_alloc.h) so `rake oom` can sweep "the nth core
103
+ # allocation fails" over representative workloads and assert every OOM branch
104
+ # fails closed. Debug/test builds only - a normal build carries no hook.
105
+ # Composes with the sanitize/coverage modes below.
106
+ if ENV["MAKIRI_ALLOC_INJECT"].to_s.strip == "1"
107
+ $CFLAGS << " -DMKR_ALLOC_INJECT=1"
108
+ warn "makiri: building with allocation-failure injection (MKR_ALLOC_INJECT)"
109
+ end
110
+
111
+ if coverage
112
+ $CFLAGS << " -O0 -g -fprofile-instr-generate -fcoverage-mapping"
113
+ $LDFLAGS << " -fprofile-instr-generate"
114
+ $DLDFLAGS << " -fprofile-instr-generate"
115
+ warn "makiri: building with clang source-based coverage"
116
+ elsif sanitize.empty?
68
117
  # Security hardening flags. Keep -O2 active so _FORTIFY_SOURCE works.
69
118
  $CFLAGS << " -O2"
70
119
  $CFLAGS << " -D_FORTIFY_SOURCE=2"
@@ -72,6 +121,20 @@ else
72
121
  $CFLAGS << " -O1 -g -fno-omit-frame-pointer -fsanitize=#{sanitize}"
73
122
  $LDFLAGS << " -fsanitize=#{sanitize}"
74
123
  $DLDFLAGS << " -fsanitize=#{sanitize}"
124
+ if sanitize.include?("address")
125
+ # No ASan *stack* red zones in the ext. CRuby is built with
126
+ # RUBY_SETJMP = __builtin_setjmp, so rb_raise unwinds via __builtin_longjmp,
127
+ # which the ASan runtime does not intercept (no __asan_handle_no_return):
128
+ # any raise crossing an instrumented frame - ours, or Ruby code raising
129
+ # through rb_protect under the evaluator - leaves that frame's stack poison
130
+ # behind, and an interceptor (memcpy & co.) in the uninstrumented interpreter
131
+ # later trips over the stale shadow: a spurious report, which ASan itself
132
+ # then aborts while rendering (asan_thread.cpp kCurrentStackFrameMagic
133
+ # CHECK). Heap red zones, UBSan, and the manual arena poisoning in
134
+ # mkr_xml_node.c are unaffected; only stack-buffer checks are lost.
135
+ $CFLAGS << (RbConfig::CONFIG["CC"] =~ /clang/ || RbConfig::CONFIG["target_os"] =~ /darwin/ ?
136
+ " -mllvm -asan-stack=0" : " --param asan-stack=0")
137
+ end
75
138
  warn "makiri: building with -fsanitize=#{sanitize}"
76
139
  end
77
140
 
@@ -115,9 +178,59 @@ elsif RbConfig::CONFIG["target_os"] =~ /linux/
115
178
  $LIBRUBYARG_STATIC = ""
116
179
  end
117
180
 
118
- # Recursively pick up C sources under ext/makiri/.
119
- $srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c")).map { |f| f.sub("#{EXT_DIR}/", "") }
181
+ # Export ONLY Init_makiri from the compiled extension. `-fvisibility=hidden`
182
+ # above hides our own sources' symbols, but the vendored Lexbor static library
183
+ # is built (by Lexbor's own CMake) with default visibility, so without this the
184
+ # linker re-exports ~1700 `lxb_*` / `lexbor_*` symbols into the bundle's dynamic
185
+ # table. Another Lexbor-based extension loaded in the same process (e.g.
186
+ # nokolexbor) would then resolve its own `lxb_*` calls to OUR copy - a different
187
+ # Lexbor version with an incompatible ABI - and segfault. Restricting the export
188
+ # list to Init_makiri keeps Makiri's Lexbor entirely private (Ruby only needs
189
+ # Init_makiri, found via dlsym at require time).
190
+ if RbConfig::CONFIG["target_os"] =~ /darwin/
191
+ $DLDFLAGS << " -Wl,-exported_symbol,_Init_makiri"
192
+ elsif RbConfig::CONFIG["target_os"] =~ /linux/
193
+ # Hide every symbol pulled in from static archives (the Lexbor .a); our own
194
+ # are already hidden by -fvisibility=hidden, leaving just RUBY_FUNC_EXPORTED
195
+ # Init_makiri in the dynamic symbol table.
196
+ $DLDFLAGS << " -Wl,--exclude-libs,ALL"
197
+ end
198
+
199
+ # Recursively pick up C sources under ext/makiri/, excluding standalone
200
+ # libFuzzer harnesses. Those define LLVMFuzzerTestOneInput and are linked by
201
+ # ext/makiri/fuzz/Makefile, never into the Ruby extension.
202
+ $srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c"))
203
+ .reject { |f| f.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
204
+ .map { |f| f.sub("#{EXT_DIR}/", "") }
120
205
  $VPATH ||= []
121
- $VPATH += Dir.glob(File.join(EXT_DIR, "**/")).map { |d| "$(srcdir)/#{d.sub("#{EXT_DIR}/", "")}".chomp("/") }
206
+ # fuzz/ must be excluded here too: after a `rake fuzz:libfuzzer_build`,
207
+ # fuzz/build/{core,xml,xpath}/ hold sanitizer-instrumented .o files, and a
208
+ # VPATH that includes them lets make resolve the extension's object
209
+ # prerequisites there instead of compiling them - breaking the link (or worse,
210
+ # silently mixing differently-flagged objects).
211
+ $VPATH += Dir.glob(File.join(EXT_DIR, "**/"))
212
+ .reject { |d| d.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
213
+ .map { |d| "$(srcdir)/#{d.sub("#{EXT_DIR}/", "")}".chomp("/") }
122
214
 
123
215
  create_makefile("makiri/makiri")
216
+
217
+ # mkmf's generated Makefile carries NO header dependencies, so editing a header
218
+ # (e.g. a struct layout in an internal .h) recompiles only the .c files whose
219
+ # own timestamps changed - the rest keep their stale layout and the objects
220
+ # silently disagree (ABI mismatch, runtime breakage). Append the coarsest sound
221
+ # rule instead: every object depends on every project header. A header edit
222
+ # then recompiles everything - a few seconds for this ext, and it can never
223
+ # rot (the list regenerates each configure; a header NEW since the last
224
+ # configure is reachable only from .c files edited to include it, which rebuild
225
+ # on their own timestamp). Ruby/Lexbor headers are deliberately excluded: a
226
+ # Ruby upgrade gets a fresh build dir from rake-compiler, and a Lexbor pin
227
+ # change already requires `rake clean:lexbor` (see CLAUDE.md).
228
+ project_headers = Dir.glob(File.join(EXT_DIR, "**", "*.h"))
229
+ .reject { |f| f.start_with?(File.join(EXT_DIR, "fuzz") + File::SEPARATOR) }
230
+ .map { |f| "$(srcdir)/#{f.sub("#{EXT_DIR}/", "")}" }
231
+ .sort
232
+ File.open("Makefile", "a") do |mk|
233
+ mk.puts
234
+ mk.puts "# Project-header dependencies appended by extconf.rb (mkmf emits none)."
235
+ mk.puts "$(OBJS): #{project_headers.join(" ")}"
236
+ end
@@ -0,0 +1,95 @@
1
+ # ext/makiri/fuzz/Makefile - libFuzzer harness build
2
+ #
3
+ # Usage:
4
+ # make xml_fuzz # build the XML parser harness
5
+ # make xpath_fuzz # build the XPath compile+eval harness
6
+ # make clean # remove build artifacts
7
+ #
8
+ # Requires: clang with libFuzzer support (usually part of the clang distribution)
9
+ # and the vendored Lexbor static library already built (via `rake compile`).
10
+
11
+ CLANG ?= clang
12
+ CXX := $(if $(filter default,$(origin CXX)),clang++,$(CXX))
13
+
14
+ # Paths relative to ext/makiri/fuzz/
15
+ EXT_DIR = ..
16
+ LEXBOR_SRC = ../../../vendor/lexbor
17
+ LEXBOR_DST = $(LEXBOR_SRC)/dist
18
+
19
+ # CFLAGS mirror the security flags from extconf.rb, minus Ruby-specific flags.
20
+ # Sanitizer instrumentation is added at compile time so every TU is covered.
21
+ SANITIZE ?= address,undefined
22
+ CFLAGS = -O2 -g -Wall -Wextra \
23
+ -fstack-protector-strong \
24
+ -Wformat -Wformat-security \
25
+ -fvisibility=hidden \
26
+ -fno-common \
27
+ -fsanitize=$(SANITIZE) \
28
+ -I$(LEXBOR_DST)/include \
29
+ -I$(EXT_DIR) \
30
+ -I$(EXT_DIR)/core \
31
+ -I$(EXT_DIR)/xml \
32
+ -I$(EXT_DIR)/xpath \
33
+ -I$(EXT_DIR)/lexbor_compat
34
+
35
+ # Linker flags: libFuzzer driver + sanitizers.
36
+ LDFLAGS = -fsanitize=fuzzer,$(SANITIZE)
37
+
38
+ BUILD_DIR = build
39
+
40
+ # Ruby-free C sources that the harnesses link against.
41
+ CORE_SRCS = \
42
+ $(EXT_DIR)/core/mkr_alloc.c \
43
+ $(EXT_DIR)/core/mkr_utf8.c \
44
+ $(EXT_DIR)/core/mkr_buf.c \
45
+ $(EXT_DIR)/core/mkr_core.c
46
+
47
+ XML_SRCS = \
48
+ $(EXT_DIR)/xml/mkr_xml_tree.c \
49
+ $(EXT_DIR)/xml/mkr_xml_node.c \
50
+ $(EXT_DIR)/xml/mkr_xml_chars.c \
51
+ $(EXT_DIR)/xml/mkr_xml_index.c \
52
+ $(EXT_DIR)/xml/mkr_xml_mutate.c
53
+
54
+ XPATH_SRCS = \
55
+ $(EXT_DIR)/xpath/mkr_xpath.c \
56
+ $(EXT_DIR)/xpath/mkr_xpath_lex.c \
57
+ $(EXT_DIR)/xpath/mkr_xpath_parse.c \
58
+ $(EXT_DIR)/xpath/mkr_xpath_shared.c \
59
+ $(EXT_DIR)/xpath/mkr_xpath_number.c \
60
+ $(EXT_DIR)/xpath/mkr_xpath_engine_xml.c \
61
+ $(EXT_DIR)/xpath/mkr_xpath_engine_html.c
62
+
63
+ CORE_OBJS = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(CORE_SRCS))
64
+ XML_OBJS = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(XML_SRCS))
65
+ XPATH_OBJS = $(patsubst $(EXT_DIR)/%.c,$(BUILD_DIR)/%.o,$(XPATH_SRCS))
66
+
67
+ .PHONY: all clean check-libfuzzer xml_fuzz xpath_fuzz
68
+
69
+ all: check-libfuzzer xml_fuzz xpath_fuzz
70
+
71
+ check-libfuzzer:
72
+ @mkdir -p $(BUILD_DIR)
73
+ @$(CXX) $(LDFLAGS) -o $(BUILD_DIR)/check_fuzzer check_fuzzer.cc >/dev/null 2>&1 || \
74
+ (echo "libFuzzer runtime not available for $(CXX). Install LLVM clang and run with CLANG=/path/to/clang CXX=/path/to/clang++." >&2; exit 1)
75
+
76
+ xml_fuzz: $(BUILD_DIR)/xml_fuzz.o $(CORE_OBJS) $(XML_OBJS)
77
+ $(CXX) $(LDFLAGS) -o $@ $^
78
+
79
+ xpath_fuzz: $(BUILD_DIR)/xpath_fuzz.o $(CORE_OBJS) $(XML_OBJS) $(XPATH_OBJS)
80
+ $(CXX) $(LDFLAGS) -o $@ $^ $(LEXBOR_DST)/lib/liblexbor_static.a
81
+
82
+ $(BUILD_DIR)/%.o: $(EXT_DIR)/%.c
83
+ @mkdir -p $(dir $@)
84
+ $(CLANG) $(CFLAGS) -c -o $@ $<
85
+
86
+ $(BUILD_DIR)/xml_fuzz.o: xml_fuzz.c
87
+ @mkdir -p $(dir $@)
88
+ $(CLANG) $(CFLAGS) -c -o $@ $<
89
+
90
+ $(BUILD_DIR)/xpath_fuzz.o: xpath_fuzz.c
91
+ @mkdir -p $(dir $@)
92
+ $(CLANG) $(CFLAGS) -c -o $@ $<
93
+
94
+ clean:
95
+ rm -rf $(BUILD_DIR) xml_fuzz xpath_fuzz
@@ -0,0 +1,4 @@
1
+ extern "C" int LLVMFuzzerTestOneInput(const unsigned char *, unsigned long)
2
+ {
3
+ return 0;
4
+ }
@@ -0,0 +1,24 @@
1
+ /* xml_fuzz.c - libFuzzer harness for mkr_xml_parse.
2
+ *
3
+ * Coverage-guided fuzzing of the XML tokenizer + tree builder.
4
+ * Ruby-free; runs directly on the pure-C parser surface.
5
+ */
6
+
7
+ #include <stdint.h>
8
+ #include <stddef.h>
9
+ #include "xml/mkr_xml.h"
10
+
11
+ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
12
+ {
13
+ mkr_xml_status_t status;
14
+ /* The parser contract says "valid UTF-8, NUL-free", but the fuzzer feeds
15
+ * raw bytes. mkr_xml_parse is fail-closed: it validates as it goes and
16
+ * returns an error status on malformed input, never a partial document.
17
+ * We pass the bytes through untouched so the fuzzer can reach the invalid-
18
+ * UTF-8 / unexpected-NUL error paths too. */
19
+ mkr_xml_doc_t *doc = mkr_xml_parse((const char *) data, size, &status);
20
+ if (doc) {
21
+ mkr_xml_doc_destroy(doc);
22
+ }
23
+ return 0;
24
+ }
@@ -0,0 +1,109 @@
1
+ /* xpath_fuzz.c - libFuzzer harness for the XPath compile + eval path.
2
+ *
3
+ * Coverage-guided fuzzing of the XPath lexer, parser, and evaluator.
4
+ * We build a small fixed XML document as the evaluation context, then
5
+ * treat the fuzzer input as the XPath expression string.
6
+ *
7
+ * Ruby-free; runs directly on the pure-C engine surface.
8
+ */
9
+
10
+ #include <stdint.h>
11
+ #include <stddef.h>
12
+ #include <string.h>
13
+ #include "core/mkr_alloc.h"
14
+ #include "xml/mkr_xml.h"
15
+ #include "xpath/mkr_xpath.h"
16
+ #include "xpath/mkr_xpath_internal.h"
17
+
18
+ /* A small, fixed XML document that gives the evaluator something to walk.
19
+ * The expression is the fuzzer input; the document is static so the coverage
20
+ * signal comes from the engine, not the parser. */
21
+ static const char FIXED_XML[] =
22
+ "<?xml version='1.0'?>"
23
+ "<root xmlns='http://example.com/default' xmlns:ns='http://example.com/ns'>"
24
+ " <a id='1' ns:attr='x'>text1</a>"
25
+ " <b id='2'><c/><c/></b>"
26
+ " <ns:d>namespaced</ns:d>"
27
+ " <!-- comment -->"
28
+ " <?pi target='value'?>"
29
+ "</root>";
30
+
31
+ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
32
+ {
33
+ /* 1. Parse the fixed document. */
34
+ mkr_xml_status_t status;
35
+ mkr_xml_doc_t *doc = mkr_xml_parse(FIXED_XML, sizeof(FIXED_XML) - 1, &status);
36
+ if (!doc) return 0;
37
+ if (!doc->doc_node) {
38
+ mkr_xml_doc_destroy(doc);
39
+ return 0;
40
+ }
41
+
42
+ /* 2. The fuzzer input is the XPath expression.
43
+ * The engine text contract requires no interior NUL and a NUL at
44
+ * ptr[len]. libFuzzer hands us exactly `size` bytes with no terminator,
45
+ * so we copy the expression prefix into an owned, NUL-terminated heap
46
+ * buffer and mint the verified-text token over that copy - this is what
47
+ * supplies the NUL-termination + no-interior-NUL the lexer's strtod and
48
+ * "%.10s" error path rely on. If the input contains a NUL we truncate to
49
+ * the prefix (the lexer hits the terminator and reports a syntax error,
50
+ * a path worth exercising). UTF-8 validity is deliberately NOT
51
+ * pre-checked: the lexer's strict decoder rejecting invalid UTF-8 is
52
+ * itself a path the fuzzer should hit. */
53
+ size_t expr_len = size;
54
+ for (size_t i = 0; i < size; i++) {
55
+ if (data[i] == '\0') {
56
+ expr_len = i;
57
+ break;
58
+ }
59
+ }
60
+ char *expr_copy = mkr_strndup((const char *) data, expr_len);
61
+ if (!expr_copy) {
62
+ mkr_xml_doc_destroy(doc);
63
+ return 0;
64
+ }
65
+ /* Empty expression is a quick syntax error; still worth a run. */
66
+ mkr_verified_text_t expr = { expr_copy, expr_len };
67
+
68
+ /* 3. Compile the expression. */
69
+ mkr_xpath_limits_t limits;
70
+ mkr_xpath_limits_init_defaults(&limits);
71
+ /* Tighten the compile-time budgets so a hostile expression fails fast
72
+ * rather than burning fuzzer time on pathological ASTs. */
73
+ limits.max_ast_nodes = 10000;
74
+ limits.max_expr_bytes = 16 * 1024;
75
+
76
+ mkr_xpath_error_t err = {0};
77
+ mkr_node_t *ast = mkr_parse(expr, &limits, &err);
78
+ if (!ast) {
79
+ mkr_xpath_error_clear(&err);
80
+ free(expr_copy);
81
+ mkr_xml_doc_destroy(doc);
82
+ return 0;
83
+ }
84
+
85
+ /* 4. Evaluate against the fixed document. */
86
+ mkr_xpath_context_t *ctx = mkr_xpath_context_new(doc->doc_node, doc->doc_node);
87
+ if (ctx) {
88
+ mkr_xpath_set_engine_kind(ctx, 1); /* XML engine */
89
+ mkr_xpath_limits_init_defaults(&limits);
90
+ limits.max_eval_ops = 5 * 1000 * 1000; /* 5M ops - enough for a real query */
91
+ limits.max_nodeset_size = 10000;
92
+ limits.max_string_bytes = 1024 * 1024;
93
+ limits.max_recursion_depth = 64;
94
+
95
+ mkr_xpath_value_t out = {0};
96
+ mkr_xpath_error_t eval_err = {0};
97
+ if (mkr_xpath_eval_compiled(ctx, ast, &out, &eval_err) == 0) {
98
+ mkr_xpath_value_clear(&out);
99
+ } else {
100
+ mkr_xpath_error_clear(&eval_err);
101
+ }
102
+ mkr_xpath_context_free(ctx);
103
+ }
104
+
105
+ mkr_node_free(ast);
106
+ free(expr_copy);
107
+ mkr_xml_doc_destroy(doc);
108
+ return 0;
109
+ }