makiri 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +60 -2
- data/README.md +81 -77
- data/Rakefile +194 -3
- data/ext/makiri/bridge/ruby_string.c +119 -66
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +27 -4
- data/ext/makiri/core/mkr_buf.c +13 -3
- data/ext/makiri/core/mkr_buf.h +80 -5
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +10 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +104 -9
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +8 -0
- data/ext/makiri/glue/ruby_doc.c +20 -24
- data/ext/makiri/glue/ruby_html_css.c +58 -12
- data/ext/makiri/glue/ruby_html_mutate.c +11 -6
- data/ext/makiri/glue/ruby_html_node.c +3 -32
- data/ext/makiri/glue/ruby_node.c +39 -0
- data/ext/makiri/glue/ruby_xml.c +198 -16
- data/ext/makiri/glue/ruby_xml_node.c +46 -59
- data/ext/makiri/glue/ruby_xpath.c +4 -4
- data/ext/makiri/lexbor_compat/source_loc.c +14 -16
- data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
- data/ext/makiri/makiri.c +45 -0
- data/ext/makiri/xml/mkr_xml.h +2 -3
- data/ext/makiri/xml/mkr_xml_chars.c +67 -97
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
- data/ext/makiri/xml/mkr_xml_node.c +147 -15
- data/ext/makiri/xml/mkr_xml_node.h +71 -6
- data/ext/makiri/xml/mkr_xml_tree.c +185 -149
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +37 -0
- data/ext/makiri/xpath/mkr_xpath.h +13 -0
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
- data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
- data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
- data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
- data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
- data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
- data/lib/makiri/cdata_section.rb +1 -3
- data/lib/makiri/comment.rb +1 -3
- data/lib/makiri/document.rb +8 -0
- data/lib/makiri/element.rb +1 -3
- data/lib/makiri/processing_instruction.rb +1 -3
- data/lib/makiri/text.rb +1 -3
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/node_methods.rb +47 -0
- data/lib/makiri.rb +1 -0
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +45 -2
- data/script/check_c_safety_allowlist.yml +19 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +19 -1
|
@@ -16,85 +16,12 @@
|
|
|
16
16
|
* via mkr_utf8_sanitize (declared in compat.h).
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
/*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
* accept set as Lexbor's decoder but validate-only - it never materialises code
|
|
24
|
-
* points, and rips through ASCII (the common case) a machine word at a time, so
|
|
25
|
-
* it is much cheaper than decode-and-discard. NUL bytes are valid here and are
|
|
26
|
-
* left for the HTML tokenizer to handle per the spec.
|
|
27
|
-
*
|
|
28
|
-
* The contract that matters: this returns true *only* for input that
|
|
19
|
+
/* UTF-8 validation: the shared, allocation-free validator lives in core
|
|
20
|
+
* (core/mkr_utf8.c, via mkr_core.h) so this fast path and the Ruby bridge's
|
|
21
|
+
* strict input gate (mkr_verify_text) run ONE implementation. Its contract
|
|
22
|
+
* here is unchanged: it returns true *only* for input that
|
|
29
23
|
* mkr_utf8_replace_invalid would leave byte-identical, so "valid" can safely
|
|
30
|
-
* skip the transcode. */
|
|
31
|
-
static bool
|
|
32
|
-
mkr_utf8_valid(const lxb_char_t *src, size_t len)
|
|
33
|
-
{
|
|
34
|
-
const unsigned char *p = (const unsigned char *)src;
|
|
35
|
-
const unsigned char *const end = p + len;
|
|
36
|
-
|
|
37
|
-
while (p < end) {
|
|
38
|
-
unsigned char b = *p;
|
|
39
|
-
|
|
40
|
-
if (b < 0x80) {
|
|
41
|
-
/* ASCII fast path: skip a run of ASCII bytes a word at a time
|
|
42
|
-
* (any high bit set ends the run), then byte-wise for the tail. */
|
|
43
|
-
while ((size_t)(end - p) >= sizeof(size_t)) {
|
|
44
|
-
size_t w;
|
|
45
|
-
memcpy(&w, p, sizeof(w));
|
|
46
|
-
if (w & (size_t)0x8080808080808080ULL) {
|
|
47
|
-
break;
|
|
48
|
-
}
|
|
49
|
-
p += sizeof(size_t);
|
|
50
|
-
}
|
|
51
|
-
while (p < end && *p < 0x80) {
|
|
52
|
-
p++;
|
|
53
|
-
}
|
|
54
|
-
continue;
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
/* Multi-byte: decide length and validate the (length-dependent) ranges
|
|
58
|
-
* that exclude overlong forms, surrogates and > U+10FFFF. */
|
|
59
|
-
size_t n;
|
|
60
|
-
if (b >= 0xC2 && b <= 0xDF) { /* U+0080..U+07FF */
|
|
61
|
-
n = 2;
|
|
62
|
-
if (end - p < 2 || (p[1] & 0xC0) != 0x80) return false;
|
|
63
|
-
} else if (b == 0xE0) { /* U+0800..U+0FFF */
|
|
64
|
-
n = 3;
|
|
65
|
-
if (end - p < 3 || p[1] < 0xA0 || p[1] > 0xBF
|
|
66
|
-
|| (p[2] & 0xC0) != 0x80) return false;
|
|
67
|
-
} else if (b >= 0xE1 && b <= 0xEC) { /* U+1000..U+CFFF */
|
|
68
|
-
n = 3;
|
|
69
|
-
if (end - p < 3 || (p[1] & 0xC0) != 0x80
|
|
70
|
-
|| (p[2] & 0xC0) != 0x80) return false;
|
|
71
|
-
} else if (b == 0xED) { /* U+D000..U+D7FF */
|
|
72
|
-
n = 3; /* (excludes surrogates) */
|
|
73
|
-
if (end - p < 3 || p[1] < 0x80 || p[1] > 0x9F
|
|
74
|
-
|| (p[2] & 0xC0) != 0x80) return false;
|
|
75
|
-
} else if (b == 0xEE || b == 0xEF) { /* U+E000..U+FFFF */
|
|
76
|
-
n = 3;
|
|
77
|
-
if (end - p < 3 || (p[1] & 0xC0) != 0x80
|
|
78
|
-
|| (p[2] & 0xC0) != 0x80) return false;
|
|
79
|
-
} else if (b == 0xF0) { /* U+10000..U+3FFFF */
|
|
80
|
-
n = 4;
|
|
81
|
-
if (end - p < 4 || p[1] < 0x90 || p[1] > 0xBF
|
|
82
|
-
|| (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
|
|
83
|
-
} else if (b >= 0xF1 && b <= 0xF3) { /* U+40000..U+FFFFF */
|
|
84
|
-
n = 4;
|
|
85
|
-
if (end - p < 4 || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
|
|
86
|
-
|| (p[3] & 0xC0) != 0x80) return false;
|
|
87
|
-
} else if (b == 0xF4) { /* U+100000..U+10FFFF */
|
|
88
|
-
n = 4;
|
|
89
|
-
if (end - p < 4 || p[1] < 0x80 || p[1] > 0x8F
|
|
90
|
-
|| (p[2] & 0xC0) != 0x80 || (p[3] & 0xC0) != 0x80) return false;
|
|
91
|
-
} else { /* C0,C1,F5..FF,stray 80..BF */
|
|
92
|
-
return false;
|
|
93
|
-
}
|
|
94
|
-
p += n;
|
|
95
|
-
}
|
|
96
|
-
return true;
|
|
97
|
-
}
|
|
24
|
+
* skip the transcode. (lxb_char_t is unsigned char, so the call is direct.) */
|
|
98
25
|
|
|
99
26
|
/* Transcode UTF-8 -> UTF-8 replacing every invalid sequence with U+FFFD
|
|
100
27
|
* (WHATWG byte-stream decoding), into a freshly malloc'd, NUL-terminated
|
data/ext/makiri/makiri.c
CHANGED
|
@@ -79,6 +79,48 @@ mkr_c_selftest(VALUE self)
|
|
|
79
79
|
return Qtrue;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
+
/* Makiri.__alloc_inject(n) / __alloc_inject_calls / __alloc_inject? - the OOM
|
|
83
|
+
* sweep harness's controls (script/check_alloc_failures.rb, `rake oom`): arm
|
|
84
|
+
* "the nth core allocation fails once", and read how many core allocations a
|
|
85
|
+
* workload attempted. Compiled to a real hook only under MKR_ALLOC_INJECT
|
|
86
|
+
* (extconf: MAKIRI_ALLOC_INJECT=1); in a normal build __alloc_inject? is
|
|
87
|
+
* false and the others raise, so the harness fails loudly on the wrong build
|
|
88
|
+
* instead of sweeping nothing. Test hooks only. */
|
|
89
|
+
static VALUE
|
|
90
|
+
mkr_s_alloc_inject_p(VALUE self)
|
|
91
|
+
{
|
|
92
|
+
(void)self;
|
|
93
|
+
#ifdef MKR_ALLOC_INJECT
|
|
94
|
+
return Qtrue;
|
|
95
|
+
#else
|
|
96
|
+
return Qfalse;
|
|
97
|
+
#endif
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
static VALUE
|
|
101
|
+
mkr_s_alloc_inject(VALUE self, VALUE nth)
|
|
102
|
+
{
|
|
103
|
+
(void)self;
|
|
104
|
+
#ifdef MKR_ALLOC_INJECT
|
|
105
|
+
mkr_alloc_inject_arm((long long)NUM2LL(nth));
|
|
106
|
+
return Qnil;
|
|
107
|
+
#else
|
|
108
|
+
(void)nth;
|
|
109
|
+
rb_raise(rb_eNotImpError, "rebuild with MAKIRI_ALLOC_INJECT=1 (rake oom does this)");
|
|
110
|
+
#endif
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
static VALUE
|
|
114
|
+
mkr_s_alloc_inject_calls(VALUE self)
|
|
115
|
+
{
|
|
116
|
+
(void)self;
|
|
117
|
+
#ifdef MKR_ALLOC_INJECT
|
|
118
|
+
return ULL2NUM(mkr_alloc_inject_calls());
|
|
119
|
+
#else
|
|
120
|
+
rb_raise(rb_eNotImpError, "rebuild with MAKIRI_ALLOC_INJECT=1 (rake oom does this)");
|
|
121
|
+
#endif
|
|
122
|
+
}
|
|
123
|
+
|
|
82
124
|
/* Makiri::XML.__decode(str) -> validated, UTF-8-tagged String, or raises
|
|
83
125
|
* Makiri::XML::SyntaxError. Internal test hook exercising the strict input
|
|
84
126
|
* decode (§2.1) on its own, until the full Makiri::XML(...) parse pipeline
|
|
@@ -223,5 +265,8 @@ Init_makiri(void)
|
|
|
223
265
|
mkr_init_xml_node();
|
|
224
266
|
|
|
225
267
|
rb_define_singleton_method(mkr_mMakiri, "__c_selftest", mkr_c_selftest, 0);
|
|
268
|
+
rb_define_singleton_method(mkr_mMakiri, "__alloc_inject?", mkr_s_alloc_inject_p, 0);
|
|
269
|
+
rb_define_singleton_method(mkr_mMakiri, "__alloc_inject", mkr_s_alloc_inject, 1);
|
|
270
|
+
rb_define_singleton_method(mkr_mMakiri, "__alloc_inject_calls", mkr_s_alloc_inject_calls, 0);
|
|
226
271
|
rb_define_singleton_method(mkr_mXML, "__decode", mkr_xml_s_decode, 1);
|
|
227
272
|
}
|
data/ext/makiri/xml/mkr_xml.h
CHANGED
|
@@ -99,9 +99,8 @@ int mkr_xml_is_char(uint32_t c);
|
|
|
99
99
|
* recognition (comment/CDATA/PI content, where '&'/'<' are literal). 0 / -1. */
|
|
100
100
|
int mkr_xml_validate_chars(const char *src, uint32_t len);
|
|
101
101
|
|
|
102
|
-
/*
|
|
103
|
-
*
|
|
104
|
-
int mkr_xml_utf8_decode(const char *p, const char *end, uint32_t *cp);
|
|
102
|
+
/* XML 1.0 §2.3 NameStartChar / NameChar (§9.2b). One-codepoint decoding is the
|
|
103
|
+
* core mkr_utf8_decode1 / mkr_utf8_decode1_span (strict, bounds-checked). */
|
|
105
104
|
int mkr_xml_is_name_start(uint32_t c);
|
|
106
105
|
int mkr_xml_is_name_char(uint32_t c);
|
|
107
106
|
|
|
@@ -7,9 +7,16 @@
|
|
|
7
7
|
* resulting codepoint (literal or from a numeric reference) must be an XML 1.0
|
|
8
8
|
* Char; control characters / surrogates / out-of-range are rejected. The input
|
|
9
9
|
* is already valid UTF-8 (§2.1).
|
|
10
|
+
*
|
|
11
|
+
* All input reads go through the bounded reader (core mkr_span_t) and the
|
|
12
|
+
* strict core decoder (mkr_utf8_decode1) - an out-of-bounds read is
|
|
13
|
+
* structurally impossible, not a per-site convention; all output goes through
|
|
14
|
+
* the bounded writer (mkr_spanbuf_t). The lint (raw_scan_call /
|
|
15
|
+
* raw_cursor_member) keeps it that way.
|
|
10
16
|
*/
|
|
11
17
|
#include "mkr_xml.h"
|
|
12
18
|
#include "mkr_xml_node.h"
|
|
19
|
+
#include "../core/mkr_core.h" /* mkr_span_t + mkr_utf8_decode1 */
|
|
13
20
|
|
|
14
21
|
#include <string.h>
|
|
15
22
|
|
|
@@ -24,73 +31,25 @@ mkr_xml_is_char(uint32_t c)
|
|
|
24
31
|
|| (c >= 0x10000u && c <= 0x10FFFFu);
|
|
25
32
|
}
|
|
26
33
|
|
|
27
|
-
/* Decode one codepoint from UTF-8. STRICT (self-contained, not trusting the
|
|
28
|
-
* caller): rejects truncation, bad continuation bytes, overlong encodings,
|
|
29
|
-
* surrogates and out-of-range values. Returns the byte length (1-4) or 0 on any
|
|
30
|
-
* violation - fail closed, even if some future caller feeds unvalidated bytes. */
|
|
31
|
-
static int
|
|
32
|
-
is_cont(const char *p, const char *end)
|
|
33
|
-
{
|
|
34
|
-
return p < end && ((unsigned char)*p & 0xC0u) == 0x80u;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/* forward decl: utf8_decode is defined below but mkr_xml_validate_chars uses it. */
|
|
38
|
-
static int utf8_decode(const char *p, const char *end, uint32_t *cp);
|
|
39
|
-
|
|
40
34
|
/* Validate that [src, src+len) is entirely XML 1.0 Char, with NO entity/reference
|
|
41
35
|
* recognition (for comment/CDATA/PI content, where '&' and '<' are literal). 0 if
|
|
42
|
-
* all valid, -1 on the first malformed UTF-8 or non-Char (caller raises SYNTAX).
|
|
36
|
+
* all valid, -1 on the first malformed UTF-8 or non-Char (caller raises SYNTAX).
|
|
37
|
+
* The strict decode (truncation / bad continuation / overlong / surrogate /
|
|
38
|
+
* out-of-range -> 0) is the core mkr_utf8_decode1 - self-contained, not trusting
|
|
39
|
+
* the caller, even if some future caller feeds unvalidated bytes. */
|
|
43
40
|
int
|
|
44
41
|
mkr_xml_validate_chars(const char *src, uint32_t len)
|
|
45
42
|
{
|
|
46
|
-
|
|
47
|
-
while (
|
|
43
|
+
mkr_span_t s = mkr_span(src, len);
|
|
44
|
+
while (mkr_span_left(&s) > 0) {
|
|
48
45
|
uint32_t cp;
|
|
49
|
-
int bl =
|
|
46
|
+
int bl = mkr_utf8_decode1_span(&s, &cp);
|
|
50
47
|
if (bl == 0 || !mkr_xml_is_char(cp)) return -1;
|
|
51
|
-
|
|
48
|
+
mkr_span_skip(&s, (size_t)bl);
|
|
52
49
|
}
|
|
53
50
|
return 0;
|
|
54
51
|
}
|
|
55
52
|
|
|
56
|
-
static int
|
|
57
|
-
utf8_decode(const char *p, const char *end, uint32_t *cp)
|
|
58
|
-
{
|
|
59
|
-
unsigned char c = (unsigned char)p[0];
|
|
60
|
-
if (c < 0x80u) { *cp = c; return 1; }
|
|
61
|
-
if ((c & 0xE0u) == 0xC0u) {
|
|
62
|
-
if (!is_cont(p + 1, end)) return 0;
|
|
63
|
-
uint32_t v = ((uint32_t)(c & 0x1Fu) << 6) | ((unsigned char)p[1] & 0x3Fu);
|
|
64
|
-
if (v < 0x80u) return 0; /* overlong */
|
|
65
|
-
*cp = v; return 2;
|
|
66
|
-
}
|
|
67
|
-
if ((c & 0xF0u) == 0xE0u) {
|
|
68
|
-
if (!is_cont(p + 1, end) || !is_cont(p + 2, end)) return 0;
|
|
69
|
-
uint32_t v = ((uint32_t)(c & 0x0Fu) << 12) | (((unsigned char)p[1] & 0x3Fu) << 6)
|
|
70
|
-
| ((unsigned char)p[2] & 0x3Fu);
|
|
71
|
-
if (v < 0x800u) return 0; /* overlong */
|
|
72
|
-
if (v >= 0xD800u && v <= 0xDFFFu) return 0; /* surrogate */
|
|
73
|
-
*cp = v; return 3;
|
|
74
|
-
}
|
|
75
|
-
if ((c & 0xF8u) == 0xF0u) {
|
|
76
|
-
if (!is_cont(p + 1, end) || !is_cont(p + 2, end) || !is_cont(p + 3, end)) return 0;
|
|
77
|
-
uint32_t v = ((uint32_t)(c & 0x07u) << 18) | (((unsigned char)p[1] & 0x3Fu) << 12)
|
|
78
|
-
| (((unsigned char)p[2] & 0x3Fu) << 6) | ((unsigned char)p[3] & 0x3Fu);
|
|
79
|
-
if (v < 0x10000u || v > 0x10FFFFu) return 0; /* overlong / out of range */
|
|
80
|
-
*cp = v; return 4;
|
|
81
|
-
}
|
|
82
|
-
return 0;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
/* Public, bounds-checked one-codepoint decode for the tokenizer's name scanning
|
|
86
|
-
* (returns 0 at end-of-input as well as on any malformed byte). */
|
|
87
|
-
int
|
|
88
|
-
mkr_xml_utf8_decode(const char *p, const char *end, uint32_t *cp)
|
|
89
|
-
{
|
|
90
|
-
if (p >= end) return 0;
|
|
91
|
-
return utf8_decode(p, end, cp);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
53
|
/* XML 1.0 §2.3 NameStartChar / NameChar (the full Unicode sets, not just ASCII).
|
|
95
54
|
* Element/attribute QNames and PI targets are validated against these so an
|
|
96
55
|
* ill-formed name never reaches the DOM (§9.2b). */
|
|
@@ -131,9 +90,12 @@ utf8_encode(uint32_t cp, char *out)
|
|
|
131
90
|
|
|
132
91
|
/* Expand references in [src, src+len) and validate XML Char. Output is never
|
|
133
92
|
* longer than the input (every &...; reference is >= 4 chars and yields <= 4
|
|
134
|
-
* bytes), so a
|
|
135
|
-
*
|
|
136
|
-
*
|
|
93
|
+
* bytes), so a buffer of `len` bytes suffices - and the bounded arena writer
|
|
94
|
+
* (mkr_xml_arena_spanbuf + core mkr_spanbuf) enforces that bound rather than
|
|
95
|
+
* trusting it (a write past it fails closed instead of overrunning the arena).
|
|
96
|
+
* Returns the arena slice and sets *out_len;
|
|
97
|
+
* returns NULL on an undefined entity / bad reference / non-XML-Char (sets
|
|
98
|
+
* *status = MKR_XML_ERR_SYNTAX) or arena OOM/LIMIT. */
|
|
137
99
|
const char *
|
|
138
100
|
mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
|
|
139
101
|
mkr_xml_expand_mode_t mode, uint32_t *out_len, mkr_xml_status_t *status)
|
|
@@ -148,46 +110,45 @@ mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
|
|
|
148
110
|
if (len == 0) { *out_len = 0; return ""; }
|
|
149
111
|
if (doc == NULL || src == NULL) { *status = MKR_XML_ERR_INTERNAL; return NULL; }
|
|
150
112
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
const char *end = src + len;
|
|
113
|
+
/* All output goes through the bounded writer, so no code below can overrun
|
|
114
|
+
* the buffer - a write that would exceed `len` is refused and latches ok=0. */
|
|
115
|
+
mkr_spanbuf_t b = mkr_xml_arena_spanbuf(doc, len);
|
|
116
|
+
if (!b.ok) { *status = doc->oom; return NULL; } /* backing alloc failed */
|
|
117
|
+
mkr_span_t s = mkr_span(src, len);
|
|
157
118
|
|
|
158
|
-
while (
|
|
159
|
-
if (
|
|
119
|
+
while (mkr_span_left(&s) > 0) {
|
|
120
|
+
if (mkr_span_peek(&s) != '&') {
|
|
160
121
|
uint32_t cp;
|
|
161
|
-
int bl =
|
|
122
|
+
int bl = mkr_utf8_decode1_span(&s, &cp);
|
|
162
123
|
if (bl == 0 || !mkr_xml_is_char(cp)) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
|
|
163
124
|
/* §3.3.3: in an attribute value a *literal* TAB/LF/CR normalizes to a
|
|
164
125
|
* space (reference-derived whitespace is preserved - see below). */
|
|
165
126
|
if (mode == MKR_XML_EXPAND_ATTR
|
|
166
127
|
&& (cp == 0x9u || cp == 0xAu || cp == 0xDu)) {
|
|
167
|
-
|
|
168
|
-
|
|
128
|
+
mkr_spanbuf_putc(&b, ' ');
|
|
129
|
+
mkr_span_skip(&s, (size_t)bl);
|
|
169
130
|
continue;
|
|
170
131
|
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
p += bl;
|
|
132
|
+
mkr_spanbuf_write(&b, mkr_span_mark(&s), (size_t)bl);
|
|
133
|
+
mkr_span_skip(&s, (size_t)bl);
|
|
174
134
|
continue;
|
|
175
135
|
}
|
|
176
136
|
|
|
177
137
|
/* a reference: '&' ... ';' */
|
|
178
|
-
|
|
179
|
-
if (
|
|
180
|
-
|
|
138
|
+
mkr_span_skip(&s, 1); /* past '&' */
|
|
139
|
+
if (mkr_span_peek(&s) == '#') { /* numeric character reference */
|
|
140
|
+
mkr_span_skip(&s, 1);
|
|
181
141
|
int hex = 0;
|
|
182
142
|
/* §4.1: the hex marker is a lowercase 'x' only - "X" is not-wf
|
|
183
143
|
* (an uppercase 'X' is not a decimal digit either, so it is rejected
|
|
184
144
|
* as "no digits" below). */
|
|
185
|
-
if (
|
|
186
|
-
const char *digits = p;
|
|
145
|
+
if (mkr_span_peek(&s) == 'x') { hex = 1; mkr_span_skip(&s, 1); }
|
|
187
146
|
uint32_t base = hex ? 16u : 10u;
|
|
188
147
|
uint32_t cp = 0;
|
|
189
|
-
|
|
190
|
-
|
|
148
|
+
int ndigits = 0;
|
|
149
|
+
for (;;) {
|
|
150
|
+
int d = mkr_span_peek(&s);
|
|
151
|
+
if (d < 0 || d == ';') break;
|
|
191
152
|
uint32_t dig;
|
|
192
153
|
if (d >= '0' && d <= '9') dig = (uint32_t)(d - '0');
|
|
193
154
|
else if (hex && d >= 'a' && d <= 'f') dig = (uint32_t)(d - 'a' + 10);
|
|
@@ -197,29 +158,38 @@ mkr_xml_expand(mkr_xml_doc_t *doc, const char *src, uint32_t len,
|
|
|
197
158
|
* uint32_t into the valid range and be falsely accepted. */
|
|
198
159
|
if (cp > (0x10FFFFu - dig) / base) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
|
|
199
160
|
cp = cp * base + dig;
|
|
200
|
-
|
|
161
|
+
ndigits++;
|
|
162
|
+
mkr_span_skip(&s, 1);
|
|
201
163
|
}
|
|
202
|
-
if (
|
|
203
|
-
|
|
164
|
+
if (mkr_span_peek(&s) != ';' || ndigits == 0) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* no ';' / no digits */
|
|
165
|
+
mkr_span_skip(&s, 1); /* past ';' */
|
|
204
166
|
if (!mkr_xml_is_char(cp)) { *status = MKR_XML_ERR_SYNTAX; return NULL; }
|
|
205
|
-
|
|
167
|
+
/* encode into a safe 4-byte local, then hand the exact length to the
|
|
168
|
+
* bounded writer (the encode itself can never overrun `enc`). */
|
|
169
|
+
char enc[4];
|
|
170
|
+
mkr_spanbuf_write(&b, enc, (size_t)utf8_encode(cp, enc));
|
|
206
171
|
} else { /* named entity */
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
p++; /* past ';' */
|
|
172
|
+
size_t nlen;
|
|
173
|
+
if (!mkr_span_find(&s, ';', &nlen)) { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* unterminated */
|
|
174
|
+
const char *ns = mkr_span_mark(&s);
|
|
175
|
+
mkr_span_skip(&s, nlen + 1); /* name + ';' */
|
|
212
176
|
char ch;
|
|
213
|
-
if (
|
|
214
|
-
else if (
|
|
215
|
-
else if (
|
|
216
|
-
else if (
|
|
217
|
-
else if (
|
|
177
|
+
if (mkr_bytes_eq(ns, nlen, "lt", 2)) ch = '<';
|
|
178
|
+
else if (mkr_bytes_eq(ns, nlen, "gt", 2)) ch = '>';
|
|
179
|
+
else if (mkr_bytes_eq(ns, nlen, "amp", 3)) ch = '&';
|
|
180
|
+
else if (mkr_bytes_eq(ns, nlen, "apos", 4)) ch = '\'';
|
|
181
|
+
else if (mkr_bytes_eq(ns, nlen, "quot", 4)) ch = '"';
|
|
218
182
|
else { *status = MKR_XML_ERR_SYNTAX; return NULL; } /* undefined entity */
|
|
219
|
-
|
|
183
|
+
mkr_spanbuf_putc(&b, ch);
|
|
220
184
|
}
|
|
221
185
|
}
|
|
222
186
|
|
|
223
|
-
|
|
224
|
-
|
|
187
|
+
/* By construction the buffer was never overrun; finish returns NULL if a
|
|
188
|
+
* write was refused (the "output <= input" invariant broke - our bug). The
|
|
189
|
+
* backing alloc was already checked at init, so a NULL here means a refused
|
|
190
|
+
* write: fail closed rather than return a truncated expansion. */
|
|
191
|
+
const char *out = mkr_spanbuf_finish(&b);
|
|
192
|
+
if (out == NULL) { *status = MKR_XML_ERR_INTERNAL; return NULL; }
|
|
193
|
+
*out_len = (uint32_t)b.pos;
|
|
194
|
+
return out;
|
|
225
195
|
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/* Element-name index for the XML arena (see mkr_xml_index.h). Open-addressing
|
|
2
|
+
* hash keyed by (local name + namespace URI); each entry holds the
|
|
3
|
+
* document-ordered elements with that name. Two tree passes: count elements to
|
|
4
|
+
* size the table, then fill in document order. Heap-allocated (not the arena),
|
|
5
|
+
* freed on invalidate. OOM fails closed (NULL -> caller walks the tree). */
|
|
6
|
+
|
|
7
|
+
#include "mkr_xml_index.h"
|
|
8
|
+
#include "../core/mkr_core.h"
|
|
9
|
+
|
|
10
|
+
#include <stdlib.h>
|
|
11
|
+
#include <string.h>
|
|
12
|
+
|
|
13
|
+
typedef struct {
|
|
14
|
+
const char *local; /* borrowed from the first element (arena-stable) */
|
|
15
|
+
const char *ns_uri; /* may be NULL (no namespace) */
|
|
16
|
+
uint32_t local_len;
|
|
17
|
+
uint32_t ns_uri_len;
|
|
18
|
+
mkr_xml_node_t **nodes; /* document order */
|
|
19
|
+
size_t count, cap;
|
|
20
|
+
} mkr_xml_index_entry_t;
|
|
21
|
+
|
|
22
|
+
struct mkr_xml_name_index {
|
|
23
|
+
mkr_xml_index_entry_t *buckets;
|
|
24
|
+
size_t cap; /* power of two; 0 only for an empty document */
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
/* FNV-1a over the local name then the namespace URI. */
|
|
28
|
+
static uint64_t
|
|
29
|
+
key_hash(const char *local, size_t local_len, const char *ns_uri, size_t ns_uri_len)
|
|
30
|
+
{
|
|
31
|
+
uint64_t h = 1469598103934665603ULL;
|
|
32
|
+
for (size_t i = 0; i < local_len; i++) { h ^= (unsigned char)local[i]; h *= 1099511628211ULL; }
|
|
33
|
+
h ^= 0xff; h *= 1099511628211ULL; /* separator so "ab"+"" != "a"+"b" */
|
|
34
|
+
for (size_t i = 0; i < ns_uri_len; i++) { h ^= (unsigned char)ns_uri[i]; h *= 1099511628211ULL; }
|
|
35
|
+
return h;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
static int
|
|
39
|
+
key_eq(const mkr_xml_index_entry_t *e, const char *local, size_t local_len,
|
|
40
|
+
const char *ns_uri, size_t ns_uri_len)
|
|
41
|
+
{
|
|
42
|
+
if (e->local_len != local_len || e->ns_uri_len != ns_uri_len) return 0;
|
|
43
|
+
if (local_len && memcmp(e->local, local, local_len) != 0) return 0;
|
|
44
|
+
if (ns_uri_len && memcmp(e->ns_uri, ns_uri, ns_uri_len) != 0) return 0;
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
/* Find the entry for the key, or the empty slot to create it (open addressing,
|
|
50
|
+
* power-of-two mask). The table is sized so it never fills (load < 0.5). */
|
|
51
|
+
static mkr_xml_index_entry_t *
|
|
52
|
+
slot_for(mkr_xml_name_index_t *idx, const char *local, size_t local_len,
|
|
53
|
+
const char *ns_uri, size_t ns_uri_len)
|
|
54
|
+
{
|
|
55
|
+
size_t mask = idx->cap - 1;
|
|
56
|
+
size_t i = (size_t)key_hash(local, local_len, ns_uri, ns_uri_len) & mask;
|
|
57
|
+
for (;;) {
|
|
58
|
+
mkr_xml_index_entry_t *e = &idx->buckets[i];
|
|
59
|
+
if (e->local == NULL && e->count == 0) return e; /* empty */
|
|
60
|
+
if (key_eq(e, local, local_len, ns_uri, ns_uri_len)) return e;
|
|
61
|
+
i = (i + 1) & mask;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/* Append +node+ to its key's bucket, creating the entry if new. 0 / -1 (OOM). */
|
|
66
|
+
static int
|
|
67
|
+
index_push(mkr_xml_name_index_t *idx, mkr_xml_node_t *node)
|
|
68
|
+
{
|
|
69
|
+
mkr_xml_index_entry_t *e =
|
|
70
|
+
slot_for(idx, node->local, node->local_len, node->ns_uri, node->ns_uri_len);
|
|
71
|
+
if (e->local == NULL && e->count == 0) { /* fresh entry: borrow the key */
|
|
72
|
+
e->local = node->local; e->local_len = node->local_len;
|
|
73
|
+
e->ns_uri = node->ns_uri; e->ns_uri_len = node->ns_uri_len;
|
|
74
|
+
}
|
|
75
|
+
if (mkr_grow_reserve((void **)&e->nodes, &e->cap, e->count + 1, sizeof(*e->nodes)) != MKR_OK) {
|
|
76
|
+
return -1; /* grows geometrically + overflow-safely internally */
|
|
77
|
+
}
|
|
78
|
+
e->nodes[e->count++] = node;
|
|
79
|
+
return 0;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/* Count elements (document order is irrelevant here) to size the table. */
|
|
83
|
+
static size_t
|
|
84
|
+
count_elements(mkr_xml_node_t *root)
|
|
85
|
+
{
|
|
86
|
+
size_t n = 0;
|
|
87
|
+
for (mkr_xml_node_t *cur = root; cur != NULL; cur = mkr_xml_preorder_next(root, cur)) {
|
|
88
|
+
if (cur->type == MKR_XML_NODE_TYPE_ELEMENT) n++;
|
|
89
|
+
}
|
|
90
|
+
return n;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
static mkr_xml_name_index_t *
|
|
94
|
+
build(mkr_xml_doc_t *doc)
|
|
95
|
+
{
|
|
96
|
+
mkr_xml_node_t *root = doc->doc_node;
|
|
97
|
+
if (root == NULL) return NULL;
|
|
98
|
+
|
|
99
|
+
mkr_xml_name_index_t *idx = (mkr_xml_name_index_t *)mkr_callocarray(1, sizeof(*idx));
|
|
100
|
+
if (idx == NULL) return NULL;
|
|
101
|
+
|
|
102
|
+
size_t n = count_elements(root);
|
|
103
|
+
/* Size for load factor < 0.5 (2n+1 slots). The overflow-checked sizer fails
|
|
104
|
+
* closed - unlike the old next_pow2, which saturated to a too-small table on
|
|
105
|
+
* overflow, where open addressing could never find a free slot. */
|
|
106
|
+
size_t want;
|
|
107
|
+
if (!mkr_size_mul(n, 2, &want) || !mkr_size_add(want, 1, &want)
|
|
108
|
+
|| !mkr_pow2_ceil(want, &idx->cap)) { free(idx); return NULL; }
|
|
109
|
+
if (idx->cap < 8) idx->cap = 8; /* small floor */
|
|
110
|
+
idx->buckets = (mkr_xml_index_entry_t *)mkr_callocarray(idx->cap, sizeof(*idx->buckets));
|
|
111
|
+
if (idx->buckets == NULL) { free(idx); return NULL; }
|
|
112
|
+
|
|
113
|
+
/* Fill pass: pre-order (document order), elements only. */
|
|
114
|
+
for (mkr_xml_node_t *cur = root; cur != NULL; cur = mkr_xml_preorder_next(root, cur)) {
|
|
115
|
+
if (cur->type == MKR_XML_NODE_TYPE_ELEMENT) {
|
|
116
|
+
if (index_push(idx, cur) != 0) { mkr_xml_name_index_free(idx); return NULL; }
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return idx;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
mkr_xml_name_index_t *
|
|
123
|
+
mkr_xml_name_index_get(mkr_xml_doc_t *doc)
|
|
124
|
+
{
|
|
125
|
+
if (doc == NULL) return NULL;
|
|
126
|
+
if (doc->name_index != NULL) return (mkr_xml_name_index_t *)doc->name_index;
|
|
127
|
+
mkr_xml_name_index_t *idx = build(doc);
|
|
128
|
+
doc->name_index = idx; /* NULL on OOM: caller walks, retries next time */
|
|
129
|
+
return idx;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
void
|
|
133
|
+
mkr_xml_name_index_free(mkr_xml_name_index_t *idx)
|
|
134
|
+
{
|
|
135
|
+
if (idx == NULL) return;
|
|
136
|
+
if (idx->buckets != NULL) {
|
|
137
|
+
for (size_t i = 0; i < idx->cap; i++) free(idx->buckets[i].nodes);
|
|
138
|
+
free(idx->buckets);
|
|
139
|
+
}
|
|
140
|
+
free(idx);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
void
|
|
144
|
+
mkr_xml_name_index_invalidate(mkr_xml_doc_t *doc)
|
|
145
|
+
{
|
|
146
|
+
if (doc == NULL || doc->name_index == NULL) return;
|
|
147
|
+
mkr_xml_name_index_free((mkr_xml_name_index_t *)doc->name_index);
|
|
148
|
+
doc->name_index = NULL;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
mkr_xml_node_t *const *
|
|
152
|
+
mkr_xml_name_index_lookup(const mkr_xml_name_index_t *idx,
|
|
153
|
+
const char *local, size_t local_len,
|
|
154
|
+
const char *ns_uri, size_t ns_uri_len, size_t *out_count)
|
|
155
|
+
{
|
|
156
|
+
if (out_count != NULL) *out_count = 0;
|
|
157
|
+
if (idx == NULL || idx->cap == 0 || local == NULL) return NULL;
|
|
158
|
+
size_t mask = idx->cap - 1;
|
|
159
|
+
size_t i = (size_t)key_hash(local, local_len, ns_uri, ns_uri_len) & mask;
|
|
160
|
+
for (;;) {
|
|
161
|
+
const mkr_xml_index_entry_t *e = &idx->buckets[i];
|
|
162
|
+
if (e->local == NULL && e->count == 0) return NULL; /* miss */
|
|
163
|
+
if (key_eq(e, local, local_len, ns_uri, ns_uri_len)) {
|
|
164
|
+
if (out_count != NULL) *out_count = e->count;
|
|
165
|
+
return e->nodes;
|
|
166
|
+
}
|
|
167
|
+
i = (i + 1) & mask;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#ifndef MKR_XML_INDEX_H
|
|
2
|
+
#define MKR_XML_INDEX_H
|
|
3
|
+
|
|
4
|
+
#include "mkr_xml_node.h"
|
|
5
|
+
#include <stddef.h>
|
|
6
|
+
|
|
7
|
+
#ifdef __cplusplus
|
|
8
|
+
extern "C" {
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
/*
|
|
12
|
+
* Element-name index for the XML arena: maps each (local name + namespace URI)
|
|
13
|
+
* to the document-ordered list of elements bearing it, so a document-rooted
|
|
14
|
+
* descendant name test (//entry, css("entry")) is answered from the bucket
|
|
15
|
+
* instead of walking the whole tree. The HTML side has the analogous tag-id
|
|
16
|
+
* index; XML element names are arbitrary strings, so this is keyed by the name
|
|
17
|
+
* bytes (borrowed from the arena, stable until the next mutation).
|
|
18
|
+
*
|
|
19
|
+
* Lazily built and cached on the document; dropped by
|
|
20
|
+
* mkr_xml_name_index_invalidate from the single XML mutation hook (the same
|
|
21
|
+
* discipline as the HTML attr/text indices). Build OOM fails closed: the getter
|
|
22
|
+
* returns NULL and the caller walks the tree.
|
|
23
|
+
*/
|
|
24
|
+
typedef struct mkr_xml_name_index mkr_xml_name_index_t;
|
|
25
|
+
|
|
26
|
+
/* The document's element-name index, built and cached on first call. Returns
|
|
27
|
+
* NULL on OOM (caller falls back to a tree walk). */
|
|
28
|
+
mkr_xml_name_index_t *mkr_xml_name_index_get(mkr_xml_doc_t *doc);
|
|
29
|
+
|
|
30
|
+
/* Drop the cached index after a structural mutation (no-op when unbuilt). */
|
|
31
|
+
void mkr_xml_name_index_invalidate(mkr_xml_doc_t *doc);
|
|
32
|
+
|
|
33
|
+
/* Free an index directly (used by mkr_xml_doc_destroy via the invalidate hook). */
|
|
34
|
+
void mkr_xml_name_index_free(mkr_xml_name_index_t *idx);
|
|
35
|
+
|
|
36
|
+
/* The document-ordered elements with local name +local+ and namespace URI
|
|
37
|
+
* +ns_uri+ (ns_uri == NULL / ns_uri_len == 0 means the no-namespace bucket).
|
|
38
|
+
* Returns the borrowed bucket and sets *out_count, or NULL with *out_count 0. */
|
|
39
|
+
mkr_xml_node_t *const *mkr_xml_name_index_lookup(const mkr_xml_name_index_t *idx,
|
|
40
|
+
const char *local, size_t local_len,
|
|
41
|
+
const char *ns_uri, size_t ns_uri_len,
|
|
42
|
+
size_t *out_count);
|
|
43
|
+
|
|
44
|
+
#ifdef __cplusplus
|
|
45
|
+
}
|
|
46
|
+
#endif
|
|
47
|
+
|
|
48
|
+
#endif /* MKR_XML_INDEX_H */
|