makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
|
@@ -108,6 +108,223 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
|
|
|
108
108
|
return 0;
|
|
109
109
|
}
|
|
110
110
|
|
|
111
|
+
VALUE
|
|
112
|
+
mkr_ruby_to_utf8(VALUE str)
|
|
113
|
+
{
|
|
114
|
+
/* Honour the Ruby String's declared encoding so its content survives:
|
|
115
|
+
*
|
|
116
|
+
* - UTF-8 / US-ASCII / ASCII-8BIT (binary): returned unchanged. These are
|
|
117
|
+
* already UTF-8 bytes (or deliberately raw bytes), and the native parser
|
|
118
|
+
* does the WHATWG invalid-byte replacement for them. The UTF-8 common
|
|
119
|
+
* case costs only this encoding comparison - no transcode, no copy.
|
|
120
|
+
*
|
|
121
|
+
* - any other encoding (Shift_JIS, EUC-JP, ISO-8859-1, Windows-1252, ...):
|
|
122
|
+
* transcoded to UTF-8 with invalid/undef -> U+FFFD, so e.g. Shift_JIS
|
|
123
|
+
* text becomes the right UTF-8 characters instead of being read as raw
|
|
124
|
+
* UTF-8 bytes and mangled. Only non-UTF-8 input pays this. */
|
|
125
|
+
rb_encoding *enc = rb_enc_get(str);
|
|
126
|
+
if (enc == rb_utf8_encoding()
|
|
127
|
+
|| enc == rb_usascii_encoding()
|
|
128
|
+
|| enc == rb_ascii8bit_encoding()) {
|
|
129
|
+
return str;
|
|
130
|
+
}
|
|
131
|
+
return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()),
|
|
132
|
+
ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE, Qnil);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/* rb_str_encode with no replacement flags: an undefined conversion or invalid
|
|
136
|
+
* byte sequence RAISES (Encoding::UndefinedConversionError /
|
|
137
|
+
* Encoding::InvalidByteSequenceError) instead of substituting U+FFFD. Run under
|
|
138
|
+
* rb_protect so we can remap the Ruby Encoding error to Makiri::XML::SyntaxError. */
|
|
139
|
+
static VALUE
|
|
140
|
+
mkr_xml_strict_transcode_thunk(VALUE str)
|
|
141
|
+
{
|
|
142
|
+
return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()), 0, Qnil);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
|
|
146
|
+
*
|
|
147
|
+
* The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
|
|
148
|
+
* checked before the UTF-16 LE BOM they share a prefix with. */
|
|
149
|
+
static rb_encoding *
|
|
150
|
+
mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len)
|
|
151
|
+
{
|
|
152
|
+
*bom_len = 0;
|
|
153
|
+
if (len >= 4 && p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xFE && p[3] == 0xFF) {
|
|
154
|
+
*bom_len = 4; return rb_enc_find("UTF-32BE");
|
|
155
|
+
}
|
|
156
|
+
if (len >= 4 && p[0] == 0xFF && p[1] == 0xFE && p[2] == 0x00 && p[3] == 0x00) {
|
|
157
|
+
*bom_len = 4; return rb_enc_find("UTF-32LE");
|
|
158
|
+
}
|
|
159
|
+
if (len >= 2 && p[0] == 0xFE && p[1] == 0xFF) { *bom_len = 2; return rb_enc_find("UTF-16BE"); }
|
|
160
|
+
if (len >= 2 && p[0] == 0xFF && p[1] == 0xFE) { *bom_len = 2; return rb_enc_find("UTF-16LE"); }
|
|
161
|
+
if (len >= 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) { *bom_len = 3; return rb_utf8_encoding(); }
|
|
162
|
+
return NULL;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
|
|
166
|
+
* The declaration is ASCII; for a UTF-16/32-detected document its bytes are
|
|
167
|
+
* stride-interleaved, so the ASCII column is extracted (per the BOM) before the
|
|
168
|
+
* scan, letting a BOM-vs-declaration conflict be caught even in UTF-16. */
|
|
169
|
+
static rb_encoding *
|
|
170
|
+
mkr_xml_decl_encoding(const unsigned char *p, long len, rb_encoding *bom)
|
|
171
|
+
{
|
|
172
|
+
long stride = 1, off = 0;
|
|
173
|
+
if (bom == rb_enc_find("UTF-16LE")) { stride = 2; off = 0; }
|
|
174
|
+
else if (bom == rb_enc_find("UTF-16BE")) { stride = 2; off = 1; }
|
|
175
|
+
else if (bom == rb_enc_find("UTF-32LE")) { stride = 4; off = 0; }
|
|
176
|
+
else if (bom == rb_enc_find("UTF-32BE")) { stride = 4; off = 3; }
|
|
177
|
+
|
|
178
|
+
char head[256];
|
|
179
|
+
long hn = 0;
|
|
180
|
+
for (long i = off; i < len && hn < (long)sizeof(head); i += stride) head[hn++] = (char)p[i];
|
|
181
|
+
|
|
182
|
+
long i = 0;
|
|
183
|
+
while (i < hn && (head[i] == ' ' || head[i] == '\t' || head[i] == '\r' || head[i] == '\n')) i++;
|
|
184
|
+
if (i + 5 > hn || memcmp(head + i, "<?xml", 5) != 0) return NULL;
|
|
185
|
+
i += 5;
|
|
186
|
+
/* find a whitespace-introduced "encoding" before the '?>' */
|
|
187
|
+
for (; i + 8 <= hn; i++) {
|
|
188
|
+
if (head[i] == '?' && i + 1 < hn && head[i + 1] == '>') return NULL; /* end of decl */
|
|
189
|
+
int ws_before = (head[i - 1] == ' ' || head[i - 1] == '\t' || head[i - 1] == '\r' || head[i - 1] == '\n');
|
|
190
|
+
if (!ws_before || memcmp(head + i, "encoding", 8) != 0) continue;
|
|
191
|
+
long j = i + 8;
|
|
192
|
+
while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
|
|
193
|
+
if (j >= hn || head[j] != '=') return NULL;
|
|
194
|
+
j++;
|
|
195
|
+
while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
|
|
196
|
+
if (j >= hn || (head[j] != '"' && head[j] != '\'')) return NULL;
|
|
197
|
+
char q = head[j++];
|
|
198
|
+
long ns = j;
|
|
199
|
+
while (j < hn && head[j] != q) j++;
|
|
200
|
+
if (j >= hn) return NULL;
|
|
201
|
+
char name[64];
|
|
202
|
+
long nl = j - ns;
|
|
203
|
+
if (nl <= 0 || nl >= (long)sizeof(name)) return NULL;
|
|
204
|
+
memcpy(name, head + ns, (size_t)nl);
|
|
205
|
+
name[nl] = '\0';
|
|
206
|
+
return rb_enc_find(name); /* NULL for an unknown encoding name */
|
|
207
|
+
}
|
|
208
|
+
return NULL;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/* Two encodings agree for conflict purposes when identical, or when either is
|
|
212
|
+
* US-ASCII (a subset of UTF-8 and the single-byte encodings). */
|
|
213
|
+
static int
|
|
214
|
+
mkr_xml_enc_compatible(rb_encoding *a, rb_encoding *b)
|
|
215
|
+
{
|
|
216
|
+
return a == b || a == rb_usascii_encoding() || b == rb_usascii_encoding();
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
VALUE
|
|
220
|
+
mkr_xml_decode_input(VALUE str, size_t max_bytes)
|
|
221
|
+
{
|
|
222
|
+
rb_encoding *tag = rb_enc_get(str);
|
|
223
|
+
const unsigned char *raw = (const unsigned char *)RSTRING_PTR(str);
|
|
224
|
+
long rawlen = RSTRING_LEN(str);
|
|
225
|
+
|
|
226
|
+
/* Detect the byte encoding (XML 1.0 Appendix F): a BOM wins, else the
|
|
227
|
+
* declaration. The Ruby String's encoding is authoritative when it is a
|
|
228
|
+
* concrete text encoding; a BOM/declaration that disagrees is a fatal
|
|
229
|
+
* conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
|
|
230
|
+
* detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
|
|
231
|
+
* with File.binread now parses). */
|
|
232
|
+
long bom_len = 0;
|
|
233
|
+
rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len);
|
|
234
|
+
rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom);
|
|
235
|
+
int is_binary = (tag == rb_ascii8bit_encoding());
|
|
236
|
+
|
|
237
|
+
if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
|
|
238
|
+
rb_raise(mkr_eXmlSyntaxError,
|
|
239
|
+
"XML encoding conflict: the byte-order mark and the encoding declaration disagree");
|
|
240
|
+
}
|
|
241
|
+
if (!is_binary && bom && !mkr_xml_enc_compatible(bom, tag)) {
|
|
242
|
+
rb_raise(mkr_eXmlSyntaxError,
|
|
243
|
+
"XML encoding conflict: the byte-order mark disagrees with the string's encoding");
|
|
244
|
+
}
|
|
245
|
+
if (!is_binary && decl && !mkr_xml_enc_compatible(decl, tag)) {
|
|
246
|
+
/* A concrete String encoding is authoritative for decoding, so the
|
|
247
|
+
* declaration is not used to transcode - but a declaration that names a
|
|
248
|
+
* different encoding than the String is tagged with (e.g. a Shift_JIS
|
|
249
|
+
* String declaring encoding="UTF-8") is a self-inconsistent document and
|
|
250
|
+
* a fatal error, not a silently-ignored mismatch. */
|
|
251
|
+
rb_raise(mkr_eXmlSyntaxError,
|
|
252
|
+
"XML encoding conflict: the encoding declaration disagrees with the string's encoding");
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
rb_encoding *eff = is_binary ? (bom ? bom : (decl ? decl : rb_utf8_encoding())) : tag;
|
|
256
|
+
|
|
257
|
+
/* Decode to UTF-8 (strict). UTF-8 / US-ASCII / ASCII-8BIT are already UTF-8
|
|
258
|
+
* bytes (validated below); anything else is strict-transcoded, raising rather
|
|
259
|
+
* than substituting U+FFFD. */
|
|
260
|
+
VALUE s;
|
|
261
|
+
if (eff == rb_utf8_encoding() || eff == rb_usascii_encoding() || eff == rb_ascii8bit_encoding()) {
|
|
262
|
+
s = str;
|
|
263
|
+
} else {
|
|
264
|
+
VALUE in = str;
|
|
265
|
+
if (rb_enc_get(str) != eff) { in = rb_str_dup(str); rb_enc_associate(in, eff); }
|
|
266
|
+
int state = 0;
|
|
267
|
+
s = rb_protect(mkr_xml_strict_transcode_thunk, in, &state);
|
|
268
|
+
if (state != 0) {
|
|
269
|
+
VALUE exc = rb_errinfo();
|
|
270
|
+
rb_set_errinfo(Qnil);
|
|
271
|
+
char msg[256];
|
|
272
|
+
mkr_ruby_exception_message(exc, msg, sizeof msg);
|
|
273
|
+
rb_raise(mkr_eXmlSyntaxError,
|
|
274
|
+
"XML input could not be decoded to UTF-8: %s", msg);
|
|
275
|
+
}
|
|
276
|
+
RB_GC_GUARD(in);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const char *ptr = RSTRING_PTR(s);
|
|
280
|
+
long len = RSTRING_LEN(s);
|
|
281
|
+
/* §4.3.3: a leading BOM is the encoding signature, not document content -
|
|
282
|
+
* strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
|
|
283
|
+
if (len >= 3 && (unsigned char)ptr[0] == 0xEF && (unsigned char)ptr[1] == 0xBB
|
|
284
|
+
&& (unsigned char)ptr[2] == 0xBF) {
|
|
285
|
+
ptr += 3; len -= 3;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/* Fail closed on an over-budget input BEFORE the validation copy and the
|
|
289
|
+
* caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
|
|
290
|
+
* budget can never parse). max_bytes == 0 disables the check (__decode). */
|
|
291
|
+
if (max_bytes != 0 && (size_t)len > max_bytes) {
|
|
292
|
+
RB_GC_GUARD(s);
|
|
293
|
+
rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/* Strict UTF-8 validation: an embedded NUL or any invalid UTF-8 is fatal
|
|
297
|
+
* (no U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). */
|
|
298
|
+
if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
|
|
299
|
+
rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
|
|
300
|
+
}
|
|
301
|
+
VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
|
|
302
|
+
if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
|
|
303
|
+
rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
|
|
304
|
+
}
|
|
305
|
+
RB_GC_GUARD(s);
|
|
306
|
+
return u; /* validated, UTF-8-tagged, BOM-stripped */
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
bool
|
|
310
|
+
mkr_ruby_str_known_valid_utf8(VALUE str)
|
|
311
|
+
{
|
|
312
|
+
if (!RB_TYPE_P(str, T_STRING)) {
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
/* ENC_CODERANGE reads the *cached* classification from the object's flags;
|
|
316
|
+
* it does NOT scan (rb_enc_str_coderange would, costing as much as our own
|
|
317
|
+
* validator). So this only wins when Ruby already knows the answer. */
|
|
318
|
+
int cr = ENC_CODERANGE(str);
|
|
319
|
+
if (cr == ENC_CODERANGE_7BIT) {
|
|
320
|
+
return true; /* all bytes < 0x80 in an ASCII-compatible encoding */
|
|
321
|
+
}
|
|
322
|
+
if (cr == ENC_CODERANGE_VALID) {
|
|
323
|
+
return rb_enc_get(str) == rb_utf8_encoding(); /* valid AND UTF-8 */
|
|
324
|
+
}
|
|
325
|
+
return false; /* UNKNOWN or BROKEN: let mkr_utf8_sanitize handle it */
|
|
326
|
+
}
|
|
327
|
+
|
|
111
328
|
const char *
|
|
112
329
|
mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
|
|
113
330
|
{
|
data/ext/makiri/core/mkr_alloc.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* allocators, the foundation every other C layer (glue, xpath engine,
|
|
7
7
|
* lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
|
|
8
8
|
* `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
|
|
9
|
-
* NOTHING in this header touches Ruby
|
|
9
|
+
* NOTHING in this header touches Ruby - exception mapping happens at the glue
|
|
10
10
|
* boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
|
|
11
11
|
*/
|
|
12
12
|
|
data/ext/makiri/core/mkr_buf.c
CHANGED
|
@@ -13,7 +13,12 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
|
|
|
13
13
|
if (!mkr_size_add(b->len, n, &need)) {
|
|
14
14
|
return MKR_ERR_OOM;
|
|
15
15
|
}
|
|
16
|
-
|
|
16
|
+
/* max == 0 is NOT unbounded: it falls back to the conservative default
|
|
17
|
+
* ceiling, so a caller that never set a cap still fails closed. Either way the
|
|
18
|
+
* absolute hard ceiling clamps it, so no buffer can exhaust memory. */
|
|
19
|
+
size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
|
|
20
|
+
size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
|
|
21
|
+
if (need > limit) {
|
|
17
22
|
return MKR_ERR_LIMIT;
|
|
18
23
|
}
|
|
19
24
|
size_t need_term; /* room for the NUL terminator too */
|
|
@@ -38,6 +43,35 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
|
|
|
38
43
|
return MKR_OK;
|
|
39
44
|
}
|
|
40
45
|
|
|
46
|
+
mkr_status_t
|
|
47
|
+
mkr_buf_reserve(mkr_buf_t *b, size_t n)
|
|
48
|
+
{
|
|
49
|
+
/* Pre-allocate capacity for n bytes so a known-size fill does not realloc on
|
|
50
|
+
* every geometric step (the serializer reserves ~the output size up front).
|
|
51
|
+
* Best-effort: never grow past the buffer's own cap, and a later append still
|
|
52
|
+
* fails closed if the real output exceeds it. */
|
|
53
|
+
size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
|
|
54
|
+
size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
|
|
55
|
+
if (n > limit) {
|
|
56
|
+
n = limit;
|
|
57
|
+
}
|
|
58
|
+
size_t need_term; /* room for the NUL terminator too */
|
|
59
|
+
if (!mkr_size_add(n, 1, &need_term)) {
|
|
60
|
+
return MKR_ERR_OOM;
|
|
61
|
+
}
|
|
62
|
+
if (need_term <= b->cap) {
|
|
63
|
+
return MKR_OK; /* already have room */
|
|
64
|
+
}
|
|
65
|
+
char *p = realloc(b->data, need_term);
|
|
66
|
+
if (p == NULL) {
|
|
67
|
+
return MKR_ERR_OOM;
|
|
68
|
+
}
|
|
69
|
+
b->data = p;
|
|
70
|
+
b->cap = need_term;
|
|
71
|
+
b->data[b->len] = '\0'; /* keep NUL-terminated */
|
|
72
|
+
return MKR_OK;
|
|
73
|
+
}
|
|
74
|
+
|
|
41
75
|
char *
|
|
42
76
|
mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
|
|
43
77
|
{
|
data/ext/makiri/core/mkr_buf.h
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
#define MAKIRI_CORE_MKR_BUF_H
|
|
3
3
|
|
|
4
4
|
/*
|
|
5
|
-
* mkr_buf_t
|
|
5
|
+
* mkr_buf_t - an owned, growable, optionally capped byte buffer, kept
|
|
6
6
|
* NUL-terminated. Built on the fail-closed allocators in mkr_alloc.h.
|
|
7
7
|
* (mkr_core.h is a thin umbrella over mkr_alloc.h + mkr_text.h + this.)
|
|
8
8
|
*/
|
|
@@ -13,14 +13,42 @@
|
|
|
13
13
|
extern "C" {
|
|
14
14
|
#endif
|
|
15
15
|
|
|
16
|
+
/* Memory safety for buffers lives HERE, at the one buffer primitive, not at each
|
|
17
|
+
* call site: "max == 0" can no longer mean "unbounded". Two ceilings bound every
|
|
18
|
+
* mkr_buf so a runaway - a cycle, an unbounded loop, or a caller that forgot to
|
|
19
|
+
* pass a cap - fails closed with MKR_ERR_LIMIT instead of exhausting memory and
|
|
20
|
+
* freezing the machine:
|
|
21
|
+
*
|
|
22
|
+
* MKR_BUF_DEFAULT_LIMIT the cap applied when the caller passes max == 0. A
|
|
23
|
+
* conservative default (100 MiB): code that did not
|
|
24
|
+
* think about a bound gets a tight one for free, and a
|
|
25
|
+
* buffer that genuinely needs to be large must opt in
|
|
26
|
+
* EXPLICITLY by passing a larger max.
|
|
27
|
+
* MKR_BUF_HARD_MAX an absolute ceiling no buffer may exceed, even one
|
|
28
|
+
* with an explicit max - the last-resort backstop.
|
|
29
|
+
* Tight, content-scaled bounds still belong to the
|
|
30
|
+
* caller (e.g. the XML serializer caps itself at a
|
|
31
|
+
* multiple of arena_bytes); this stops total runaway.
|
|
32
|
+
*
|
|
33
|
+
* Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
|
|
34
|
+
#ifndef MKR_BUF_DEFAULT_LIMIT
|
|
35
|
+
#define MKR_BUF_DEFAULT_LIMIT ((size_t)100 << 20) /* 100 MiB */
|
|
36
|
+
#endif
|
|
37
|
+
#ifndef MKR_BUF_HARD_MAX
|
|
38
|
+
#define MKR_BUF_HARD_MAX ((size_t)4 << 30) /* 4 GiB */
|
|
39
|
+
#endif
|
|
40
|
+
|
|
16
41
|
typedef struct {
|
|
17
42
|
char *data; /* owned; kept NUL-terminated after any append */
|
|
18
43
|
size_t len; /* bytes used (excluding the terminator) */
|
|
19
44
|
size_t cap; /* bytes allocated */
|
|
20
|
-
size_t max; /* 0 =
|
|
45
|
+
size_t max; /* 0 = the conservative MKR_BUF_DEFAULT_LIMIT; else this value -
|
|
46
|
+
* either way clamped by MKR_BUF_HARD_MAX (past it -> ERR_LIMIT) */
|
|
21
47
|
} mkr_buf_t;
|
|
22
48
|
|
|
23
|
-
/* Initialise an empty buffer. max == 0
|
|
49
|
+
/* Initialise an empty buffer. max == 0 applies the conservative default ceiling
|
|
50
|
+
* (MKR_BUF_DEFAULT_LIMIT) - it is NOT unbounded; pass an explicit (larger or
|
|
51
|
+
* smaller) value to opt into a different bound, always under MKR_BUF_HARD_MAX. */
|
|
24
52
|
static inline void
|
|
25
53
|
mkr_buf_init(mkr_buf_t *b, size_t max)
|
|
26
54
|
{
|
|
@@ -35,6 +63,12 @@ mkr_buf_init(mkr_buf_t *b, size_t max)
|
|
|
35
63
|
* failure (the buffer is left intact in every failure case). n == 0 is a no-op. */
|
|
36
64
|
mkr_status_t mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n);
|
|
37
65
|
|
|
66
|
+
/* Pre-allocate capacity for at least n bytes (best-effort, clamped to the
|
|
67
|
+
* buffer's cap), so a fill of known approximate size avoids per-append reallocs.
|
|
68
|
+
* A no-op if the buffer already has room. Returns MKR_ERR_OOM on overflow /
|
|
69
|
+
* allocation failure (the buffer is left intact). */
|
|
70
|
+
mkr_status_t mkr_buf_reserve(mkr_buf_t *b, size_t n);
|
|
71
|
+
|
|
38
72
|
/* Take ownership of the (NUL-terminated) bytes; the buffer is reset to empty.
|
|
39
73
|
* Returns a freshly owned "" for an empty buffer, or NULL on OOM. */
|
|
40
74
|
char *mkr_buf_steal(mkr_buf_t *b, size_t *out_len);
|
data/ext/makiri/core/mkr_core.h
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* mkr_text.h string-type lattice (owned/borrowed/verified text + bytes)
|
|
11
11
|
* mkr_buf.h mkr_buf_t (growable, capped byte buffer)
|
|
12
12
|
*
|
|
13
|
-
* NOTHING here touches Ruby
|
|
13
|
+
* NOTHING here touches Ruby - exception mapping happens at the glue boundary.
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
#include "mkr_alloc.h"
|
data/ext/makiri/core/mkr_hash.h
CHANGED
|
@@ -35,7 +35,7 @@ mkr_ptr_hash(const void *p)
|
|
|
35
35
|
|
|
36
36
|
/* Smallest power of two >= n, into *out. Returns false on overflow (no power of
|
|
37
37
|
* two >= n fits in size_t) so the caller fails closed rather than sizing a
|
|
38
|
-
* power-of-two hash table below the element count it must hold
|
|
38
|
+
* power-of-two hash table below the element count it must hold - which would
|
|
39
39
|
* never find a free slot under linear probing. Shared by the pointer-keyed
|
|
40
40
|
* indexes (attr->owner, text-index). */
|
|
41
41
|
static inline bool
|
data/ext/makiri/core/mkr_text.h
CHANGED
|
@@ -16,7 +16,7 @@ extern "C" {
|
|
|
16
16
|
#endif
|
|
17
17
|
|
|
18
18
|
/* ---------------------------------------------------------------- */
|
|
19
|
-
/* mkr_verified_text_t
|
|
19
|
+
/* mkr_verified_text_t - a string proven to meet the engine text contract */
|
|
20
20
|
/* ---------------------------------------------------------------- */
|
|
21
21
|
|
|
22
22
|
/* A borrowed byte slice whose contents are guaranteed to satisfy Makiri's
|
|
@@ -39,7 +39,7 @@ typedef struct {
|
|
|
39
39
|
/*
|
|
40
40
|
* Makiri's string types form a small lattice over two axes plus a shape marker.
|
|
41
41
|
* They look alike ({ptr,len}) but C has no subtyping, so each contract is its
|
|
42
|
-
* own type
|
|
42
|
+
* own type - that distinctness IS the guarantee, and is why there is no single
|
|
43
43
|
* "string" type.
|
|
44
44
|
*
|
|
45
45
|
* axis 1 ownership : borrowed (we never free) | owned (free via *_clear)
|
|
@@ -51,7 +51,7 @@ typedef struct {
|
|
|
51
51
|
* shape \ contract raw (bytes) valid (text)
|
|
52
52
|
* ---------------------- ------------------------ -------------------------
|
|
53
53
|
* ruby-anchored borrowed mkr_ruby_borrowed_bytes_t mkr_ruby_borrowed_text_t (bridge.h)
|
|
54
|
-
* borrowed slice (none yet
|
|
54
|
+
* borrowed slice (none yet - would be mkr_borrowed_text_t /
|
|
55
55
|
* mkr_borrowed_bytes_t) mkr_verified_text_t (*)
|
|
56
56
|
* owned mkr_owned_bytes_t mkr_owned_text_t
|
|
57
57
|
*
|
|
@@ -65,22 +65,22 @@ typedef struct {
|
|
|
65
65
|
* cannot reach the engine's public API. Internally the engine carries the
|
|
66
66
|
* freely-constructible mkr_borrowed_text_t instead.
|
|
67
67
|
*
|
|
68
|
-
* Conversions
|
|
68
|
+
* Conversions - the only sanctioned edges. The points that actually VALIDATE
|
|
69
69
|
* raw bytes are the bridge's checked entry points; everything else only moves
|
|
70
70
|
* already-valid text between shapes (no edge re-validates, and none turns raw
|
|
71
71
|
* bytes into text without one of those checks):
|
|
72
|
-
* validate raw -> valid : the bridge's checked entry points only
|
|
72
|
+
* validate raw -> valid : the bridge's checked entry points only -
|
|
73
73
|
* mkr_ruby_verified_text / mkr_ruby_try_verified_text
|
|
74
74
|
* (both validate UTF-8 + no NUL); never a cast.
|
|
75
75
|
* drop the GC anchor : mkr_verified_text_from_view (ruby_borrowed_text -> verified_text)
|
|
76
76
|
* assert valid (no copy) : mkr_borrowed_text (const char*,len -> borrowed_text)
|
|
77
|
-
*
|
|
77
|
+
* - caller asserts the bytes already meet the contract
|
|
78
78
|
* downgrade to borrow : mkr_borrowed_text_from_owned (owned_text -> borrowed_text)
|
|
79
79
|
* mkr_borrowed_text_from_verified (verified_text -> borrowed_text)
|
|
80
80
|
* copy into owned : mkr_owned_text_from_borrowed_copy /
|
|
81
|
-
* mkr_owned_text_from_buf_steal
|
|
81
|
+
* mkr_owned_text_from_buf_steal - accept only
|
|
82
82
|
* already-asserted-valid text; they copy, not validate.
|
|
83
|
-
* take ownership : mkr_owned_text (char*,len -> owned_text)
|
|
83
|
+
* take ownership : mkr_owned_text (char*,len -> owned_text) - caller
|
|
84
84
|
* transfers an already-valid heap buffer it produced
|
|
85
85
|
* (substring/concat/format output); asserts validity.
|
|
86
86
|
*/
|
data/ext/makiri/extconf.rb
CHANGED
|
@@ -12,7 +12,7 @@ require "etc"
|
|
|
12
12
|
# 1. Build vendored Lexbor (unpatched) via cmake into vendor/lexbor/build,
|
|
13
13
|
# install headers + a static archive into vendor/lexbor/dist.
|
|
14
14
|
# 2. Compile ext/makiri/**/*.c with rake-compiler, linking against the
|
|
15
|
-
# static Lexbor archive only
|
|
15
|
+
# static Lexbor archive only - no system libxml2/libxslt.
|
|
16
16
|
#
|
|
17
17
|
# Security note: the C extension is built with -D_FORTIFY_SOURCE=2,
|
|
18
18
|
# -fstack-protector-strong, and -Wformat -Wformat-security. -O2 is kept
|
|
@@ -60,7 +60,7 @@ $LDFLAGS << " #{lexbor_archive.shellescape}"
|
|
|
60
60
|
# Sanitizer build (opt-in): MAKIRI_SANITIZE=address,undefined rake clean compile
|
|
61
61
|
# Then run the suite under the runtime via `rake sanitize` (which preloads the
|
|
62
62
|
# ASan runtime). Sanitizers replace the heap allocator, so even the vendored
|
|
63
|
-
# (uninstrumented) Lexbor's allocations get red-zoned
|
|
63
|
+
# (uninstrumented) Lexbor's allocations get red-zoned - heap overflows on
|
|
64
64
|
# Lexbor-owned buffers are still caught. _FORTIFY_SOURCE is dropped here because
|
|
65
65
|
# it conflicts with the sanitizer interceptors.
|
|
66
66
|
sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
|
|
@@ -115,6 +115,24 @@ elsif RbConfig::CONFIG["target_os"] =~ /linux/
|
|
|
115
115
|
$LIBRUBYARG_STATIC = ""
|
|
116
116
|
end
|
|
117
117
|
|
|
118
|
+
# Export ONLY Init_makiri from the compiled extension. `-fvisibility=hidden`
|
|
119
|
+
# above hides our own sources' symbols, but the vendored Lexbor static library
|
|
120
|
+
# is built (by Lexbor's own CMake) with default visibility, so without this the
|
|
121
|
+
# linker re-exports ~1700 `lxb_*` / `lexbor_*` symbols into the bundle's dynamic
|
|
122
|
+
# table. Another Lexbor-based extension loaded in the same process (e.g.
|
|
123
|
+
# nokolexbor) would then resolve its own `lxb_*` calls to OUR copy - a different
|
|
124
|
+
# Lexbor version with an incompatible ABI - and segfault. Restricting the export
|
|
125
|
+
# list to Init_makiri keeps Makiri's Lexbor entirely private (Ruby only needs
|
|
126
|
+
# Init_makiri, found via dlsym at require time).
|
|
127
|
+
if RbConfig::CONFIG["target_os"] =~ /darwin/
|
|
128
|
+
$DLDFLAGS << " -Wl,-exported_symbol,_Init_makiri"
|
|
129
|
+
elsif RbConfig::CONFIG["target_os"] =~ /linux/
|
|
130
|
+
# Hide every symbol pulled in from static archives (the Lexbor .a); our own
|
|
131
|
+
# are already hidden by -fvisibility=hidden, leaving just RUBY_FUNC_EXPORTED
|
|
132
|
+
# Init_makiri in the dynamic symbol table.
|
|
133
|
+
$DLDFLAGS << " -Wl,--exclude-libs,ALL"
|
|
134
|
+
end
|
|
135
|
+
|
|
118
136
|
# Recursively pick up C sources under ext/makiri/.
|
|
119
137
|
$srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c")).map { |f| f.sub("#{EXT_DIR}/", "") }
|
|
120
138
|
$VPATH ||= []
|
data/ext/makiri/glue/glue.h
CHANGED
|
@@ -8,11 +8,24 @@
|
|
|
8
8
|
extern "C" {
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
+
/* A DOM node pointer of UNKNOWN representation - an HTML lxb_dom_node_t or an XML
|
|
12
|
+
* mkr_xml_node_t - as stored in a node wrapper or a NodeSet. It is an INCOMPLETE
|
|
13
|
+
* type on purpose: it cannot be dereferenced, and (unlike void*) it does not
|
|
14
|
+
* implicitly convert to a typed pointer, so reading a stored node AS a specific
|
|
15
|
+
* representation requires an explicit cast that the kind-checked accessors
|
|
16
|
+
* (mkr_html_node_unwrap / mkr_xml_node_unwrap) justify by the wrapper's TypedData type
|
|
17
|
+
* (or, for a NodeSet, by doc_is_xml). The stored pointer is only ever
|
|
18
|
+
* pointer-compared or cast through one of those accessors. */
|
|
19
|
+
typedef struct mkr_raw_node mkr_raw_node_t;
|
|
20
|
+
|
|
11
21
|
/* Wrapper for any DOM node except Document. The node memory is owned by the
|
|
12
|
-
* document's Lexbor arena; we keep only the
|
|
13
|
-
* reference to the Ruby Document so the arena
|
|
22
|
+
* document's arena (an HTML Lexbor arena or the XML node arena); we keep only the
|
|
23
|
+
* pointer plus a keepalive VALUE reference to the Ruby Document so the arena
|
|
24
|
+
* outlives the wrapper. The pointer is representation-opaque (mkr_raw_node_t):
|
|
25
|
+
* read it only through mkr_html_node_unwrap / mkr_xml_node_unwrap, which check the
|
|
26
|
+
* wrapper's representation (distinct TypedData types) before casting. */
|
|
14
27
|
typedef struct {
|
|
15
|
-
|
|
28
|
+
mkr_raw_node_t *node;
|
|
16
29
|
VALUE document;
|
|
17
30
|
} mkr_node_data_t;
|
|
18
31
|
|
|
@@ -31,14 +44,35 @@ extern const rb_data_type_t mkr_node_type;
|
|
|
31
44
|
extern const rb_data_type_t mkr_doc_type;
|
|
32
45
|
extern const rb_data_type_t mkr_node_set_type;
|
|
33
46
|
|
|
34
|
-
/* Node bridge (glue/ruby_node.c).
|
|
47
|
+
/* Node bridge (glue/ruby_node.c). mkr_wrap_html_node returns the Document VALUE
|
|
35
48
|
* for the document node, Qnil for NULL, otherwise a freshly-wrapped Node. */
|
|
36
|
-
VALUE
|
|
37
|
-
lxb_dom_node_t *mkr_node_unwrap(VALUE rb_node);
|
|
49
|
+
VALUE mkr_wrap_html_node(lxb_dom_node_t *node, VALUE document);
|
|
38
50
|
VALUE mkr_node_document(VALUE rb_node);
|
|
39
51
|
|
|
52
|
+
/* HTML and XML nodes are wrapped under DISTINCT TypedData types (both deriving
|
|
53
|
+
* from the shared base mkr_node_type), so a representation-specific accessor
|
|
54
|
+
* rejects the wrong kind via Ruby's type machinery. See ruby_node.c.
|
|
55
|
+
* mkr_html_node_unwrap -> lxb_dom_node_t* ; raises on an XML node/Document.
|
|
56
|
+
* mkr_xml_node_unwrap-> mkr_xml_node_t* ; raises on an HTML node/Document (ruby_xml_node.c).
|
|
57
|
+
* mkr_node_raw -> void* ; kind-agnostic raw pointer for identity, or for a
|
|
58
|
+
* site where the kind is already guaranteed. Deref needs an
|
|
59
|
+
* explicit cast - never treat it as a typed pointer blindly.
|
|
60
|
+
* mkr_node_id -> uintptr_t ; node identity for ==/eql?/hash/pointer_id. */
|
|
61
|
+
extern const rb_data_type_t mkr_html_node_type;
|
|
62
|
+
extern const rb_data_type_t mkr_xml_node_type;
|
|
63
|
+
lxb_dom_node_t *mkr_html_node_unwrap(VALUE rb_node);
|
|
64
|
+
void *mkr_node_raw(VALUE rb_node);
|
|
65
|
+
uintptr_t mkr_node_id(VALUE rb_node);
|
|
66
|
+
|
|
67
|
+
/* XML node bridge (glue/ruby_xml_node.c): wrap a custom XML node into the right
|
|
68
|
+
* Makiri::XML::* leaf (Qnil for NULL, the Document VALUE for the document node). */
|
|
69
|
+
struct mkr_xml_node;
|
|
70
|
+
VALUE mkr_wrap_xml_node(struct mkr_xml_node *node, VALUE document);
|
|
71
|
+
/* XML node-pointer accessor; raises TypeError on an HTML node/Document. */
|
|
72
|
+
struct mkr_xml_node *mkr_xml_node_unwrap(VALUE rb_node);
|
|
73
|
+
|
|
40
74
|
/* Document bridge (glue/ruby_doc.c). */
|
|
41
|
-
lxb_dom_document_t *
|
|
75
|
+
lxb_dom_document_t *mkr_html_doc_unwrap(VALUE rb_doc);
|
|
42
76
|
mkr_parsed_t *mkr_doc_parsed(VALUE rb_doc);
|
|
43
77
|
VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownership */
|
|
44
78
|
|
|
@@ -46,7 +80,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
|
|
|
46
80
|
* inner_html=/outer_html= so the UTF-8 sanitisation and import+template-fixup
|
|
47
81
|
* are not duplicated.
|
|
48
82
|
*
|
|
49
|
-
* mkr_sanitize_html_input: decode rb_html for the fragment parser
|
|
83
|
+
* mkr_sanitize_html_input: decode rb_html for the fragment parser - *out / *out_len
|
|
50
84
|
* are the bytes to parse, *owned a malloc'd buffer to free afterwards (NULL when
|
|
51
85
|
* the input is used in place). Returns 0, or -1 on OOM (nothing allocated), so
|
|
52
86
|
* the caller can release its parser before raising. See mkr_utf8_sanitize.
|
|
@@ -54,7 +88,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
|
|
|
54
88
|
* mkr_import_fragment_children: deep-import each child of `root` into `doc`, hand
|
|
55
89
|
* it to `emit`, and fix up any <template> contents (which import_node omits).
|
|
56
90
|
*
|
|
57
|
-
* mkr_emit_append / mkr_emit_before: emit callbacks
|
|
91
|
+
* mkr_emit_append / mkr_emit_before: emit callbacks - append as last child of
|
|
58
92
|
* `u`, or insert before the reference node `u`. */
|
|
59
93
|
int mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
|
|
60
94
|
lxb_char_t **owned);
|
|
@@ -69,9 +103,11 @@ void mkr_emit_before(lxb_dom_node_t *imported, void *u);
|
|
|
69
103
|
* mkr_init_node. */
|
|
70
104
|
VALUE mkr_node_clone_node(int argc, VALUE *argv, VALUE self);
|
|
71
105
|
|
|
72
|
-
/* NodeSet bridge (glue/ruby_node_set.c).
|
|
106
|
+
/* NodeSet bridge (glue/ruby_node_set.c). mkr_raw_node_t (above): callers cast
|
|
107
|
+
* their typed node to it when pushing (forgetting the type is the safe, store
|
|
108
|
+
* direction); the single typed read-back lives in mkr_node_set_wrap. */
|
|
73
109
|
VALUE mkr_node_set_new(VALUE document);
|
|
74
|
-
void mkr_node_set_push(VALUE rb_set,
|
|
110
|
+
void mkr_node_set_push(VALUE rb_set, mkr_raw_node_t *node);
|
|
75
111
|
|
|
76
112
|
#ifdef __cplusplus
|
|
77
113
|
}
|