makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
@@ -108,6 +108,223 @@ mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out)
108
108
  return 0;
109
109
  }
110
110
 
111
+ VALUE
112
+ mkr_ruby_to_utf8(VALUE str)
113
+ {
114
+ /* Honour the Ruby String's declared encoding so its content survives:
115
+ *
116
+ * - UTF-8 / US-ASCII / ASCII-8BIT (binary): returned unchanged. These are
117
+ * already UTF-8 bytes (or deliberately raw bytes), and the native parser
118
+ * does the WHATWG invalid-byte replacement for them. The UTF-8 common
119
+ * case costs only this encoding comparison - no transcode, no copy.
120
+ *
121
+ * - any other encoding (Shift_JIS, EUC-JP, ISO-8859-1, Windows-1252, ...):
122
+ * transcoded to UTF-8 with invalid/undef -> U+FFFD, so e.g. Shift_JIS
123
+ * text becomes the right UTF-8 characters instead of being read as raw
124
+ * UTF-8 bytes and mangled. Only non-UTF-8 input pays this. */
125
+ rb_encoding *enc = rb_enc_get(str);
126
+ if (enc == rb_utf8_encoding()
127
+ || enc == rb_usascii_encoding()
128
+ || enc == rb_ascii8bit_encoding()) {
129
+ return str;
130
+ }
131
+ return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()),
132
+ ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE, Qnil);
133
+ }
134
+
135
+ /* rb_str_encode with no replacement flags: an undefined conversion or invalid
136
+ * byte sequence RAISES (Encoding::UndefinedConversionError /
137
+ * Encoding::InvalidByteSequenceError) instead of substituting U+FFFD. Run under
138
+ * rb_protect so we can remap the Ruby Encoding error to Makiri::XML::SyntaxError. */
139
+ static VALUE
140
+ mkr_xml_strict_transcode_thunk(VALUE str)
141
+ {
142
+ return rb_str_encode(str, rb_enc_from_encoding(rb_utf8_encoding()), 0, Qnil);
143
+ }
144
+
145
+ /* --- XML 1.0 Appendix F: byte-encoding autodetection (BOM, then declaration) ---
146
+ *
147
+ * The leading byte-order mark, or NULL; *bom_len gets its length. UTF-32 BOMs are
148
+ * checked before the UTF-16 LE BOM they share a prefix with. */
149
+ static rb_encoding *
150
+ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len)
151
+ {
152
+ *bom_len = 0;
153
+ if (len >= 4 && p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xFE && p[3] == 0xFF) {
154
+ *bom_len = 4; return rb_enc_find("UTF-32BE");
155
+ }
156
+ if (len >= 4 && p[0] == 0xFF && p[1] == 0xFE && p[2] == 0x00 && p[3] == 0x00) {
157
+ *bom_len = 4; return rb_enc_find("UTF-32LE");
158
+ }
159
+ if (len >= 2 && p[0] == 0xFE && p[1] == 0xFF) { *bom_len = 2; return rb_enc_find("UTF-16BE"); }
160
+ if (len >= 2 && p[0] == 0xFF && p[1] == 0xFE) { *bom_len = 2; return rb_enc_find("UTF-16LE"); }
161
+ if (len >= 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) { *bom_len = 3; return rb_utf8_encoding(); }
162
+ return NULL;
163
+ }
164
+
165
+ /* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
166
+ * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
167
+ * stride-interleaved, so the ASCII column is extracted (per the BOM) before the
168
+ * scan, letting a BOM-vs-declaration conflict be caught even in UTF-16. */
169
+ static rb_encoding *
170
+ mkr_xml_decl_encoding(const unsigned char *p, long len, rb_encoding *bom)
171
+ {
172
+ long stride = 1, off = 0;
173
+ if (bom == rb_enc_find("UTF-16LE")) { stride = 2; off = 0; }
174
+ else if (bom == rb_enc_find("UTF-16BE")) { stride = 2; off = 1; }
175
+ else if (bom == rb_enc_find("UTF-32LE")) { stride = 4; off = 0; }
176
+ else if (bom == rb_enc_find("UTF-32BE")) { stride = 4; off = 3; }
177
+
178
+ char head[256];
179
+ long hn = 0;
180
+ for (long i = off; i < len && hn < (long)sizeof(head); i += stride) head[hn++] = (char)p[i];
181
+
182
+ long i = 0;
183
+ while (i < hn && (head[i] == ' ' || head[i] == '\t' || head[i] == '\r' || head[i] == '\n')) i++;
184
+ if (i + 5 > hn || memcmp(head + i, "<?xml", 5) != 0) return NULL;
185
+ i += 5;
186
+ /* find a whitespace-introduced "encoding" before the '?>' */
187
+ for (; i + 8 <= hn; i++) {
188
+ if (head[i] == '?' && i + 1 < hn && head[i + 1] == '>') return NULL; /* end of decl */
189
+ int ws_before = (head[i - 1] == ' ' || head[i - 1] == '\t' || head[i - 1] == '\r' || head[i - 1] == '\n');
190
+ if (!ws_before || memcmp(head + i, "encoding", 8) != 0) continue;
191
+ long j = i + 8;
192
+ while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
193
+ if (j >= hn || head[j] != '=') return NULL;
194
+ j++;
195
+ while (j < hn && (head[j] == ' ' || head[j] == '\t' || head[j] == '\r' || head[j] == '\n')) j++;
196
+ if (j >= hn || (head[j] != '"' && head[j] != '\'')) return NULL;
197
+ char q = head[j++];
198
+ long ns = j;
199
+ while (j < hn && head[j] != q) j++;
200
+ if (j >= hn) return NULL;
201
+ char name[64];
202
+ long nl = j - ns;
203
+ if (nl <= 0 || nl >= (long)sizeof(name)) return NULL;
204
+ memcpy(name, head + ns, (size_t)nl);
205
+ name[nl] = '\0';
206
+ return rb_enc_find(name); /* NULL for an unknown encoding name */
207
+ }
208
+ return NULL;
209
+ }
210
+
211
+ /* Two encodings agree for conflict purposes when identical, or when either is
212
+ * US-ASCII (a subset of UTF-8 and the single-byte encodings). */
213
+ static int
214
+ mkr_xml_enc_compatible(rb_encoding *a, rb_encoding *b)
215
+ {
216
+ return a == b || a == rb_usascii_encoding() || b == rb_usascii_encoding();
217
+ }
218
+
219
+ VALUE
220
+ mkr_xml_decode_input(VALUE str, size_t max_bytes)
221
+ {
222
+ rb_encoding *tag = rb_enc_get(str);
223
+ const unsigned char *raw = (const unsigned char *)RSTRING_PTR(str);
224
+ long rawlen = RSTRING_LEN(str);
225
+
226
+ /* Detect the byte encoding (XML 1.0 Appendix F): a BOM wins, else the
227
+ * declaration. The Ruby String's encoding is authoritative when it is a
228
+ * concrete text encoding; a BOM/declaration that disagrees is a fatal
229
+ * conflict. ASCII-8BIT means "raw bytes, no claimed encoding", so there the
230
+ * detected encoding decodes the input (a UTF-16/Shift_JIS/BOM'd file read
231
+ * with File.binread now parses). */
232
+ long bom_len = 0;
233
+ rb_encoding *bom = mkr_xml_bom_encoding(raw, rawlen, &bom_len);
234
+ rb_encoding *decl = mkr_xml_decl_encoding(raw + bom_len, rawlen - bom_len, bom);
235
+ int is_binary = (tag == rb_ascii8bit_encoding());
236
+
237
+ if (bom && decl && !mkr_xml_enc_compatible(bom, decl)) {
238
+ rb_raise(mkr_eXmlSyntaxError,
239
+ "XML encoding conflict: the byte-order mark and the encoding declaration disagree");
240
+ }
241
+ if (!is_binary && bom && !mkr_xml_enc_compatible(bom, tag)) {
242
+ rb_raise(mkr_eXmlSyntaxError,
243
+ "XML encoding conflict: the byte-order mark disagrees with the string's encoding");
244
+ }
245
+ if (!is_binary && decl && !mkr_xml_enc_compatible(decl, tag)) {
246
+ /* A concrete String encoding is authoritative for decoding, so the
247
+ * declaration is not used to transcode - but a declaration that names a
248
+ * different encoding than the String is tagged with (e.g. a Shift_JIS
249
+ * String declaring encoding="UTF-8") is a self-inconsistent document and
250
+ * a fatal error, not a silently-ignored mismatch. */
251
+ rb_raise(mkr_eXmlSyntaxError,
252
+ "XML encoding conflict: the encoding declaration disagrees with the string's encoding");
253
+ }
254
+
255
+ rb_encoding *eff = is_binary ? (bom ? bom : (decl ? decl : rb_utf8_encoding())) : tag;
256
+
257
+ /* Decode to UTF-8 (strict). UTF-8 / US-ASCII / ASCII-8BIT are already UTF-8
258
+ * bytes (validated below); anything else is strict-transcoded, raising rather
259
+ * than substituting U+FFFD. */
260
+ VALUE s;
261
+ if (eff == rb_utf8_encoding() || eff == rb_usascii_encoding() || eff == rb_ascii8bit_encoding()) {
262
+ s = str;
263
+ } else {
264
+ VALUE in = str;
265
+ if (rb_enc_get(str) != eff) { in = rb_str_dup(str); rb_enc_associate(in, eff); }
266
+ int state = 0;
267
+ s = rb_protect(mkr_xml_strict_transcode_thunk, in, &state);
268
+ if (state != 0) {
269
+ VALUE exc = rb_errinfo();
270
+ rb_set_errinfo(Qnil);
271
+ char msg[256];
272
+ mkr_ruby_exception_message(exc, msg, sizeof msg);
273
+ rb_raise(mkr_eXmlSyntaxError,
274
+ "XML input could not be decoded to UTF-8: %s", msg);
275
+ }
276
+ RB_GC_GUARD(in);
277
+ }
278
+
279
+ const char *ptr = RSTRING_PTR(s);
280
+ long len = RSTRING_LEN(s);
281
+ /* §4.3.3: a leading BOM is the encoding signature, not document content -
282
+ * strip a U+FEFF (the transcode above turns any UTF-16/32 BOM into one). */
283
+ if (len >= 3 && (unsigned char)ptr[0] == 0xEF && (unsigned char)ptr[1] == 0xBB
284
+ && (unsigned char)ptr[2] == 0xBF) {
285
+ ptr += 3; len -= 3;
286
+ }
287
+
288
+ /* Fail closed on an over-budget input BEFORE the validation copy and the
289
+ * caller's GVL-release copy (an input whose UTF-8 length exceeds the arena
290
+ * budget can never parse). max_bytes == 0 disables the check (__decode). */
291
+ if (max_bytes != 0 && (size_t)len > max_bytes) {
292
+ RB_GC_GUARD(s);
293
+ rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
294
+ }
295
+
296
+ /* Strict UTF-8 validation: an embedded NUL or any invalid UTF-8 is fatal
297
+ * (no U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). */
298
+ if (len > 0 && memchr(ptr, '\0', (size_t)len) != NULL) {
299
+ rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
300
+ }
301
+ VALUE u = rb_enc_str_new(ptr, len, rb_utf8_encoding());
302
+ if (rb_enc_str_coderange(u) == ENC_CODERANGE_BROKEN) {
303
+ rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
304
+ }
305
+ RB_GC_GUARD(s);
306
+ return u; /* validated, UTF-8-tagged, BOM-stripped */
307
+ }
308
+
309
+ bool
310
+ mkr_ruby_str_known_valid_utf8(VALUE str)
311
+ {
312
+ if (!RB_TYPE_P(str, T_STRING)) {
313
+ return false;
314
+ }
315
+ /* ENC_CODERANGE reads the *cached* classification from the object's flags;
316
+ * it does NOT scan (rb_enc_str_coderange would, costing as much as our own
317
+ * validator). So this only wins when Ruby already knows the answer. */
318
+ int cr = ENC_CODERANGE(str);
319
+ if (cr == ENC_CODERANGE_7BIT) {
320
+ return true; /* all bytes < 0x80 in an ASCII-compatible encoding */
321
+ }
322
+ if (cr == ENC_CODERANGE_VALID) {
323
+ return rb_enc_get(str) == rb_utf8_encoding(); /* valid AND UTF-8 */
324
+ }
325
+ return false; /* UNKNOWN or BROKEN: let mkr_utf8_sanitize handle it */
326
+ }
327
+
111
328
  const char *
112
329
  mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
113
330
  {
@@ -6,7 +6,7 @@
6
6
  * allocators, the foundation every other C layer (glue, xpath engine,
7
7
  * lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
8
8
  * `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
9
- * NOTHING in this header touches Ruby exception mapping happens at the glue
9
+ * NOTHING in this header touches Ruby - exception mapping happens at the glue
10
10
  * boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
11
11
  */
12
12
 
@@ -13,7 +13,12 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
13
13
  if (!mkr_size_add(b->len, n, &need)) {
14
14
  return MKR_ERR_OOM;
15
15
  }
16
- if (b->max != 0 && need > b->max) {
16
+ /* max == 0 is NOT unbounded: it falls back to the conservative default
17
+ * ceiling, so a caller that never set a cap still fails closed. Either way the
18
+ * absolute hard ceiling clamps it, so no buffer can exhaust memory. */
19
+ size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
20
+ size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
21
+ if (need > limit) {
17
22
  return MKR_ERR_LIMIT;
18
23
  }
19
24
  size_t need_term; /* room for the NUL terminator too */
@@ -38,6 +43,35 @@ mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n)
38
43
  return MKR_OK;
39
44
  }
40
45
 
46
+ mkr_status_t
47
+ mkr_buf_reserve(mkr_buf_t *b, size_t n)
48
+ {
49
+ /* Pre-allocate capacity for n bytes so a known-size fill does not realloc on
50
+ * every geometric step (the serializer reserves ~the output size up front).
51
+ * Best-effort: never grow past the buffer's own cap, and a later append still
52
+ * fails closed if the real output exceeds it. */
53
+ size_t soft = (b->max != 0) ? b->max : MKR_BUF_DEFAULT_LIMIT;
54
+ size_t limit = (soft < MKR_BUF_HARD_MAX) ? soft : MKR_BUF_HARD_MAX;
55
+ if (n > limit) {
56
+ n = limit;
57
+ }
58
+ size_t need_term; /* room for the NUL terminator too */
59
+ if (!mkr_size_add(n, 1, &need_term)) {
60
+ return MKR_ERR_OOM;
61
+ }
62
+ if (need_term <= b->cap) {
63
+ return MKR_OK; /* already have room */
64
+ }
65
+ char *p = realloc(b->data, need_term);
66
+ if (p == NULL) {
67
+ return MKR_ERR_OOM;
68
+ }
69
+ b->data = p;
70
+ b->cap = need_term;
71
+ b->data[b->len] = '\0'; /* keep NUL-terminated */
72
+ return MKR_OK;
73
+ }
74
+
41
75
  char *
42
76
  mkr_buf_steal(mkr_buf_t *b, size_t *out_len)
43
77
  {
@@ -2,7 +2,7 @@
2
2
  #define MAKIRI_CORE_MKR_BUF_H
3
3
 
4
4
  /*
5
- * mkr_buf_t an owned, growable, optionally capped byte buffer, kept
5
+ * mkr_buf_t - an owned, growable, optionally capped byte buffer, kept
6
6
  * NUL-terminated. Built on the fail-closed allocators in mkr_alloc.h.
7
7
  * (mkr_core.h is a thin umbrella over mkr_alloc.h + mkr_text.h + this.)
8
8
  */
@@ -13,14 +13,42 @@
13
13
  extern "C" {
14
14
  #endif
15
15
 
16
+ /* Memory safety for buffers lives HERE, at the one buffer primitive, not at each
17
+ * call site: "max == 0" can no longer mean "unbounded". Two ceilings bound every
18
+ * mkr_buf so a runaway - a cycle, an unbounded loop, or a caller that forgot to
19
+ * pass a cap - fails closed with MKR_ERR_LIMIT instead of exhausting memory and
20
+ * freezing the machine:
21
+ *
22
+ * MKR_BUF_DEFAULT_LIMIT the cap applied when the caller passes max == 0. A
23
+ * conservative default (100 MiB): code that did not
24
+ * think about a bound gets a tight one for free, and a
25
+ * buffer that genuinely needs to be large must opt in
26
+ * EXPLICITLY by passing a larger max.
27
+ * MKR_BUF_HARD_MAX an absolute ceiling no buffer may exceed, even one
28
+ * with an explicit max - the last-resort backstop.
29
+ * Tight, content-scaled bounds still belong to the
30
+ * caller (e.g. the XML serializer caps itself at a
31
+ * multiple of arena_bytes); this stops total runaway.
32
+ *
33
+ * Override either at build time: -DMKR_BUF_DEFAULT_LIMIT=<bytes> / -DMKR_BUF_HARD_MAX=<bytes>. */
34
+ #ifndef MKR_BUF_DEFAULT_LIMIT
35
+ #define MKR_BUF_DEFAULT_LIMIT ((size_t)100 << 20) /* 100 MiB */
36
+ #endif
37
+ #ifndef MKR_BUF_HARD_MAX
38
+ #define MKR_BUF_HARD_MAX ((size_t)4 << 30) /* 4 GiB */
39
+ #endif
40
+
16
41
  typedef struct {
17
42
  char *data; /* owned; kept NUL-terminated after any append */
18
43
  size_t len; /* bytes used (excluding the terminator) */
19
44
  size_t cap; /* bytes allocated */
20
- size_t max; /* 0 = unbounded; else append past max returns MKR_ERR_LIMIT */
45
+ size_t max; /* 0 = the conservative MKR_BUF_DEFAULT_LIMIT; else this value -
46
+ * either way clamped by MKR_BUF_HARD_MAX (past it -> ERR_LIMIT) */
21
47
  } mkr_buf_t;
22
48
 
23
- /* Initialise an empty buffer. max == 0 means unbounded. */
49
+ /* Initialise an empty buffer. max == 0 applies the conservative default ceiling
50
+ * (MKR_BUF_DEFAULT_LIMIT) - it is NOT unbounded; pass an explicit (larger or
51
+ * smaller) value to opt into a different bound, always under MKR_BUF_HARD_MAX. */
24
52
  static inline void
25
53
  mkr_buf_init(mkr_buf_t *b, size_t max)
26
54
  {
@@ -35,6 +63,12 @@ mkr_buf_init(mkr_buf_t *b, size_t max)
35
63
  * failure (the buffer is left intact in every failure case). n == 0 is a no-op. */
36
64
  mkr_status_t mkr_buf_append(mkr_buf_t *b, const void *bytes, size_t n);
37
65
 
66
+ /* Pre-allocate capacity for at least n bytes (best-effort, clamped to the
67
+ * buffer's cap), so a fill of known approximate size avoids per-append reallocs.
68
+ * A no-op if the buffer already has room. Returns MKR_ERR_OOM on overflow /
69
+ * allocation failure (the buffer is left intact). */
70
+ mkr_status_t mkr_buf_reserve(mkr_buf_t *b, size_t n);
71
+
38
72
  /* Take ownership of the (NUL-terminated) bytes; the buffer is reset to empty.
39
73
  * Returns a freshly owned "" for an empty buffer, or NULL on OOM. */
40
74
  char *mkr_buf_steal(mkr_buf_t *b, size_t *out_len);
@@ -10,7 +10,7 @@
10
10
  * mkr_text.h string-type lattice (owned/borrowed/verified text + bytes)
11
11
  * mkr_buf.h mkr_buf_t (growable, capped byte buffer)
12
12
  *
13
- * NOTHING here touches Ruby exception mapping happens at the glue boundary.
13
+ * NOTHING here touches Ruby - exception mapping happens at the glue boundary.
14
14
  */
15
15
 
16
16
  #include "mkr_alloc.h"
@@ -35,7 +35,7 @@ mkr_ptr_hash(const void *p)
35
35
 
36
36
  /* Smallest power of two >= n, into *out. Returns false on overflow (no power of
37
37
  * two >= n fits in size_t) so the caller fails closed rather than sizing a
38
- * power-of-two hash table below the element count it must hold which would
38
+ * power-of-two hash table below the element count it must hold - which would
39
39
  * never find a free slot under linear probing. Shared by the pointer-keyed
40
40
  * indexes (attr->owner, text-index). */
41
41
  static inline bool
@@ -16,7 +16,7 @@ extern "C" {
16
16
  #endif
17
17
 
18
18
  /* ---------------------------------------------------------------- */
19
- /* mkr_verified_text_t a string proven to meet the engine text contract */
19
+ /* mkr_verified_text_t - a string proven to meet the engine text contract */
20
20
  /* ---------------------------------------------------------------- */
21
21
 
22
22
  /* A borrowed byte slice whose contents are guaranteed to satisfy Makiri's
@@ -39,7 +39,7 @@ typedef struct {
39
39
  /*
40
40
  * Makiri's string types form a small lattice over two axes plus a shape marker.
41
41
  * They look alike ({ptr,len}) but C has no subtyping, so each contract is its
42
- * own type that distinctness IS the guarantee, and is why there is no single
42
+ * own type - that distinctness IS the guarantee, and is why there is no single
43
43
  * "string" type.
44
44
  *
45
45
  * axis 1 ownership : borrowed (we never free) | owned (free via *_clear)
@@ -51,7 +51,7 @@ typedef struct {
51
51
  * shape \ contract raw (bytes) valid (text)
52
52
  * ---------------------- ------------------------ -------------------------
53
53
  * ruby-anchored borrowed mkr_ruby_borrowed_bytes_t mkr_ruby_borrowed_text_t (bridge.h)
54
- * borrowed slice (none yet would be mkr_borrowed_text_t /
54
+ * borrowed slice (none yet - would be mkr_borrowed_text_t /
55
55
  * mkr_borrowed_bytes_t) mkr_verified_text_t (*)
56
56
  * owned mkr_owned_bytes_t mkr_owned_text_t
57
57
  *
@@ -65,22 +65,22 @@ typedef struct {
65
65
  * cannot reach the engine's public API. Internally the engine carries the
66
66
  * freely-constructible mkr_borrowed_text_t instead.
67
67
  *
68
- * Conversions the only sanctioned edges. The points that actually VALIDATE
68
+ * Conversions - the only sanctioned edges. The points that actually VALIDATE
69
69
  * raw bytes are the bridge's checked entry points; everything else only moves
70
70
  * already-valid text between shapes (no edge re-validates, and none turns raw
71
71
  * bytes into text without one of those checks):
72
- * validate raw -> valid : the bridge's checked entry points only
72
+ * validate raw -> valid : the bridge's checked entry points only -
73
73
  * mkr_ruby_verified_text / mkr_ruby_try_verified_text
74
74
  * (both validate UTF-8 + no NUL); never a cast.
75
75
  * drop the GC anchor : mkr_verified_text_from_view (ruby_borrowed_text -> verified_text)
76
76
  * assert valid (no copy) : mkr_borrowed_text (const char*,len -> borrowed_text)
77
- * caller asserts the bytes already meet the contract
77
+ * - caller asserts the bytes already meet the contract
78
78
  * downgrade to borrow : mkr_borrowed_text_from_owned (owned_text -> borrowed_text)
79
79
  * mkr_borrowed_text_from_verified (verified_text -> borrowed_text)
80
80
  * copy into owned : mkr_owned_text_from_borrowed_copy /
81
- * mkr_owned_text_from_buf_steal accept only
81
+ * mkr_owned_text_from_buf_steal - accept only
82
82
  * already-asserted-valid text; they copy, not validate.
83
- * take ownership : mkr_owned_text (char*,len -> owned_text) caller
83
+ * take ownership : mkr_owned_text (char*,len -> owned_text) - caller
84
84
  * transfers an already-valid heap buffer it produced
85
85
  * (substring/concat/format output); asserts validity.
86
86
  */
@@ -12,7 +12,7 @@ require "etc"
12
12
  # 1. Build vendored Lexbor (unpatched) via cmake into vendor/lexbor/build,
13
13
  # install headers + a static archive into vendor/lexbor/dist.
14
14
  # 2. Compile ext/makiri/**/*.c with rake-compiler, linking against the
15
- # static Lexbor archive only no system libxml2/libxslt.
15
+ # static Lexbor archive only - no system libxml2/libxslt.
16
16
  #
17
17
  # Security note: the C extension is built with -D_FORTIFY_SOURCE=2,
18
18
  # -fstack-protector-strong, and -Wformat -Wformat-security. -O2 is kept
@@ -60,7 +60,7 @@ $LDFLAGS << " #{lexbor_archive.shellescape}"
60
60
  # Sanitizer build (opt-in): MAKIRI_SANITIZE=address,undefined rake clean compile
61
61
  # Then run the suite under the runtime via `rake sanitize` (which preloads the
62
62
  # ASan runtime). Sanitizers replace the heap allocator, so even the vendored
63
- # (uninstrumented) Lexbor's allocations get red-zoned heap overflows on
63
+ # (uninstrumented) Lexbor's allocations get red-zoned - heap overflows on
64
64
  # Lexbor-owned buffers are still caught. _FORTIFY_SOURCE is dropped here because
65
65
  # it conflicts with the sanitizer interceptors.
66
66
  sanitize = ENV["MAKIRI_SANITIZE"].to_s.strip
@@ -115,6 +115,24 @@ elsif RbConfig::CONFIG["target_os"] =~ /linux/
115
115
  $LIBRUBYARG_STATIC = ""
116
116
  end
117
117
 
118
+ # Export ONLY Init_makiri from the compiled extension. `-fvisibility=hidden`
119
+ # above hides our own sources' symbols, but the vendored Lexbor static library
120
+ # is built (by Lexbor's own CMake) with default visibility, so without this the
121
+ # linker re-exports ~1700 `lxb_*` / `lexbor_*` symbols into the bundle's dynamic
122
+ # table. Another Lexbor-based extension loaded in the same process (e.g.
123
+ # nokolexbor) would then resolve its own `lxb_*` calls to OUR copy - a different
124
+ # Lexbor version with an incompatible ABI - and segfault. Restricting the export
125
+ # list to Init_makiri keeps Makiri's Lexbor entirely private (Ruby only needs
126
+ # Init_makiri, found via dlsym at require time).
127
+ if RbConfig::CONFIG["target_os"] =~ /darwin/
128
+ $DLDFLAGS << " -Wl,-exported_symbol,_Init_makiri"
129
+ elsif RbConfig::CONFIG["target_os"] =~ /linux/
130
+ # Hide every symbol pulled in from static archives (the Lexbor .a); our own
131
+ # are already hidden by -fvisibility=hidden, leaving just RUBY_FUNC_EXPORTED
132
+ # Init_makiri in the dynamic symbol table.
133
+ $DLDFLAGS << " -Wl,--exclude-libs,ALL"
134
+ end
135
+
118
136
  # Recursively pick up C sources under ext/makiri/.
119
137
  $srcs = Dir.glob(File.join(EXT_DIR, "**", "*.c")).map { |f| f.sub("#{EXT_DIR}/", "") }
120
138
  $VPATH ||= []
@@ -8,11 +8,24 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ /* A DOM node pointer of UNKNOWN representation - an HTML lxb_dom_node_t or an XML
12
+ * mkr_xml_node_t - as stored in a node wrapper or a NodeSet. It is an INCOMPLETE
13
+ * type on purpose: it cannot be dereferenced, and (unlike void*) it does not
14
+ * implicitly convert to a typed pointer, so reading a stored node AS a specific
15
+ * representation requires an explicit cast that the kind-checked accessors
16
+ * (mkr_html_node_unwrap / mkr_xml_node_unwrap) justify by the wrapper's TypedData type
17
+ * (or, for a NodeSet, by doc_is_xml). The stored pointer is only ever
18
+ * pointer-compared or cast through one of those accessors. */
19
+ typedef struct mkr_raw_node mkr_raw_node_t;
20
+
11
21
  /* Wrapper for any DOM node except Document. The node memory is owned by the
12
- * document's Lexbor arena; we keep only the pointer plus a keepalive VALUE
13
- * reference to the Ruby Document so the arena outlives the wrapper. */
22
+ * document's arena (an HTML Lexbor arena or the XML node arena); we keep only the
23
+ * pointer plus a keepalive VALUE reference to the Ruby Document so the arena
24
+ * outlives the wrapper. The pointer is representation-opaque (mkr_raw_node_t):
25
+ * read it only through mkr_html_node_unwrap / mkr_xml_node_unwrap, which check the
26
+ * wrapper's representation (distinct TypedData types) before casting. */
14
27
  typedef struct {
15
- lxb_dom_node_t *node;
28
+ mkr_raw_node_t *node;
16
29
  VALUE document;
17
30
  } mkr_node_data_t;
18
31
 
@@ -31,14 +44,35 @@ extern const rb_data_type_t mkr_node_type;
31
44
  extern const rb_data_type_t mkr_doc_type;
32
45
  extern const rb_data_type_t mkr_node_set_type;
33
46
 
34
- /* Node bridge (glue/ruby_node.c). mkr_wrap_node returns the Document VALUE
47
+ /* Node bridge (glue/ruby_node.c). mkr_wrap_html_node returns the Document VALUE
35
48
  * for the document node, Qnil for NULL, otherwise a freshly-wrapped Node. */
36
- VALUE mkr_wrap_node(lxb_dom_node_t *node, VALUE document);
37
- lxb_dom_node_t *mkr_node_unwrap(VALUE rb_node);
49
+ VALUE mkr_wrap_html_node(lxb_dom_node_t *node, VALUE document);
38
50
  VALUE mkr_node_document(VALUE rb_node);
39
51
 
52
+ /* HTML and XML nodes are wrapped under DISTINCT TypedData types (both deriving
53
+ * from the shared base mkr_node_type), so a representation-specific accessor
54
+ * rejects the wrong kind via Ruby's type machinery. See ruby_node.c.
55
+ * mkr_html_node_unwrap -> lxb_dom_node_t* ; raises on an XML node/Document.
56
+ * mkr_xml_node_unwrap-> mkr_xml_node_t* ; raises on an HTML node/Document (ruby_xml_node.c).
57
+ * mkr_node_raw -> void* ; kind-agnostic raw pointer for identity, or for a
58
+ * site where the kind is already guaranteed. Deref needs an
59
+ * explicit cast - never treat it as a typed pointer blindly.
60
+ * mkr_node_id -> uintptr_t ; node identity for ==/eql?/hash/pointer_id. */
61
+ extern const rb_data_type_t mkr_html_node_type;
62
+ extern const rb_data_type_t mkr_xml_node_type;
63
+ lxb_dom_node_t *mkr_html_node_unwrap(VALUE rb_node);
64
+ void *mkr_node_raw(VALUE rb_node);
65
+ uintptr_t mkr_node_id(VALUE rb_node);
66
+
67
+ /* XML node bridge (glue/ruby_xml_node.c): wrap a custom XML node into the right
68
+ * Makiri::XML::* leaf (Qnil for NULL, the Document VALUE for the document node). */
69
+ struct mkr_xml_node;
70
+ VALUE mkr_wrap_xml_node(struct mkr_xml_node *node, VALUE document);
71
+ /* XML node-pointer accessor; raises TypeError on an HTML node/Document. */
72
+ struct mkr_xml_node *mkr_xml_node_unwrap(VALUE rb_node);
73
+
40
74
  /* Document bridge (glue/ruby_doc.c). */
41
- lxb_dom_document_t *mkr_doc_unwrap(VALUE rb_doc);
75
+ lxb_dom_document_t *mkr_html_doc_unwrap(VALUE rb_doc);
42
76
  mkr_parsed_t *mkr_doc_parsed(VALUE rb_doc);
43
77
  VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownership */
44
78
 
@@ -46,7 +80,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
46
80
  * inner_html=/outer_html= so the UTF-8 sanitisation and import+template-fixup
47
81
  * are not duplicated.
48
82
  *
49
- * mkr_sanitize_html_input: decode rb_html for the fragment parser *out / *out_len
83
+ * mkr_sanitize_html_input: decode rb_html for the fragment parser - *out / *out_len
50
84
  * are the bytes to parse, *owned a malloc'd buffer to free afterwards (NULL when
51
85
  * the input is used in place). Returns 0, or -1 on OOM (nothing allocated), so
52
86
  * the caller can release its parser before raising. See mkr_utf8_sanitize.
@@ -54,7 +88,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
54
88
  * mkr_import_fragment_children: deep-import each child of `root` into `doc`, hand
55
89
  * it to `emit`, and fix up any <template> contents (which import_node omits).
56
90
  *
57
- * mkr_emit_append / mkr_emit_before: emit callbacks append as last child of
91
+ * mkr_emit_append / mkr_emit_before: emit callbacks - append as last child of
58
92
  * `u`, or insert before the reference node `u`. */
59
93
  int mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
60
94
  lxb_char_t **owned);
@@ -69,9 +103,11 @@ void mkr_emit_before(lxb_dom_node_t *imported, void *u);
69
103
  * mkr_init_node. */
70
104
  VALUE mkr_node_clone_node(int argc, VALUE *argv, VALUE self);
71
105
 
72
- /* NodeSet bridge (glue/ruby_node_set.c). */
106
+ /* NodeSet bridge (glue/ruby_node_set.c). mkr_raw_node_t (above): callers cast
107
+ * their typed node to it when pushing (forgetting the type is the safe, store
108
+ * direction); the single typed read-back lives in mkr_node_set_wrap. */
73
109
  VALUE mkr_node_set_new(VALUE document);
74
- void mkr_node_set_push(VALUE rb_set, lxb_dom_node_t *node);
110
+ void mkr_node_set_push(VALUE rb_set, mkr_raw_node_t *node);
75
111
 
76
112
  #ifdef __cplusplus
77
113
  }