makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,292 @@
1
+ #include "glue.h"
2
+
3
+ #include <lexbor/css/css.h>
4
+ #include <lexbor/selectors/selectors.h>
5
+
6
+ /*
7
+ * CSS selector queries, delegated to Lexbor's lxb_selectors engine.
8
+ *
9
+ * Node#css(selector) -> NodeSet (descendants matching, document order)
10
+ * Node#at_css(selector) -> first matching descendant, or nil
11
+ *
12
+ * The Lexbor CSS engine (selector parser + its arena + the traversal engine) is
13
+ * built once and reused for every query. CSS evaluation always holds the GVL (it
14
+ * never releases it), so all queries are serialized and a single process-global
15
+ * engine is safe with no locking. Creating and tearing the engine down per call
16
+ * - four create/init/destroy triples - dominated a cheap query like
17
+ * at_css('#id') (the match is found almost immediately, so setup IS the cost);
18
+ * reusing it closes the gap to nokolexbor, which caches the same objects in
19
+ * thread-local storage. Between calls only the parsed selector list's arena is
20
+ * reset (lxb_css_memory_clean) and the parser is returned to its CLEAN stage
21
+ * (lxb_css_parser_clean) - both preserve the memory/selectors objects set once;
22
+ * the selectors parse-state is auto-cleaned by the parser and the traversal
23
+ * engine self-cleans after each find/match. A malformed selector raises
24
+ * Makiri::CSS::SyntaxError.
25
+ */
26
+
27
+ /* Process-global CSS engine, created lazily and kept for the process lifetime
28
+ * (one small allocation). parser->memory / parser->selectors are set once so the
29
+ * parser reuses the same selector arena + parse state across calls. */
30
+ static lxb_css_memory_t *g_css_mem;
31
+ static lxb_css_parser_t *g_css_parser;
32
+ static lxb_css_selectors_t *g_css_sel;
33
+ static lxb_selectors_t *g_selectors;
34
+ static int g_css_ready;
35
+
36
+ /* Build the shared engine on first use; raises Makiri::Error on init failure
37
+ * (leaving the globals unset, so a later call retries). */
38
+ static void
39
+ mkr_css_engine_init(void)
40
+ {
41
+ if (g_css_ready) {
42
+ return;
43
+ }
44
+
45
+ lxb_css_memory_t *mem = lxb_css_memory_create();
46
+ lxb_css_parser_t *parser = lxb_css_parser_create();
47
+ lxb_css_selectors_t *css_sel = lxb_css_selectors_create();
48
+ lxb_selectors_t *selectors = lxb_selectors_create();
49
+
50
+ int ok = (mem != NULL && parser != NULL && css_sel != NULL && selectors != NULL)
51
+ && (lxb_css_memory_init(mem, 128) == LXB_STATUS_OK)
52
+ && (lxb_css_parser_init(parser, NULL) == LXB_STATUS_OK)
53
+ && (lxb_css_selectors_init(css_sel) == LXB_STATUS_OK)
54
+ && (lxb_selectors_init(selectors) == LXB_STATUS_OK);
55
+
56
+ if (!ok) {
57
+ if (selectors != NULL) lxb_selectors_destroy(selectors, true);
58
+ if (parser != NULL) lxb_css_parser_destroy(parser, true);
59
+ if (mem != NULL) lxb_css_memory_destroy(mem, true);
60
+ if (css_sel != NULL) lxb_css_selectors_destroy(css_sel, true);
61
+ rb_raise(mkr_eError, "failed to initialise CSS selector engine");
62
+ }
63
+
64
+ lxb_css_parser_memory_set(parser, mem);
65
+ lxb_css_parser_selectors_set(parser, css_sel);
66
+
67
+ g_css_mem = mem;
68
+ g_css_parser = parser;
69
+ g_css_sel = css_sel;
70
+ g_selectors = selectors;
71
+ g_css_ready = 1;
72
+ }
73
+
74
+ /* find: the callback runs inside Lexbor's traversal frames, so it must NOT
75
+ * raise - a longjmp would abort lxb_selectors_find mid-walk AND skip the
76
+ * post-run reset in mkr_with_compiled_selector, leaving the process-global
77
+ * parser/memory/selectors dirty for every later query. So matches are collected
78
+ * into a plain C array here (no Ruby at all); the node cap and a growth failure
79
+ * each latch a flag and STOP, and the caller raises only after the traversal
80
+ * has unwound normally and the engine was reset. */
81
+ typedef struct {
82
+ lxb_dom_node_t **nodes; /* malloc-backed; the caller owns + frees it */
83
+ size_t count;
84
+ size_t cap;
85
+ lxb_dom_node_t *root; /* excluded from results: css is descendant-only */
86
+ int overflow;
87
+ int oom;
88
+ } mkr_css_ctx_t;
89
+
90
+ static lxb_status_t
91
+ mkr_css_find_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
92
+ void *ctx_)
93
+ {
94
+ (void)spec;
95
+ mkr_css_ctx_t *c = (mkr_css_ctx_t *)ctx_;
96
+
97
+ if (node == c->root) {
98
+ return LXB_STATUS_OK; /* descendant-only, like Nokogiri's node.css */
99
+ }
100
+ if (c->count >= MKR_NODE_SET_MAX) {
101
+ c->overflow = 1;
102
+ return LXB_STATUS_STOP; /* fail closed without raising mid-traversal */
103
+ }
104
+ if (mkr_grow_reserve((void **)&c->nodes, &c->cap, c->count + 1,
105
+ sizeof(*c->nodes)) != MKR_OK) {
106
+ c->oom = 1;
107
+ return LXB_STATUS_STOP; /* ditto: report OOM after the walk unwinds */
108
+ }
109
+ c->nodes[c->count++] = node;
110
+ return LXB_STATUS_OK;
111
+ }
112
+
113
+ /* Move the collected matches into the NodeSet under rb_ensure, so the C array
114
+ * cannot leak even if a push raises (NoMemoryError). */
115
+ typedef struct {
116
+ VALUE set;
117
+ mkr_css_ctx_t *ctx;
118
+ } mkr_css_fill_t;
119
+
120
+ static VALUE
121
+ mkr_css_fill_set(VALUE p)
122
+ {
123
+ mkr_css_fill_t *f = (mkr_css_fill_t *)p;
124
+ for (size_t i = 0; i < f->ctx->count; i++) {
125
+ mkr_node_set_push(f->set, (mkr_raw_node_t *)f->ctx->nodes[i]);
126
+ }
127
+ return Qnil;
128
+ }
129
+
130
+ static VALUE
131
+ mkr_css_free_nodes(VALUE p)
132
+ {
133
+ free(((mkr_css_ctx_t *)p)->nodes);
134
+ return Qnil;
135
+ }
136
+
137
+ /* at_css: capture the first matching descendant and stop. Avoids materialising a
138
+ * NodeSet (and a Ruby #first dispatch) for the one node the caller wants. */
139
+ typedef struct {
140
+ lxb_dom_node_t *root; /* excluded: descendant-only */
141
+ lxb_dom_node_t *found;
142
+ } mkr_css_first_ctx_t;
143
+
144
+ static lxb_status_t
145
+ mkr_css_first_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
146
+ void *ctx_)
147
+ {
148
+ (void)spec;
149
+ mkr_css_first_ctx_t *c = (mkr_css_first_ctx_t *)ctx_;
150
+
151
+ if (node == c->root) {
152
+ return LXB_STATUS_OK; /* descendant-only */
153
+ }
154
+ c->found = node;
155
+ return LXB_STATUS_STOP;
156
+ }
157
+
158
+ /* Callback for matches?: signals that the node matched the selector. */
159
+ static lxb_status_t
160
+ mkr_css_match_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
161
+ void *ctx_)
162
+ {
163
+ (void)node; (void)spec;
164
+ *(int *)ctx_ = 1;
165
+ return LXB_STATUS_STOP;
166
+ }
167
+
168
+ /* Parse +rb_selector+ with the shared engine, hand the parsed list to +run+
169
+ * (the actual find / match against +node+), then reset the engine for the next
170
+ * call. Raises Makiri::CSS::SyntaxError on a bad selector; any result-specific
171
+ * limits are checked by the caller after return. */
172
+ static void
173
+ mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
174
+ lxb_status_t (*run)(lxb_selectors_t *, lxb_dom_node_t *,
175
+ lxb_css_selector_list_t *, void *),
176
+ void *u)
177
+ {
178
+ mkr_ruby_borrowed_text_t sv = mkr_ruby_verified_text(rb_selector, "CSS selector");
179
+
180
+ mkr_css_engine_init(); /* raises on init failure */
181
+
182
+ lxb_css_selector_list_t *list =
183
+ lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
184
+
185
+ int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
186
+ if (!syntax_error) {
187
+ (void)run(g_selectors, node, list, u);
188
+ }
189
+
190
+ /* Reset the shared engine for the next query: drop the parsed list's arena
191
+ * allocations and return the parser to its CLEAN stage. Both preserve the
192
+ * memory/selectors objects we set once; the traversal engine self-cleans
193
+ * after find/match. */
194
+ lxb_css_memory_clean(g_css_mem);
195
+ lxb_css_parser_clean(g_css_parser);
196
+
197
+ if (syntax_error) {
198
+ rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
199
+ }
200
+ RB_GC_GUARD(sv.value);
201
+ }
202
+
203
+ /* find: collect descendants matching the selector (MATCH_FIRST dedups a node
204
+ * that matches several selectors in a comma list). */
205
+ static lxb_status_t
206
+ mkr_run_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
207
+ lxb_css_selector_list_t *list, void *u)
208
+ {
209
+ lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
210
+ return lxb_selectors_find(selectors, root, list, mkr_css_find_cb, u);
211
+ }
212
+
213
+ /* find_first: stop at the first matching descendant (for at_css). */
214
+ static lxb_status_t
215
+ mkr_run_find_first(lxb_selectors_t *selectors, lxb_dom_node_t *root,
216
+ lxb_css_selector_list_t *list, void *u)
217
+ {
218
+ lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
219
+ return lxb_selectors_find(selectors, root, list, mkr_css_first_cb, u);
220
+ }
221
+
222
+ /* match_node: does THIS node match? */
223
+ static lxb_status_t
224
+ mkr_run_match(lxb_selectors_t *selectors, lxb_dom_node_t *node,
225
+ lxb_css_selector_list_t *list, void *u)
226
+ {
227
+ return lxb_selectors_match_node(selectors, node, list, mkr_css_match_cb, u);
228
+ }
229
+
230
+ /* Node#css: collect every matching descendant into a NodeSet (document order).
231
+ * Raises Makiri::CSS::SyntaxError on a bad selector, Makiri::Error on an
232
+ * over-large result. */
233
+ static VALUE
234
+ mkr_node_css(VALUE self, VALUE rb_selector)
235
+ {
236
+ lxb_dom_node_t *root = mkr_html_node_unwrap(self);
237
+ VALUE document = mkr_node_document(self);
238
+
239
+ /* A syntax error raises inside mkr_with_compiled_selector BEFORE the find
240
+ * runs, so ctx.nodes is still NULL there - nothing leaks on that path. */
241
+ mkr_css_ctx_t ctx = { .nodes = NULL, .count = 0, .cap = 0,
242
+ .root = root, .overflow = 0, .oom = 0 };
243
+ mkr_with_compiled_selector(rb_selector, root, mkr_run_find, &ctx);
244
+
245
+ if (ctx.overflow || ctx.oom) {
246
+ free(ctx.nodes);
247
+ if (ctx.overflow) {
248
+ rb_raise(mkr_eError, "CSS result set exceeded the node limit (%u)",
249
+ MKR_NODE_SET_MAX);
250
+ }
251
+ rb_raise(mkr_eError, "out of memory collecting CSS results");
252
+ }
253
+
254
+ VALUE set = mkr_node_set_new(document);
255
+ mkr_css_fill_t fill = { set, &ctx };
256
+ rb_ensure(mkr_css_fill_set, (VALUE)&fill, mkr_css_free_nodes, (VALUE)&ctx);
257
+ return set;
258
+ }
259
+
260
+ /* Node#at_css: the first matching descendant, or nil. */
261
+ static VALUE
262
+ mkr_node_at_css(VALUE self, VALUE rb_selector)
263
+ {
264
+ lxb_dom_node_t *root = mkr_html_node_unwrap(self);
265
+
266
+ mkr_css_first_ctx_t ctx = { .root = root, .found = NULL };
267
+ mkr_with_compiled_selector(rb_selector, root, mkr_run_find_first, &ctx);
268
+
269
+ return ctx.found != NULL
270
+ ? mkr_wrap_html_node(ctx.found, mkr_node_document(self))
271
+ : Qnil;
272
+ }
273
+
274
+ /* Node#matches?(selector): does THIS node match the CSS selector? (Like
275
+ * Nokogiri - tested against the node itself, not its descendants.) A malformed
276
+ * selector raises Makiri::CSS::SyntaxError. */
277
+ static VALUE
278
+ mkr_node_matches(VALUE self, VALUE rb_selector)
279
+ {
280
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
281
+ int matched = 0;
282
+ mkr_with_compiled_selector(rb_selector, node, mkr_run_match, &matched);
283
+ return matched ? Qtrue : Qfalse;
284
+ }
285
+
286
+ void
287
+ mkr_init_css(void)
288
+ {
289
+ rb_define_method(mkr_mHtmlNodeMethods, "css", mkr_node_css, 1);
290
+ rb_define_method(mkr_mHtmlNodeMethods, "at_css", mkr_node_at_css, 1);
291
+ rb_define_method(mkr_mHtmlNodeMethods, "matches?", mkr_node_matches, 1);
292
+ }