makiri 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -4
- data/Rakefile +7 -0
- data/ext/makiri/glue/ruby_html_css.c +99 -10
- data/ext/makiri/xpath/mkr_xpath.c +12 -0
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +15 -6
- data/lib/makiri/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ee7e950514b33a2fa483cb6f4f4f48d5b80b8b6cc2205dbf720c2f47f790821d
|
|
4
|
+
data.tar.gz: 5c6739b1eabec71dbc34b8ce410c4ada7053112dd88d04d278840070178b4aa4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 51af947ded032f4e09d3c622374cf09bbbc36f92f3f2006c98bd2f6d4884b51de83ded0a6faf8174be27cece0986d317ea2cd553e6837286d7c1d0a743c3261d
|
|
7
|
+
data.tar.gz: da5184494302a3c43cd8014972cf43e6c684460af1a53ef219a42bdfac9238e98a792ec2976b442462661ea63328330195408dd31d9b180f8dd421122676de13
|
data/CHANGELOG.md
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
## [0.5.1] - 2026-06-22
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
### Changed
|
|
6
|
+
|
|
7
|
+
* Faster CSS queries that reuse the same selector: compiled selectors are now
|
|
8
|
+
cached and reused across queries instead of being re-parsed each time.
|
|
7
9
|
|
|
8
10
|
## [0.5.0] - 2026-06-14
|
|
9
11
|
|
|
@@ -362,7 +364,8 @@ libxml2 / libxslt dependency at any layer**.
|
|
|
362
364
|
domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
|
|
363
365
|
Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
|
|
364
366
|
|
|
365
|
-
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.
|
|
367
|
+
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.1...HEAD
|
|
368
|
+
[0.5.1]: https://github.com/takahashim/makiri/compare/v0.5.0...v0.5.1
|
|
366
369
|
[0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
|
|
367
370
|
[0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
|
|
368
371
|
[0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
|
data/Rakefile
CHANGED
|
@@ -303,6 +303,13 @@ task "bench:xml" => :compile do
|
|
|
303
303
|
end
|
|
304
304
|
end
|
|
305
305
|
|
|
306
|
+
desc "Run the CSS selector-query benchmark on a note.com-style SPA page (set BENCH_CARDS)"
|
|
307
|
+
task "bench:css" => :compile do
|
|
308
|
+
Bundler.with_unbundled_env do
|
|
309
|
+
sh "#{FileUtils::RUBY} -Ilib bench/bench_css_query.rb"
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
306
313
|
namespace :conformance do
|
|
307
314
|
desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
|
|
308
315
|
task html5: :compile do
|
|
@@ -33,6 +33,34 @@ static lxb_css_selectors_t *g_css_sel;
|
|
|
33
33
|
static lxb_selectors_t *g_selectors;
|
|
34
34
|
static int g_css_ready;
|
|
35
35
|
|
|
36
|
+
/* Compiled-selector cache: a Ruby Hash mapping a selector String to the parsed
|
|
37
|
+
* lxb_css_selector_list_t (stored as an Integer pointer). Parsing a selector is
|
|
38
|
+
* the dominant cost when the same selectors are queried over and over (a
|
|
39
|
+
* querySelector-heavy SPA fires tens of thousands of identical queries), so the
|
|
40
|
+
* compiled list is reused instead of re-parsed. The parsed lists live in the
|
|
41
|
+
* shared g_css_mem arena, which is therefore NOT cleaned per call (the original
|
|
42
|
+
* code reset it after every query); to bound growth, when the cache fills we drop
|
|
43
|
+
* every compiled list at once (clean the arena + clear the Hash) and start over. */
|
|
44
|
+
static VALUE g_css_cache;
|
|
45
|
+
#define MKR_CSS_CACHE_CAP 256
|
|
46
|
+
|
|
47
|
+
/* Adaptive caching. Holding many distinct compiled lists in the shared arena
|
|
48
|
+
* makes each new parse slower (a bigger arena = more allocator work), so a flood
|
|
49
|
+
* of one-off selectors -- e.g. getElementById on unique React `useId` ids, which
|
|
50
|
+
* are never requeried -- turned the cache into a net loss vs the original
|
|
51
|
+
* parse+clean (measured ~22% slower per call). Track the cache hit rate over a
|
|
52
|
+
* window and, when it is low, BYPASS the cache: parse + clean per call, exactly
|
|
53
|
+
* as before, so the arena stays small and the worst case is merely "as fast as no
|
|
54
|
+
* cache". Periodically drop back into caching to re-test, so a workload that
|
|
55
|
+
* starts repeating selectors regains the cache. */
|
|
56
|
+
static size_t g_css_win; /* lookups in the current window */
|
|
57
|
+
static size_t g_css_win_hits; /* cache hits in the current window */
|
|
58
|
+
static int g_css_bypass; /* 1 = parse+clean, no caching */
|
|
59
|
+
static size_t g_css_bypass_runs; /* consecutive bypass windows */
|
|
60
|
+
#define MKR_CSS_WIN 1024 /* re-evaluate the hit rate every N lookups */
|
|
61
|
+
#define MKR_CSS_MIN_HIT_PCT 15 /* below this hit rate, bypass the cache */
|
|
62
|
+
#define MKR_CSS_RETEST_GAP 32 /* re-test caching every N bypass windows */
|
|
63
|
+
|
|
36
64
|
/* Build the shared engine on first use; raises Makiri::Error on init failure
|
|
37
65
|
* (leaving the globals unset, so a later call retries). */
|
|
38
66
|
static void
|
|
@@ -179,22 +207,83 @@ mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
|
|
|
179
207
|
|
|
180
208
|
mkr_css_engine_init(); /* raises on init failure */
|
|
181
209
|
|
|
182
|
-
|
|
183
|
-
|
|
210
|
+
if (g_css_cache == 0) {
|
|
211
|
+
g_css_cache = rb_hash_new();
|
|
212
|
+
rb_gc_register_address(&g_css_cache);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/* Adaptive window: every MKR_CSS_WIN lookups, decide whether caching is
|
|
216
|
+
* paying off. A low hit rate (mostly one-off selectors) means the cache is
|
|
217
|
+
* only growing the arena and slowing parses, so bypass it; periodically
|
|
218
|
+
* re-test so a workload that becomes selector-repetitive regains the cache. */
|
|
219
|
+
if (++g_css_win > MKR_CSS_WIN) {
|
|
220
|
+
if (!g_css_bypass) {
|
|
221
|
+
if (g_css_win_hits * 100 < (size_t)MKR_CSS_WIN * MKR_CSS_MIN_HIT_PCT) {
|
|
222
|
+
g_css_bypass = 1;
|
|
223
|
+
g_css_bypass_runs = 0;
|
|
224
|
+
lxb_css_memory_clean(g_css_mem); /* drop the cached lists' arena */
|
|
225
|
+
rb_hash_clear(g_css_cache);
|
|
226
|
+
}
|
|
227
|
+
} else if (++g_css_bypass_runs >= MKR_CSS_RETEST_GAP) {
|
|
228
|
+
g_css_bypass = 0; /* re-test caching over the next window */
|
|
229
|
+
}
|
|
230
|
+
g_css_win = 1;
|
|
231
|
+
g_css_win_hits = 0;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (!g_css_bypass) {
|
|
235
|
+
/* Reuse the compiled selector when we have already parsed this string. */
|
|
236
|
+
VALUE cached = rb_hash_lookup(g_css_cache, sv.value);
|
|
237
|
+
if (!NIL_P(cached)) {
|
|
238
|
+
lxb_css_selector_list_t *list =
|
|
239
|
+
(lxb_css_selector_list_t *)(intptr_t)NUM2LL(cached);
|
|
240
|
+
g_css_win_hits++;
|
|
241
|
+
(void)run(g_selectors, node, list, u);
|
|
242
|
+
/* The traversal engine self-cleans; the cached list + arena persist. */
|
|
243
|
+
RB_GC_GUARD(sv.value);
|
|
244
|
+
return;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/* Cache miss. Bound the cache before parsing: when full, drop every
|
|
248
|
+
* compiled list at once by cleaning the shared arena, then start fresh —
|
|
249
|
+
* so the new list is parsed into the now-empty arena (cleaning AFTER
|
|
250
|
+
* parsing would invalidate it). */
|
|
251
|
+
if (RHASH_SIZE(g_css_cache) >= MKR_CSS_CACHE_CAP) {
|
|
252
|
+
lxb_css_memory_clean(g_css_mem);
|
|
253
|
+
rb_hash_clear(g_css_cache);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
lxb_css_selector_list_t *list =
|
|
257
|
+
lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
|
|
258
|
+
int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
|
|
259
|
+
|
|
260
|
+
/* Return the parser to its CLEAN stage; do NOT clean the memory arena —
|
|
261
|
+
* the freshly parsed list lives there and we keep it. */
|
|
262
|
+
lxb_css_parser_clean(g_css_parser);
|
|
263
|
+
|
|
264
|
+
if (syntax_error) {
|
|
265
|
+
rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
|
|
266
|
+
}
|
|
184
267
|
|
|
185
|
-
|
|
186
|
-
|
|
268
|
+
/* Cache the compiled list (the Hash dups + freezes the String key), then
|
|
269
|
+
* run. Store the pointer as a Ruby Integer; the list outlives the call. */
|
|
270
|
+
rb_hash_aset(g_css_cache, sv.value, LL2NUM((long long)(intptr_t)list));
|
|
187
271
|
(void)run(g_selectors, node, list, u);
|
|
272
|
+
RB_GC_GUARD(sv.value);
|
|
273
|
+
return;
|
|
188
274
|
}
|
|
189
275
|
|
|
190
|
-
/*
|
|
191
|
-
*
|
|
192
|
-
|
|
193
|
-
|
|
276
|
+
/* Bypass mode: parse + clean per call (the original behavior), so the arena
|
|
277
|
+
* stays small and a one-off-selector flood is no slower than no cache. */
|
|
278
|
+
lxb_css_selector_list_t *l0 =
|
|
279
|
+
lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
|
|
280
|
+
int err0 = (l0 == NULL || g_css_parser->status != LXB_STATUS_OK);
|
|
281
|
+
if (!err0) {
|
|
282
|
+
(void)run(g_selectors, node, l0, u);
|
|
283
|
+
}
|
|
194
284
|
lxb_css_memory_clean(g_css_mem);
|
|
195
285
|
lxb_css_parser_clean(g_css_parser);
|
|
196
|
-
|
|
197
|
-
if (syntax_error) {
|
|
286
|
+
if (err0) {
|
|
198
287
|
rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
|
|
199
288
|
}
|
|
200
289
|
RB_GC_GUARD(sv.value);
|
|
@@ -258,6 +258,18 @@ mkr_limit_ast_node(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
|
|
|
258
258
|
return 0;
|
|
259
259
|
}
|
|
260
260
|
|
|
261
|
+
/*
|
|
262
|
+
* THE evaluator progress gate. This is the single primitive that bounds runtime
|
|
263
|
+
* work: every loop in the engine whose trip count is input-derived charges ONE
|
|
264
|
+
* tick per iteration through here (the axis walk per visited node, the M*N
|
|
265
|
+
* compare per pair, the index-bucket scan per element, eval_node per AST node).
|
|
266
|
+
* One uniform rule - "input-bounded loop => one eval_op per step" - so checking
|
|
267
|
+
* the DoS bound is local: confirm each such loop calls this. Kept deliberately
|
|
268
|
+
* uniform (no bulk/up-front variant): a bulk charge would only suit
|
|
269
|
+
* run-to-completion loops and would wrongly reject an early-exiting query if
|
|
270
|
+
* misapplied, trading one foot-gun-free rule for a conditional one. Overrun is
|
|
271
|
+
* fail-closed (MKR_XPATH_ERR_LIMIT), never a truncated result.
|
|
272
|
+
*/
|
|
261
273
|
int
|
|
262
274
|
mkr_limit_eval_op(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
|
|
263
275
|
{
|
|
@@ -1666,22 +1666,30 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
|
|
|
1666
1666
|
mkr_val_t *out, mkr_xpath_error_t *err)
|
|
1667
1667
|
{
|
|
1668
1668
|
/* Budget + recursion bookkeeping. Every AST node visit counts as one
|
|
1669
|
-
* eval op; recursion depth tracks how deep we are in expression
|
|
1670
|
-
*
|
|
1669
|
+
* eval op; recursion depth tracks how deep we are in expression nodes
|
|
1670
|
+
* so we abort cleanly on pathological inputs.
|
|
1671
|
+
*
|
|
1672
|
+
* eval_node is the engine's ONLY recursive function, so "AST-driven
|
|
1673
|
+
* recursion is bounded" reduces entirely to this one enter/leave pair:
|
|
1674
|
+
* charge one op + one recursion level on entry, release the level at the
|
|
1675
|
+
* SINGLE exit below. Keeping eval_node single-exit is what makes that
|
|
1676
|
+
* balance locally verifiable - there is no path from the enter here to a
|
|
1677
|
+
* return that skips the mkr_limit_recurse_leave. Do not add an early
|
|
1678
|
+
* `return` between here and the leave; set `rc` and fall through. */
|
|
1671
1679
|
mkr_xpath_limits_t *L = mkr_ctx_limits(ctx);
|
|
1672
1680
|
if (mkr_limit_eval_op(L, err) != 0) return -1;
|
|
1673
1681
|
if (mkr_limit_recurse_enter(L, err) != 0) return -1;
|
|
1674
1682
|
|
|
1683
|
+
int rc;
|
|
1684
|
+
|
|
1675
1685
|
/* Hoisting fast path: a CI subtree that's already been computed in
|
|
1676
1686
|
* this evaluate is returned as a clone. The clone keeps ownership
|
|
1677
1687
|
* semantics clean - mkr_val_clear on either copy is safe. */
|
|
1678
1688
|
if (n->is_context_independent && n->memoized) {
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
return rc;
|
|
1689
|
+
rc = mkr_val_clone(&n->memo_value, out, err);
|
|
1690
|
+
goto done; /* single exit: the recurse_leave below still runs */
|
|
1682
1691
|
}
|
|
1683
1692
|
|
|
1684
|
-
int rc;
|
|
1685
1693
|
switch (n->kind) {
|
|
1686
1694
|
case MKR_NK_LITERAL_STR: {
|
|
1687
1695
|
mkr_owned_text_t text;
|
|
@@ -1772,6 +1780,7 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
|
|
|
1772
1780
|
}
|
|
1773
1781
|
}
|
|
1774
1782
|
|
|
1783
|
+
done:
|
|
1775
1784
|
mkr_limit_recurse_leave(L);
|
|
1776
1785
|
return rc;
|
|
1777
1786
|
}
|
data/lib/makiri/version.rb
CHANGED