makiri 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 27ac120b94ab835caee9bbb50a1cee71b19e339dde2384496db9608e58b3269b
4
- data.tar.gz: 27b8ea683abe8854e6c68269413d4858e0f2fedfdd04f04d8fa91130b9b05ac1
3
+ metadata.gz: ee7e950514b33a2fa483cb6f4f4f48d5b80b8b6cc2205dbf720c2f47f790821d
4
+ data.tar.gz: 5c6739b1eabec71dbc34b8ce410c4ada7053112dd88d04d278840070178b4aa4
5
5
  SHA512:
6
- metadata.gz: 84754fb994af236692bdbc281cb0cba89a8cd6d7c75e2caa4e16ebe9b1efa6c4cbd270409be2957e461db4909bbabc32296ed44e185ccaa8985a0c285f25846c
7
- data.tar.gz: c3fba2792720ad30d1bee90343e4ae7877bd9871195620ce667281fffeb36e994b3e715a5f456327aaa4e8de1e80e3c24c7e6a898739c660f4d1c8d5ffa51c60
6
+ metadata.gz: 51af947ded032f4e09d3c622374cf09bbbc36f92f3f2006c98bd2f6d4884b51de83ded0a6faf8174be27cece0986d317ea2cd553e6837286d7c1d0a743c3261d
7
+ data.tar.gz: da5184494302a3c43cd8014972cf43e6c684460af1a53ef219a42bdfac9238e98a792ec2976b442462661ea63328330195408dd31d9b180f8dd421122676de13
data/CHANGELOG.md CHANGED
@@ -1,9 +1,11 @@
1
1
  # Changelog
2
2
 
3
- All notable changes to this project will be documented in this file.
3
+ ## [0.5.1] - 2026-06-22
4
4
 
5
- The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
- and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
5
+ ### Changed
6
+
7
+ * Faster CSS queries that reuse the same selector: compiled selectors are now
8
+ cached and reused across queries instead of being re-parsed each time.
7
9
 
8
10
  ## [0.5.0] - 2026-06-14
9
11
 
@@ -362,7 +364,8 @@ libxml2 / libxslt dependency at any layer**.
362
364
  domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
363
365
  Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
364
366
 
365
- [Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.0...HEAD
367
+ [Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.1...HEAD
368
+ [0.5.1]: https://github.com/takahashim/makiri/compare/v0.5.0...v0.5.1
366
369
  [0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
367
370
  [0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
368
371
  [0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
data/Rakefile CHANGED
@@ -303,6 +303,13 @@ task "bench:xml" => :compile do
303
303
  end
304
304
  end
305
305
 
306
+ desc "Run the CSS selector-query benchmark on a note.com-style SPA page (set BENCH_CARDS)"
307
+ task "bench:css" => :compile do
308
+ Bundler.with_unbundled_env do
309
+ sh "#{FileUtils::RUBY} -Ilib bench/bench_css_query.rb"
310
+ end
311
+ end
312
+
306
313
  namespace :conformance do
307
314
  desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
308
315
  task html5: :compile do
@@ -33,6 +33,34 @@ static lxb_css_selectors_t *g_css_sel;
33
33
  static lxb_selectors_t *g_selectors;
34
34
  static int g_css_ready;
35
35
 
36
+ /* Compiled-selector cache: a Ruby Hash mapping a selector String to the parsed
37
+ * lxb_css_selector_list_t (stored as an Integer pointer). Parsing a selector is
38
+ * the dominant cost when the same selectors are queried over and over (a
39
+ * querySelector-heavy SPA fires tens of thousands of identical queries), so the
40
+ * compiled list is reused instead of re-parsed. The parsed lists live in the
41
+ * shared g_css_mem arena, which is therefore NOT cleaned per call (the original
42
+ * code reset it after every query); to bound growth, when the cache fills we drop
43
+ * every compiled list at once (clean the arena + clear the Hash) and start over. */
44
+ static VALUE g_css_cache;
45
+ #define MKR_CSS_CACHE_CAP 256
46
+
47
+ /* Adaptive caching. Holding many distinct compiled lists in the shared arena
48
+ * makes each new parse slower (a bigger arena = more allocator work), so a flood
49
+ * of one-off selectors -- e.g. getElementById on unique React `useId` ids, which
50
+ * are never requeried -- turned the cache into a net loss vs the original
51
+ * parse+clean (measured ~22% slower per call). Track the cache hit rate over a
52
+ * window and, when it is low, BYPASS the cache: parse + clean per call, exactly
53
+ * as before, so the arena stays small and the worst case is merely "as fast as no
54
+ * cache". Periodically drop back into caching to re-test, so a workload that
55
+ * starts repeating selectors regains the cache. */
56
+ static size_t g_css_win; /* lookups in the current window */
57
+ static size_t g_css_win_hits; /* cache hits in the current window */
58
+ static int g_css_bypass; /* 1 = parse+clean, no caching */
59
+ static size_t g_css_bypass_runs; /* consecutive bypass windows */
60
+ #define MKR_CSS_WIN 1024 /* re-evaluate the hit rate every N lookups */
61
+ #define MKR_CSS_MIN_HIT_PCT 15 /* below this hit rate, bypass the cache */
62
+ #define MKR_CSS_RETEST_GAP 32 /* re-test caching every N bypass windows */
63
+
36
64
  /* Build the shared engine on first use; raises Makiri::Error on init failure
37
65
  * (leaving the globals unset, so a later call retries). */
38
66
  static void
@@ -179,22 +207,83 @@ mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
179
207
 
180
208
  mkr_css_engine_init(); /* raises on init failure */
181
209
 
182
- lxb_css_selector_list_t *list =
183
- lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
210
+ if (g_css_cache == 0) {
211
+ g_css_cache = rb_hash_new();
212
+ rb_gc_register_address(&g_css_cache);
213
+ }
214
+
215
+ /* Adaptive window: every MKR_CSS_WIN lookups, decide whether caching is
216
+ * paying off. A low hit rate (mostly one-off selectors) means the cache is
217
+ * only growing the arena and slowing parses, so bypass it; periodically
218
+ * re-test so a workload that becomes selector-repetitive regains the cache. */
219
+ if (++g_css_win > MKR_CSS_WIN) {
220
+ if (!g_css_bypass) {
221
+ if (g_css_win_hits * 100 < (size_t)MKR_CSS_WIN * MKR_CSS_MIN_HIT_PCT) {
222
+ g_css_bypass = 1;
223
+ g_css_bypass_runs = 0;
224
+ lxb_css_memory_clean(g_css_mem); /* drop the cached lists' arena */
225
+ rb_hash_clear(g_css_cache);
226
+ }
227
+ } else if (++g_css_bypass_runs >= MKR_CSS_RETEST_GAP) {
228
+ g_css_bypass = 0; /* re-test caching over the next window */
229
+ }
230
+ g_css_win = 1;
231
+ g_css_win_hits = 0;
232
+ }
233
+
234
+ if (!g_css_bypass) {
235
+ /* Reuse the compiled selector when we have already parsed this string. */
236
+ VALUE cached = rb_hash_lookup(g_css_cache, sv.value);
237
+ if (!NIL_P(cached)) {
238
+ lxb_css_selector_list_t *list =
239
+ (lxb_css_selector_list_t *)(intptr_t)NUM2LL(cached);
240
+ g_css_win_hits++;
241
+ (void)run(g_selectors, node, list, u);
242
+ /* The traversal engine self-cleans; the cached list + arena persist. */
243
+ RB_GC_GUARD(sv.value);
244
+ return;
245
+ }
246
+
247
+ /* Cache miss. Bound the cache before parsing: when full, drop every
248
+ * compiled list at once by cleaning the shared arena, then start fresh —
249
+ * so the new list is parsed into the now-empty arena (cleaning AFTER
250
+ * parsing would invalidate it). */
251
+ if (RHASH_SIZE(g_css_cache) >= MKR_CSS_CACHE_CAP) {
252
+ lxb_css_memory_clean(g_css_mem);
253
+ rb_hash_clear(g_css_cache);
254
+ }
255
+
256
+ lxb_css_selector_list_t *list =
257
+ lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
258
+ int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
259
+
260
+ /* Return the parser to its CLEAN stage; do NOT clean the memory arena —
261
+ * the freshly parsed list lives there and we keep it. */
262
+ lxb_css_parser_clean(g_css_parser);
263
+
264
+ if (syntax_error) {
265
+ rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
266
+ }
184
267
 
185
- int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
186
- if (!syntax_error) {
268
+ /* Cache the compiled list (the Hash dups + freezes the String key), then
269
+ * run. Store the pointer as a Ruby Integer; the list outlives the call. */
270
+ rb_hash_aset(g_css_cache, sv.value, LL2NUM((long long)(intptr_t)list));
187
271
  (void)run(g_selectors, node, list, u);
272
+ RB_GC_GUARD(sv.value);
273
+ return;
188
274
  }
189
275
 
190
- /* Reset the shared engine for the next query: drop the parsed list's arena
191
- * allocations and return the parser to its CLEAN stage. Both preserve the
192
- * memory/selectors objects we set once; the traversal engine self-cleans
193
- * after find/match. */
276
+ /* Bypass mode: parse + clean per call (the original behavior), so the arena
277
+ * stays small and a one-off-selector flood is no slower than no cache. */
278
+ lxb_css_selector_list_t *l0 =
279
+ lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
280
+ int err0 = (l0 == NULL || g_css_parser->status != LXB_STATUS_OK);
281
+ if (!err0) {
282
+ (void)run(g_selectors, node, l0, u);
283
+ }
194
284
  lxb_css_memory_clean(g_css_mem);
195
285
  lxb_css_parser_clean(g_css_parser);
196
-
197
- if (syntax_error) {
286
+ if (err0) {
198
287
  rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
199
288
  }
200
289
  RB_GC_GUARD(sv.value);
@@ -258,6 +258,18 @@ mkr_limit_ast_node(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
258
258
  return 0;
259
259
  }
260
260
 
261
+ /*
262
+ * THE evaluator progress gate. This is the single primitive that bounds runtime
263
+ * work: every loop in the engine whose trip count is input-derived charges ONE
264
+ * tick per iteration through here (the axis walk per visited node, the M*N
265
+ * compare per pair, the index-bucket scan per element, eval_node per AST node).
266
+ * One uniform rule - "input-bounded loop => one eval_op per step" - so checking
267
+ * the DoS bound is local: confirm each such loop calls this. Kept deliberately
268
+ * uniform (no bulk/up-front variant): a bulk charge would only suit
269
+ * run-to-completion loops and would wrongly reject an early-exiting query if
270
+ * misapplied, trading one foot-gun-free rule for a conditional one. Overrun is
271
+ * fail-closed (MKR_XPATH_ERR_LIMIT), never a truncated result.
272
+ */
261
273
  int
262
274
  mkr_limit_eval_op(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
263
275
  {
@@ -1666,22 +1666,30 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
1666
1666
  mkr_val_t *out, mkr_xpath_error_t *err)
1667
1667
  {
1668
1668
  /* Budget + recursion bookkeeping. Every AST node visit counts as one
1669
- * eval op; recursion depth tracks how deep we are in expression
1670
- * nodes so we abort cleanly on pathological inputs. */
1669
+ * eval op; recursion depth tracks how deep we are in expression nodes
1670
+ * so we abort cleanly on pathological inputs.
1671
+ *
1672
+ * eval_node is the engine's ONLY recursive function, so "AST-driven
1673
+ * recursion is bounded" reduces entirely to this one enter/leave pair:
1674
+ * charge one op + one recursion level on entry, release the level at the
1675
+ * SINGLE exit below. Keeping eval_node single-exit is what makes that
1676
+ * balance locally verifiable - there is no path from the enter here to a
1677
+ * return that skips the mkr_limit_recurse_leave. Do not add an early
1678
+ * `return` between here and the leave; set `rc` and fall through. */
1671
1679
  mkr_xpath_limits_t *L = mkr_ctx_limits(ctx);
1672
1680
  if (mkr_limit_eval_op(L, err) != 0) return -1;
1673
1681
  if (mkr_limit_recurse_enter(L, err) != 0) return -1;
1674
1682
 
1683
+ int rc;
1684
+
1675
1685
  /* Hoisting fast path: a CI subtree that's already been computed in
1676
1686
  * this evaluate is returned as a clone. The clone keeps ownership
1677
1687
  * semantics clean - mkr_val_clear on either copy is safe. */
1678
1688
  if (n->is_context_independent && n->memoized) {
1679
- int rc = mkr_val_clone(&n->memo_value, out, err);
1680
- mkr_limit_recurse_leave(L);
1681
- return rc;
1689
+ rc = mkr_val_clone(&n->memo_value, out, err);
1690
+ goto done; /* single exit: the recurse_leave below still runs */
1682
1691
  }
1683
1692
 
1684
- int rc;
1685
1693
  switch (n->kind) {
1686
1694
  case MKR_NK_LITERAL_STR: {
1687
1695
  mkr_owned_text_t text;
@@ -1772,6 +1780,7 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
1772
1780
  }
1773
1781
  }
1774
1782
 
1783
+ done:
1775
1784
  mkr_limit_recurse_leave(L);
1776
1785
  return rc;
1777
1786
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Makiri
4
- VERSION = "0.5.0"
4
+ VERSION = "0.5.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makiri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahashim