RubyGems - makiri - Versions diffs - 0.5.0 → 0.5.1 - Mend

makiri 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -4
data/Rakefile +7 -0
data/ext/makiri/glue/ruby_html_css.c +99 -10
data/ext/makiri/xpath/mkr_xpath.c +12 -0
data/ext/makiri/xpath/mkr_xpath_eval_body.h +15 -6
data/lib/makiri/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 27ac120b94ab835caee9bbb50a1cee71b19e339dde2384496db9608e58b3269b
-  data.tar.gz: 27b8ea683abe8854e6c68269413d4858e0f2fedfdd04f04d8fa91130b9b05ac1
+  metadata.gz: ee7e950514b33a2fa483cb6f4f4f48d5b80b8b6cc2205dbf720c2f47f790821d
+  data.tar.gz: 5c6739b1eabec71dbc34b8ce410c4ada7053112dd88d04d278840070178b4aa4
 SHA512:
-  metadata.gz: 84754fb994af236692bdbc281cb0cba89a8cd6d7c75e2caa4e16ebe9b1efa6c4cbd270409be2957e461db4909bbabc32296ed44e185ccaa8985a0c285f25846c
-  data.tar.gz: c3fba2792720ad30d1bee90343e4ae7877bd9871195620ce667281fffeb36e994b3e715a5f456327aaa4e8de1e80e3c24c7e6a898739c660f4d1c8d5ffa51c60
+  metadata.gz: 51af947ded032f4e09d3c622374cf09bbbc36f92f3f2006c98bd2f6d4884b51de83ded0a6faf8174be27cece0986d317ea2cd553e6837286d7c1d0a743c3261d
+  data.tar.gz: da5184494302a3c43cd8014972cf43e6c684460af1a53ef219a42bdfac9238e98a792ec2976b442462661ea63328330195408dd31d9b180f8dd421122676de13

data/CHANGELOG.md CHANGED Viewed

@@ -1,9 +1,11 @@
 # Changelog
-All notable changes to this project will be documented in this file.
+## [0.5.1] - 2026-06-22
-The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+### Changed
+* Faster CSS queries that reuse the same selector: compiled selectors are now
+  cached and reused across queries instead of being re-parsed each time.
 ## [0.5.0] - 2026-06-14
@@ -362,7 +364,8 @@ libxml2 / libxslt dependency at any layer**.
   domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
   Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
-[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.0...HEAD
+[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.1...HEAD
+[0.5.1]: https://github.com/takahashim/makiri/compare/v0.5.0...v0.5.1
 [0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
 [0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
 [0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0

data/Rakefile CHANGED Viewed

@@ -303,6 +303,13 @@ task "bench:xml" => :compile do
   end
 end
+desc "Run the CSS selector-query benchmark on a note.com-style SPA page (set BENCH_CARDS)"
+task "bench:css" => :compile do
+  Bundler.with_unbundled_env do
+    sh "#{FileUtils::RUBY} -Ilib bench/bench_css_query.rb"
+  end
+end
 namespace :conformance do
   desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
   task html5: :compile do

data/ext/makiri/glue/ruby_html_css.c CHANGED Viewed

@@ -33,6 +33,34 @@ static lxb_css_selectors_t *g_css_sel;
 static lxb_selectors_t     *g_selectors;
 static int                  g_css_ready;
+/* Compiled-selector cache: a Ruby Hash mapping a selector String to the parsed
+ * lxb_css_selector_list_t (stored as an Integer pointer). Parsing a selector is
+ * the dominant cost when the same selectors are queried over and over (a
+ * querySelector-heavy SPA fires tens of thousands of identical queries), so the
+ * compiled list is reused instead of re-parsed. The parsed lists live in the
+ * shared g_css_mem arena, which is therefore NOT cleaned per call (the original
+ * code reset it after every query); to bound growth, when the cache fills we drop
+ * every compiled list at once (clean the arena + clear the Hash) and start over. */
+static VALUE                g_css_cache;
+#define MKR_CSS_CACHE_CAP   256
+/* Adaptive caching. Holding many distinct compiled lists in the shared arena
+ * makes each new parse slower (a bigger arena = more allocator work), so a flood
+ * of one-off selectors -- e.g. getElementById on unique React `useId` ids, which
+ * are never requeried -- turned the cache into a net loss vs the original
+ * parse+clean (measured ~22% slower per call). Track the cache hit rate over a
+ * window and, when it is low, BYPASS the cache: parse + clean per call, exactly
+ * as before, so the arena stays small and the worst case is merely "as fast as no
+ * cache". Periodically drop back into caching to re-test, so a workload that
+ * starts repeating selectors regains the cache. */
+static size_t               g_css_win;        /* lookups in the current window */
+static size_t               g_css_win_hits;   /* cache hits in the current window */
+static int                  g_css_bypass;     /* 1 = parse+clean, no caching */
+static size_t               g_css_bypass_runs; /* consecutive bypass windows */
+#define MKR_CSS_WIN          1024  /* re-evaluate the hit rate every N lookups */
+#define MKR_CSS_MIN_HIT_PCT  15    /* below this hit rate, bypass the cache */
+#define MKR_CSS_RETEST_GAP   32    /* re-test caching every N bypass windows */
 /* Build the shared engine on first use; raises Makiri::Error on init failure
  * (leaving the globals unset, so a later call retries). */
 static void
@@ -179,22 +207,83 @@ mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
     mkr_css_engine_init(); /* raises on init failure */
-    lxb_css_selector_list_t *list =
-        lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
+    if (g_css_cache == 0) {
+        g_css_cache = rb_hash_new();
+        rb_gc_register_address(&g_css_cache);
+    }
+    /* Adaptive window: every MKR_CSS_WIN lookups, decide whether caching is
+     * paying off. A low hit rate (mostly one-off selectors) means the cache is
+     * only growing the arena and slowing parses, so bypass it; periodically
+     * re-test so a workload that becomes selector-repetitive regains the cache. */
+    if (++g_css_win > MKR_CSS_WIN) {
+        if (!g_css_bypass) {
+            if (g_css_win_hits * 100 < (size_t)MKR_CSS_WIN * MKR_CSS_MIN_HIT_PCT) {
+                g_css_bypass = 1;
+                g_css_bypass_runs = 0;
+                lxb_css_memory_clean(g_css_mem); /* drop the cached lists' arena */
+                rb_hash_clear(g_css_cache);
+            }
+        } else if (++g_css_bypass_runs >= MKR_CSS_RETEST_GAP) {
+            g_css_bypass = 0; /* re-test caching over the next window */
+        }
+        g_css_win = 1;
+        g_css_win_hits = 0;
+    }
+    if (!g_css_bypass) {
+        /* Reuse the compiled selector when we have already parsed this string. */
+        VALUE cached = rb_hash_lookup(g_css_cache, sv.value);
+        if (!NIL_P(cached)) {
+            lxb_css_selector_list_t *list =
+                (lxb_css_selector_list_t *)(intptr_t)NUM2LL(cached);
+            g_css_win_hits++;
+            (void)run(g_selectors, node, list, u);
+            /* The traversal engine self-cleans; the cached list + arena persist. */
+            RB_GC_GUARD(sv.value);
+            return;
+        }
+        /* Cache miss. Bound the cache before parsing: when full, drop every
+         * compiled list at once by cleaning the shared arena, then start fresh —
+         * so the new list is parsed into the now-empty arena (cleaning AFTER
+         * parsing would invalidate it). */
+        if (RHASH_SIZE(g_css_cache) >= MKR_CSS_CACHE_CAP) {
+            lxb_css_memory_clean(g_css_mem);
+            rb_hash_clear(g_css_cache);
+        }
+        lxb_css_selector_list_t *list =
+            lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
+        int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
+        /* Return the parser to its CLEAN stage; do NOT clean the memory arena —
+         * the freshly parsed list lives there and we keep it. */
+        lxb_css_parser_clean(g_css_parser);
+        if (syntax_error) {
+            rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
+        }
-    int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
-    if (!syntax_error) {
+        /* Cache the compiled list (the Hash dups + freezes the String key), then
+         * run. Store the pointer as a Ruby Integer; the list outlives the call. */
+        rb_hash_aset(g_css_cache, sv.value, LL2NUM((long long)(intptr_t)list));
         (void)run(g_selectors, node, list, u);
+        RB_GC_GUARD(sv.value);
+        return;
     }
-    /* Reset the shared engine for the next query: drop the parsed list's arena
-     * allocations and return the parser to its CLEAN stage. Both preserve the
-     * memory/selectors objects we set once; the traversal engine self-cleans
-     * after find/match. */
+    /* Bypass mode: parse + clean per call (the original behavior), so the arena
+     * stays small and a one-off-selector flood is no slower than no cache. */
+    lxb_css_selector_list_t *l0 =
+        lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
+    int err0 = (l0 == NULL || g_css_parser->status != LXB_STATUS_OK);
+    if (!err0) {
+        (void)run(g_selectors, node, l0, u);
+    }
     lxb_css_memory_clean(g_css_mem);
     lxb_css_parser_clean(g_css_parser);
-    if (syntax_error) {
+    if (err0) {
         rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
     }
     RB_GC_GUARD(sv.value);

data/ext/makiri/xpath/mkr_xpath.c CHANGED Viewed

@@ -258,6 +258,18 @@ mkr_limit_ast_node(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
   return 0;
 }
+/*
+ * THE evaluator progress gate. This is the single primitive that bounds runtime
+ * work: every loop in the engine whose trip count is input-derived charges ONE
+ * tick per iteration through here (the axis walk per visited node, the M*N
+ * compare per pair, the index-bucket scan per element, eval_node per AST node).
+ * One uniform rule - "input-bounded loop => one eval_op per step" - so checking
+ * the DoS bound is local: confirm each such loop calls this. Kept deliberately
+ * uniform (no bulk/up-front variant): a bulk charge would only suit
+ * run-to-completion loops and would wrongly reject an early-exiting query if
+ * misapplied, trading one foot-gun-free rule for a conditional one. Overrun is
+ * fail-closed (MKR_XPATH_ERR_LIMIT), never a truncated result.
+ */
 int
 mkr_limit_eval_op(mkr_xpath_limits_t *L, mkr_xpath_error_t *err)
 {

data/ext/makiri/xpath/mkr_xpath_eval_body.h CHANGED Viewed

@@ -1666,22 +1666,30 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
           mkr_val_t *out, mkr_xpath_error_t *err)
 {
   /* Budget + recursion bookkeeping. Every AST node visit counts as one
-   * eval op; recursion depth tracks how deep we are in expression
-   * nodes so we abort cleanly on pathological inputs. */
+   * eval op; recursion depth tracks how deep we are in expression nodes
+   * so we abort cleanly on pathological inputs.
+   *
+   * eval_node is the engine's ONLY recursive function, so "AST-driven
+   * recursion is bounded" reduces entirely to this one enter/leave pair:
+   * charge one op + one recursion level on entry, release the level at the
+   * SINGLE exit below. Keeping eval_node single-exit is what makes that
+   * balance locally verifiable - there is no path from the enter here to a
+   * return that skips the mkr_limit_recurse_leave. Do not add an early
+   * `return` between here and the leave; set `rc` and fall through. */
   mkr_xpath_limits_t *L = mkr_ctx_limits(ctx);
   if (mkr_limit_eval_op(L, err) != 0) return -1;
   if (mkr_limit_recurse_enter(L, err) != 0) return -1;
+  int rc;
   /* Hoisting fast path: a CI subtree that's already been computed in
    * this evaluate is returned as a clone. The clone keeps ownership
    * semantics clean - mkr_val_clear on either copy is safe. */
   if (n->is_context_independent && n->memoized) {
-    int rc = mkr_val_clone(&n->memo_value, out, err);
-    mkr_limit_recurse_leave(L);
-    return rc;
+    rc = mkr_val_clone(&n->memo_value, out, err);
+    goto done; /* single exit: the recurse_leave below still runs */
   }
-  int rc;
   switch (n->kind) {
   case MKR_NK_LITERAL_STR: {
     mkr_owned_text_t text;
@@ -1772,6 +1780,7 @@ eval_node(mkr_xpath_context_t *ctx, const mkr_node_t *n,
     }
   }
+done:
   mkr_limit_recurse_leave(L);
   return rc;
 }

data/lib/makiri/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Makiri
-  VERSION = "0.5.0"
+  VERSION = "0.5.1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: makiri
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.5.1
 platform: ruby
 authors:
 - takahashim