RubyGems - makiri - Versions diffs - 0.4.0 → 0.5.0 - Mend

makiri 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +4 -4
data/.github/workflows/valgrind.yml +49 -46
data/CHANGELOG.md +68 -1
data/README.md +14 -0
data/Rakefile +13 -0
data/ext/makiri/bridge/ruby_string.c +80 -54
data/ext/makiri/core/mkr_alloc.h +1 -1
data/ext/makiri/core/mkr_utf8.c +1 -1
data/ext/makiri/core/mkr_utf8.h +1 -1
data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
data/ext/makiri/dom_adapter/cross_import.c +434 -0
data/ext/makiri/dom_adapter/cross_import.h +35 -0
data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
data/ext/makiri/fuzz/Makefile +1 -1
data/ext/makiri/glue/cross_import.h +30 -0
data/ext/makiri/glue/glue.h +1 -1
data/ext/makiri/glue/ruby_doc.c +11 -3
data/ext/makiri/glue/ruby_html_mutate.c +6 -0
data/ext/makiri/glue/ruby_html_node.c +1 -1
data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
data/ext/makiri/glue/ruby_node.c +14 -0
data/ext/makiri/glue/ruby_xml.c +31 -2
data/ext/makiri/glue/ruby_xml_node.c +87 -2
data/ext/makiri/glue/ruby_xpath.c +16 -1
data/ext/makiri/makiri.c +3 -0
data/ext/makiri/makiri.h +5 -0
data/ext/makiri/xml/mkr_xml.h +5 -0
data/ext/makiri/xml/mkr_xml_chars.c +22 -0
data/ext/makiri/xml/mkr_xml_mutate.c +160 -50
data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
data/ext/makiri/xml/mkr_xml_tree.c +63 -27
data/ext/makiri/xpath/mkr_xpath.c +28 -0
data/ext/makiri/xpath/mkr_xpath.h +5 -1
data/ext/makiri/xpath/mkr_xpath_eval_body.h +11 -1
data/lib/makiri/html/document.rb +11 -12
data/lib/makiri/html/node_methods.rb +0 -1
data/lib/makiri/node_set.rb +14 -9
data/lib/makiri/processing_instruction.rb +8 -0
data/lib/makiri/version.rb +1 -1
data/lib/makiri/xml/builder.rb +29 -21
data/lib/makiri/xpath_context.rb +12 -4
data/script/check_c_safety.rb +1 -1
data/script/check_c_safety_allowlist.yml +8 -5
data/script/leaks_harness.rb +7 -0
data/suppressions/ruby.supp +140 -0
metadata +13 -8
/data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
/data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
/data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +0 -0
/data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 30e3037756fec29474a8fb0c62e38d06a3337bba9c3ad844e6bdcfc02cff5026
-  data.tar.gz: 5b2f2a2887019261a359a64c35bddd36eb076a8c6a4f145c1a2ec1d84f679be6
+  metadata.gz: 27ac120b94ab835caee9bbb50a1cee71b19e339dde2384496db9608e58b3269b
+  data.tar.gz: 27b8ea683abe8854e6c68269413d4858e0f2fedfdd04f04d8fa91130b9b05ac1
 SHA512:
-  metadata.gz: 2b6bf4ed94ae428e23bcb3af8ec71febda16c83c5187ffb8cadd3698327681d951ead4b68d7df78328e8eeeec82a09de310d862cd110323c2874ac1fb0adf62c
-  data.tar.gz: 9ebd0a7562d7ff5541ead1e61f4dae1719257b29ff8cf4414e5ca2c23e2c708849c360b26471fc132bebd0ddd9cd5866ec11ab44aec4ac4044cdba10958a1038
+  metadata.gz: 84754fb994af236692bdbc281cb0cba89a8cd6d7c75e2caa4e16ebe9b1efa6c4cbd270409be2957e461db4909bbabc32296ed44e185ccaa8985a0c285f25846c
+  data.tar.gz: c3fba2792720ad30d1bee90343e4ae7877bd9871195620ce667281fffeb36e994b3e715a5f456327aaa4e8de1e80e3c24c7e6a898739c660f4d1c8d5ffa51c60

data/.github/workflows/valgrind.yml CHANGED Viewed

@@ -1,9 +1,18 @@
 name: Valgrind + GC.compact
 on:
-  # Nightly: these jobs are heavy (Valgrind is ~10-50x slower, GC.stress is ~10x
-  # slower) and check structural properties that do not vary by day-to-day code
-  # churn, so run them on a schedule rather than on every push/PR.
+  # Valgrind memcheck ALSO runs on push to main: it is the only check without a
+  # frequency threshold (any "definitely lost" / uninitialised-value use fails,
+  # unlike the PR-level macOS leak gate, which only flags stacks repeated >=30x),
+  # so a leak on a rarely-hit error path slips past the PR gates and would
+  # otherwise surface only on the next nightly. Running it post-merge catches such
+  # regressions within ~30 min without adding ~20 min to every PR. (It is gated to
+  # main only, not pull_request, to keep PR latency low.)
+  #
+  # The GC.stress job stays nightly-only (see its `if:` below): it is heavy and
+  # checks structural properties that do not vary by day-to-day churn.
+  push:
+    branches: [main, master]
   schedule:
     - cron: "0 2 * * *"
   workflow_dispatch:
@@ -61,32 +70,44 @@ jobs:
       - name: Run spec suite under Valgrind (ruby_memcheck)
         run: bundle exec rake spec:valgrind
-  # GC.auto_compact + GC.stress run of the full spec suite.  This structurally
-  # tests the borrowed-pointer discipline under the condition that Ruby Strings
-  # actually move (compaction) and that every allocation triggers a full GC
-  # cycle (stress).  Failures here are typically use-after-move or stale
+  # GC.auto_compact + GC.stress over the GC-sensitive examples.  This
+  # structurally tests the borrowed-pointer discipline under the condition that
+  # Ruby Strings actually move (compaction) and that every allocation triggers a
+  # full GC cycle (stress).  Failures here are typically use-after-move or stale
   # pointer bugs in the C extension or bridge layer.
   #
-  # THREADING is deliberately OFF here.  The :threading suite (spec/threading_spec.rb)
-  # is 8 threads x tens of iterations, and forcing the job-level GC.stress onto it
-  # means a full GC per allocation across every thread - which made this job run
-  # for 30+ minutes without finishing.  It also adds little: that suite already
-  # runs in ci.yml (ubuntu/3.4), and its GC-sensitive examples opt into GC.stress
-  # themselves via their own `around` hook, so cross-thread interactions are
-  # covered there.  This job's unique value is the *single-threaded* full suite
-  # under stress+compaction, which catches use-after-move across every code path.
+  # Scope: only the examples tagged `:gc_compact` (the `memory safety` blocks in
+  # css/xpath/serialize/mutation/source_location/xpath_handler/api_compat2 +
+  # attribute's lazy-index example).  Those are the examples written to exercise
+  # the borrowed-pointer paths.  `GC_COMPACT_STRESS=1` makes spec_helper set
+  # `GC.auto_compact = true` process-wide and wrap every example in `GC.stress`,
+  # so each allocation inside a tagged example triggers a *compacting* GC - the
+  # strongest form of the use-after-move test.  The high-volume churn loops
+  # (parse/drop cycles) scale their iteration count down under stress
+  # (`gc_churn_iters` / `GC_COMPACT_ITERS`) because each stressed iteration is
+  # orders of magnitude heavier; `GC_COMPACT_ITERS` below tunes the total runtime
+  # (~6-9 min on CI at 200).  An earlier version forced GC.stress onto the
+  # *entire* suite (~800 examples): it ran 1h40m+ and never finished, while
+  # testing borrowed-pointer discipline on hundreds of examples that have none.
+  # The rest of the suite still runs in ci.yml.
+  #
+  # THREADING is deliberately OFF here.  The :threading suite is 8 threads x tens
+  # of iterations; it runs in ci.yml and its GC-sensitive examples opt into
+  # GC.stress themselves, so cross-thread interactions are covered there.
   gc-compact-stress:
-    # Temporarily disabled, too long
-    if: false
+    # Nightly / on-demand only - not on push (the valgrind job is the post-merge
+    # gate; GC.stress is heavy and structural, so it does not need per-push runs).
+    if: github.event_name != 'push'
     name: GC.auto_compact + GC.stress (Ruby ${{ matrix.ruby }})
     runs-on: ubuntu-latest
-    timeout-minutes: 360
+    timeout-minutes: 30
     env:
-      # As in the Valgrind job: GC.stress (a full GC per allocation) makes the
-      # 300-iteration PBT sweep run for hours, and these jobs check memory
-      # discipline rather than the property space, so trim the iteration count.
-      PBT_COUNT: "15"
-      CSS_PBT_COUNT: "15"
+      GC_COMPACT_STRESS: "1"
+      # Per-iteration cost under per-allocation compacting GC is ~1000x normal, so
+      # the churn loops run this many iterations (vs their normal 200-1000). Tunes
+      # the job's runtime; raise for more coverage, lower if it approaches the
+      # timeout.
+      GC_COMPACT_ITERS: "200"
     strategy:
       fail-fast: false
       matrix:
@@ -110,26 +131,8 @@ jobs:
       - name: Compile the extension
         run: bundle exec rake compile
-      # GC.stress is scoped to each example via an around hook rather than set
-      # process-wide: under a global GC.stress, even requiring the 88 spec files
-      # runs a full GC per allocation, so loading alone took tens of minutes and
-      # the job never reached the first example.  auto_compact stays global so
-      # objects actually move during those stressed examples (the point of the
-      # job), while loading/collection runs at normal speed.
-      - name: Run spec suite under GC.auto_compact + GC.stress
-        run: |
-          bundle exec ruby -Ilib -e '
-            GC.auto_compact = true
-            require "rspec/core"
-            RSpec.configure do |c|
-              c.around(:each) do |example|
-                GC.stress = true
-                begin
-                  example.run
-                ensure
-                  GC.stress = false
-                end
-              end
-            end
-            exit RSpec::Core::Runner.run(ARGV)
-          ' spec
+      # GC_COMPACT_STRESS=1 (set in env above) makes spec_helper enable
+      # auto_compact globally and wrap each example in GC.stress; --tag gc_compact
+      # limits the run to the borrowed-pointer examples.
+      - name: Run GC-sensitive examples under GC.auto_compact + GC.stress
+        run: bundle exec rspec --tag gc_compact spec

data/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,72 @@ All notable changes to this project will be documented in this file.
 The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.5.0] - 2026-06-14
+### Fixed
+* Use-after-free when an XPath custom-function handler mutated the same
+  `XPathContext` (`register_*` / `node=`) mid-`evaluate`: such re-entrant context
+  mutation is now refused instead of invalidating the running evaluation's state.
+* `Node#name=` now invalidates the element-name index, so a later `//tag` query
+  reflects the rename instead of seeing a stale bucket.
+* XML processing-instruction targets now follow XML 1.0 §2.6: a PITarget is a
+  `Name`, not an NCName, so a colon is permitted (`<?a:b ...?>` parses, and
+  `create_processing_instruction("a:b", ...)` succeeds). Only the reserved `xml`
+  (any case) is still rejected. Previously a colon in a PI target was rejected as
+  not-well-formed, which was stricter than the spec (a PI target is not subject to
+  namespace processing).
+* Memory leaks of the internal XPath evaluation context on error / edge paths: a
+  `Makiri::XML` `#css` / `#xpath` / `#at_xpath` whose selector or expression failed
+  the text-input contract leaked the context (it is now verified BEFORE the context
+  is allocated), and a context could leak if building the Ruby result raised (it is
+  now freed before conversion).
+### Added
+* `ProcessingInstruction#target` on the XML node (the PI's target name).
+* Cross-kind `Document#import_node(node, deep = false)`. `import_node` now
+  translates a subtree across representations: `Makiri::XML::Document#import_node`
+  (newly added) imports an HTML (Lexbor) node by translating it to the XML node
+  representation, and `Makiri::HTML::Document#import_node` likewise translates an
+  XML node to HTML. Same-representation imports keep working (HTML to HTML via
+  Lexbor, XML to XML via the arena deep/shallow copy). The result is a detached
+  copy owned by the target document; the source is untouched. Elements (with
+  attributes), text, comment, and processing-instruction nodes translate both
+  ways, and an HTML `<template>`'s contents (which HTML keeps in a separate
+  fragment) are carried across rather than silently dropped; an XML CDATA section
+  has no HTML counterpart, so translating one into an HTML document fails closed
+  (`Makiri::Error`). Namespaces are preserved across the translation: HTML->XML
+  synthesizes the xmlns declarations needed to reproduce each node's namespace
+  (so e.g. an inline `<svg>` stays in the SVG namespace and HTML elements in the
+  XHTML namespace), and XML->HTML maps the namespace URI back to a Lexbor
+  namespace id, interning any URI (not only the ones Lexbor knows by default) so
+  custom namespaces survive too. An HTML-namespaced `<template>`'s content is
+  placed in its content fragment (HTMLTemplateElement.content), like a parsed
+  template. The other node-argument mutators
+  (`add_child`/`before`/`after`/`replace`/`fragment`) still reject a foreign-kind
+  node; `import_node` is the one sanctioned crossing point.
+* `set_attribute_ns(namespace, qualified_name, value)` and
+  `remove_attribute_ns(namespace, local_name)` on `Makiri::XML` elements - the DOM
+  setAttributeNS / removeAttributeNS, keyed on the (explicit namespace, local name)
+  pair so two attributes with the same qualified name in different namespaces
+  coexist (a null/"" namespace is the null namespace).
+* `Makiri::Lexbor::CSS.parse_stylesheet(text)`, a thin binding over Lexbor's
+  CSS stylesheet parser that returns the parsed rules as plain Ruby primitives
+  (`{type: :style, selectors: [{text:, specificity: [a,b,c]}, ...],
+  declarations: [{name:, value:, important:}, ...]}` and nested
+  `{type: :media, condition:, rules: [...]}`, in source order). Selector
+  specificity and value normalization come from Lexbor; `css-syntax-3` error
+  recovery means a broken stylesheet yields its valid rules instead of raising.
+  Hosts the new `Makiri::Lexbor::*` namespace (the unabstracted lexbor-native
+  surface, distinct from the Nokogiri-compatible `Makiri::*`).
 ## [0.4.0] - 2026-06-12
 ### Added
@@ -296,7 +362,8 @@ libxml2 / libxslt dependency at any layer**.
   domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
   Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
-[Unreleased]: https://github.com/takahashim/makiri/compare/v0.4.0...HEAD
+[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.0...HEAD
+[0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
 [0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
 [0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
 [0.2.0]: https://github.com/takahashim/makiri/compare/v0.1.0...v0.2.0

data/README.md CHANGED Viewed

@@ -141,6 +141,14 @@ XML subtrees can be built with `Document#create_element` and related node factor
 then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
 namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
+`Document#import_node(node, deep = false)` brings a node into a document as a
+detached copy, and works **across representations**: importing a `Makiri::HTML`
+node into a `Makiri::XML::Document` (or vice versa) translates the subtree between
+the two node representations, preserving namespaces (e.g. an inline `<svg>` keeps
+the SVG namespace, HTML elements the XHTML namespace; custom namespaces are
+preserved across both directions). An XML CDATA section has no HTML counterpart,
+so importing one into an HTML document raises.
 ```ruby
 doc   = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
 entry = doc.create_element("entry")
@@ -226,6 +234,12 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
     markup string straight to `#add_child` is unsupported (parse it into a fragment
     first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
     / `inner_html` / `outer_html` - is not.)
+* A colon in a processing-instruction target is well-formed (`<?a:b ...?>` parses).
+  * XML 1.0 §2.6: a `PITarget` is a `Name`, not an NCName, and Namespaces in XML
+    1.0's normative conformance section constrains only element/attribute names
+    (QNames), never PI targets. Nokogiri/libxml2 rejects it (`colons are forbidden
+    from PI names`); Makiri follows the normative text. Only the reserved `xml`
+    (any case) target is rejected.
 * Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
   the property-based differential), including namespaces, prolog/epilog comments
   and PIs, and adjacent-CDATA coalescing.

data/Rakefile CHANGED Viewed

@@ -59,6 +59,19 @@ task default: %i[compile spec]
 # *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
 # and is still reported.
 #
+# BUT the binary-touch filter is too coarse for one residual class: when a GC
+# cycle fires *inside* one of our allocations (or marks through our mark
+# callback), CRuby's conservative collector legitimately reads uninitialised
+# words (machine-stack scan reading stale frames, incremental mark/sweep reading
+# not-yet-written RVALUE flags) while a makiri frame sits on the stack - so ~190
+# of these pure-Ruby-GC false positives pass the filter. The gem's bundled
+# ruby.supp only covers `each_location*` under Addr8, not the Cond/Value8 reads
+# we hit. `suppressions/ruby.supp` (auto-loaded by ruby_memcheck: it globs
+# `<dir>/<ruby-version>.supp`, and the bare `ruby.supp` matches every version)
+# suppresses exactly those GC-driver-anchored uninit reads, plus the VM
+# method-cache id_table the interpreter never frees before exit. A real uninit
+# read in our code does not descend from a GC driver, so it still fails.
+#
 # Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
 # normal `bundle exec rake` (without that group) must not fail to load.
 begin

data/ext/makiri/bridge/ruby_string.c CHANGED Viewed

@@ -45,33 +45,61 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
     return rb_utf8_str_new(text.ptr, (long)text.len);
 }
-void
-mkr_verify_text(VALUE str, const char *what)
-{
-    /* ALLOCATION-FREE by design: this gate runs between a caller taking a
-     * borrowed RSTRING pointer and using it, so it must not be a GC point. The
-     * former implementation built a throwaway Ruby String (rb_enc_str_new) to
-     * ask for its coderange - a Ruby allocation inside every borrow, which both
-     * passed the borrowed ptr into an allocating call and opened a GC window
-     * under every OTHER borrow already held at multi-borrow call sites. Bytes
-     * are validated as UTF-8 regardless of the String's declared encoding,
-     * exactly as before. */
-    long        len = RSTRING_LEN(str);
-    const char *ptr = RSTRING_PTR(str);
+/* The shared core of Makiri's strict text contract: no NUL byte, valid UTF-8.
+ * Returns the specific violation (or MKR_TEXT_OK); each caller maps the verdict
+ * to its own error surface (Makiri::Error, XML::SyntaxError, or a reason string).
+ *
+ * ALLOCATION-FREE BY DESIGN, which every caller relies on: it runs between a
+ * caller taking a borrowed RSTRING pointer and using it, so it must not be a GC
+ * point. (The former per-caller implementations each built a throwaway Ruby
+ * String (rb_enc_str_new) to read its coderange - a Ruby allocation inside every
+ * borrow, which both passed the borrowed ptr into an allocating call and opened a
+ * GC window under every OTHER borrow already held at multi-borrow call sites.)
+ *
+ * +coderange_str+ is the String consulted for its CACHED coderange (no scan, no
+ * alloc); +ptr+/+len+ are the bytes validated. They may differ: the XML path
+ * passes the whole decoded String for the coderange but a BOM-stripped suffix as
+ * the bytes (the BOM is one complete UTF-8 char, so a whole-string VALID
+ * coderange still proves the suffix valid). Bytes are validated as UTF-8
+ * regardless of the String's declared encoding. */
+typedef enum {
+    MKR_TEXT_OK = 0,
+    MKR_TEXT_HAS_NUL,
+    MKR_TEXT_INVALID_UTF8,
+} mkr_text_verdict_t;
-    mkr_span_t sv = mkr_span(ptr, (size_t)len);
+static mkr_text_verdict_t
+mkr_text_check(VALUE coderange_str, const char *ptr, size_t len)
+{
+    mkr_span_t sv = mkr_span(ptr, len);
     size_t nul_at;
     if (mkr_span_find(&sv, '\0', &nul_at)) {
-        rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
+        return MKR_TEXT_HAS_NUL;
     }
     /* Cached-coderange fast path (reads flags, never scans, never allocates);
-     * NUL is valid UTF-8, so the memchr above stays either way. */
-    if (mkr_ruby_str_known_valid_utf8(str)) {
-        return;
+     * NUL is valid UTF-8, so the find above stays either way. */
+    if (mkr_ruby_str_known_valid_utf8(coderange_str)) {
+        return MKR_TEXT_OK;
+    }
+    if (!mkr_utf8_valid((const unsigned char *)ptr, len)) {
+        return MKR_TEXT_INVALID_UTF8;
     }
-    if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
-        rb_raise(mkr_eError, "%s must be valid UTF-8", what);
+    return MKR_TEXT_OK;
+}
+void
+mkr_verify_text(VALUE str, const char *what)
+{
+    const char *ptr = RSTRING_PTR(str);
+    size_t      len = (size_t)RSTRING_LEN(str);
+    switch (mkr_text_check(str, ptr, len)) {
+        case MKR_TEXT_HAS_NUL:
+            rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
+        case MKR_TEXT_INVALID_UTF8:
+            rb_raise(mkr_eError, "%s must be valid UTF-8", what);
+        case MKR_TEXT_OK:
+            break;
     }
 }
@@ -180,6 +208,12 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
     return NULL;
 }
+static int
+mkr_decl_ws(int c)
+{
+    return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
 /* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
  * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
  * stride-interleaved, so the ASCII column is extracted (stride/off resolved by
@@ -190,12 +224,6 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
  * of p is done: the stride/off geometry is passed in (rather than derived here
  * via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
  * the final name lookup - runs after the bytes have been copied into head[]. */
-static int
-mkr_decl_ws(int c)
-{
-    return c == ' ' || c == '\t' || c == '\r' || c == '\n';
-}
 static rb_encoding *
 mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
 {
@@ -336,19 +364,19 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
         rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
     }
-    /* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
-     * borrowed (the former rb_enc_str_new copy handed the borrow straight into
-     * an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
-     * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
-     * cached coderange covers the BOM-stripped suffix too (the BOM is one
-     * complete UTF-8 character). */
-    size_t nul_at;
-    if (mkr_span_find(&sv, '\0', &nul_at)) {
-        rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
-    }
-    if (!mkr_ruby_str_known_valid_utf8(s)
-        && !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
-        rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
+    /* Strict UTF-8 validation via the shared, allocation-free core - no GC point
+     * while `ptr` is borrowed: an embedded NUL or any invalid UTF-8 is fatal (no
+     * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). The whole-string
+     * `s` is consulted for the cached coderange (it covers the BOM-stripped
+     * suffix too - the BOM is one complete UTF-8 character), while the validated
+     * bytes are the stripped suffix `ptr + off`. */
+    switch (mkr_text_check(s, ptr + off, (size_t)len)) {
+        case MKR_TEXT_HAS_NUL:
+            rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
+        case MKR_TEXT_INVALID_UTF8:
+            rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
+        case MKR_TEXT_OK:
+            break;
     }
     /* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
      * allocates, so the ptr must not be what it copies from). */
@@ -379,26 +407,24 @@ mkr_ruby_str_known_valid_utf8(VALUE str)
 const char *
 mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
 {
-    /* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
-     * crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
-     * funcall allocated twice with `ptr` already taken). */
-    long len = RSTRING_LEN(sv);
-    if ((size_t)len > max_bytes) {
+    /* ALLOCATION-FREE, like mkr_verify_text (see mkr_text_check): the returned
+     * borrow must not have crossed a Ruby allocation. */
+    size_t len = (size_t)RSTRING_LEN(sv);
+    if (len > max_bytes) {
         return "string exceeds the maximum length";
     }
     const char *ptr = RSTRING_PTR(sv);
-    mkr_span_t view = mkr_span(ptr, (size_t)len);
-    size_t nul_at;
-    if (mkr_span_find(&view, '\0', &nul_at)) {
-        return "string contains a NUL byte";
-    }
-    if (!mkr_ruby_str_known_valid_utf8(sv)
-        && !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
-        return "string is not valid UTF-8";
+    switch (mkr_text_check(sv, ptr, len)) {
+        case MKR_TEXT_HAS_NUL:
+            return "string contains a NUL byte";
+        case MKR_TEXT_INVALID_UTF8:
+            return "string is not valid UTF-8";
+        case MKR_TEXT_OK:
+            break;
     }
     out->value = sv;
     out->ptr   = ptr;
-    out->len   = (size_t)len;
+    out->len   = len;
     return NULL;
 }

data/ext/makiri/core/mkr_alloc.h CHANGED Viewed

@@ -4,7 +4,7 @@
 /*
  * Fail-closed memory primitives: overflow-checked size arithmetic and
  * allocators, the foundation every other C layer (glue, xpath engine,
- * lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
+ * dom_adapter) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
  * `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
  * NOTHING in this header touches Ruby - exception mapping happens at the glue
  * boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)

data/ext/makiri/core/mkr_utf8.c CHANGED Viewed

@@ -1,6 +1,6 @@
 /* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
  * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
- * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
+ * dom_adapter/utf8_input.c (whose sanitiser fast path now calls this). */
 #include "mkr_utf8.h"
 #include <string.h>   /* memcpy for the word-at-a-time ASCII scan */

data/ext/makiri/core/mkr_utf8.h CHANGED Viewed

@@ -13,7 +13,7 @@
  * separately (memchr).
  *
  * This lives in core so the Ruby bridge (mkr_verify_text - the strict
- * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
+ * programmatic-input gate) and the HTML input sanitiser (dom_adapter/
  * utf8_input.c fast path) share a single implementation, and so the bridge's
  * validation never allocates: a borrowed RSTRING pointer must not be held
  * across a Ruby allocation (= GC point), so the validator the bridge runs

data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h RENAMED Viewed

@@ -67,7 +67,7 @@ int mkr_utf8_sanitize(const lxb_char_t *src, size_t len,
 void mkr_parsed_destroy(mkr_parsed_t *p);
-/* ---- attribute -> owner element index (lexbor_compat/dom_index.c) ----
+/* ---- attribute -> owner element index (dom_adapter/dom_index.c) ----
  *
  * Lexbor sets neither lxb_dom_attr_t::owner nor attr->node.parent, so an
  * attribute node has no usable back-pointer to its element. We build our own
@@ -94,7 +94,7 @@ int mkr_parsed_dom_index_build(mkr_parsed_t *p);
  * removing a subtree). */
 void mkr_parsed_dom_index_invalidate(mkr_parsed_t *p);
-/* ---- element index: tag id -> elements (lexbor_compat/dom_index.c) ----
+/* ---- element index: tag id -> elements (dom_adapter/dom_index.c) ----
  *
  * Co-built with the attr->owner index in the same document walk (same object,
  * same lazy build, same invalidation). Groups every element by tag id in
@@ -119,7 +119,7 @@ lxb_dom_node_t *const *mkr_element_index_tag(const void *idx, lxb_tag_id_t tag_i
  * (fail safe) for a NULL index. */
 int mkr_element_index_has_foreign(const void *idx);
-/* ---- source location (lexbor_compat/source_loc.c) ----
+/* ---- source location (dom_adapter/source_loc.c) ----
  *
  * mkr_parse_html drives Lexbor's low-level parser pipeline and chains the
  * tokenizer's token-done callback so we can record the byte offset of every
@@ -156,7 +156,7 @@ size_t mkr_parsed_node_line(mkr_parsed_t *p, lxb_dom_node_t *node);
  * mkr_parsed_destroy; exposed so post_parse.c need not see the index layout. */
 void mkr_dom_index_free(void *idx);
-/* ---- text-extraction index (lexbor_compat/text_index.c) ----
+/* ---- text-extraction index (dom_adapter/text_index.c) ----
  *
  * Maps a node to the contiguous run of document-order TEXT/CDATA byte slices
  * its subtree owns, so Node#text / XPath string-value can serve a pre-sized

data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h RENAMED Viewed

@@ -2,7 +2,7 @@
 #define MAKIRI_COMPAT_INTERNAL_H
 /* Low-level helpers shared across the extension's C translation units (the
- * lexbor_compat layer and the Ruby↔C glue) but not part of the compat public
+ * dom_adapter layer and the Ruby↔C glue) but not part of the compat public
  * API in compat.h. */
 #include <lexbor/dom/dom.h>