makiri 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/valgrind.yml +49 -46
  3. data/CHANGELOG.md +68 -1
  4. data/README.md +14 -0
  5. data/Rakefile +13 -0
  6. data/ext/makiri/bridge/ruby_string.c +80 -54
  7. data/ext/makiri/core/mkr_alloc.h +1 -1
  8. data/ext/makiri/core/mkr_utf8.c +1 -1
  9. data/ext/makiri/core/mkr_utf8.h +1 -1
  10. data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
  11. data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
  12. data/ext/makiri/dom_adapter/cross_import.c +434 -0
  13. data/ext/makiri/dom_adapter/cross_import.h +35 -0
  14. data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
  15. data/ext/makiri/fuzz/Makefile +1 -1
  16. data/ext/makiri/glue/cross_import.h +30 -0
  17. data/ext/makiri/glue/glue.h +1 -1
  18. data/ext/makiri/glue/ruby_doc.c +11 -3
  19. data/ext/makiri/glue/ruby_html_mutate.c +6 -0
  20. data/ext/makiri/glue/ruby_html_node.c +1 -1
  21. data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
  22. data/ext/makiri/glue/ruby_node.c +14 -0
  23. data/ext/makiri/glue/ruby_xml.c +31 -2
  24. data/ext/makiri/glue/ruby_xml_node.c +87 -2
  25. data/ext/makiri/glue/ruby_xpath.c +16 -1
  26. data/ext/makiri/makiri.c +3 -0
  27. data/ext/makiri/makiri.h +5 -0
  28. data/ext/makiri/xml/mkr_xml.h +5 -0
  29. data/ext/makiri/xml/mkr_xml_chars.c +22 -0
  30. data/ext/makiri/xml/mkr_xml_mutate.c +160 -50
  31. data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
  32. data/ext/makiri/xml/mkr_xml_tree.c +63 -27
  33. data/ext/makiri/xpath/mkr_xpath.c +28 -0
  34. data/ext/makiri/xpath/mkr_xpath.h +5 -1
  35. data/ext/makiri/xpath/mkr_xpath_eval_body.h +11 -1
  36. data/lib/makiri/html/document.rb +11 -12
  37. data/lib/makiri/html/node_methods.rb +0 -1
  38. data/lib/makiri/node_set.rb +14 -9
  39. data/lib/makiri/processing_instruction.rb +8 -0
  40. data/lib/makiri/version.rb +1 -1
  41. data/lib/makiri/xml/builder.rb +29 -21
  42. data/lib/makiri/xpath_context.rb +12 -4
  43. data/script/check_c_safety.rb +1 -1
  44. data/script/check_c_safety_allowlist.yml +8 -5
  45. data/script/leaks_harness.rb +7 -0
  46. data/suppressions/ruby.supp +140 -0
  47. metadata +13 -8
  48. /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
  49. /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
  50. /data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +0 -0
  51. /data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30e3037756fec29474a8fb0c62e38d06a3337bba9c3ad844e6bdcfc02cff5026
4
- data.tar.gz: 5b2f2a2887019261a359a64c35bddd36eb076a8c6a4f145c1a2ec1d84f679be6
3
+ metadata.gz: 27ac120b94ab835caee9bbb50a1cee71b19e339dde2384496db9608e58b3269b
4
+ data.tar.gz: 27b8ea683abe8854e6c68269413d4858e0f2fedfdd04f04d8fa91130b9b05ac1
5
5
  SHA512:
6
- metadata.gz: 2b6bf4ed94ae428e23bcb3af8ec71febda16c83c5187ffb8cadd3698327681d951ead4b68d7df78328e8eeeec82a09de310d862cd110323c2874ac1fb0adf62c
7
- data.tar.gz: 9ebd0a7562d7ff5541ead1e61f4dae1719257b29ff8cf4414e5ca2c23e2c708849c360b26471fc132bebd0ddd9cd5866ec11ab44aec4ac4044cdba10958a1038
6
+ metadata.gz: 84754fb994af236692bdbc281cb0cba89a8cd6d7c75e2caa4e16ebe9b1efa6c4cbd270409be2957e461db4909bbabc32296ed44e185ccaa8985a0c285f25846c
7
+ data.tar.gz: c3fba2792720ad30d1bee90343e4ae7877bd9871195620ce667281fffeb36e994b3e715a5f456327aaa4e8de1e80e3c24c7e6a898739c660f4d1c8d5ffa51c60
@@ -1,9 +1,18 @@
1
1
  name: Valgrind + GC.compact
2
2
 
3
3
  on:
4
- # Nightly: these jobs are heavy (Valgrind is ~10-50x slower, GC.stress is ~10x
5
- # slower) and check structural properties that do not vary by day-to-day code
6
- # churn, so run them on a schedule rather than on every push/PR.
4
+ # Valgrind memcheck ALSO runs on push to main: it is the only check without a
5
+ # frequency threshold (any "definitely lost" / uninitialised-value use fails,
6
+ # unlike the PR-level macOS leak gate, which only flags stacks repeated >=30x),
7
+ # so a leak on a rarely-hit error path slips past the PR gates and would
8
+ # otherwise surface only on the next nightly. Running it post-merge catches such
9
+ # regressions within ~30 min without adding ~20 min to every PR. (It is gated to
10
+ # main only, not pull_request, to keep PR latency low.)
11
+ #
12
+ # The GC.stress job stays nightly-only (see its `if:` below): it is heavy and
13
+ # checks structural properties that do not vary by day-to-day churn.
14
+ push:
15
+ branches: [main, master]
7
16
  schedule:
8
17
  - cron: "0 2 * * *"
9
18
  workflow_dispatch:
@@ -61,32 +70,44 @@ jobs:
61
70
  - name: Run spec suite under Valgrind (ruby_memcheck)
62
71
  run: bundle exec rake spec:valgrind
63
72
 
64
- # GC.auto_compact + GC.stress run of the full spec suite. This structurally
65
- # tests the borrowed-pointer discipline under the condition that Ruby Strings
66
- # actually move (compaction) and that every allocation triggers a full GC
67
- # cycle (stress). Failures here are typically use-after-move or stale
73
+ # GC.auto_compact + GC.stress over the GC-sensitive examples. This
74
+ # structurally tests the borrowed-pointer discipline under the condition that
75
+ # Ruby Strings actually move (compaction) and that every allocation triggers a
76
+ # full GC cycle (stress). Failures here are typically use-after-move or stale
68
77
  # pointer bugs in the C extension or bridge layer.
69
78
  #
70
- # THREADING is deliberately OFF here. The :threading suite (spec/threading_spec.rb)
71
- # is 8 threads x tens of iterations, and forcing the job-level GC.stress onto it
72
- # means a full GC per allocation across every thread - which made this job run
73
- # for 30+ minutes without finishing. It also adds little: that suite already
74
- # runs in ci.yml (ubuntu/3.4), and its GC-sensitive examples opt into GC.stress
75
- # themselves via their own `around` hook, so cross-thread interactions are
76
- # covered there. This job's unique value is the *single-threaded* full suite
77
- # under stress+compaction, which catches use-after-move across every code path.
79
+ # Scope: only the examples tagged `:gc_compact` (the `memory safety` blocks in
80
+ # css/xpath/serialize/mutation/source_location/xpath_handler/api_compat2 +
81
+ # attribute's lazy-index example). Those are the examples written to exercise
82
+ # the borrowed-pointer paths. `GC_COMPACT_STRESS=1` makes spec_helper set
83
+ # `GC.auto_compact = true` process-wide and wrap every example in `GC.stress`,
84
+ # so each allocation inside a tagged example triggers a *compacting* GC - the
85
+ # strongest form of the use-after-move test. The high-volume churn loops
86
+ # (parse/drop cycles) scale their iteration count down under stress
87
+ # (`gc_churn_iters` / `GC_COMPACT_ITERS`) because each stressed iteration is
88
+ # orders of magnitude heavier; `GC_COMPACT_ITERS` below tunes the total runtime
89
+ # (~6-9 min on CI at 200). An earlier version forced GC.stress onto the
90
+ # *entire* suite (~800 examples): it ran 1h40m+ and never finished, while
91
+ # testing borrowed-pointer discipline on hundreds of examples that have none.
92
+ # The rest of the suite still runs in ci.yml.
93
+ #
94
+ # THREADING is deliberately OFF here. The :threading suite is 8 threads x tens
95
+ # of iterations; it runs in ci.yml and its GC-sensitive examples opt into
96
+ # GC.stress themselves, so cross-thread interactions are covered there.
78
97
  gc-compact-stress:
79
- # Temporarily disabled, too long
80
- if: false
98
+ # Nightly / on-demand only - not on push (the valgrind job is the post-merge
99
+ # gate; GC.stress is heavy and structural, so it does not need per-push runs).
100
+ if: github.event_name != 'push'
81
101
  name: GC.auto_compact + GC.stress (Ruby ${{ matrix.ruby }})
82
102
  runs-on: ubuntu-latest
83
- timeout-minutes: 360
103
+ timeout-minutes: 30
84
104
  env:
85
- # As in the Valgrind job: GC.stress (a full GC per allocation) makes the
86
- # 300-iteration PBT sweep run for hours, and these jobs check memory
87
- # discipline rather than the property space, so trim the iteration count.
88
- PBT_COUNT: "15"
89
- CSS_PBT_COUNT: "15"
105
+ GC_COMPACT_STRESS: "1"
106
+ # Per-iteration cost under per-allocation compacting GC is ~1000x normal, so
107
+ # the churn loops run this many iterations (vs their normal 200-1000). Tunes
108
+ # the job's runtime; raise for more coverage, lower if it approaches the
109
+ # timeout.
110
+ GC_COMPACT_ITERS: "200"
90
111
  strategy:
91
112
  fail-fast: false
92
113
  matrix:
@@ -110,26 +131,8 @@ jobs:
110
131
  - name: Compile the extension
111
132
  run: bundle exec rake compile
112
133
 
113
- # GC.stress is scoped to each example via an around hook rather than set
114
- # process-wide: under a global GC.stress, even requiring the 88 spec files
115
- # runs a full GC per allocation, so loading alone took tens of minutes and
116
- # the job never reached the first example. auto_compact stays global so
117
- # objects actually move during those stressed examples (the point of the
118
- # job), while loading/collection runs at normal speed.
119
- - name: Run spec suite under GC.auto_compact + GC.stress
120
- run: |
121
- bundle exec ruby -Ilib -e '
122
- GC.auto_compact = true
123
- require "rspec/core"
124
- RSpec.configure do |c|
125
- c.around(:each) do |example|
126
- GC.stress = true
127
- begin
128
- example.run
129
- ensure
130
- GC.stress = false
131
- end
132
- end
133
- end
134
- exit RSpec::Core::Runner.run(ARGV)
135
- ' spec
134
+ # GC_COMPACT_STRESS=1 (set in env above) makes spec_helper enable
135
+ # auto_compact globally and wrap each example in GC.stress; --tag gc_compact
136
+ # limits the run to the borrowed-pointer examples.
137
+ - name: Run GC-sensitive examples under GC.auto_compact + GC.stress
138
+ run: bundle exec rspec --tag gc_compact spec
data/CHANGELOG.md CHANGED
@@ -5,6 +5,72 @@ All notable changes to this project will be documented in this file.
5
5
  The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.5.0] - 2026-06-14
9
+
10
+ ### Fixed
11
+
12
+ * Use-after-free when an XPath custom-function handler mutated the same
13
+ `XPathContext` (`register_*` / `node=`) mid-`evaluate`: such re-entrant context
14
+ mutation is now refused instead of invalidating the running evaluation's state.
15
+
16
+ * `Node#name=` now invalidates the element-name index, so a later `//tag` query
17
+ reflects the rename instead of seeing a stale bucket.
18
+
19
+ * XML processing-instruction targets now follow XML 1.0 §2.6: a PITarget is a
20
+ `Name`, not an NCName, so a colon is permitted (`<?a:b ...?>` parses, and
21
+ `create_processing_instruction("a:b", ...)` succeeds). Only the reserved `xml`
22
+ (any case) is still rejected. Previously a colon in a PI target was rejected as
23
+ not-well-formed, which was stricter than the spec (a PI target is not subject to
24
+ namespace processing).
25
+
26
+ * Memory leaks of the internal XPath evaluation context on error / edge paths: a
27
+ `Makiri::XML` `#css` / `#xpath` / `#at_xpath` whose selector or expression failed
28
+ the text-input contract leaked the context (it is now verified BEFORE the context
29
+ is allocated), and a context could leak if building the Ruby result raised (it is
30
+ now freed before conversion).
31
+
32
+ ### Added
33
+
34
+ * `ProcessingInstruction#target` on the XML node (the PI's target name).
35
+
36
+ * Cross-kind `Document#import_node(node, deep = false)`. `import_node` now
37
+ translates a subtree across representations: `Makiri::XML::Document#import_node`
38
+ (newly added) imports an HTML (Lexbor) node by translating it to the XML node
39
+ representation, and `Makiri::HTML::Document#import_node` likewise translates an
40
+ XML node to HTML. Same-representation imports keep working (HTML to HTML via
41
+ Lexbor, XML to XML via the arena deep/shallow copy). The result is a detached
42
+ copy owned by the target document; the source is untouched. Elements (with
43
+ attributes), text, comment, and processing-instruction nodes translate both
44
+ ways, and an HTML `<template>`'s contents (which HTML keeps in a separate
45
+ fragment) are carried across rather than silently dropped; an XML CDATA section
46
+ has no HTML counterpart, so translating one into an HTML document fails closed
47
+ (`Makiri::Error`). Namespaces are preserved across the translation: HTML->XML
48
+ synthesizes the xmlns declarations needed to reproduce each node's namespace
49
+ (so e.g. an inline `<svg>` stays in the SVG namespace and HTML elements in the
50
+ XHTML namespace), and XML->HTML maps the namespace URI back to a Lexbor
51
+ namespace id, interning any URI (not only the ones Lexbor knows by default) so
52
+ custom namespaces survive too. An HTML-namespaced `<template>`'s content is
53
+ placed in its content fragment (HTMLTemplateElement.content), like a parsed
54
+ template. The other node-argument mutators
55
+ (`add_child`/`before`/`after`/`replace`/`fragment`) still reject a foreign-kind
56
+ node; `import_node` is the one sanctioned crossing point.
57
+
58
+ * `set_attribute_ns(namespace, qualified_name, value)` and
59
+ `remove_attribute_ns(namespace, local_name)` on `Makiri::XML` elements - the DOM
60
+ setAttributeNS / removeAttributeNS, keyed on the (explicit namespace, local name)
61
+ pair so two attributes with the same qualified name in different namespaces
62
+ coexist (a null/"" namespace is the null namespace).
63
+
64
+ * `Makiri::Lexbor::CSS.parse_stylesheet(text)`, a thin binding over Lexbor's
65
+ CSS stylesheet parser that returns the parsed rules as plain Ruby primitives
66
+ (`{type: :style, selectors: [{text:, specificity: [a,b,c]}, ...],
67
+ declarations: [{name:, value:, important:}, ...]}` and nested
68
+ `{type: :media, condition:, rules: [...]}`, in source order). Selector
69
+ specificity and value normalization come from Lexbor; `css-syntax-3` error
70
+ recovery means a broken stylesheet yields its valid rules instead of raising.
71
+ Hosts the new `Makiri::Lexbor::*` namespace (the unabstracted lexbor-native
72
+ surface, distinct from the Nokogiri-compatible `Makiri::*`).
73
+
8
74
  ## [0.4.0] - 2026-06-12
9
75
 
10
76
  ### Added
@@ -296,7 +362,8 @@ libxml2 / libxslt dependency at any layer**.
296
362
  domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
297
363
  Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
298
364
 
299
- [Unreleased]: https://github.com/takahashim/makiri/compare/v0.4.0...HEAD
365
+ [Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.0...HEAD
366
+ [0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
300
367
  [0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
301
368
  [0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
302
369
  [0.2.0]: https://github.com/takahashim/makiri/compare/v0.1.0...v0.2.0
data/README.md CHANGED
@@ -141,6 +141,14 @@ XML subtrees can be built with `Document#create_element` and related node factor
141
141
  then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
142
142
  namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
143
143
 
144
+ `Document#import_node(node, deep = false)` brings a node into a document as a
145
+ detached copy, and works **across representations**: importing a `Makiri::HTML`
146
+ node into a `Makiri::XML::Document` (or vice versa) translates the subtree between
147
+ the two node representations, preserving namespaces (e.g. an inline `<svg>` keeps
148
+ the SVG namespace, HTML elements the XHTML namespace; custom namespaces are
149
+ preserved across both directions). An XML CDATA section has no HTML counterpart,
150
+ so importing one into an HTML document raises.
151
+
144
152
  ```ruby
145
153
  doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
146
154
  entry = doc.create_element("entry")
@@ -226,6 +234,12 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
226
234
  markup string straight to `#add_child` is unsupported (parse it into a fragment
227
235
  first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
228
236
  / `inner_html` / `outer_html` - is not.)
237
+ * A colon in a processing-instruction target is well-formed (`<?a:b ...?>` parses).
238
+ * XML 1.0 §2.6: a `PITarget` is a `Name`, not an NCName, and Namespaces in XML
239
+ 1.0's normative conformance section constrains only element/attribute names
240
+ (QNames), never PI targets. Nokogiri/libxml2 rejects it (`colons are forbidden
241
+ from PI names`); Makiri follows the normative text. Only the reserved `xml`
242
+ (any case) target is rejected.
229
243
  * Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
230
244
  the property-based differential), including namespaces, prolog/epilog comments
231
245
  and PIs, and adjacent-CDATA coalescing.
data/Rakefile CHANGED
@@ -59,6 +59,19 @@ task default: %i[compile spec]
59
59
  # *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
60
60
  # and is still reported.
61
61
  #
62
+ # BUT the binary-touch filter is too coarse for one residual class: when a GC
63
+ # cycle fires *inside* one of our allocations (or marks through our mark
64
+ # callback), CRuby's conservative collector legitimately reads uninitialised
65
+ # words (machine-stack scan reading stale frames, incremental mark/sweep reading
66
+ # not-yet-written RVALUE flags) while a makiri frame sits on the stack - so ~190
67
+ # of these pure-Ruby-GC false positives pass the filter. The gem's bundled
68
+ # ruby.supp only covers `each_location*` under Addr8, not the Cond/Value8 reads
69
+ # we hit. `suppressions/ruby.supp` (auto-loaded by ruby_memcheck: it globs
70
+ # `<dir>/<ruby-version>.supp`, and the bare `ruby.supp` matches every version)
71
+ # suppresses exactly those GC-driver-anchored uninit reads, plus the VM
72
+ # method-cache id_table the interpreter never frees before exit. A real uninit
73
+ # read in our code does not descend from a GC driver, so it still fails.
74
+ #
62
75
  # Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
63
76
  # normal `bundle exec rake` (without that group) must not fail to load.
64
77
  begin
@@ -45,33 +45,61 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
45
45
  return rb_utf8_str_new(text.ptr, (long)text.len);
46
46
  }
47
47
 
48
- void
49
- mkr_verify_text(VALUE str, const char *what)
50
- {
51
- /* ALLOCATION-FREE by design: this gate runs between a caller taking a
52
- * borrowed RSTRING pointer and using it, so it must not be a GC point. The
53
- * former implementation built a throwaway Ruby String (rb_enc_str_new) to
54
- * ask for its coderange - a Ruby allocation inside every borrow, which both
55
- * passed the borrowed ptr into an allocating call and opened a GC window
56
- * under every OTHER borrow already held at multi-borrow call sites. Bytes
57
- * are validated as UTF-8 regardless of the String's declared encoding,
58
- * exactly as before. */
59
- long len = RSTRING_LEN(str);
60
- const char *ptr = RSTRING_PTR(str);
48
+ /* The shared core of Makiri's strict text contract: no NUL byte, valid UTF-8.
49
+ * Returns the specific violation (or MKR_TEXT_OK); each caller maps the verdict
50
+ * to its own error surface (Makiri::Error, XML::SyntaxError, or a reason string).
51
+ *
52
+ * ALLOCATION-FREE BY DESIGN, which every caller relies on: it runs between a
53
+ * caller taking a borrowed RSTRING pointer and using it, so it must not be a GC
54
+ * point. (The former per-caller implementations each built a throwaway Ruby
55
+ * String (rb_enc_str_new) to read its coderange - a Ruby allocation inside every
56
+ * borrow, which both passed the borrowed ptr into an allocating call and opened a
57
+ * GC window under every OTHER borrow already held at multi-borrow call sites.)
58
+ *
59
+ * +coderange_str+ is the String consulted for its CACHED coderange (no scan, no
60
+ * alloc); +ptr+/+len+ are the bytes validated. They may differ: the XML path
61
+ * passes the whole decoded String for the coderange but a BOM-stripped suffix as
62
+ * the bytes (the BOM is one complete UTF-8 char, so a whole-string VALID
63
+ * coderange still proves the suffix valid). Bytes are validated as UTF-8
64
+ * regardless of the String's declared encoding. */
65
+ typedef enum {
66
+ MKR_TEXT_OK = 0,
67
+ MKR_TEXT_HAS_NUL,
68
+ MKR_TEXT_INVALID_UTF8,
69
+ } mkr_text_verdict_t;
61
70
 
62
- mkr_span_t sv = mkr_span(ptr, (size_t)len);
71
+ static mkr_text_verdict_t
72
+ mkr_text_check(VALUE coderange_str, const char *ptr, size_t len)
73
+ {
74
+ mkr_span_t sv = mkr_span(ptr, len);
63
75
  size_t nul_at;
64
76
  if (mkr_span_find(&sv, '\0', &nul_at)) {
65
- rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
77
+ return MKR_TEXT_HAS_NUL;
66
78
  }
67
-
68
79
  /* Cached-coderange fast path (reads flags, never scans, never allocates);
69
- * NUL is valid UTF-8, so the memchr above stays either way. */
70
- if (mkr_ruby_str_known_valid_utf8(str)) {
71
- return;
80
+ * NUL is valid UTF-8, so the find above stays either way. */
81
+ if (mkr_ruby_str_known_valid_utf8(coderange_str)) {
82
+ return MKR_TEXT_OK;
83
+ }
84
+ if (!mkr_utf8_valid((const unsigned char *)ptr, len)) {
85
+ return MKR_TEXT_INVALID_UTF8;
72
86
  }
73
- if (!mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
74
- rb_raise(mkr_eError, "%s must be valid UTF-8", what);
87
+ return MKR_TEXT_OK;
88
+ }
89
+
90
+ void
91
+ mkr_verify_text(VALUE str, const char *what)
92
+ {
93
+ const char *ptr = RSTRING_PTR(str);
94
+ size_t len = (size_t)RSTRING_LEN(str);
95
+
96
+ switch (mkr_text_check(str, ptr, len)) {
97
+ case MKR_TEXT_HAS_NUL:
98
+ rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
99
+ case MKR_TEXT_INVALID_UTF8:
100
+ rb_raise(mkr_eError, "%s must be valid UTF-8", what);
101
+ case MKR_TEXT_OK:
102
+ break;
75
103
  }
76
104
  }
77
105
 
@@ -180,6 +208,12 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
180
208
  return NULL;
181
209
  }
182
210
 
211
+ static int
212
+ mkr_decl_ws(int c)
213
+ {
214
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n';
215
+ }
216
+
183
217
  /* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
184
218
  * The declaration is ASCII; for a UTF-16/32-detected document its bytes are
185
219
  * stride-interleaved, so the ASCII column is extracted (stride/off resolved by
@@ -190,12 +224,6 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
190
224
  * of p is done: the stride/off geometry is passed in (rather than derived here
191
225
  * via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
192
226
  * the final name lookup - runs after the bytes have been copied into head[]. */
193
- static int
194
- mkr_decl_ws(int c)
195
- {
196
- return c == ' ' || c == '\t' || c == '\r' || c == '\n';
197
- }
198
-
199
227
  static rb_encoding *
200
228
  mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
201
229
  {
@@ -336,19 +364,19 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
336
364
  rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
337
365
  }
338
366
 
339
- /* Strict UTF-8 validation, allocation-free - no GC point while `ptr` is
340
- * borrowed (the former rb_enc_str_new copy handed the borrow straight into
341
- * an allocating call): an embedded NUL or any invalid UTF-8 is fatal (no
342
- * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). A whole-string
343
- * cached coderange covers the BOM-stripped suffix too (the BOM is one
344
- * complete UTF-8 character). */
345
- size_t nul_at;
346
- if (mkr_span_find(&sv, '\0', &nul_at)) {
347
- rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
348
- }
349
- if (!mkr_ruby_str_known_valid_utf8(s)
350
- && !mkr_utf8_valid((const unsigned char *)ptr + off, (size_t)len)) {
351
- rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
367
+ /* Strict UTF-8 validation via the shared, allocation-free core - no GC point
368
+ * while `ptr` is borrowed: an embedded NUL or any invalid UTF-8 is fatal (no
369
+ * U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). The whole-string
370
+ * `s` is consulted for the cached coderange (it covers the BOM-stripped
371
+ * suffix too - the BOM is one complete UTF-8 character), while the validated
372
+ * bytes are the stripped suffix `ptr + off`. */
373
+ switch (mkr_text_check(s, ptr + off, (size_t)len)) {
374
+ case MKR_TEXT_HAS_NUL:
375
+ rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
376
+ case MKR_TEXT_INVALID_UTF8:
377
+ rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
378
+ case MKR_TEXT_OK:
379
+ break;
352
380
  }
353
381
  /* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
354
382
  * allocates, so the ptr must not be what it copies from). */
@@ -379,26 +407,24 @@ mkr_ruby_str_known_valid_utf8(VALUE str)
379
407
  const char *
380
408
  mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
381
409
  {
382
- /* ALLOCATION-FREE, like mkr_verify_text: the returned borrow must not have
383
- * crossed a Ruby allocation (the former rb_utf8_str_new + valid_encoding?
384
- * funcall allocated twice with `ptr` already taken). */
385
- long len = RSTRING_LEN(sv);
386
- if ((size_t)len > max_bytes) {
410
+ /* ALLOCATION-FREE, like mkr_verify_text (see mkr_text_check): the returned
411
+ * borrow must not have crossed a Ruby allocation. */
412
+ size_t len = (size_t)RSTRING_LEN(sv);
413
+ if (len > max_bytes) {
387
414
  return "string exceeds the maximum length";
388
415
  }
389
416
  const char *ptr = RSTRING_PTR(sv);
390
- mkr_span_t view = mkr_span(ptr, (size_t)len);
391
- size_t nul_at;
392
- if (mkr_span_find(&view, '\0', &nul_at)) {
393
- return "string contains a NUL byte";
394
- }
395
- if (!mkr_ruby_str_known_valid_utf8(sv)
396
- && !mkr_utf8_valid((const unsigned char *)ptr, (size_t)len)) {
397
- return "string is not valid UTF-8";
417
+ switch (mkr_text_check(sv, ptr, len)) {
418
+ case MKR_TEXT_HAS_NUL:
419
+ return "string contains a NUL byte";
420
+ case MKR_TEXT_INVALID_UTF8:
421
+ return "string is not valid UTF-8";
422
+ case MKR_TEXT_OK:
423
+ break;
398
424
  }
399
425
  out->value = sv;
400
426
  out->ptr = ptr;
401
- out->len = (size_t)len;
427
+ out->len = len;
402
428
  return NULL;
403
429
  }
404
430
 
@@ -4,7 +4,7 @@
4
4
  /*
5
5
  * Fail-closed memory primitives: overflow-checked size arithmetic and
6
6
  * allocators, the foundation every other C layer (glue, xpath engine,
7
- * lexbor_compat) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
7
+ * dom_adapter) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
8
8
  * `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
9
9
  * NOTHING in this header touches Ruby - exception mapping happens at the glue
10
10
  * boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
@@ -1,6 +1,6 @@
1
1
  /* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
2
2
  * See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
3
- * lexbor_compat/utf8_input.c (whose sanitiser fast path now calls this). */
3
+ * dom_adapter/utf8_input.c (whose sanitiser fast path now calls this). */
4
4
  #include "mkr_utf8.h"
5
5
 
6
6
  #include <string.h> /* memcpy for the word-at-a-time ASCII scan */
@@ -13,7 +13,7 @@
13
13
  * separately (memchr).
14
14
  *
15
15
  * This lives in core so the Ruby bridge (mkr_verify_text - the strict
16
- * programmatic-input gate) and the HTML input sanitiser (lexbor_compat/
16
+ * programmatic-input gate) and the HTML input sanitiser (dom_adapter/
17
17
  * utf8_input.c fast path) share a single implementation, and so the bridge's
18
18
  * validation never allocates: a borrowed RSTRING pointer must not be held
19
19
  * across a Ruby allocation (= GC point), so the validator the bridge runs
@@ -67,7 +67,7 @@ int mkr_utf8_sanitize(const lxb_char_t *src, size_t len,
67
67
 
68
68
  void mkr_parsed_destroy(mkr_parsed_t *p);
69
69
 
70
- /* ---- attribute -> owner element index (lexbor_compat/dom_index.c) ----
70
+ /* ---- attribute -> owner element index (dom_adapter/dom_index.c) ----
71
71
  *
72
72
  * Lexbor sets neither lxb_dom_attr_t::owner nor attr->node.parent, so an
73
73
  * attribute node has no usable back-pointer to its element. We build our own
@@ -94,7 +94,7 @@ int mkr_parsed_dom_index_build(mkr_parsed_t *p);
94
94
  * removing a subtree). */
95
95
  void mkr_parsed_dom_index_invalidate(mkr_parsed_t *p);
96
96
 
97
- /* ---- element index: tag id -> elements (lexbor_compat/dom_index.c) ----
97
+ /* ---- element index: tag id -> elements (dom_adapter/dom_index.c) ----
98
98
  *
99
99
  * Co-built with the attr->owner index in the same document walk (same object,
100
100
  * same lazy build, same invalidation). Groups every element by tag id in
@@ -119,7 +119,7 @@ lxb_dom_node_t *const *mkr_element_index_tag(const void *idx, lxb_tag_id_t tag_i
119
119
  * (fail safe) for a NULL index. */
120
120
  int mkr_element_index_has_foreign(const void *idx);
121
121
 
122
- /* ---- source location (lexbor_compat/source_loc.c) ----
122
+ /* ---- source location (dom_adapter/source_loc.c) ----
123
123
  *
124
124
  * mkr_parse_html drives Lexbor's low-level parser pipeline and chains the
125
125
  * tokenizer's token-done callback so we can record the byte offset of every
@@ -156,7 +156,7 @@ size_t mkr_parsed_node_line(mkr_parsed_t *p, lxb_dom_node_t *node);
156
156
  * mkr_parsed_destroy; exposed so post_parse.c need not see the index layout. */
157
157
  void mkr_dom_index_free(void *idx);
158
158
 
159
- /* ---- text-extraction index (lexbor_compat/text_index.c) ----
159
+ /* ---- text-extraction index (dom_adapter/text_index.c) ----
160
160
  *
161
161
  * Maps a node to the contiguous run of document-order TEXT/CDATA byte slices
162
162
  * its subtree owns, so Node#text / XPath string-value can serve a pre-sized
@@ -2,7 +2,7 @@
2
2
  #define MAKIRI_COMPAT_INTERNAL_H
3
3
 
4
4
  /* Low-level helpers shared across the extension's C translation units (the
5
- * lexbor_compat layer and the Ruby↔C glue) but not part of the compat public
5
+ * dom_adapter layer and the Ruby↔C glue) but not part of the compat public
6
6
  * API in compat.h. */
7
7
 
8
8
  #include <lexbor/dom/dom.h>