makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  # Makiri
2
2
 
3
- Standards-oriented HTML5 parsing, CSS selector querying, and XPath 1.0
4
- querying for Ruby, powered by Lexbor and a native XPath engine.
3
+ Makiri is a Ruby library for parsing and querying HTML and XML documents.
4
+
5
+ It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
6
+ Makiri does not depend on libxml2.
5
7
 
6
8
  > [!WARNING]
7
9
  > Status: early release. APIs and behavior may change before v1.0.
@@ -13,13 +15,17 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
13
15
 
14
16
  * HTML5 parsing via [Lexbor](https://lexbor.com)
15
17
  * Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
16
- * Lexbor-specific behavior is isolated in a thin compatibility layer
17
- (`ext/makiri/lexbor_compat/`).
18
18
  * CSS selector support via Lexbor
19
19
  * Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
20
20
  * Native XPath 1.0 engine
21
21
  * XPath is parsed and evaluated by Makiri's own engine, written from scratch.
22
22
  * Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
23
+ * Native XML 1.0 parser
24
+ * A strict, non-validating, fail-closed parser with its own node arena (not
25
+ Lexbor's HTML DOM), queried through the same native XPath engine, with
26
+ in-place tree edits (attributes, content, rename, remove).
27
+ * Conformance is held by the W3C XML Conformance Test Suite, an XPath
28
+ differential, and property-based testing vs Nokogiri (see below).
23
29
  * Bounded, fail-closed execution
24
30
  * XPath evaluation is bounded by per-evaluation limits on work, memory, and recursion.
25
31
  * Ownership and borrowing are kept explicit across layers, with owned/borrowed
@@ -46,7 +52,7 @@ HTML
46
52
  doc.css("a").map { |a| a["href"] } # => ["/a", "/b"]
47
53
  doc.at_css("p.lead").text # => "Hello"
48
54
 
49
- # XPath 1.0 (native engine no libxml2)
55
+ # XPath 1.0 (native engine - no libxml2)
50
56
  doc.xpath("//a").length # => 2
51
57
  doc.xpath("count(//a)") # => 2.0
52
58
  doc.at_xpath('//*[@id="main"]/p').text # => "Hello"
@@ -72,16 +78,120 @@ ctx.register_variable("cls", "lead")
72
78
  ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
73
79
  ```
74
80
 
81
+ ### XML (with in-place editing)
82
+
83
+ ```ruby
84
+ doc = Makiri::XML(<<~XML)
85
+ <feed xmlns="http://www.w3.org/2005/Atom">
86
+ <entry><title>Hello</title></entry>
87
+ <entry><title>World</title></entry>
88
+ </feed>
89
+ XML
90
+
91
+ # Namespace matching is strict, so a default namespace needs a registered prefix.
92
+ ns = { "a" => "http://www.w3.org/2005/Atom" }
93
+ doc.xpath("//entry").length # => 0 (default namespace)
94
+ doc.xpath("//a:entry", ns).length # => 2
95
+ doc.at_xpath("//a:entry/a:title", ns).text # => "Hello"
96
+
97
+ # Or reuse a context (caches registrations + compiled expressions):
98
+ ctx = Makiri::XPathContext.new(doc.root)
99
+ ctx.register_namespace("a", "http://www.w3.org/2005/Atom")
100
+ ctx.evaluate("//a:entry").length # => 2
101
+
102
+ el = doc.at_xpath("//a:entry", ns)
103
+ el.local_name # => "entry"
104
+ el.namespace_uri # => "http://www.w3.org/2005/Atom"
105
+
106
+ # CSS selectors work too (lowered to the native XPath engine): a bare type
107
+ # selector binds to the document's default namespace, so this just works.
108
+ doc.css("entry").length # => 2
109
+ doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
110
+
111
+ # Serialize back to XML
112
+ doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
113
+ doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
114
+ doc.to_xml(pretty: true) # indented, element-only content
115
+
116
+ # DOCTYPE is recognized but the DTD is not processed (no entities, no I/O):
117
+ dtd = Makiri::XML(%(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0//EN" "x.dtd"><html/>))
118
+ .internal_subset
119
+ dtd.name # => "html"
120
+ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
121
+ dtd.system_id # => "x.dtd"
122
+ ```
123
+
124
+ The tree supports in-place mutation.
125
+
126
+ ```ruby
127
+ doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
128
+ e = doc.at_xpath("//entry")
129
+
130
+ e["id"] = "9" # add or replace an attribute (value escaped on output)
131
+ e["dc:k"] = "v" # a prefixed name resolves against the in-scope xmlns
132
+ e.content = "Bye" # replace an element's children with text
133
+ e.name = "post" # rename in place (identity + namespace re-resolved)
134
+ e.delete("id") # remove an attribute
135
+ doc.at_xpath("//draft").remove
136
+
137
+ doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
138
+ ```
139
+
140
+ XML subtrees can be built with `Document#create_element` and related node factory methods,
141
+ then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
142
+ namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
143
+
144
+ ```ruby
145
+ doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
146
+ entry = doc.create_element("entry")
147
+ entry["dc:id"] = "42" # prefixed attr resolves on insertion
148
+ entry.add_child(doc.create_element("title", "Hello"))
149
+ doc.root.add_child(entry)
150
+
151
+ doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
152
+ ```
153
+
154
+ `Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
155
+
156
+ ```ruby
157
+ builder = Makiri::XML::Builder.new do |xml|
158
+ xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
159
+ xml.title("Example Feed")
160
+ xml.entry("dc:id" => "1") do
161
+ xml.title("First")
162
+ xml.summary { xml.cdata("raw <b>html</b>") }
163
+ end
164
+ end
165
+ end
166
+
167
+ builder.to_xml # the whole document (with XML declaration)
168
+ builder.doc # the Makiri::XML::Document being built
169
+ ```
170
+
171
+ XML parsing is bounded by an arena memory limit, 256 MiB by default,
172
+ and unusually large documents can raise it with `max_bytes:`.
173
+
174
+ ```ruby
175
+ Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
176
+ ```
177
+
75
178
  ## Non-goals (v1.0)
76
179
 
77
- * XML parsing (HTML only).
180
+ * Passing a raw markup string straight to an insertion method
181
+ (`node.add_child("<x/>")`); parse it into a fragment first
182
+ (`Document#fragment` / `DocumentFragment.parse`). (Building XML from scratch
183
+ (`XML::Document.new` + `#root=`), the node factories - `Document#create_element`
184
+ etc. - fragments, node insertion (`#add_child` / `#before` / `#after` /
185
+ `#replace`), and `#to_xml` serialization ARE supported.)
78
186
  * XSLT, DTD / Schema / RelaxNG validation, XPointer, XInclude.
79
187
  * Streaming / SAX parsing.
80
188
  * Drop-in replacement for every Nokogiri method. Makiri covers the common
81
189
  HTML-scraping and manipulation surface. Deliberately not provided:
82
- - XML/XHTML serialization variants (`to_xml`, `to_xhtml`, `write_xml_to`)
190
+ - XHTML serialization variants (`to_xhtml`, `write_xml_to`); `#to_xml` is supported
83
191
  - XML/DTD construction (`create_internal_subset`, `external_subset`)
84
- - namespace introspection beyond `namespace-uri()` (`namespace_definitions`, `add_namespace`, `collect_namespaces`)
192
+ - namespace *mutation* (`add_namespace_definition`); read introspection
193
+ (`#namespace`, `#namespace_definitions`, `#namespaces`, `#collect_namespaces`)
194
+ is supported on `Makiri::XML` nodes
85
195
  - Nokogiri internals (`decorate`, `slop!`, `validate`).
86
196
 
87
197
  ## Differences from Nokogiri
@@ -103,16 +213,67 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
103
213
  * `namespace-uri()` of an HTML element returns the XHTML URI (DOM-correct, as browsers report)
104
214
  * `Nokogiri::HTML5` returns `""`.
105
215
 
216
+ ### XML
217
+
218
+ * `Makiri::XML` is **XML 1.0 only and non-validating**.
219
+ * A `version="1.1"` declaration is rejected; Nokogiri parses XML 1.1.
220
+ * The DTD is recognized but not processed: DTD-defined entities are not
221
+ expanded and DTD default attributes are not applied (Nokogiri/libxml2 can do
222
+ both). External entities/subsets are never fetched (no I/O).
223
+ * Mutation supports in-place edits, the node factories, fragments
224
+ (`Document#fragment` / `DocumentFragment.parse`), node insertion, and building
225
+ a document from scratch (`XML::Document.new` + `#root=`); only handing a raw
226
+ markup string straight to `#add_child` is unsupported (parse it into a fragment
227
+ first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
228
+ / `inner_html` / `outer_html` - is not.)
229
+ * Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
230
+ the property-based differential), including namespaces, prolog/epilog comments
231
+ and PIs, and adjacent-CDATA coalescing.
232
+
106
233
  ### CSS
107
234
 
108
- * jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, )
109
- * Makiri uses Lexbor's standards-only selector engine.
110
- Use XPath (`xpath("//p[contains(., 'x')]")`) or Enumerable (`css('li')[1]`).
235
+ * Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
236
+ * Makiri uses Lexbor's selector engine, which is standards-based apart from one
237
+ text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
238
+ Enumerable (`css('li')[1]`) for the rest.
111
239
  Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
240
+ * `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
241
+ spelling of the jQuery `:contains()` substring filter, matching an element
242
+ whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
243
+ for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
244
+ alias.) Like Lexbor's matcher, it tests the element's **immediate child text
245
+ nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
246
+ to XPath `child::text()[contains(., "text")]`.
247
+ * Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
248
+ selector) is supported and correct on both HTML and XML - the "type" is the
249
+ element's own expanded name.
250
+ * Nokogiri (XML and HTML5) mistranslates these to first-/only-child
251
+ (`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
252
+ Lexbor's HTML matcher.
112
253
  * Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
113
254
  * `Nokogiri::HTML5` is case-sensitive there.
114
- * Class/ID selectors are matched case-insensitively regardless of quirks mode (a Lexbor behaviour)
115
- * In a no-quirks document browsers and `Nokogiri::HTML5` match them case-sensitively.
255
+
256
+ ## Conformance
257
+
258
+ The XPath engine and XML parser are original code, so their correctness is held by
259
+ differential and standards harnesses in `spec/conformance/`.
260
+ The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
261
+ (Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
262
+ HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
263
+ parsing itself is checked against the WHATWG html5lib-tests corpus, and
264
+ XPath-over-HTML semantics additionally against browsers via a WPT port.
265
+ See also [`spec/conformance/README.md`](spec/conformance/README.md).
266
+
267
+ | Suite | Input | Oracle | `rake` task |
268
+ |---|---|---|---|
269
+ | HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
270
+ | XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
271
+ | XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
272
+ | CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
273
+ | Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
274
+ | XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
275
+ | Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
276
+ | CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
116
277
 
117
278
  ## Requirements
118
279
 
@@ -129,6 +290,15 @@ bundle exec rake compile
129
290
  bundle exec rake spec
130
291
  ```
131
292
 
293
+ ### Vendored Lexbor version
294
+
295
+ `vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
296
+ commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
297
+ case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
298
+ heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
299
+ bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
300
+ v3.0.0. See `CLAUDE.md` for details.
301
+
132
302
  ## License
133
303
 
134
304
  Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
data/Rakefile CHANGED
@@ -4,9 +4,28 @@ require "bundler/gem_tasks"
4
4
  require "rspec/core/rake_task"
5
5
  require "rake/extensiontask"
6
6
  require "shellwords"
7
+ require "tmpdir"
7
8
 
8
9
  GEMSPEC = Gem::Specification.load("makiri.gemspec")
9
10
 
11
+ # Replace bundler/gem_tasks' `release` (which builds a source-only gem and
12
+ # `gem push`es it from the dev machine) with a tag push: it hands the build,
13
+ # GitHub Release, and the approval-gated RubyGems publish off to CI
14
+ # (.github/workflows/release.yml). Nothing is pushed to RubyGems locally.
15
+ Rake::Task["release"].clear
16
+ desc "Tag v#{GEMSPEC.version} and push it; CI builds, releases, and publishes"
17
+ task release: %w[release:guard_clean release:source_control_push] do
18
+ puts <<~MSG
19
+
20
+ Pushed tag v#{GEMSPEC.version}. GitHub Actions (release.yml) will now:
21
+ 1. build the source gem + precompiled native gems,
22
+ 2. create the GitHub Release and attach them, then
23
+ 3. publish to RubyGems via OIDC - after the `rubygems` environment approval.
24
+ Approve the pending deployment in the Actions run to publish; nothing is
25
+ pushed to RubyGems from this machine.
26
+ MSG
27
+ end
28
+
10
29
  Rake::ExtensionTask.new("makiri", GEMSPEC) do |ext|
11
30
  ext.lib_dir = "lib/makiri"
12
31
  ext.ext_dir = "ext/makiri"
@@ -17,6 +36,59 @@ RSpec::Core::RakeTask.new(:spec)
17
36
 
18
37
  task default: %i[compile spec]
19
38
 
39
+ # `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
40
+ # ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
41
+ # Ruby's own Valgrind suppression files (matched by Ruby version) and filters
42
+ # the report down to errors whose stack touches our extension, so we no longer
43
+ # have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
44
+ #
45
+ # We keep this job's historical contract: catch *use of uninitialised values*
46
+ # and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
47
+ # detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
48
+ # which disable undef-value errors and turn on full leak-check.
49
+ #
50
+ # `filter_all_errors: true` is essential: by default ruby_memcheck only applies
51
+ # its "stack must touch the makiri binary" filter to *leak*-kind errors
52
+ # (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
53
+ # uninitialised-value report is surfaced regardless of where it comes from. Ruby's
54
+ # conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
55
+ # legitimately reads uninitialised words, and the bundled ruby.supp does not cover
56
+ # the free-at-exit / subprocess stacks the `:isolated` specs spin up under
57
+ # `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
58
+ # Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
59
+ # *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
60
+ # and is still reported.
61
+ #
62
+ # Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
63
+ # normal `bundle exec rake` (without that group) must not fail to load.
64
+ begin
65
+ require "ruby_memcheck"
66
+ require "ruby_memcheck/rspec/rake_task"
67
+
68
+ RubyMemcheck.config(
69
+ binary_name: "makiri",
70
+ filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
71
+ # not just leaks (see note above) - drops Ruby's own
72
+ # GC uninitialised-value noise, keeps mkr_* reports
73
+ valgrind_options: [
74
+ "--num-callers=50",
75
+ "--error-limit=no",
76
+ "--trace-children=yes", # spec processes may fork
77
+ "--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
78
+ "--track-origins=yes", # report where an uninitialised value came from
79
+ "--leak-check=no", # leaks are `rake leaks`' job, not this one
80
+ ],
81
+ )
82
+
83
+ namespace :spec do
84
+ desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
85
+ ":valgrind bundler group and the valgrind binary)"
86
+ RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
87
+ end
88
+ rescue LoadError
89
+ # ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
90
+ end
91
+
20
92
  namespace :security do
21
93
  desc "Run mechanical C safety lint over ext/makiri"
22
94
  task :clint do
@@ -26,7 +98,7 @@ end
26
98
 
27
99
  # `rake clean` (from rake-compiler) removes the ext build dir under tmp/,
28
100
  # including the generated Makefile. The next `rake compile` re-runs extconf,
29
- # so newly-added .c files are picked up without this, a stale Makefile omits
101
+ # so newly-added .c files are picked up - without this, a stale Makefile omits
30
102
  # new sources and macOS's -undefined dynamic_lookup turns the missing symbols
31
103
  # into runtime NULL calls. The vendored Lexbor build is deliberately NOT wiped
32
104
  # here (it is slow to rebuild and rarely changes); use `rake clean:lexbor` for
@@ -63,6 +135,28 @@ def asan_runtime_path
63
135
  nil
64
136
  end
65
137
 
138
+ def libfuzzer_available?
139
+ cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
140
+ Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
141
+ src = File.join(dir, "check.cc")
142
+ exe = File.join(dir, "check")
143
+ File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
144
+ return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
145
+ out: File::NULL, err: File::NULL)
146
+ end
147
+ end
148
+
149
+ # The compiled extension, and whether it carries sanitizer instrumentation, so
150
+ # `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
151
+ def ext_bundle_path
152
+ Dir["lib/makiri/makiri.{bundle,so}"].first
153
+ end
154
+
155
+ def ext_sanitized?
156
+ bundle = ext_bundle_path or return false
157
+ !(`nm "#{bundle}" 2>/dev/null` =~ /asan|ubsan/i).nil?
158
+ end
159
+
66
160
  desc "Build the extension with sanitizers (MAKIRI_SANITIZE, default " \
67
161
  "address,undefined) and run the spec suite under them"
68
162
  task :sanitize do
@@ -87,11 +181,99 @@ task :sanitize do
87
181
  sh(env, "#{FileUtils::RUBY} -S rspec")
88
182
  end
89
183
 
184
+ desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
185
+ "Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
186
+ "a line-level detail file to tmp/coverage/show.txt."
187
+ task :coverage do
188
+ require "fileutils"
189
+ dir = File.expand_path("tmp/coverage")
190
+ FileUtils.rm_rf(dir)
191
+ FileUtils.mkdir_p(dir)
192
+
193
+ # Instrument only our sources (Lexbor is built separately, uninstrumented).
194
+ sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
195
+ # %p -> PID, so any forked spec process gets its own raw profile.
196
+ sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
197
+
198
+ profdata = File.join(dir, "makiri.profdata")
199
+ bundle = "lib/makiri/makiri.bundle"
200
+ ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
201
+ sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
202
+ sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
203
+ "-ignore-filename-regex='#{ignore}' -show-branch-summary"
204
+ show = File.join(dir, "show.txt")
205
+ sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
206
+ "-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
207
+ puts "\ncoverage line/branch detail: #{show}"
208
+ puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
209
+ end
210
+
211
+ desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
212
+ "INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
213
+ "spec suite, or FUZZ_ARGS via the fuzzer when set."
214
+ task "sanitize:lexbor" do
215
+ sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
216
+ sanitize.include?("address") or
217
+ abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
218
+
219
+ # MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
220
+ # (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
221
+ # plain<->asan switch, so no manual clean:lexbor is needed before or after.
222
+ build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
223
+ sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
224
+
225
+ env = {
226
+ "ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
227
+ "detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
228
+ "UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
229
+ }
230
+ runtime = asan_runtime_path or
231
+ abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
232
+ preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
233
+ env[preload] = runtime
234
+ puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
235
+
236
+ if ENV["FUZZ_ARGS"]
237
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
238
+ else
239
+ sh(env, "#{FileUtils::RUBY} -S rspec")
240
+ end
241
+ end
242
+
90
243
  desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
91
244
  task fuzz: :compile do
92
245
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
93
246
  end
94
247
 
248
+ desc "Fuzz the XML parser (hostile/mutated documents; override via FUZZ_ARGS)"
249
+ task "fuzz:xml": :compile do
250
+ sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target xml #{ENV['FUZZ_ARGS']}"
251
+ end
252
+
253
+ desc "Fuzz the XML mutation surface (random edit sequences + invariants; override via FUZZ_ARGS)"
254
+ task "fuzz:mutate": :compile do
255
+ sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
256
+ end
257
+
258
+ desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
259
+ task leaks: :compile do
260
+ # ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
261
+ # leaks are otherwise never machine-checked; see script/check_leaks.rb.
262
+ sh "#{FileUtils::RUBY} script/check_leaks.rb"
263
+ end
264
+
265
+ desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
266
+ "allocation site, verifying each failure fails closed (clean raise or " \
267
+ "baseline-identical result, never truncated output)"
268
+ task :oom do
269
+ # The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
270
+ # a normal build), so this needs its own rebuild; see
271
+ # script/check_alloc_failures.rb for the protocol and the property gated.
272
+ sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
273
+ sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
274
+ puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
275
+ end
276
+
95
277
  desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
96
278
  task bench: :compile do
97
279
  # Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
@@ -101,6 +283,13 @@ task bench: :compile do
101
283
  end
102
284
  end
103
285
 
286
+ desc "Run the XML reader benchmark (Makiri::XML vs Nokogiri::XML reference)"
287
+ task "bench:xml" => :compile do
288
+ Bundler.with_unbundled_env do
289
+ sh "#{FileUtils::RUBY} -Ilib bench/bench_xml.rb"
290
+ end
291
+ end
292
+
104
293
  namespace :conformance do
105
294
  desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
106
295
  task html5: :compile do
@@ -116,22 +305,77 @@ namespace :conformance do
116
305
  end
117
306
  end
118
307
 
308
+ desc "XML XPath 1.0 differential conformance: Makiri::XML vs Nokogiri::XML"
309
+ task xpath_xml: :compile do
310
+ Bundler.with_unbundled_env do
311
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_xpath_diff.rb #{ENV['XPATH_ARGS']}"
312
+ end
313
+ end
314
+
315
+ desc "W3C XML Conformance Test Suite: well-formedness through Makiri::XML"
316
+ task xmlconf: :compile do
317
+ # Nokogiri (bench-only) parses the manifests, so run outside the bundle.
318
+ Bundler.with_unbundled_env do
319
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xmlconf_runner.rb #{ENV['XMLCONF_ARGS']}"
320
+ end
321
+ end
322
+
323
+ desc "Property-based XML differential: generated documents, Makiri vs Nokogiri tree"
324
+ task xml_pbt: :compile do
325
+ Bundler.with_unbundled_env do
326
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_pbt_diff.rb #{ENV['PBT_ARGS']}"
327
+ end
328
+ end
329
+
119
330
  desc "CSS Selectors differential conformance vs Nokogiri::HTML5"
120
331
  task css: :compile do
121
332
  Bundler.with_unbundled_env do
122
333
  sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
123
334
  end
124
335
  end
336
+
337
+ desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
338
+ task css_xml: :compile do
339
+ Bundler.with_unbundled_env do
340
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
341
+ end
342
+ end
343
+
344
+ desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
345
+ task builder: :compile do
346
+ Bundler.with_unbundled_env do
347
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
348
+ end
349
+ end
125
350
  end
126
351
 
127
- desc "Run all conformance suites (html5lib-tests + XPath & CSS differentials)"
128
- task conformance: %w[conformance:html5 conformance:xpath conformance:css]
352
+ desc "Run all conformance suites"
353
+ task conformance: %w[conformance:html5 conformance:xpath conformance:css
354
+ conformance:xmlconf conformance:xpath_xml conformance:css_xml
355
+ conformance:builder]
129
356
 
130
357
  namespace :fuzz do
131
- desc "Run the fuzzer under AddressSanitizer (rebuilds the ext; --isolated)"
358
+ # Run the fuzzer under the sanitizer. Toggles (all via env):
359
+ # FAST=1 run the surfaces NON-isolated (one process, no fork-per-query).
360
+ # Far higher throughput; ASan still aborts on a memory error
361
+ # (halt_on_error). The default (isolated) is the complete net:
362
+ # it also survives + attributes a genuine segfault and catches a
363
+ # hang via the per-query timeout, at much lower throughput.
364
+ # SKIP_BUILD=1 reuse the current build instead of rebuilding (refuses to run
365
+ # if it is not a sanitizer build, so you never fuzz a plain ext).
366
+ # FUZZ_TIME=N seconds per surface (default 90).
367
+ # FUZZ_ARGS=... run a single custom invocation instead of the three surfaces.
368
+ desc "Run the fuzzer under AddressSanitizer (FAST=1 non-isolated, SKIP_BUILD=1 reuse build)"
132
369
  task :sanitize do
133
370
  sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
134
- sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
371
+ if %w[1 true yes].include?(ENV["SKIP_BUILD"].to_s.downcase)
372
+ ext_sanitized? or
373
+ abort "fuzz:sanitize: SKIP_BUILD set but lib/makiri is not a sanitizer build; " \
374
+ "drop SKIP_BUILD to rebuild with MAKIRI_SANITIZE"
375
+ puts "fuzz:sanitize: reusing the existing sanitizer build (SKIP_BUILD)"
376
+ else
377
+ sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
378
+ end
135
379
 
136
380
  env = {
137
381
  "ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
@@ -144,7 +388,50 @@ namespace :fuzz do
144
388
  preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
145
389
  env[preload] = runtime
146
390
  end
147
- args = ENV["FUZZ_ARGS"] || "--isolated --time 120"
148
- sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{args}")
391
+
392
+ if ENV["FUZZ_ARGS"]
393
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
394
+ else
395
+ iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
396
+ secs = ENV["FUZZ_TIME"] || "90"
397
+ # Cover every surface under the sanitizer: the query engine (XPath/CSS over
398
+ # parsed fixtures), the XML parser (hostile documents), and the XML mutation
399
+ # surface (random edit sequences + invariants).
400
+ ["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
401
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
402
+ end
403
+ end
149
404
  end
405
+
406
+ # Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
407
+ # XPath compile+eval). These are Ruby-free standalone binaries, so they run
408
+ # directly under clang's libFuzzer driver without the Ruby interpreter.
409
+ # They complement the Ruby-based robustness fuzzer by providing coverage
410
+ # feedback and 2-3 orders of magnitude faster execution for the C core.
411
+ desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
412
+ task :libfuzzer_build => :compile do
413
+ libfuzzer_available? or
414
+ abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
415
+ "Install an LLVM clang with libFuzzer support and run with " \
416
+ "CLANG=/path/to/clang CXX=/path/to/clang++."
417
+ Dir.chdir("ext/makiri/fuzz") do
418
+ sh "make clean"
419
+ sh "make all"
420
+ end
421
+ end
422
+
423
+ desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
424
+ task :libfuzzer => :libfuzzer_build do
425
+ time = ENV["FUZZ_TIME"] || "60"
426
+ Dir.chdir("ext/makiri/fuzz") do
427
+ sh "mkdir -p corpus/xml corpus/xpath"
428
+ sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
429
+ sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
430
+ end
431
+ end
432
+ end
433
+
434
+ desc "Show code statistics"
435
+ task :stats do
436
+ sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
150
437
  end
@@ -46,6 +46,34 @@ mkr_ruby_borrowed_bytes_t mkr_ruby_bytes_view(VALUE in);
46
46
  * for an empty input), suitable for use while the GVL is released. */
47
47
  int mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out);
48
48
 
49
+ /* Return a UTF-8 Ruby String for `str`, honouring its declared encoding: UTF-8 /
50
+ * US-ASCII / ASCII-8BIT are returned unchanged (the parser handles their bytes
51
+ * directly); any other encoding is transcoded to UTF-8 (invalid/undef -> U+FFFD)
52
+ * so its content is preserved rather than read as raw UTF-8. The UTF-8 common
53
+ * case is a single encoding comparison. */
54
+ VALUE mkr_ruby_to_utf8(VALUE str);
55
+
56
+ /* STRICT decode for XML (§2.1): like mkr_ruby_to_utf8 it honours the String's
57
+ * declared encoding (UTF-8 / US-ASCII / ASCII-8BIT pass through; any other
58
+ * encoding is transcoded to UTF-8) - but FAIL-CLOSED, never lenient: a non-UTF-8
59
+ * byte that can't be converted, invalid UTF-8, or an embedded NUL all raise
60
+ * Makiri::XML::SyntaxError (no U+FFFD replacement). Returns a validated,
61
+ * UTF-8-tagged Ruby String. (The HTML replace path mkr_ruby_to_utf8 itself is
62
+ * NOT reused for the conversion - only its encoding-judgment rule is shared.)
63
+ *
64
+ * +max_bytes+ bounds the decoded UTF-8 length: an input that already exceeds the
65
+ * parser's arena byte budget is rejected here with Makiri::XML::LimitExceeded,
66
+ * before the validation copy and the caller's GVL-release copy (so a hostile
67
+ * oversized document is not copied twice for a doomed parse). 0 disables the
68
+ * check (decode-only callers that build no arena). */
69
+ VALUE mkr_xml_decode_input(VALUE str, size_t max_bytes);
70
+
71
+ /* True if `str` is *already known* to be valid UTF-8 - pure ASCII, or valid in
72
+ * the UTF-8 encoding - from its cached coderange, WITHOUT forcing a scan. Lets
73
+ * the parse skip mkr_utf8_sanitize's validation pass for input Ruby has already
74
+ * classified (an unknown/broken coderange returns false: sanitize handles it). */
75
+ bool mkr_ruby_str_known_valid_utf8(VALUE str);
76
+
49
77
  /* Validate a Ruby String for use as an XPath engine string: valid UTF-8,
50
78
  * no interior NUL, and at most +max_bytes+. Returns NULL on success and fills
51
79
  * +out+; otherwise returns a static reason string. +sv+ must be a String. */