makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f88832bca79aadf7ea686b37739a5b600d9ff4a2075f28e9d59a885a66afab80
4
- data.tar.gz: 6f835ef9f2bee6318e9ef5ff179dac48b7e99b2f019fb85f86c67eb94fced1e9
3
+ metadata.gz: b0cf63c9d861e721a52064dccc929db0a8f823d485f69854f07d90b805913db0
4
+ data.tar.gz: 989e0d0b1430b202147cd4f0fec411d0377114f34ae380217b683b6b63d031e6
5
5
  SHA512:
6
- metadata.gz: 4171815b57b086979c4638b44cd9316562e7293b16c40302a62e78b7a93c30f0a1e0dd4f09764574fe20c5b6e948699dee64dad6d45136e582da22b4ac5fc74d
7
- data.tar.gz: ee5dda37c9dc8722d7313b96981312b0d1b1599dde764eeb108ba8397655586cfd793d6ce1a3f9473ae0cfc0e542d6e50f87c93dc355d9237432de00d088bb29
6
+ metadata.gz: 13598e1f45341c8fed3924da8bbe913cf55ef5c1b9256193db18ae3cf2bb9ae0f4816370d8ca569139b33e7194aced65656c1314989b882ea612a44e3750e84b
7
+ data.tar.gz: 71c9da99e6f26fb8a034efba1d0642b37cdcd3ddf212c6f977ad3c868b104162ca86f0c721606024286a06c858326508e41c8624687c03fa3d7a205461926faf
@@ -196,17 +196,22 @@ jobs:
196
196
  $pre --verify-tag || \
197
197
  gh release upload "${GITHUB_REF_NAME}" dist/*.gem --repo "${GITHUB_REPOSITORY}" --clobber
198
198
 
199
- # --- optional: publish to RubyGems (manual, opt-in, never on a tag push) ----
200
- # Auth is RubyGems Trusted Publishing (OIDC): no stored API key, short-lived
201
- # token, MFA-compatible. Configure a matching Trusted Publisher on RubyGems.org
202
- # for this gem: owner=takahashim, repo=makiri, workflow=release.yml, and set its
203
- # Environment to "rubygems" (matching `environment:` below).
199
+ # --- publish to RubyGems, behind the `rubygems` environment approval gate ---
200
+ # Held until the `rubygems` environment's Required-reviewers rule is approved,
201
+ # so a tag push releases on GitHub immediately but the RubyGems push waits.
202
+ #
203
+ # Auth is RubyGems Trusted Publishing (OIDC): no stored API key. Configure a
204
+ # matching Trusted Publisher on RubyGems.org (owner=takahashim, repo=makiri,
205
+ # workflow=release.yml, Environment=rubygems) so the token is only accepted
206
+ # through this gated environment.
204
207
  publish:
205
208
  name: Publish to RubyGems
206
209
  needs: [source-gem, native-gem]
207
- if: github.event_name == 'workflow_dispatch' && inputs.publish_to_rubygems
210
+ if: >-
211
+ startsWith(github.ref, 'refs/tags/') ||
212
+ (github.event_name == 'workflow_dispatch' && inputs.publish_to_rubygems)
208
213
  runs-on: ubuntu-latest
209
- environment: rubygems # add a Required-reviewers protection rule for an approval gate
214
+ environment: rubygems
210
215
  permissions:
211
216
  contents: read
212
217
  id-token: write # OIDC identity token for Trusted Publishing
data/CHANGELOG.md CHANGED
@@ -7,28 +7,106 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.3.0] - 2026-06-06
11
+
12
+ ### Added
13
+
14
+ * **Native XML 1.0 reader + in-place editor** - `Makiri::XML::Document.parse(source)`
15
+ / `Makiri::XML(source)`. No libxml2: a strict, fail-closed parser builds its own
16
+ node arena (case- and namespace-preserving), queried by the native XPath engine.
17
+ * Strict & secure: fail-closed decode (bad UTF-8 / NUL -> `XML::SyntaxError`),
18
+ duplicate attributes rejected, XML 1.0 only; verified against the W3C XML
19
+ Conformance Test Suite.
20
+ * Encoding autodetected (BOM / `<?xml encoding?>`); a contradicting String
21
+ encoding is a fatal error, not a silent mis-decode.
22
+ * DoS-bounded by a single arena byte ceiling (default 256 MiB; raise per parse
23
+ with `max_bytes:`).
24
+ * `<!DOCTYPE>` recognized but **not processed** (`#internal_subset` ->
25
+ `XML::DocumentType`); zero entity/DTD I/O, so **XXE and billion-laughs are
26
+ structurally impossible**. Kept off the tree, as in libxml2.
27
+ * Read API mirrors Nokogiri: `#xpath` / `#at_xpath` (`{prefix => uri}`),
28
+ name/namespace readers, `#text`, `#[]`, traversal, and namespace introspection
29
+ (`Makiri::XML::Namespace`); `XPathContext` works over XML nodes too.
30
+ * Prolog/epilog comments & PIs kept on the document node; adjacent same-type
31
+ character data coalesced - byte-identical to Nokogiri (property-based diff).
32
+ * `#to_xml` / `#to_s` (`pretty:` / `indent:` / `encoding:`) and `#canonicalize`
33
+ (Inclusive C14N 1.0, byte-identical to libxml2); buffers fail closed.
34
+ * Unsupported surface raises `NotImplementedError`: `#css` / `#at_css` and HTML
35
+ serialization.
36
+ * Tree mutation - fully fail-closed, detach-never-destroy:
37
+ * in-place: `#[]=` / `#delete`, `#content=`, `#name=`, `#remove` / `#unlink`;
38
+ * factories: `Document#create_{element,text_node,comment,cdata,processing_instruction}`
39
+ (+ Nokogiri-style `.new` constructors);
40
+ * insertion: `#add_child` / `<<`, `#before` / `#after`, `#replace` - namespaces
41
+ resolved at the insertion point; a cross-document insert deep-copies;
42
+ * fragments: `XML::DocumentFragment.parse` / `XML::Document#fragment`;
43
+ * from scratch: `XML::Document.new` + `#root=`.
44
+ * `XML::Element#element_children` and `Node#clone_node` for XML nodes (also enabling
45
+ `Node#dup` / `#clone`); a clone keeps name case, namespace and the CDATA type.
46
+ * `Node` includes `Enumerable` over its child nodes (`each` / `map` / `select` / ...).
47
+ * `Node#<=>` + `Comparable` - sort by document position (`nil` across documents or
48
+ for attributes).
49
+ * `NodeSet.new(document_or_node, list = [])` - foreign / cross-representation nodes
50
+ are rejected.
51
+ * `NodeSet#[]` accepts a `Range` or `start, length` (like `Array#[]`).
52
+ * `Node` / `NodeSet` / `Document` `#dup` / `#clone` now return real independent
53
+ copies (`#dup(0)` shallow; `#clone(freeze:)` honoured).
54
+ * A **frozen node is genuinely immutable** - every mutator raises `FrozenError`.
55
+
56
+ ### Changed
57
+
58
+ * CSS queries reuse one shared Lexbor engine (GVL-safe) and `at_css` wraps the match
59
+ directly: `at_css('#id')` ~5x faster than nokolexbor (was ~1.16x slower).
60
+ * HTML serialization pre-reserves its buffer - `to_html` now at parity with nokolexbor.
61
+ * Node-class names are the WHATWG DOM interface names (`CDATASection`, `Attr`,
62
+ `DocumentType`, ...), with the Nokogiri spellings (`CDATA`, `DTD`) kept as aliases;
63
+ added `Node#cdata?`.
64
+ * Text-index range table uses `uint32` bounds (24 -> 16 B/entry; ~27% less retained
65
+ index, byte-identical text).
66
+ * Parsing **honours the input String's encoding** - Shift_JIS / EUC-JP / ... are now
67
+ transcoded to UTF-8 instead of mangled.
68
+ * Parsing skips its UTF-8 validation scan when the String's coderange already proves
69
+ it valid.
70
+ * Faster HTML parse/serialize: `memchr` line table + validate-only UTF-8 scan (~7%),
71
+ and a single-copy serializer buffer (~1.2-1.3x).
72
+
73
+ ### Fixed
74
+
75
+ * **Hardened the HTML/XML representation boundary.** HTML (Lexbor) and XML (arena)
76
+ nodes are now distinct TypedData types, so the wrong representation raises
77
+ `TypeError` instead of corrupting memory:
78
+ * `Node#==` / `XPathContext#node=` with an XML `Document` no longer aborts the
79
+ process;
80
+ * `NodeSet#|` / `+` / `&` / `-` across different documents raise `Makiri::Error`
81
+ (was a silent mis-wrap);
82
+ * HTML-only APIs (`import_node`, `add_child` / `before` / `after` / `replace`,
83
+ `fragment(context:)`) reject an XML node argument (was a segfault).
84
+ * The bundle exported the entire vendored Lexbor symbol table (~1700 `lxb_*`); now
85
+ only `Init_makiri` is exported, so loading alongside another Lexbor gem (e.g.
86
+ nokolexbor) no longer segfaults. (Precompiled gems: rebuild required.)
87
+
10
88
  ## [0.2.0] - 2026-06-04
11
89
 
12
90
  ### Added
13
91
 
14
- * `Element#tag_name` (DOM `tagName`) the qualified name uppercased for an
92
+ * `Element#tag_name` (DOM `tagName`) - the qualified name uppercased for an
15
93
  HTML element in an HTML document (`"DIV"`), keeping the original case for
16
94
  SVG/MathML; `nil` for non-elements. Complements `#name`, which stays the
17
95
  lowercase qualified name.
18
- * `ProcessingInstruction#target` (DOM `target`) a PI's target name; `nil` for
96
+ * `ProcessingInstruction#target` (DOM `target`) - a PI's target name; `nil` for
19
97
  other node kinds. Its data is read via `#content`/`#text`.
20
98
  * `Document#create_processing_instruction(target, data)` (DOM
21
99
  `createProcessingInstruction`) and `Document#create_document_fragment` (DOM
22
- `createDocumentFragment`, an empty fragment to build up programmatically
100
+ `createDocumentFragment`, an empty fragment to build up programmatically -
23
101
  unlike `#fragment` / `DocumentFragment.parse`, which parse HTML). Both produce
24
102
  a detached node owned by the document; PI creation fails closed when the data
25
103
  contains the `?>` terminator (matching the DOM constraint). (DOM
26
104
  `createCDATASection` is intentionally not provided: per WHATWG DOM it throws on
27
105
  an HTML document, which is the only kind Makiri produces.)
28
- * `Node#{namespace_uri, prefix, local_name}` the WHATWG DOM per-node
106
+ * `Node#{namespace_uri, prefix, local_name}` - the WHATWG DOM per-node
29
107
  namespace accessors on `Element` and `Attribute` (`nil` on other node kinds).
30
108
  `namespace_uri` resolves an element's namespace from its node (so an HTML
31
- element is the XHTML namespace `http://www.w3.org/1999/xhtml`, not `nil` the
109
+ element is the XHTML namespace `http://www.w3.org/1999/xhtml`, not `nil` - the
32
110
  DOM-faithful value browsers and `namespace-uri()` return; SVG/MathML get their
33
111
  own URI), and agrees byte-for-byte with the `namespace-uri()` XPath function.
34
112
  For attributes it is `nil` unless prefixed, where it returns the parser-assigned
@@ -36,21 +114,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
36
114
  segment of the qualified name (`nil` for the usual unprefixed HTML5 case), and
37
115
  `local_name` is the name without that prefix. Previously a node's namespace was
38
116
  reachable only through XPath (`namespace-uri()`/`local-name()`).
39
- * `Node#clone_node(deep = false)` a copy of the node, owned by the same
117
+ * `Node#clone_node(deep = false)` - a copy of the node, owned by the same
40
118
  document and detached from any parent (the DOM `cloneNode`, whose `deep`
41
- defaults to `false` a missing/`nil`/`false` argument is a shallow clone; a
119
+ defaults to `false` - a missing/`nil`/`false` argument is a shallow clone; a
42
120
  truthy one copies the subtree). Built on the same `import_node` +
43
121
  `<template>`-content fixup the fragment parser uses, so a deep-cloned
44
122
  `<template>` keeps its contents. Fails closed: a failed import raises rather
45
123
  than returning a partial node.
46
- * `Document#import_node(node, deep = false)` a copy of `node` owned by the
124
+ * `Document#import_node(node, deep = false)` - a copy of `node` owned by the
47
125
  receiver document (the DOM `importNode`, whose `deep` likewise defaults to
48
126
  `false`). Unlike `Node#clone_node`, the copy is owned by the target rather
49
127
  than the node's own document, so it is the way to bring a node across
50
128
  documents (Makiri never moves a node between arenas); the source is left
51
129
  untouched. Same import + `<template>`-content fixup as `clone_node`, and fails
52
130
  closed on a failed import.
53
- * `Node#pointer_id` the underlying `lxb_dom_node_t` pointer as an Integer,
131
+ * `Node#pointer_id` - the underlying `lxb_dom_node_t` pointer as an Integer,
54
132
  matching `Nokogiri::XML::Node#pointer_id`. Shares the value `#hash`/`#eql?`
55
133
  are built on, so it is a stable, Nokogiri-compatible identity key for
56
134
  consumers (e.g. wrapper caches) that key nodes by pointer. Stable for a
@@ -73,19 +151,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
73
151
  ## [0.1.0] - 2026-06-02
74
152
 
75
153
  First public release. An HTML5 parser, a native XPath 1.0 query engine, and CSS
76
- selectors for Ruby built on vendored [Lexbor](https://lexbor.com/) with **no
154
+ selectors for Ruby - built on vendored [Lexbor](https://lexbor.com/) with **no
77
155
  libxml2 / libxslt dependency at any layer**.
78
156
 
79
157
  ### Added
80
158
 
81
159
  **Parsing & DOM**
82
160
 
83
- * `Makiri::HTML` / `Makiri.parse` HTML5 parsing via vendored, unpatched Lexbor,
161
+ * `Makiri::HTML` / `Makiri.parse` - HTML5 parsing via vendored, unpatched Lexbor,
84
162
  with browser-compatible UTF-8 decoding (invalid bytes → U+FFFD; parsing never
85
163
  fails on bad bytes). Read-only navigation and attribute/text readers across
86
164
  `Document`, `Element`, `Attribute`, `Text`, `CData`, `Comment`,
87
165
  `ProcessingInstruction`, `DocumentType`, and `DocumentFragment`.
88
- * `Node#line` 1-based source line of an element, reconstructed from the
166
+ * `Node#line` - 1-based source line of an element, reconstructed from the
89
167
  tokenizer without patching Lexbor (nil when the location is unknown).
90
168
  * `Element#attribute_nodes` and `Attribute#{name,value,parent,element}`, backed
91
169
  by a lazily-built attribute→owner index in the Lexbor compat layer.
@@ -138,7 +216,7 @@ libxml2 / libxslt dependency at any layer**.
138
216
  * UTF-8 text-input contract: HTML and fragment parsing are lenient (invalid
139
217
  bytes → U+FFFD, never reject), while strings passed to the XPath / CSS /
140
218
  DOM-mutation APIs must be valid UTF-8 with no NUL byte, otherwise they raise
141
- `Makiri::Error` never silently truncated, repaired, or reinterpreted.
219
+ `Makiri::Error` - never silently truncated, repaired, or reinterpreted.
142
220
  * Thread-safe by construction: parsing releases the GVL (concurrent parse scales
143
221
  ~2× on 8 cores), while XPath evaluation holds the GVL so sharing a document or
144
222
  context across threads cannot corrupt memory. Fail-closed string caps and
@@ -161,6 +239,7 @@ libxml2 / libxslt dependency at any layer**.
161
239
  domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
162
240
  Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
163
241
 
164
- [Unreleased]: https://github.com/takahashim/makiri/compare/v0.2.0...HEAD
242
+ [Unreleased]: https://github.com/takahashim/makiri/compare/v0.3.0...HEAD
243
+ [0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
165
244
  [0.2.0]: https://github.com/takahashim/makiri/compare/v0.1.0...v0.2.0
166
245
  [0.1.0]: https://github.com/takahashim/makiri/releases/tag/v0.1.0
data/README.md CHANGED
@@ -1,7 +1,8 @@
1
1
  # Makiri
2
2
 
3
- Standards-oriented HTML5 parsing, CSS selector querying, and XPath 1.0
4
- querying for Ruby, powered by Lexbor and a native XPath engine.
3
+ Standards-oriented HTML5/XML parsing, CSS selector querying, XPath 1.0 querying,
4
+ and a native XML 1.0 reader/editor for Ruby, powered by Lexbor and a native XPath
5
+ engine - with no libxml2 dependency.
5
6
 
6
7
  > [!WARNING]
7
8
  > Status: early release. APIs and behavior may change before v1.0.
@@ -20,6 +21,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
20
21
  * Native XPath 1.0 engine
21
22
  * XPath is parsed and evaluated by Makiri's own engine, written from scratch.
22
23
  * Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
24
+ * Native XML 1.0 reader + in-place editor (`Makiri::XML`)
25
+ * A strict, non-validating, fail-closed parser with its own node arena (not
26
+ Lexbor's HTML DOM), queried through the same native XPath engine, with
27
+ in-place tree edits (attributes, content, rename, remove).
28
+ * Conformance is held by the W3C XML Conformance Test Suite, an XPath
29
+ differential, and property-based testing vs Nokogiri (see below).
23
30
  * Bounded, fail-closed execution
24
31
  * XPath evaluation is bounded by per-evaluation limits on work, memory, and recursion.
25
32
  * Ownership and borrowing are kept explicit across layers, with owned/borrowed
@@ -46,7 +53,7 @@ HTML
46
53
  doc.css("a").map { |a| a["href"] } # => ["/a", "/b"]
47
54
  doc.at_css("p.lead").text # => "Hello"
48
55
 
49
- # XPath 1.0 (native engine no libxml2)
56
+ # XPath 1.0 (native engine - no libxml2)
50
57
  doc.xpath("//a").length # => 2
51
58
  doc.xpath("count(//a)") # => 2.0
52
59
  doc.at_xpath('//*[@id="main"]/p').text # => "Hello"
@@ -72,16 +79,158 @@ ctx.register_variable("cls", "lead")
72
79
  ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
73
80
  ```
74
81
 
82
+ ### XML (with in-place editing)
83
+
84
+ `Makiri::XML(source)` parses **XML 1.0** with a native, strict,
85
+ well-formedness-checking parser (no libxml2) and queries it through the same
86
+ native XPath 1.0 engine. `source` is a String or any object responding to
87
+ `#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
88
+ (`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
89
+ **fail-closed**: malformed input, a duplicate attribute, or a
90
+ non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
91
+ XML does not support raise `NotImplementedError` rather than returning a wrong
92
+ result. The tree supports in-place edits and building new subtrees (see below).
93
+ A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
94
+ entity/element declarations are loaded, no external subset is fetched) - so a
95
+ DTD-defined entity reference stays an undefined-entity error and **XXE /
96
+ billion-laughs are structurally impossible**. The doctype's name and identifiers
97
+ are still readable:
98
+
99
+ ```ruby
100
+ doc = Makiri::XML(<<~XML)
101
+ <feed xmlns="http://www.w3.org/2005/Atom">
102
+ <entry><title>Hello</title></entry>
103
+ <entry><title>World</title></entry>
104
+ </feed>
105
+ XML
106
+
107
+ # Namespace matching is strict, so a default namespace needs a registered prefix.
108
+ ns = { "a" => "http://www.w3.org/2005/Atom" }
109
+ doc.xpath("//entry").length # => 0 (default namespace)
110
+ doc.xpath("//a:entry", ns).length # => 2
111
+ doc.at_xpath("//a:entry/a:title", ns).text # => "Hello"
112
+
113
+ # Or reuse a context (caches registrations + compiled expressions):
114
+ ctx = Makiri::XPathContext.new(doc.root)
115
+ ctx.register_namespace("a", "http://www.w3.org/2005/Atom")
116
+ ctx.evaluate("//a:entry").length # => 2
117
+
118
+ el = doc.at_xpath("//a:entry", ns)
119
+ el.local_name # => "entry"
120
+ el.namespace_uri # => "http://www.w3.org/2005/Atom"
121
+
122
+ doc.css("entry") # raises NotImplementedError (use #xpath)
123
+
124
+ # Serialize back to XML
125
+ doc.to_xml # => "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<feed ...>...</feed>\n"
126
+ doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
127
+ doc.to_xml(pretty: true) # indented, element-only content
128
+
129
+ # DOCTYPE is recognized but the DTD is not processed (no entities, no I/O):
130
+ dtd = Makiri::XML(%(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0//EN" "x.dtd"><html/>))
131
+ .internal_subset
132
+ dtd.name # => "html"
133
+ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
134
+ dtd.system_id # => "x.dtd"
135
+ ```
136
+
137
+ Comments and processing instructions in the prolog/epilog are document-node
138
+ children (reachable via `//comment()` / `//processing-instruction()` and
139
+ `#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
140
+ data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
141
+ or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
142
+ transcode, with a hex character reference for anything the encoding can't hold);
143
+ a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
144
+ Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
145
+ comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
146
+ (Lexbor's selector engine lower-cases names, which breaks XML case/namespace
147
+ matching) - use XPath.
148
+
149
+ The tree supports in-place mutation - every edit validates its input (names as
150
+ XML 1.0 QNames, values as XML Char) so the tree stays serializable to
151
+ well-formed XML, and a removed node is detached, never freed, so a live wrapper
152
+ that aliases it stays usable:
153
+
154
+ ```ruby
155
+ doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
156
+ e = doc.at_xpath("//entry")
157
+
158
+ e["id"] = "9" # add or replace an attribute (value escaped on output)
159
+ e["dc:k"] = "v" # a prefixed name resolves against the in-scope xmlns
160
+ e.content = "Bye" # replace an element's children with text
161
+ e.name = "post" # rename in place (identity + namespace re-resolved)
162
+ e.delete("id") # remove an attribute
163
+ doc.at_xpath("//draft").remove
164
+
165
+ doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
166
+ ```
167
+
168
+ New subtrees can be built too - `Document#create_element` (and
169
+ `#create_text_node` / `#create_comment` / `#create_cdata` /
170
+ `#create_processing_instruction`) make detached nodes, and `#add_child` / `<<`,
171
+ `#add_previous_sibling` / `#before`, `#add_next_sibling` / `#after`, `#replace`
172
+ link them. A node's namespace is resolved against its position **at insertion**
173
+ (a prefixed name binds to the in-scope `xmlns`, an unprefixed element to the
174
+ default namespace), so the same tree results whether you set names before or
175
+ after attaching; an unbound prefix in the live tree fails closed. A node from
176
+ another document is **deep-copied** into the target (the source is untouched):
177
+
178
+ ```ruby
179
+ doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
180
+ entry = doc.create_element("entry")
181
+ entry["dc:id"] = "42" # prefixed attr resolves on insertion
182
+ entry.add_child(doc.create_element("title", "Hello"))
183
+ doc.root.add_child(entry)
184
+
185
+ doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
186
+ ```
187
+
188
+ Supported edits: `#[]=`, `#delete` / `#remove_attribute`, `#content=`, `#name=`,
189
+ `#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
190
+ `#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
191
+ `DocumentFragment` (its children are spliced in); a fragment is parsed by
192
+ `Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
193
+ (standalone). A raw string handed straight to `#add_child` is **not** accepted -
194
+ parse it into a fragment first. A whole document can also be built from scratch
195
+ with `XML::Document.new` + `#root=` and the factories.
196
+
197
+ The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
198
+ the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
199
+ in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
200
+ concrete String encoding stays authoritative - a BOM or declaration that
201
+ contradicts it is a fatal error, not a silent mis-decode.
202
+
203
+ Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
204
+ counting node structs and text), which fits every standard document. Raise it
205
+ per parse for an unusually large one:
206
+
207
+ ```ruby
208
+ Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
209
+ ```
210
+
211
+ Conformance is held by a regression net: the **W3C XML Conformance Test Suite**
212
+ (`rake conformance:xmlconf`, 100% of the in-scope non-validating XML-1.0 tests),
213
+ an XPath 1.0 differential vs Nokogiri/libxml2 (`rake conformance:xpath_xml`), and
214
+ property-based testing that requires Makiri's tree to be byte-identical to
215
+ Nokogiri's over generated documents (`rake conformance:xml_pbt`).
216
+
75
217
  ## Non-goals (v1.0)
76
218
 
77
- * XML parsing (HTML only).
219
+ * Passing a raw markup string straight to an insertion method
220
+ (`node.add_child("<x/>")`); parse it into a fragment first
221
+ (`Document#fragment` / `DocumentFragment.parse`). (Building XML from scratch
222
+ (`XML::Document.new` + `#root=`), the node factories - `Document#create_element`
223
+ etc. - fragments, node insertion (`#add_child` / `#before` / `#after` /
224
+ `#replace`), and `#to_xml` serialization ARE supported.)
78
225
  * XSLT, DTD / Schema / RelaxNG validation, XPointer, XInclude.
79
226
  * Streaming / SAX parsing.
80
227
  * Drop-in replacement for every Nokogiri method. Makiri covers the common
81
228
  HTML-scraping and manipulation surface. Deliberately not provided:
82
- - XML/XHTML serialization variants (`to_xml`, `to_xhtml`, `write_xml_to`)
229
+ - XHTML serialization variants (`to_xhtml`, `write_xml_to`); `#to_xml` is supported
83
230
  - XML/DTD construction (`create_internal_subset`, `external_subset`)
84
- - namespace introspection beyond `namespace-uri()` (`namespace_definitions`, `add_namespace`, `collect_namespaces`)
231
+ - namespace *mutation* (`add_namespace_definition`); read introspection
232
+ (`#namespace`, `#namespace_definitions`, `#namespaces`, `#collect_namespaces`)
233
+ is supported on `Makiri::XML` nodes
85
234
  - Nokogiri internals (`decorate`, `slop!`, `validate`).
86
235
 
87
236
  ## Differences from Nokogiri
@@ -103,9 +252,26 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
103
252
  * `namespace-uri()` of an HTML element returns the XHTML URI (DOM-correct, as browsers report)
104
253
  * `Nokogiri::HTML5` returns `""`.
105
254
 
255
+ ### XML
256
+
257
+ * `Makiri::XML` is **XML 1.0 only and non-validating**.
258
+ * A `version="1.1"` declaration is rejected; Nokogiri parses XML 1.1.
259
+ * The DTD is recognized but not processed: DTD-defined entities are not
260
+ expanded and DTD default attributes are not applied (Nokogiri/libxml2 can do
261
+ both). External entities/subsets are never fetched (no I/O).
262
+ * Mutation supports in-place edits, the node factories, fragments
263
+ (`Document#fragment` / `DocumentFragment.parse`), node insertion, and building
264
+ a document from scratch (`XML::Document.new` + `#root=`); only handing a raw
265
+ markup string straight to `#add_child` is unsupported (parse it into a fragment
266
+ first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
267
+ / `inner_html` / `outer_html` - is not.)
268
+ * Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
269
+ the property-based differential), including namespaces, prolog/epilog comments
270
+ and PIs, and adjacent-CDATA coalescing.
271
+
106
272
  ### CSS
107
273
 
108
- * jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, )
274
+ * jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, ...)
109
275
  * Makiri uses Lexbor's standards-only selector engine.
110
276
  Use XPath (`xpath("//p[contains(., 'x')]")`) or Enumerable (`css('li')[1]`).
111
277
  Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
data/Rakefile CHANGED
@@ -7,6 +7,24 @@ require "shellwords"
7
7
 
8
8
  GEMSPEC = Gem::Specification.load("makiri.gemspec")
9
9
 
10
+ # Replace bundler/gem_tasks' `release` (which builds a source-only gem and
11
+ # `gem push`es it from the dev machine) with a tag push: it hands the build,
12
+ # GitHub Release, and the approval-gated RubyGems publish off to CI
13
+ # (.github/workflows/release.yml). Nothing is pushed to RubyGems locally.
14
+ Rake::Task["release"].clear
15
+ desc "Tag v#{GEMSPEC.version} and push it; CI builds, releases, and publishes"
16
+ task release: %w[release:guard_clean release:source_control_push] do
17
+ puts <<~MSG
18
+
19
+ Pushed tag v#{GEMSPEC.version}. GitHub Actions (release.yml) will now:
20
+ 1. build the source gem + precompiled native gems,
21
+ 2. create the GitHub Release and attach them, then
22
+ 3. publish to RubyGems via OIDC - after the `rubygems` environment approval.
23
+ Approve the pending deployment in the Actions run to publish; nothing is
24
+ pushed to RubyGems from this machine.
25
+ MSG
26
+ end
27
+
10
28
  Rake::ExtensionTask.new("makiri", GEMSPEC) do |ext|
11
29
  ext.lib_dir = "lib/makiri"
12
30
  ext.ext_dir = "ext/makiri"
@@ -26,7 +44,7 @@ end
26
44
 
27
45
  # `rake clean` (from rake-compiler) removes the ext build dir under tmp/,
28
46
  # including the generated Makefile. The next `rake compile` re-runs extconf,
29
- # so newly-added .c files are picked up without this, a stale Makefile omits
47
+ # so newly-added .c files are picked up - without this, a stale Makefile omits
30
48
  # new sources and macOS's -undefined dynamic_lookup turns the missing symbols
31
49
  # into runtime NULL calls. The vendored Lexbor build is deliberately NOT wiped
32
50
  # here (it is slow to rebuild and rarely changes); use `rake clean:lexbor` for
@@ -63,6 +81,17 @@ def asan_runtime_path
63
81
  nil
64
82
  end
65
83
 
84
+ # The compiled extension, and whether it carries sanitizer instrumentation, so
85
+ # `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
86
+ def ext_bundle_path
87
+ Dir["lib/makiri/makiri.{bundle,so}"].first
88
+ end
89
+
90
+ def ext_sanitized?
91
+ bundle = ext_bundle_path or return false
92
+ !(`nm "#{bundle}" 2>/dev/null` =~ /asan|ubsan/i).nil?
93
+ end
94
+
66
95
  desc "Build the extension with sanitizers (MAKIRI_SANITIZE, default " \
67
96
  "address,undefined) and run the spec suite under them"
68
97
  task :sanitize do
@@ -92,6 +121,16 @@ task fuzz: :compile do
92
121
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
93
122
  end
94
123
 
124
+ desc "Fuzz the XML parser (hostile/mutated documents; override via FUZZ_ARGS)"
125
+ task "fuzz:xml": :compile do
126
+ sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target xml #{ENV['FUZZ_ARGS']}"
127
+ end
128
+
129
+ desc "Fuzz the XML mutation surface (random edit sequences + invariants; override via FUZZ_ARGS)"
130
+ task "fuzz:mutate": :compile do
131
+ sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
132
+ end
133
+
95
134
  desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
96
135
  task bench: :compile do
97
136
  # Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
@@ -101,6 +140,13 @@ task bench: :compile do
101
140
  end
102
141
  end
103
142
 
143
+ desc "Run the XML reader benchmark (Makiri::XML vs Nokogiri::XML reference)"
144
+ task "bench:xml" => :compile do
145
+ Bundler.with_unbundled_env do
146
+ sh "#{FileUtils::RUBY} -Ilib bench/bench_xml.rb"
147
+ end
148
+ end
149
+
104
150
  namespace :conformance do
105
151
  desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
106
152
  task html5: :compile do
@@ -116,6 +162,28 @@ namespace :conformance do
116
162
  end
117
163
  end
118
164
 
165
+ desc "XML XPath 1.0 differential conformance: Makiri::XML vs Nokogiri::XML"
166
+ task xpath_xml: :compile do
167
+ Bundler.with_unbundled_env do
168
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_xpath_diff.rb #{ENV['XPATH_ARGS']}"
169
+ end
170
+ end
171
+
172
+ desc "W3C XML Conformance Test Suite: well-formedness through Makiri::XML"
173
+ task xmlconf: :compile do
174
+ # Nokogiri (bench-only) parses the manifests, so run outside the bundle.
175
+ Bundler.with_unbundled_env do
176
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xmlconf_runner.rb #{ENV['XMLCONF_ARGS']}"
177
+ end
178
+ end
179
+
180
+ desc "Property-based XML differential: generated documents, Makiri vs Nokogiri tree"
181
+ task xml_pbt: :compile do
182
+ Bundler.with_unbundled_env do
183
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_pbt_diff.rb #{ENV['PBT_ARGS']}"
184
+ end
185
+ end
186
+
119
187
  desc "CSS Selectors differential conformance vs Nokogiri::HTML5"
120
188
  task css: :compile do
121
189
  Bundler.with_unbundled_env do
@@ -124,14 +192,31 @@ namespace :conformance do
124
192
  end
125
193
  end
126
194
 
127
- desc "Run all conformance suites (html5lib-tests + XPath & CSS differentials)"
128
- task conformance: %w[conformance:html5 conformance:xpath conformance:css]
195
+ desc "Run all conformance suites"
196
+ task conformance: %w[conformance:html5 conformance:xpath conformance:css conformance:xmlconf conformance:xpath_xml]
129
197
 
130
198
  namespace :fuzz do
131
- desc "Run the fuzzer under AddressSanitizer (rebuilds the ext; --isolated)"
199
+ # Run the fuzzer under the sanitizer. Toggles (all via env):
200
+ # FAST=1 run the surfaces NON-isolated (one process, no fork-per-query).
201
+ # Far higher throughput; ASan still aborts on a memory error
202
+ # (halt_on_error). The default (isolated) is the complete net:
203
+ # it also survives + attributes a genuine segfault and catches a
204
+ # hang via the per-query timeout, at much lower throughput.
205
+ # SKIP_BUILD=1 reuse the current build instead of rebuilding (refuses to run
206
+ # if it is not a sanitizer build, so you never fuzz a plain ext).
207
+ # FUZZ_TIME=N seconds per surface (default 90).
208
+ # FUZZ_ARGS=... run a single custom invocation instead of the three surfaces.
209
+ desc "Run the fuzzer under AddressSanitizer (FAST=1 non-isolated, SKIP_BUILD=1 reuse build)"
132
210
  task :sanitize do
133
211
  sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
134
- sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
212
+ if %w[1 true yes].include?(ENV["SKIP_BUILD"].to_s.downcase)
213
+ ext_sanitized? or
214
+ abort "fuzz:sanitize: SKIP_BUILD set but lib/makiri is not a sanitizer build; " \
215
+ "drop SKIP_BUILD to rebuild with MAKIRI_SANITIZE"
216
+ puts "fuzz:sanitize: reusing the existing sanitizer build (SKIP_BUILD)"
217
+ else
218
+ sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
219
+ end
135
220
 
136
221
  env = {
137
222
  "ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
@@ -144,7 +229,18 @@ namespace :fuzz do
144
229
  preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
145
230
  env[preload] = runtime
146
231
  end
147
- args = ENV["FUZZ_ARGS"] || "--isolated --time 120"
148
- sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{args}")
232
+
233
+ if ENV["FUZZ_ARGS"]
234
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
235
+ else
236
+ iso = %w[1 true yes].include?(ENV["FAST"].to_s.downcase) ? "" : "--isolated"
237
+ secs = ENV["FUZZ_TIME"] || "90"
238
+ # Cover every surface under the sanitizer: the query engine (XPath/CSS over
239
+ # parsed fixtures), the XML parser (hostile documents), and the XML mutation
240
+ # surface (random edit sequences + invariants).
241
+ ["", "--target xml", "--target mutate"].each do |surface|
242
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
243
+ end
244
+ end
149
245
  end
150
246
  end
@@ -46,6 +46,34 @@ mkr_ruby_borrowed_bytes_t mkr_ruby_bytes_view(VALUE in);
46
46
  * for an empty input), suitable for use while the GVL is released. */
47
47
  int mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out);
48
48
 
49
+ /* Return a UTF-8 Ruby String for `str`, honouring its declared encoding: UTF-8 /
50
+ * US-ASCII / ASCII-8BIT are returned unchanged (the parser handles their bytes
51
+ * directly); any other encoding is transcoded to UTF-8 (invalid/undef -> U+FFFD)
52
+ * so its content is preserved rather than read as raw UTF-8. The UTF-8 common
53
+ * case is a single encoding comparison. */
54
+ VALUE mkr_ruby_to_utf8(VALUE str);
55
+
56
+ /* STRICT decode for XML (§2.1): like mkr_ruby_to_utf8 it honours the String's
57
+ * declared encoding (UTF-8 / US-ASCII / ASCII-8BIT pass through; any other
58
+ * encoding is transcoded to UTF-8) - but FAIL-CLOSED, never lenient: a non-UTF-8
59
+ * byte that can't be converted, invalid UTF-8, or an embedded NUL all raise
60
+ * Makiri::XML::SyntaxError (no U+FFFD replacement). Returns a validated,
61
+ * UTF-8-tagged Ruby String. (The HTML replace path mkr_ruby_to_utf8 itself is
62
+ * NOT reused for the conversion - only its encoding-judgment rule is shared.)
63
+ *
64
+ * +max_bytes+ bounds the decoded UTF-8 length: an input that already exceeds the
65
+ * parser's arena byte budget is rejected here with Makiri::XML::LimitExceeded,
66
+ * before the validation copy and the caller's GVL-release copy (so a hostile
67
+ * oversized document is not copied twice for a doomed parse). 0 disables the
68
+ * check (decode-only callers that build no arena). */
69
+ VALUE mkr_xml_decode_input(VALUE str, size_t max_bytes);
70
+
71
+ /* True if `str` is *already known* to be valid UTF-8 - pure ASCII, or valid in
72
+ * the UTF-8 encoding - from its cached coderange, WITHOUT forcing a scan. Lets
73
+ * the parse skip mkr_utf8_sanitize's validation pass for input Ruby has already
74
+ * classified (an unknown/broken coderange returns false: sanitize handles it). */
75
+ bool mkr_ruby_str_known_valid_utf8(VALUE str);
76
+
49
77
  /* Validate a Ruby String for use as an XPath engine string: valid UTF-8,
50
78
  * no interior NUL, and at most +max_bytes+. Returns NULL on success and fills
51
79
  * +out+; otherwise returns a static reason string. +sv+ must be a String. */