makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b0cf63c9d861e721a52064dccc929db0a8f823d485f69854f07d90b805913db0
|
|
4
|
+
data.tar.gz: 989e0d0b1430b202147cd4f0fec411d0377114f34ae380217b683b6b63d031e6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 13598e1f45341c8fed3924da8bbe913cf55ef5c1b9256193db18ae3cf2bb9ae0f4816370d8ca569139b33e7194aced65656c1314989b882ea612a44e3750e84b
|
|
7
|
+
data.tar.gz: 71c9da99e6f26fb8a034efba1d0642b37cdcd3ddf212c6f977ad3c868b104162ca86f0c721606024286a06c858326508e41c8624687c03fa3d7a205461926faf
|
|
@@ -196,17 +196,22 @@ jobs:
|
|
|
196
196
|
$pre --verify-tag || \
|
|
197
197
|
gh release upload "${GITHUB_REF_NAME}" dist/*.gem --repo "${GITHUB_REPOSITORY}" --clobber
|
|
198
198
|
|
|
199
|
-
# ---
|
|
200
|
-
#
|
|
201
|
-
#
|
|
202
|
-
#
|
|
203
|
-
#
|
|
199
|
+
# --- publish to RubyGems, behind the `rubygems` environment approval gate ---
|
|
200
|
+
# Held until the `rubygems` environment's Required-reviewers rule is approved,
|
|
201
|
+
# so a tag push releases on GitHub immediately but the RubyGems push waits.
|
|
202
|
+
#
|
|
203
|
+
# Auth is RubyGems Trusted Publishing (OIDC): no stored API key. Configure a
|
|
204
|
+
# matching Trusted Publisher on RubyGems.org (owner=takahashim, repo=makiri,
|
|
205
|
+
# workflow=release.yml, Environment=rubygems) so the token is only accepted
|
|
206
|
+
# through this gated environment.
|
|
204
207
|
publish:
|
|
205
208
|
name: Publish to RubyGems
|
|
206
209
|
needs: [source-gem, native-gem]
|
|
207
|
-
if:
|
|
210
|
+
if: >-
|
|
211
|
+
startsWith(github.ref, 'refs/tags/') ||
|
|
212
|
+
(github.event_name == 'workflow_dispatch' && inputs.publish_to_rubygems)
|
|
208
213
|
runs-on: ubuntu-latest
|
|
209
|
-
environment: rubygems
|
|
214
|
+
environment: rubygems
|
|
210
215
|
permissions:
|
|
211
216
|
contents: read
|
|
212
217
|
id-token: write # OIDC identity token for Trusted Publishing
|
data/CHANGELOG.md
CHANGED
|
@@ -7,28 +7,106 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2026-06-06
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
* **Native XML 1.0 reader + in-place editor** - `Makiri::XML::Document.parse(source)`
|
|
15
|
+
/ `Makiri::XML(source)`. No libxml2: a strict, fail-closed parser builds its own
|
|
16
|
+
node arena (case- and namespace-preserving), queried by the native XPath engine.
|
|
17
|
+
* Strict & secure: fail-closed decode (bad UTF-8 / NUL -> `XML::SyntaxError`),
|
|
18
|
+
duplicate attributes rejected, XML 1.0 only; verified against the W3C XML
|
|
19
|
+
Conformance Test Suite.
|
|
20
|
+
* Encoding autodetected (BOM / `<?xml encoding?>`); a contradicting String
|
|
21
|
+
encoding is a fatal error, not a silent mis-decode.
|
|
22
|
+
* DoS-bounded by a single arena byte ceiling (default 256 MiB; raise per parse
|
|
23
|
+
with `max_bytes:`).
|
|
24
|
+
* `<!DOCTYPE>` recognized but **not processed** (`#internal_subset` ->
|
|
25
|
+
`XML::DocumentType`); zero entity/DTD I/O, so **XXE and billion-laughs are
|
|
26
|
+
structurally impossible**. Kept off the tree, as in libxml2.
|
|
27
|
+
* Read API mirrors Nokogiri: `#xpath` / `#at_xpath` (`{prefix => uri}`),
|
|
28
|
+
name/namespace readers, `#text`, `#[]`, traversal, and namespace introspection
|
|
29
|
+
(`Makiri::XML::Namespace`); `XPathContext` works over XML nodes too.
|
|
30
|
+
* Prolog/epilog comments & PIs kept on the document node; adjacent same-type
|
|
31
|
+
character data coalesced - byte-identical to Nokogiri (property-based diff).
|
|
32
|
+
* `#to_xml` / `#to_s` (`pretty:` / `indent:` / `encoding:`) and `#canonicalize`
|
|
33
|
+
(Inclusive C14N 1.0, byte-identical to libxml2); buffers fail closed.
|
|
34
|
+
* Unsupported surface raises `NotImplementedError`: `#css` / `#at_css` and HTML
|
|
35
|
+
serialization.
|
|
36
|
+
* Tree mutation - fully fail-closed, detach-never-destroy:
|
|
37
|
+
* in-place: `#[]=` / `#delete`, `#content=`, `#name=`, `#remove` / `#unlink`;
|
|
38
|
+
* factories: `Document#create_{element,text_node,comment,cdata,processing_instruction}`
|
|
39
|
+
(+ Nokogiri-style `.new` constructors);
|
|
40
|
+
* insertion: `#add_child` / `<<`, `#before` / `#after`, `#replace` - namespaces
|
|
41
|
+
resolved at the insertion point; a cross-document insert deep-copies;
|
|
42
|
+
* fragments: `XML::DocumentFragment.parse` / `XML::Document#fragment`;
|
|
43
|
+
* from scratch: `XML::Document.new` + `#root=`.
|
|
44
|
+
* `XML::Element#element_children` and `Node#clone_node` for XML nodes (also enabling
|
|
45
|
+
`Node#dup` / `#clone`); a clone keeps name case, namespace and the CDATA type.
|
|
46
|
+
* `Node` includes `Enumerable` over its child nodes (`each` / `map` / `select` / ...).
|
|
47
|
+
* `Node#<=>` + `Comparable` - sort by document position (`nil` across documents or
|
|
48
|
+
for attributes).
|
|
49
|
+
* `NodeSet.new(document_or_node, list = [])` - foreign / cross-representation nodes
|
|
50
|
+
are rejected.
|
|
51
|
+
* `NodeSet#[]` accepts a `Range` or `start, length` (like `Array#[]`).
|
|
52
|
+
* `Node` / `NodeSet` / `Document` `#dup` / `#clone` now return real independent
|
|
53
|
+
copies (`#dup(0)` shallow; `#clone(freeze:)` honoured).
|
|
54
|
+
* A **frozen node is genuinely immutable** - every mutator raises `FrozenError`.
|
|
55
|
+
|
|
56
|
+
### Changed
|
|
57
|
+
|
|
58
|
+
* CSS queries reuse one shared Lexbor engine (GVL-safe) and `at_css` wraps the match
|
|
59
|
+
directly: `at_css('#id')` ~5x faster than nokolexbor (was ~1.16x slower).
|
|
60
|
+
* HTML serialization pre-reserves its buffer - `to_html` now at parity with nokolexbor.
|
|
61
|
+
* Node-class names are the WHATWG DOM interface names (`CDATASection`, `Attr`,
|
|
62
|
+
`DocumentType`, ...), with the Nokogiri spellings (`CDATA`, `DTD`) kept as aliases;
|
|
63
|
+
added `Node#cdata?`.
|
|
64
|
+
* Text-index range table uses `uint32` bounds (24 -> 16 B/entry; ~27% less retained
|
|
65
|
+
index, byte-identical text).
|
|
66
|
+
* Parsing **honours the input String's encoding** - Shift_JIS / EUC-JP / ... are now
|
|
67
|
+
transcoded to UTF-8 instead of mangled.
|
|
68
|
+
* Parsing skips its UTF-8 validation scan when the String's coderange already proves
|
|
69
|
+
it valid.
|
|
70
|
+
* Faster HTML parse/serialize: `memchr` line table + validate-only UTF-8 scan (~7%),
|
|
71
|
+
and a single-copy serializer buffer (~1.2-1.3x).
|
|
72
|
+
|
|
73
|
+
### Fixed
|
|
74
|
+
|
|
75
|
+
* **Hardened the HTML/XML representation boundary.** HTML (Lexbor) and XML (arena)
|
|
76
|
+
nodes are now distinct TypedData types, so the wrong representation raises
|
|
77
|
+
`TypeError` instead of corrupting memory:
|
|
78
|
+
* `Node#==` / `XPathContext#node=` with an XML `Document` no longer aborts the
|
|
79
|
+
process;
|
|
80
|
+
* `NodeSet#|` / `+` / `&` / `-` across different documents raise `Makiri::Error`
|
|
81
|
+
(was a silent mis-wrap);
|
|
82
|
+
* HTML-only APIs (`import_node`, `add_child` / `before` / `after` / `replace`,
|
|
83
|
+
`fragment(context:)`) reject an XML node argument (was a segfault).
|
|
84
|
+
* The bundle exported the entire vendored Lexbor symbol table (~1700 `lxb_*`); now
|
|
85
|
+
only `Init_makiri` is exported, so loading alongside another Lexbor gem (e.g.
|
|
86
|
+
nokolexbor) no longer segfaults. (Precompiled gems: rebuild required.)
|
|
87
|
+
|
|
10
88
|
## [0.2.0] - 2026-06-04
|
|
11
89
|
|
|
12
90
|
### Added
|
|
13
91
|
|
|
14
|
-
* `Element#tag_name` (DOM `tagName`)
|
|
92
|
+
* `Element#tag_name` (DOM `tagName`) - the qualified name uppercased for an
|
|
15
93
|
HTML element in an HTML document (`"DIV"`), keeping the original case for
|
|
16
94
|
SVG/MathML; `nil` for non-elements. Complements `#name`, which stays the
|
|
17
95
|
lowercase qualified name.
|
|
18
|
-
* `ProcessingInstruction#target` (DOM `target`)
|
|
96
|
+
* `ProcessingInstruction#target` (DOM `target`) - a PI's target name; `nil` for
|
|
19
97
|
other node kinds. Its data is read via `#content`/`#text`.
|
|
20
98
|
* `Document#create_processing_instruction(target, data)` (DOM
|
|
21
99
|
`createProcessingInstruction`) and `Document#create_document_fragment` (DOM
|
|
22
|
-
`createDocumentFragment`, an empty fragment to build up programmatically
|
|
100
|
+
`createDocumentFragment`, an empty fragment to build up programmatically -
|
|
23
101
|
unlike `#fragment` / `DocumentFragment.parse`, which parse HTML). Both produce
|
|
24
102
|
a detached node owned by the document; PI creation fails closed when the data
|
|
25
103
|
contains the `?>` terminator (matching the DOM constraint). (DOM
|
|
26
104
|
`createCDATASection` is intentionally not provided: per WHATWG DOM it throws on
|
|
27
105
|
an HTML document, which is the only kind Makiri produces.)
|
|
28
|
-
* `Node#{namespace_uri, prefix, local_name}`
|
|
106
|
+
* `Node#{namespace_uri, prefix, local_name}` - the WHATWG DOM per-node
|
|
29
107
|
namespace accessors on `Element` and `Attribute` (`nil` on other node kinds).
|
|
30
108
|
`namespace_uri` resolves an element's namespace from its node (so an HTML
|
|
31
|
-
element is the XHTML namespace `http://www.w3.org/1999/xhtml`, not `nil`
|
|
109
|
+
element is the XHTML namespace `http://www.w3.org/1999/xhtml`, not `nil` - the
|
|
32
110
|
DOM-faithful value browsers and `namespace-uri()` return; SVG/MathML get their
|
|
33
111
|
own URI), and agrees byte-for-byte with the `namespace-uri()` XPath function.
|
|
34
112
|
For attributes it is `nil` unless prefixed, where it returns the parser-assigned
|
|
@@ -36,21 +114,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
36
114
|
segment of the qualified name (`nil` for the usual unprefixed HTML5 case), and
|
|
37
115
|
`local_name` is the name without that prefix. Previously a node's namespace was
|
|
38
116
|
reachable only through XPath (`namespace-uri()`/`local-name()`).
|
|
39
|
-
* `Node#clone_node(deep = false)`
|
|
117
|
+
* `Node#clone_node(deep = false)` - a copy of the node, owned by the same
|
|
40
118
|
document and detached from any parent (the DOM `cloneNode`, whose `deep`
|
|
41
|
-
defaults to `false`
|
|
119
|
+
defaults to `false` - a missing/`nil`/`false` argument is a shallow clone; a
|
|
42
120
|
truthy one copies the subtree). Built on the same `import_node` +
|
|
43
121
|
`<template>`-content fixup the fragment parser uses, so a deep-cloned
|
|
44
122
|
`<template>` keeps its contents. Fails closed: a failed import raises rather
|
|
45
123
|
than returning a partial node.
|
|
46
|
-
* `Document#import_node(node, deep = false)`
|
|
124
|
+
* `Document#import_node(node, deep = false)` - a copy of `node` owned by the
|
|
47
125
|
receiver document (the DOM `importNode`, whose `deep` likewise defaults to
|
|
48
126
|
`false`). Unlike `Node#clone_node`, the copy is owned by the target rather
|
|
49
127
|
than the node's own document, so it is the way to bring a node across
|
|
50
128
|
documents (Makiri never moves a node between arenas); the source is left
|
|
51
129
|
untouched. Same import + `<template>`-content fixup as `clone_node`, and fails
|
|
52
130
|
closed on a failed import.
|
|
53
|
-
* `Node#pointer_id`
|
|
131
|
+
* `Node#pointer_id` - the underlying `lxb_dom_node_t` pointer as an Integer,
|
|
54
132
|
matching `Nokogiri::XML::Node#pointer_id`. Shares the value `#hash`/`#eql?`
|
|
55
133
|
are built on, so it is a stable, Nokogiri-compatible identity key for
|
|
56
134
|
consumers (e.g. wrapper caches) that key nodes by pointer. Stable for a
|
|
@@ -73,19 +151,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
73
151
|
## [0.1.0] - 2026-06-02
|
|
74
152
|
|
|
75
153
|
First public release. An HTML5 parser, a native XPath 1.0 query engine, and CSS
|
|
76
|
-
selectors for Ruby
|
|
154
|
+
selectors for Ruby - built on vendored [Lexbor](https://lexbor.com/) with **no
|
|
77
155
|
libxml2 / libxslt dependency at any layer**.
|
|
78
156
|
|
|
79
157
|
### Added
|
|
80
158
|
|
|
81
159
|
**Parsing & DOM**
|
|
82
160
|
|
|
83
|
-
* `Makiri::HTML` / `Makiri.parse`
|
|
161
|
+
* `Makiri::HTML` / `Makiri.parse` - HTML5 parsing via vendored, unpatched Lexbor,
|
|
84
162
|
with browser-compatible UTF-8 decoding (invalid bytes → U+FFFD; parsing never
|
|
85
163
|
fails on bad bytes). Read-only navigation and attribute/text readers across
|
|
86
164
|
`Document`, `Element`, `Attribute`, `Text`, `CData`, `Comment`,
|
|
87
165
|
`ProcessingInstruction`, `DocumentType`, and `DocumentFragment`.
|
|
88
|
-
* `Node#line`
|
|
166
|
+
* `Node#line` - 1-based source line of an element, reconstructed from the
|
|
89
167
|
tokenizer without patching Lexbor (nil when the location is unknown).
|
|
90
168
|
* `Element#attribute_nodes` and `Attribute#{name,value,parent,element}`, backed
|
|
91
169
|
by a lazily-built attribute→owner index in the Lexbor compat layer.
|
|
@@ -138,7 +216,7 @@ libxml2 / libxslt dependency at any layer**.
|
|
|
138
216
|
* UTF-8 text-input contract: HTML and fragment parsing are lenient (invalid
|
|
139
217
|
bytes → U+FFFD, never reject), while strings passed to the XPath / CSS /
|
|
140
218
|
DOM-mutation APIs must be valid UTF-8 with no NUL byte, otherwise they raise
|
|
141
|
-
`Makiri::Error`
|
|
219
|
+
`Makiri::Error` - never silently truncated, repaired, or reinterpreted.
|
|
142
220
|
* Thread-safe by construction: parsing releases the GVL (concurrent parse scales
|
|
143
221
|
~2× on 8 cores), while XPath evaluation holds the GVL so sharing a document or
|
|
144
222
|
context across threads cannot corrupt memory. Fail-closed string caps and
|
|
@@ -161,6 +239,7 @@ libxml2 / libxslt dependency at any layer**.
|
|
|
161
239
|
domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
|
|
162
240
|
Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
|
|
163
241
|
|
|
164
|
-
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.
|
|
242
|
+
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.3.0...HEAD
|
|
243
|
+
[0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
|
|
165
244
|
[0.2.0]: https://github.com/takahashim/makiri/compare/v0.1.0...v0.2.0
|
|
166
245
|
[0.1.0]: https://github.com/takahashim/makiri/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# Makiri
|
|
2
2
|
|
|
3
|
-
Standards-oriented HTML5 parsing, CSS selector querying,
|
|
4
|
-
|
|
3
|
+
Standards-oriented HTML5/XML parsing, CSS selector querying, XPath 1.0 querying,
|
|
4
|
+
and a native XML 1.0 reader/editor for Ruby, powered by Lexbor and a native XPath
|
|
5
|
+
engine - with no libxml2 dependency.
|
|
5
6
|
|
|
6
7
|
> [!WARNING]
|
|
7
8
|
> Status: early release. APIs and behavior may change before v1.0.
|
|
@@ -20,6 +21,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
|
|
|
20
21
|
* Native XPath 1.0 engine
|
|
21
22
|
* XPath is parsed and evaluated by Makiri's own engine, written from scratch.
|
|
22
23
|
* Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
|
|
24
|
+
* Native XML 1.0 reader + in-place editor (`Makiri::XML`)
|
|
25
|
+
* A strict, non-validating, fail-closed parser with its own node arena (not
|
|
26
|
+
Lexbor's HTML DOM), queried through the same native XPath engine, with
|
|
27
|
+
in-place tree edits (attributes, content, rename, remove).
|
|
28
|
+
* Conformance is held by the W3C XML Conformance Test Suite, an XPath
|
|
29
|
+
differential, and property-based testing vs Nokogiri (see below).
|
|
23
30
|
* Bounded, fail-closed execution
|
|
24
31
|
* XPath evaluation is bounded by per-evaluation limits on work, memory, and recursion.
|
|
25
32
|
* Ownership and borrowing are kept explicit across layers, with owned/borrowed
|
|
@@ -46,7 +53,7 @@ HTML
|
|
|
46
53
|
doc.css("a").map { |a| a["href"] } # => ["/a", "/b"]
|
|
47
54
|
doc.at_css("p.lead").text # => "Hello"
|
|
48
55
|
|
|
49
|
-
# XPath 1.0 (native engine
|
|
56
|
+
# XPath 1.0 (native engine - no libxml2)
|
|
50
57
|
doc.xpath("//a").length # => 2
|
|
51
58
|
doc.xpath("count(//a)") # => 2.0
|
|
52
59
|
doc.at_xpath('//*[@id="main"]/p').text # => "Hello"
|
|
@@ -72,16 +79,158 @@ ctx.register_variable("cls", "lead")
|
|
|
72
79
|
ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
|
|
73
80
|
```
|
|
74
81
|
|
|
82
|
+
### XML (with in-place editing)
|
|
83
|
+
|
|
84
|
+
`Makiri::XML(source)` parses **XML 1.0** with a native, strict,
|
|
85
|
+
well-formedness-checking parser (no libxml2) and queries it through the same
|
|
86
|
+
native XPath 1.0 engine. `source` is a String or any object responding to
|
|
87
|
+
`#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
|
|
88
|
+
(`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
|
|
89
|
+
**fail-closed**: malformed input, a duplicate attribute, or a
|
|
90
|
+
non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
|
|
91
|
+
XML does not support raise `NotImplementedError` rather than returning a wrong
|
|
92
|
+
result. The tree supports in-place edits and building new subtrees (see below).
|
|
93
|
+
A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
|
|
94
|
+
entity/element declarations are loaded, no external subset is fetched) - so a
|
|
95
|
+
DTD-defined entity reference stays an undefined-entity error and **XXE /
|
|
96
|
+
billion-laughs are structurally impossible**. The doctype's name and identifiers
|
|
97
|
+
are still readable:
|
|
98
|
+
|
|
99
|
+
```ruby
|
|
100
|
+
doc = Makiri::XML(<<~XML)
|
|
101
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
102
|
+
<entry><title>Hello</title></entry>
|
|
103
|
+
<entry><title>World</title></entry>
|
|
104
|
+
</feed>
|
|
105
|
+
XML
|
|
106
|
+
|
|
107
|
+
# Namespace matching is strict, so a default namespace needs a registered prefix.
|
|
108
|
+
ns = { "a" => "http://www.w3.org/2005/Atom" }
|
|
109
|
+
doc.xpath("//entry").length # => 0 (default namespace)
|
|
110
|
+
doc.xpath("//a:entry", ns).length # => 2
|
|
111
|
+
doc.at_xpath("//a:entry/a:title", ns).text # => "Hello"
|
|
112
|
+
|
|
113
|
+
# Or reuse a context (caches registrations + compiled expressions):
|
|
114
|
+
ctx = Makiri::XPathContext.new(doc.root)
|
|
115
|
+
ctx.register_namespace("a", "http://www.w3.org/2005/Atom")
|
|
116
|
+
ctx.evaluate("//a:entry").length # => 2
|
|
117
|
+
|
|
118
|
+
el = doc.at_xpath("//a:entry", ns)
|
|
119
|
+
el.local_name # => "entry"
|
|
120
|
+
el.namespace_uri # => "http://www.w3.org/2005/Atom"
|
|
121
|
+
|
|
122
|
+
doc.css("entry") # raises NotImplementedError (use #xpath)
|
|
123
|
+
|
|
124
|
+
# Serialize back to XML
|
|
125
|
+
doc.to_xml # => "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<feed ...>...</feed>\n"
|
|
126
|
+
doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
|
|
127
|
+
doc.to_xml(pretty: true) # indented, element-only content
|
|
128
|
+
|
|
129
|
+
# DOCTYPE is recognized but the DTD is not processed (no entities, no I/O):
|
|
130
|
+
dtd = Makiri::XML(%(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0//EN" "x.dtd"><html/>))
|
|
131
|
+
.internal_subset
|
|
132
|
+
dtd.name # => "html"
|
|
133
|
+
dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
|
|
134
|
+
dtd.system_id # => "x.dtd"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Comments and processing instructions in the prolog/epilog are document-node
|
|
138
|
+
children (reachable via `//comment()` / `//processing-instruction()` and
|
|
139
|
+
`#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
|
|
140
|
+
data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
|
|
141
|
+
or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
|
|
142
|
+
transcode, with a hex character reference for anything the encoding can't hold);
|
|
143
|
+
a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
|
|
144
|
+
Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
|
|
145
|
+
comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
|
|
146
|
+
(Lexbor's selector engine lower-cases names, which breaks XML case/namespace
|
|
147
|
+
matching) - use XPath.
|
|
148
|
+
|
|
149
|
+
The tree supports in-place mutation - every edit validates its input (names as
|
|
150
|
+
XML 1.0 QNames, values as XML Char) so the tree stays serializable to
|
|
151
|
+
well-formed XML, and a removed node is detached, never freed, so a live wrapper
|
|
152
|
+
that aliases it stays usable:
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
|
|
156
|
+
e = doc.at_xpath("//entry")
|
|
157
|
+
|
|
158
|
+
e["id"] = "9" # add or replace an attribute (value escaped on output)
|
|
159
|
+
e["dc:k"] = "v" # a prefixed name resolves against the in-scope xmlns
|
|
160
|
+
e.content = "Bye" # replace an element's children with text
|
|
161
|
+
e.name = "post" # rename in place (identity + namespace re-resolved)
|
|
162
|
+
e.delete("id") # remove an attribute
|
|
163
|
+
doc.at_xpath("//draft").remove
|
|
164
|
+
|
|
165
|
+
doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
New subtrees can be built too - `Document#create_element` (and
|
|
169
|
+
`#create_text_node` / `#create_comment` / `#create_cdata` /
|
|
170
|
+
`#create_processing_instruction`) make detached nodes, and `#add_child` / `<<`,
|
|
171
|
+
`#add_previous_sibling` / `#before`, `#add_next_sibling` / `#after`, `#replace`
|
|
172
|
+
link them. A node's namespace is resolved against its position **at insertion**
|
|
173
|
+
(a prefixed name binds to the in-scope `xmlns`, an unprefixed element to the
|
|
174
|
+
default namespace), so the same tree results whether you set names before or
|
|
175
|
+
after attaching; an unbound prefix in the live tree fails closed. A node from
|
|
176
|
+
another document is **deep-copied** into the target (the source is untouched):
|
|
177
|
+
|
|
178
|
+
```ruby
|
|
179
|
+
doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
|
|
180
|
+
entry = doc.create_element("entry")
|
|
181
|
+
entry["dc:id"] = "42" # prefixed attr resolves on insertion
|
|
182
|
+
entry.add_child(doc.create_element("title", "Hello"))
|
|
183
|
+
doc.root.add_child(entry)
|
|
184
|
+
|
|
185
|
+
doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Supported edits: `#[]=`, `#delete` / `#remove_attribute`, `#content=`, `#name=`,
|
|
189
|
+
`#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
|
|
190
|
+
`#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
|
|
191
|
+
`DocumentFragment` (its children are spliced in); a fragment is parsed by
|
|
192
|
+
`Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
|
|
193
|
+
(standalone). A raw string handed straight to `#add_child` is **not** accepted -
|
|
194
|
+
parse it into a fragment first. A whole document can also be built from scratch
|
|
195
|
+
with `XML::Document.new` + `#root=` and the factories.
|
|
196
|
+
|
|
197
|
+
The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
|
|
198
|
+
the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
|
|
199
|
+
in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
|
|
200
|
+
concrete String encoding stays authoritative - a BOM or declaration that
|
|
201
|
+
contradicts it is a fatal error, not a silent mis-decode.
|
|
202
|
+
|
|
203
|
+
Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
|
|
204
|
+
counting node structs and text), which fits every standard document. Raise it
|
|
205
|
+
per parse for an unusually large one:
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Conformance is held by a regression net: the **W3C XML Conformance Test Suite**
|
|
212
|
+
(`rake conformance:xmlconf`, 100% of the in-scope non-validating XML-1.0 tests),
|
|
213
|
+
an XPath 1.0 differential vs Nokogiri/libxml2 (`rake conformance:xpath_xml`), and
|
|
214
|
+
property-based testing that requires Makiri's tree to be byte-identical to
|
|
215
|
+
Nokogiri's over generated documents (`rake conformance:xml_pbt`).
|
|
216
|
+
|
|
75
217
|
## Non-goals (v1.0)
|
|
76
218
|
|
|
77
|
-
*
|
|
219
|
+
* Passing a raw markup string straight to an insertion method
|
|
220
|
+
(`node.add_child("<x/>")`); parse it into a fragment first
|
|
221
|
+
(`Document#fragment` / `DocumentFragment.parse`). (Building XML from scratch
|
|
222
|
+
(`XML::Document.new` + `#root=`), the node factories - `Document#create_element`
|
|
223
|
+
etc. - fragments, node insertion (`#add_child` / `#before` / `#after` /
|
|
224
|
+
`#replace`), and `#to_xml` serialization ARE supported.)
|
|
78
225
|
* XSLT, DTD / Schema / RelaxNG validation, XPointer, XInclude.
|
|
79
226
|
* Streaming / SAX parsing.
|
|
80
227
|
* Drop-in replacement for every Nokogiri method. Makiri covers the common
|
|
81
228
|
HTML-scraping and manipulation surface. Deliberately not provided:
|
|
82
|
-
-
|
|
229
|
+
- XHTML serialization variants (`to_xhtml`, `write_xml_to`); `#to_xml` is supported
|
|
83
230
|
- XML/DTD construction (`create_internal_subset`, `external_subset`)
|
|
84
|
-
- namespace
|
|
231
|
+
- namespace *mutation* (`add_namespace_definition`); read introspection
|
|
232
|
+
(`#namespace`, `#namespace_definitions`, `#namespaces`, `#collect_namespaces`)
|
|
233
|
+
is supported on `Makiri::XML` nodes
|
|
85
234
|
- Nokogiri internals (`decorate`, `slop!`, `validate`).
|
|
86
235
|
|
|
87
236
|
## Differences from Nokogiri
|
|
@@ -103,9 +252,26 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
|
|
|
103
252
|
* `namespace-uri()` of an HTML element returns the XHTML URI (DOM-correct, as browsers report)
|
|
104
253
|
* `Nokogiri::HTML5` returns `""`.
|
|
105
254
|
|
|
255
|
+
### XML
|
|
256
|
+
|
|
257
|
+
* `Makiri::XML` is **XML 1.0 only and non-validating**.
|
|
258
|
+
* A `version="1.1"` declaration is rejected; Nokogiri parses XML 1.1.
|
|
259
|
+
* The DTD is recognized but not processed: DTD-defined entities are not
|
|
260
|
+
expanded and DTD default attributes are not applied (Nokogiri/libxml2 can do
|
|
261
|
+
both). External entities/subsets are never fetched (no I/O).
|
|
262
|
+
* Mutation supports in-place edits, the node factories, fragments
|
|
263
|
+
(`Document#fragment` / `DocumentFragment.parse`), node insertion, and building
|
|
264
|
+
a document from scratch (`XML::Document.new` + `#root=`); only handing a raw
|
|
265
|
+
markup string straight to `#add_child` is unsupported (parse it into a fragment
|
|
266
|
+
first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
|
|
267
|
+
/ `inner_html` / `outer_html` - is not.)
|
|
268
|
+
* Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
|
|
269
|
+
the property-based differential), including namespaces, prolog/epilog comments
|
|
270
|
+
and PIs, and adjacent-CDATA coalescing.
|
|
271
|
+
|
|
106
272
|
### CSS
|
|
107
273
|
|
|
108
|
-
* jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`,
|
|
274
|
+
* jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, ...)
|
|
109
275
|
* Makiri uses Lexbor's standards-only selector engine.
|
|
110
276
|
Use XPath (`xpath("//p[contains(., 'x')]")`) or Enumerable (`css('li')[1]`).
|
|
111
277
|
Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
|
data/Rakefile
CHANGED
|
@@ -7,6 +7,24 @@ require "shellwords"
|
|
|
7
7
|
|
|
8
8
|
GEMSPEC = Gem::Specification.load("makiri.gemspec")
|
|
9
9
|
|
|
10
|
+
# Replace bundler/gem_tasks' `release` (which builds a source-only gem and
|
|
11
|
+
# `gem push`es it from the dev machine) with a tag push: it hands the build,
|
|
12
|
+
# GitHub Release, and the approval-gated RubyGems publish off to CI
|
|
13
|
+
# (.github/workflows/release.yml). Nothing is pushed to RubyGems locally.
|
|
14
|
+
Rake::Task["release"].clear
|
|
15
|
+
desc "Tag v#{GEMSPEC.version} and push it; CI builds, releases, and publishes"
|
|
16
|
+
task release: %w[release:guard_clean release:source_control_push] do
|
|
17
|
+
puts <<~MSG
|
|
18
|
+
|
|
19
|
+
Pushed tag v#{GEMSPEC.version}. GitHub Actions (release.yml) will now:
|
|
20
|
+
1. build the source gem + precompiled native gems,
|
|
21
|
+
2. create the GitHub Release and attach them, then
|
|
22
|
+
3. publish to RubyGems via OIDC - after the `rubygems` environment approval.
|
|
23
|
+
Approve the pending deployment in the Actions run to publish; nothing is
|
|
24
|
+
pushed to RubyGems from this machine.
|
|
25
|
+
MSG
|
|
26
|
+
end
|
|
27
|
+
|
|
10
28
|
Rake::ExtensionTask.new("makiri", GEMSPEC) do |ext|
|
|
11
29
|
ext.lib_dir = "lib/makiri"
|
|
12
30
|
ext.ext_dir = "ext/makiri"
|
|
@@ -26,7 +44,7 @@ end
|
|
|
26
44
|
|
|
27
45
|
# `rake clean` (from rake-compiler) removes the ext build dir under tmp/,
|
|
28
46
|
# including the generated Makefile. The next `rake compile` re-runs extconf,
|
|
29
|
-
# so newly-added .c files are picked up
|
|
47
|
+
# so newly-added .c files are picked up - without this, a stale Makefile omits
|
|
30
48
|
# new sources and macOS's -undefined dynamic_lookup turns the missing symbols
|
|
31
49
|
# into runtime NULL calls. The vendored Lexbor build is deliberately NOT wiped
|
|
32
50
|
# here (it is slow to rebuild and rarely changes); use `rake clean:lexbor` for
|
|
@@ -63,6 +81,17 @@ def asan_runtime_path
|
|
|
63
81
|
nil
|
|
64
82
|
end
|
|
65
83
|
|
|
84
|
+
# The compiled extension, and whether it carries sanitizer instrumentation, so
|
|
85
|
+
# `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
|
|
86
|
+
def ext_bundle_path
|
|
87
|
+
Dir["lib/makiri/makiri.{bundle,so}"].first
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def ext_sanitized?
|
|
91
|
+
bundle = ext_bundle_path or return false
|
|
92
|
+
!(`nm "#{bundle}" 2>/dev/null` =~ /asan|ubsan/i).nil?
|
|
93
|
+
end
|
|
94
|
+
|
|
66
95
|
desc "Build the extension with sanitizers (MAKIRI_SANITIZE, default " \
|
|
67
96
|
"address,undefined) and run the spec suite under them"
|
|
68
97
|
task :sanitize do
|
|
@@ -92,6 +121,16 @@ task fuzz: :compile do
|
|
|
92
121
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
|
|
93
122
|
end
|
|
94
123
|
|
|
124
|
+
desc "Fuzz the XML parser (hostile/mutated documents; override via FUZZ_ARGS)"
|
|
125
|
+
task "fuzz:xml": :compile do
|
|
126
|
+
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target xml #{ENV['FUZZ_ARGS']}"
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
desc "Fuzz the XML mutation surface (random edit sequences + invariants; override via FUZZ_ARGS)"
|
|
130
|
+
task "fuzz:mutate": :compile do
|
|
131
|
+
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
|
|
132
|
+
end
|
|
133
|
+
|
|
95
134
|
desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
|
|
96
135
|
task bench: :compile do
|
|
97
136
|
# Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
|
|
@@ -101,6 +140,13 @@ task bench: :compile do
|
|
|
101
140
|
end
|
|
102
141
|
end
|
|
103
142
|
|
|
143
|
+
desc "Run the XML reader benchmark (Makiri::XML vs Nokogiri::XML reference)"
|
|
144
|
+
task "bench:xml" => :compile do
|
|
145
|
+
Bundler.with_unbundled_env do
|
|
146
|
+
sh "#{FileUtils::RUBY} -Ilib bench/bench_xml.rb"
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
104
150
|
namespace :conformance do
|
|
105
151
|
desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
|
|
106
152
|
task html5: :compile do
|
|
@@ -116,6 +162,28 @@ namespace :conformance do
|
|
|
116
162
|
end
|
|
117
163
|
end
|
|
118
164
|
|
|
165
|
+
desc "XML XPath 1.0 differential conformance: Makiri::XML vs Nokogiri::XML"
|
|
166
|
+
task xpath_xml: :compile do
|
|
167
|
+
Bundler.with_unbundled_env do
|
|
168
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_xpath_diff.rb #{ENV['XPATH_ARGS']}"
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
desc "W3C XML Conformance Test Suite: well-formedness through Makiri::XML"
|
|
173
|
+
task xmlconf: :compile do
|
|
174
|
+
# Nokogiri (bench-only) parses the manifests, so run outside the bundle.
|
|
175
|
+
Bundler.with_unbundled_env do
|
|
176
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xmlconf_runner.rb #{ENV['XMLCONF_ARGS']}"
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
desc "Property-based XML differential: generated documents, Makiri vs Nokogiri tree"
|
|
181
|
+
task xml_pbt: :compile do
|
|
182
|
+
Bundler.with_unbundled_env do
|
|
183
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_pbt_diff.rb #{ENV['PBT_ARGS']}"
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
119
187
|
desc "CSS Selectors differential conformance vs Nokogiri::HTML5"
|
|
120
188
|
task css: :compile do
|
|
121
189
|
Bundler.with_unbundled_env do
|
|
@@ -124,14 +192,31 @@ namespace :conformance do
|
|
|
124
192
|
end
|
|
125
193
|
end
|
|
126
194
|
|
|
127
|
-
desc "Run all conformance suites
|
|
128
|
-
task conformance: %w[conformance:html5 conformance:xpath conformance:css]
|
|
195
|
+
desc "Run all conformance suites"
|
|
196
|
+
task conformance: %w[conformance:html5 conformance:xpath conformance:css conformance:xmlconf conformance:xpath_xml]
|
|
129
197
|
|
|
130
198
|
namespace :fuzz do
|
|
131
|
-
|
|
199
|
+
# Run the fuzzer under the sanitizer. Toggles (all via env):
|
|
200
|
+
# FAST=1 run the surfaces NON-isolated (one process, no fork-per-query).
|
|
201
|
+
# Far higher throughput; ASan still aborts on a memory error
|
|
202
|
+
# (halt_on_error). The default (isolated) is the complete net:
|
|
203
|
+
# it also survives + attributes a genuine segfault and catches a
|
|
204
|
+
# hang via the per-query timeout, at much lower throughput.
|
|
205
|
+
# SKIP_BUILD=1 reuse the current build instead of rebuilding (refuses to run
|
|
206
|
+
# if it is not a sanitizer build, so you never fuzz a plain ext).
|
|
207
|
+
# FUZZ_TIME=N seconds per surface (default 90).
|
|
208
|
+
# FUZZ_ARGS=... run a single custom invocation instead of the three surfaces.
|
|
209
|
+
desc "Run the fuzzer under AddressSanitizer (FAST=1 non-isolated, SKIP_BUILD=1 reuse build)"
|
|
132
210
|
task :sanitize do
|
|
133
211
|
sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
|
|
134
|
-
|
|
212
|
+
if %w[1 true yes].include?(ENV["SKIP_BUILD"].to_s.downcase)
|
|
213
|
+
ext_sanitized? or
|
|
214
|
+
abort "fuzz:sanitize: SKIP_BUILD set but lib/makiri is not a sanitizer build; " \
|
|
215
|
+
"drop SKIP_BUILD to rebuild with MAKIRI_SANITIZE"
|
|
216
|
+
puts "fuzz:sanitize: reusing the existing sanitizer build (SKIP_BUILD)"
|
|
217
|
+
else
|
|
218
|
+
sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
219
|
+
end
|
|
135
220
|
|
|
136
221
|
env = {
|
|
137
222
|
"ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
|
|
@@ -144,7 +229,18 @@ namespace :fuzz do
|
|
|
144
229
|
preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
|
|
145
230
|
env[preload] = runtime
|
|
146
231
|
end
|
|
147
|
-
|
|
148
|
-
|
|
232
|
+
|
|
233
|
+
if ENV["FUZZ_ARGS"]
|
|
234
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
235
|
+
else
|
|
236
|
+
iso = %w[1 true yes].include?(ENV["FAST"].to_s.downcase) ? "" : "--isolated"
|
|
237
|
+
secs = ENV["FUZZ_TIME"] || "90"
|
|
238
|
+
# Cover every surface under the sanitizer: the query engine (XPath/CSS over
|
|
239
|
+
# parsed fixtures), the XML parser (hostile documents), and the XML mutation
|
|
240
|
+
# surface (random edit sequences + invariants).
|
|
241
|
+
["", "--target xml", "--target mutate"].each do |surface|
|
|
242
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
|
|
243
|
+
end
|
|
244
|
+
end
|
|
149
245
|
end
|
|
150
246
|
end
|
data/ext/makiri/bridge/bridge.h
CHANGED
|
@@ -46,6 +46,34 @@ mkr_ruby_borrowed_bytes_t mkr_ruby_bytes_view(VALUE in);
|
|
|
46
46
|
* for an empty input), suitable for use while the GVL is released. */
|
|
47
47
|
int mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out);
|
|
48
48
|
|
|
49
|
+
/* Return a UTF-8 Ruby String for `str`, honouring its declared encoding: UTF-8 /
|
|
50
|
+
* US-ASCII / ASCII-8BIT are returned unchanged (the parser handles their bytes
|
|
51
|
+
* directly); any other encoding is transcoded to UTF-8 (invalid/undef -> U+FFFD)
|
|
52
|
+
* so its content is preserved rather than read as raw UTF-8. The UTF-8 common
|
|
53
|
+
* case is a single encoding comparison. */
|
|
54
|
+
VALUE mkr_ruby_to_utf8(VALUE str);
|
|
55
|
+
|
|
56
|
+
/* STRICT decode for XML (§2.1): like mkr_ruby_to_utf8 it honours the String's
|
|
57
|
+
* declared encoding (UTF-8 / US-ASCII / ASCII-8BIT pass through; any other
|
|
58
|
+
* encoding is transcoded to UTF-8) - but FAIL-CLOSED, never lenient: a non-UTF-8
|
|
59
|
+
* byte that can't be converted, invalid UTF-8, or an embedded NUL all raise
|
|
60
|
+
* Makiri::XML::SyntaxError (no U+FFFD replacement). Returns a validated,
|
|
61
|
+
* UTF-8-tagged Ruby String. (The HTML replace path mkr_ruby_to_utf8 itself is
|
|
62
|
+
* NOT reused for the conversion - only its encoding-judgment rule is shared.)
|
|
63
|
+
*
|
|
64
|
+
* +max_bytes+ bounds the decoded UTF-8 length: an input that already exceeds the
|
|
65
|
+
* parser's arena byte budget is rejected here with Makiri::XML::LimitExceeded,
|
|
66
|
+
* before the validation copy and the caller's GVL-release copy (so a hostile
|
|
67
|
+
* oversized document is not copied twice for a doomed parse). 0 disables the
|
|
68
|
+
* check (decode-only callers that build no arena). */
|
|
69
|
+
VALUE mkr_xml_decode_input(VALUE str, size_t max_bytes);
|
|
70
|
+
|
|
71
|
+
/* True if `str` is *already known* to be valid UTF-8 - pure ASCII, or valid in
|
|
72
|
+
* the UTF-8 encoding - from its cached coderange, WITHOUT forcing a scan. Lets
|
|
73
|
+
* the parse skip mkr_utf8_sanitize's validation pass for input Ruby has already
|
|
74
|
+
* classified (an unknown/broken coderange returns false: sanitize handles it). */
|
|
75
|
+
bool mkr_ruby_str_known_valid_utf8(VALUE str);
|
|
76
|
+
|
|
49
77
|
/* Validate a Ruby String for use as an XPath engine string: valid UTF-8,
|
|
50
78
|
* no interior NUL, and at most +max_bytes+. Returns NULL on success and fills
|
|
51
79
|
* +out+; otherwise returns a static reason string. +sv+ must be a String. */
|