makiri 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +60 -2
- data/README.md +81 -77
- data/Rakefile +194 -3
- data/ext/makiri/bridge/ruby_string.c +119 -66
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +27 -4
- data/ext/makiri/core/mkr_buf.c +13 -3
- data/ext/makiri/core/mkr_buf.h +80 -5
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +10 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +104 -9
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +8 -0
- data/ext/makiri/glue/ruby_doc.c +20 -24
- data/ext/makiri/glue/ruby_html_css.c +58 -12
- data/ext/makiri/glue/ruby_html_mutate.c +11 -6
- data/ext/makiri/glue/ruby_html_node.c +3 -32
- data/ext/makiri/glue/ruby_node.c +39 -0
- data/ext/makiri/glue/ruby_xml.c +198 -16
- data/ext/makiri/glue/ruby_xml_node.c +46 -59
- data/ext/makiri/glue/ruby_xpath.c +4 -4
- data/ext/makiri/lexbor_compat/source_loc.c +14 -16
- data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
- data/ext/makiri/makiri.c +45 -0
- data/ext/makiri/xml/mkr_xml.h +2 -3
- data/ext/makiri/xml/mkr_xml_chars.c +67 -97
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
- data/ext/makiri/xml/mkr_xml_node.c +147 -15
- data/ext/makiri/xml/mkr_xml_node.h +71 -6
- data/ext/makiri/xml/mkr_xml_tree.c +185 -149
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +37 -0
- data/ext/makiri/xpath/mkr_xpath.h +13 -0
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
- data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
- data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
- data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
- data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
- data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
- data/lib/makiri/cdata_section.rb +1 -3
- data/lib/makiri/comment.rb +1 -3
- data/lib/makiri/document.rb +8 -0
- data/lib/makiri/element.rb +1 -3
- data/lib/makiri/processing_instruction.rb +1 -3
- data/lib/makiri/text.rb +1 -3
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/node_methods.rb +47 -0
- data/lib/makiri.rb +1 -0
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +45 -2
- data/script/check_c_safety_allowlist.yml +19 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +19 -1
data/README.md
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# Makiri
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
Makiri is a Ruby library for parsing and querying HTML and XML documents.
|
|
4
|
+
|
|
5
|
+
It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
|
|
6
|
+
Makiri does not depend on libxml2.
|
|
6
7
|
|
|
7
8
|
> [!WARNING]
|
|
8
9
|
> Status: early release. APIs and behavior may change before v1.0.
|
|
@@ -14,14 +15,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
|
|
|
14
15
|
|
|
15
16
|
* HTML5 parsing via [Lexbor](https://lexbor.com)
|
|
16
17
|
* Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
|
|
17
|
-
* Lexbor-specific behavior is isolated in a thin compatibility layer
|
|
18
|
-
(`ext/makiri/lexbor_compat/`).
|
|
19
18
|
* CSS selector support via Lexbor
|
|
20
19
|
* Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
|
|
21
20
|
* Native XPath 1.0 engine
|
|
22
21
|
* XPath is parsed and evaluated by Makiri's own engine, written from scratch.
|
|
23
22
|
* Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
|
|
24
|
-
* Native XML 1.0
|
|
23
|
+
* Native XML 1.0 parser
|
|
25
24
|
* A strict, non-validating, fail-closed parser with its own node arena (not
|
|
26
25
|
Lexbor's HTML DOM), queried through the same native XPath engine, with
|
|
27
26
|
in-place tree edits (attributes, content, rename, remove).
|
|
@@ -81,21 +80,6 @@ ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
|
|
|
81
80
|
|
|
82
81
|
### XML (with in-place editing)
|
|
83
82
|
|
|
84
|
-
`Makiri::XML(source)` parses **XML 1.0** with a native, strict,
|
|
85
|
-
well-formedness-checking parser (no libxml2) and queries it through the same
|
|
86
|
-
native XPath 1.0 engine. `source` is a String or any object responding to
|
|
87
|
-
`#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
|
|
88
|
-
(`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
|
|
89
|
-
**fail-closed**: malformed input, a duplicate attribute, or a
|
|
90
|
-
non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
|
|
91
|
-
XML does not support raise `NotImplementedError` rather than returning a wrong
|
|
92
|
-
result. The tree supports in-place edits and building new subtrees (see below).
|
|
93
|
-
A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
|
|
94
|
-
entity/element declarations are loaded, no external subset is fetched) - so a
|
|
95
|
-
DTD-defined entity reference stays an undefined-entity error and **XXE /
|
|
96
|
-
billion-laughs are structurally impossible**. The doctype's name and identifiers
|
|
97
|
-
are still readable:
|
|
98
|
-
|
|
99
83
|
```ruby
|
|
100
84
|
doc = Makiri::XML(<<~XML)
|
|
101
85
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
@@ -119,10 +103,13 @@ el = doc.at_xpath("//a:entry", ns)
|
|
|
119
103
|
el.local_name # => "entry"
|
|
120
104
|
el.namespace_uri # => "http://www.w3.org/2005/Atom"
|
|
121
105
|
|
|
122
|
-
|
|
106
|
+
# CSS selectors work too (lowered to the native XPath engine): a bare type
|
|
107
|
+
# selector binds to the document's default namespace, so this just works.
|
|
108
|
+
doc.css("entry").length # => 2
|
|
109
|
+
doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
|
|
123
110
|
|
|
124
111
|
# Serialize back to XML
|
|
125
|
-
doc.to_xml # => "<?xml version=\"1.0\"
|
|
112
|
+
doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
|
|
126
113
|
doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
|
|
127
114
|
doc.to_xml(pretty: true) # indented, element-only content
|
|
128
115
|
|
|
@@ -134,22 +121,7 @@ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
|
|
|
134
121
|
dtd.system_id # => "x.dtd"
|
|
135
122
|
```
|
|
136
123
|
|
|
137
|
-
|
|
138
|
-
children (reachable via `//comment()` / `//processing-instruction()` and
|
|
139
|
-
`#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
|
|
140
|
-
data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
|
|
141
|
-
or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
|
|
142
|
-
transcode, with a hex character reference for anything the encoding can't hold);
|
|
143
|
-
a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
|
|
144
|
-
Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
|
|
145
|
-
comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
|
|
146
|
-
(Lexbor's selector engine lower-cases names, which breaks XML case/namespace
|
|
147
|
-
matching) - use XPath.
|
|
148
|
-
|
|
149
|
-
The tree supports in-place mutation - every edit validates its input (names as
|
|
150
|
-
XML 1.0 QNames, values as XML Char) so the tree stays serializable to
|
|
151
|
-
well-formed XML, and a removed node is detached, never freed, so a live wrapper
|
|
152
|
-
that aliases it stays usable:
|
|
124
|
+
The tree supports in-place mutation.
|
|
153
125
|
|
|
154
126
|
```ruby
|
|
155
127
|
doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
|
|
@@ -165,15 +137,9 @@ doc.at_xpath("//draft").remove
|
|
|
165
137
|
doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
|
|
166
138
|
```
|
|
167
139
|
|
|
168
|
-
|
|
169
|
-
`#
|
|
170
|
-
|
|
171
|
-
`#add_previous_sibling` / `#before`, `#add_next_sibling` / `#after`, `#replace`
|
|
172
|
-
link them. A node's namespace is resolved against its position **at insertion**
|
|
173
|
-
(a prefixed name binds to the in-scope `xmlns`, an unprefixed element to the
|
|
174
|
-
default namespace), so the same tree results whether you set names before or
|
|
175
|
-
after attaching; an unbound prefix in the live tree fails closed. A node from
|
|
176
|
-
another document is **deep-copied** into the target (the source is untouched):
|
|
140
|
+
XML subtrees can be built with `Document#create_element` and related node factory methods,
|
|
141
|
+
then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
|
|
142
|
+
namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
|
|
177
143
|
|
|
178
144
|
```ruby
|
|
179
145
|
doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
|
|
@@ -185,34 +151,29 @@ doc.root.add_child(entry)
|
|
|
185
151
|
doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
|
|
186
152
|
```
|
|
187
153
|
|
|
188
|
-
|
|
189
|
-
`#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
|
|
190
|
-
`#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
|
|
191
|
-
`DocumentFragment` (its children are spliced in); a fragment is parsed by
|
|
192
|
-
`Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
|
|
193
|
-
(standalone). A raw string handed straight to `#add_child` is **not** accepted -
|
|
194
|
-
parse it into a fragment first. A whole document can also be built from scratch
|
|
195
|
-
with `XML::Document.new` + `#root=` and the factories.
|
|
196
|
-
|
|
197
|
-
The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
|
|
198
|
-
the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
|
|
199
|
-
in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
|
|
200
|
-
concrete String encoding stays authoritative - a BOM or declaration that
|
|
201
|
-
contradicts it is a fatal error, not a silent mis-decode.
|
|
202
|
-
|
|
203
|
-
Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
|
|
204
|
-
counting node structs and text), which fits every standard document. Raise it
|
|
205
|
-
per parse for an unusually large one:
|
|
154
|
+
`Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
|
|
206
155
|
|
|
207
156
|
```ruby
|
|
208
|
-
|
|
157
|
+
builder = Makiri::XML::Builder.new do |xml|
|
|
158
|
+
xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
|
|
159
|
+
xml.title("Example Feed")
|
|
160
|
+
xml.entry("dc:id" => "1") do
|
|
161
|
+
xml.title("First")
|
|
162
|
+
xml.summary { xml.cdata("raw <b>html</b>") }
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
builder.to_xml # the whole document (with XML declaration)
|
|
168
|
+
builder.doc # the Makiri::XML::Document being built
|
|
209
169
|
```
|
|
210
170
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
171
|
+
XML parsing is bounded by an arena memory limit, 256 MiB by default,
|
|
172
|
+
and unusually large documents can raise it with `max_bytes:`.
|
|
173
|
+
|
|
174
|
+
```ruby
|
|
175
|
+
Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
|
|
176
|
+
```
|
|
216
177
|
|
|
217
178
|
## Non-goals (v1.0)
|
|
218
179
|
|
|
@@ -271,14 +232,48 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
|
|
|
271
232
|
|
|
272
233
|
### CSS
|
|
273
234
|
|
|
274
|
-
* jQuery/Nokogiri CSS extensions are not supported (`:
|
|
275
|
-
* Makiri uses Lexbor's standards-
|
|
276
|
-
Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
235
|
+
* Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
|
|
236
|
+
* Makiri uses Lexbor's selector engine, which is standards-based apart from one
|
|
237
|
+
text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
238
|
+
Enumerable (`css('li')[1]`) for the rest.
|
|
277
239
|
Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
|
|
240
|
+
* `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
|
|
241
|
+
spelling of the jQuery `:contains()` substring filter, matching an element
|
|
242
|
+
whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
|
|
243
|
+
for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
|
|
244
|
+
alias.) Like Lexbor's matcher, it tests the element's **immediate child text
|
|
245
|
+
nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
|
|
246
|
+
to XPath `child::text()[contains(., "text")]`.
|
|
247
|
+
* Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
|
|
248
|
+
selector) is supported and correct on both HTML and XML - the "type" is the
|
|
249
|
+
element's own expanded name.
|
|
250
|
+
* Nokogiri (XML and HTML5) mistranslates these to first-/only-child
|
|
251
|
+
(`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
|
|
252
|
+
Lexbor's HTML matcher.
|
|
278
253
|
* Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
|
|
279
254
|
* `Nokogiri::HTML5` is case-sensitive there.
|
|
280
|
-
|
|
281
|
-
|
|
255
|
+
|
|
256
|
+
## Conformance
|
|
257
|
+
|
|
258
|
+
The XPath engine and XML parser are original code, so their correctness is held by
|
|
259
|
+
differential and standards harnesses in `spec/conformance/`.
|
|
260
|
+
The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
|
|
261
|
+
(Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
|
|
262
|
+
HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
|
|
263
|
+
parsing itself is checked against the WHATWG html5lib-tests corpus, and
|
|
264
|
+
XPath-over-HTML semantics additionally against browsers via a WPT port.
|
|
265
|
+
See also [`spec/conformance/README.md`](spec/conformance/README.md).
|
|
266
|
+
|
|
267
|
+
| Suite | Input | Oracle | `rake` task |
|
|
268
|
+
|---|---|---|---|
|
|
269
|
+
| HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
|
|
270
|
+
| XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
|
|
271
|
+
| XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
|
|
272
|
+
| CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
|
|
273
|
+
| Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
|
|
274
|
+
| XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
|
|
275
|
+
| Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
|
|
276
|
+
| CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
|
|
282
277
|
|
|
283
278
|
## Requirements
|
|
284
279
|
|
|
@@ -295,6 +290,15 @@ bundle exec rake compile
|
|
|
295
290
|
bundle exec rake spec
|
|
296
291
|
```
|
|
297
292
|
|
|
293
|
+
### Vendored Lexbor version
|
|
294
|
+
|
|
295
|
+
`vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
|
|
296
|
+
commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
|
|
297
|
+
case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
|
|
298
|
+
heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
|
|
299
|
+
bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
|
|
300
|
+
v3.0.0. See `CLAUDE.md` for details.
|
|
301
|
+
|
|
298
302
|
## License
|
|
299
303
|
|
|
300
304
|
Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
|
data/Rakefile
CHANGED
|
@@ -4,6 +4,7 @@ require "bundler/gem_tasks"
|
|
|
4
4
|
require "rspec/core/rake_task"
|
|
5
5
|
require "rake/extensiontask"
|
|
6
6
|
require "shellwords"
|
|
7
|
+
require "tmpdir"
|
|
7
8
|
|
|
8
9
|
GEMSPEC = Gem::Specification.load("makiri.gemspec")
|
|
9
10
|
|
|
@@ -35,6 +36,59 @@ RSpec::Core::RakeTask.new(:spec)
|
|
|
35
36
|
|
|
36
37
|
task default: %i[compile spec]
|
|
37
38
|
|
|
39
|
+
# `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
|
|
40
|
+
# ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
|
|
41
|
+
# Ruby's own Valgrind suppression files (matched by Ruby version) and filters
|
|
42
|
+
# the report down to errors whose stack touches our extension, so we no longer
|
|
43
|
+
# have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
|
|
44
|
+
#
|
|
45
|
+
# We keep this job's historical contract: catch *use of uninitialised values*
|
|
46
|
+
# and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
|
|
47
|
+
# detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
|
|
48
|
+
# which disable undef-value errors and turn on full leak-check.
|
|
49
|
+
#
|
|
50
|
+
# `filter_all_errors: true` is essential: by default ruby_memcheck only applies
|
|
51
|
+
# its "stack must touch the makiri binary" filter to *leak*-kind errors
|
|
52
|
+
# (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
|
|
53
|
+
# uninitialised-value report is surfaced regardless of where it comes from. Ruby's
|
|
54
|
+
# conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
|
|
55
|
+
# legitimately reads uninitialised words, and the bundled ruby.supp does not cover
|
|
56
|
+
# the free-at-exit / subprocess stacks the `:isolated` specs spin up under
|
|
57
|
+
# `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
|
|
58
|
+
# Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
|
|
59
|
+
# *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
|
|
60
|
+
# and is still reported.
|
|
61
|
+
#
|
|
62
|
+
# Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
|
|
63
|
+
# normal `bundle exec rake` (without that group) must not fail to load.
|
|
64
|
+
begin
|
|
65
|
+
require "ruby_memcheck"
|
|
66
|
+
require "ruby_memcheck/rspec/rake_task"
|
|
67
|
+
|
|
68
|
+
RubyMemcheck.config(
|
|
69
|
+
binary_name: "makiri",
|
|
70
|
+
filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
|
|
71
|
+
# not just leaks (see note above) - drops Ruby's own
|
|
72
|
+
# GC uninitialised-value noise, keeps mkr_* reports
|
|
73
|
+
valgrind_options: [
|
|
74
|
+
"--num-callers=50",
|
|
75
|
+
"--error-limit=no",
|
|
76
|
+
"--trace-children=yes", # spec processes may fork
|
|
77
|
+
"--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
|
|
78
|
+
"--track-origins=yes", # report where an uninitialised value came from
|
|
79
|
+
"--leak-check=no", # leaks are `rake leaks`' job, not this one
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
namespace :spec do
|
|
84
|
+
desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
|
|
85
|
+
":valgrind bundler group and the valgrind binary)"
|
|
86
|
+
RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
|
|
87
|
+
end
|
|
88
|
+
rescue LoadError
|
|
89
|
+
# ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
|
|
90
|
+
end
|
|
91
|
+
|
|
38
92
|
namespace :security do
|
|
39
93
|
desc "Run mechanical C safety lint over ext/makiri"
|
|
40
94
|
task :clint do
|
|
@@ -81,6 +135,17 @@ def asan_runtime_path
|
|
|
81
135
|
nil
|
|
82
136
|
end
|
|
83
137
|
|
|
138
|
+
def libfuzzer_available?
|
|
139
|
+
cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
|
|
140
|
+
Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
|
|
141
|
+
src = File.join(dir, "check.cc")
|
|
142
|
+
exe = File.join(dir, "check")
|
|
143
|
+
File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
|
|
144
|
+
return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
|
|
145
|
+
out: File::NULL, err: File::NULL)
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
84
149
|
# The compiled extension, and whether it carries sanitizer instrumentation, so
|
|
85
150
|
# `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
|
|
86
151
|
def ext_bundle_path
|
|
@@ -116,6 +181,65 @@ task :sanitize do
|
|
|
116
181
|
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
117
182
|
end
|
|
118
183
|
|
|
184
|
+
desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
|
|
185
|
+
"Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
|
|
186
|
+
"a line-level detail file to tmp/coverage/show.txt."
|
|
187
|
+
task :coverage do
|
|
188
|
+
require "fileutils"
|
|
189
|
+
dir = File.expand_path("tmp/coverage")
|
|
190
|
+
FileUtils.rm_rf(dir)
|
|
191
|
+
FileUtils.mkdir_p(dir)
|
|
192
|
+
|
|
193
|
+
# Instrument only our sources (Lexbor is built separately, uninstrumented).
|
|
194
|
+
sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
195
|
+
# %p -> PID, so any forked spec process gets its own raw profile.
|
|
196
|
+
sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
|
|
197
|
+
|
|
198
|
+
profdata = File.join(dir, "makiri.profdata")
|
|
199
|
+
bundle = "lib/makiri/makiri.bundle"
|
|
200
|
+
ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
|
|
201
|
+
sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
|
|
202
|
+
sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
|
|
203
|
+
"-ignore-filename-regex='#{ignore}' -show-branch-summary"
|
|
204
|
+
show = File.join(dir, "show.txt")
|
|
205
|
+
sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
|
|
206
|
+
"-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
|
|
207
|
+
puts "\ncoverage line/branch detail: #{show}"
|
|
208
|
+
puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
|
|
212
|
+
"INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
|
|
213
|
+
"spec suite, or FUZZ_ARGS via the fuzzer when set."
|
|
214
|
+
task "sanitize:lexbor" do
|
|
215
|
+
sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
|
|
216
|
+
sanitize.include?("address") or
|
|
217
|
+
abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
|
|
218
|
+
|
|
219
|
+
# MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
|
|
220
|
+
# (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
|
|
221
|
+
# plain<->asan switch, so no manual clean:lexbor is needed before or after.
|
|
222
|
+
build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
|
|
223
|
+
sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
|
|
224
|
+
|
|
225
|
+
env = {
|
|
226
|
+
"ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
|
|
227
|
+
"detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
|
|
228
|
+
"UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
|
|
229
|
+
}
|
|
230
|
+
runtime = asan_runtime_path or
|
|
231
|
+
abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
|
|
232
|
+
preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
|
|
233
|
+
env[preload] = runtime
|
|
234
|
+
puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
|
|
235
|
+
|
|
236
|
+
if ENV["FUZZ_ARGS"]
|
|
237
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
238
|
+
else
|
|
239
|
+
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
119
243
|
desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
|
|
120
244
|
task fuzz: :compile do
|
|
121
245
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
|
|
@@ -131,6 +255,25 @@ task "fuzz:mutate": :compile do
|
|
|
131
255
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
|
|
132
256
|
end
|
|
133
257
|
|
|
258
|
+
desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
|
|
259
|
+
task leaks: :compile do
|
|
260
|
+
# ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
|
|
261
|
+
# leaks are otherwise never machine-checked; see script/check_leaks.rb.
|
|
262
|
+
sh "#{FileUtils::RUBY} script/check_leaks.rb"
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
|
|
266
|
+
"allocation site, verifying each failure fails closed (clean raise or " \
|
|
267
|
+
"baseline-identical result, never truncated output)"
|
|
268
|
+
task :oom do
|
|
269
|
+
# The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
|
|
270
|
+
# a normal build), so this needs its own rebuild; see
|
|
271
|
+
# script/check_alloc_failures.rb for the protocol and the property gated.
|
|
272
|
+
sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
273
|
+
sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
|
|
274
|
+
puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
|
|
275
|
+
end
|
|
276
|
+
|
|
134
277
|
desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
|
|
135
278
|
task bench: :compile do
|
|
136
279
|
# Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
|
|
@@ -190,10 +333,26 @@ namespace :conformance do
|
|
|
190
333
|
sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
|
|
191
334
|
end
|
|
192
335
|
end
|
|
336
|
+
|
|
337
|
+
desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
|
|
338
|
+
task css_xml: :compile do
|
|
339
|
+
Bundler.with_unbundled_env do
|
|
340
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
|
|
345
|
+
task builder: :compile do
|
|
346
|
+
Bundler.with_unbundled_env do
|
|
347
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
|
|
348
|
+
end
|
|
349
|
+
end
|
|
193
350
|
end
|
|
194
351
|
|
|
195
352
|
desc "Run all conformance suites"
|
|
196
|
-
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
353
|
+
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
354
|
+
conformance:xmlconf conformance:xpath_xml conformance:css_xml
|
|
355
|
+
conformance:builder]
|
|
197
356
|
|
|
198
357
|
namespace :fuzz do
|
|
199
358
|
# Run the fuzzer under the sanitizer. Toggles (all via env):
|
|
@@ -233,14 +392,46 @@ namespace :fuzz do
|
|
|
233
392
|
if ENV["FUZZ_ARGS"]
|
|
234
393
|
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
235
394
|
else
|
|
236
|
-
iso = %w[1 true yes].include?(ENV["
|
|
395
|
+
iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
|
|
237
396
|
secs = ENV["FUZZ_TIME"] || "90"
|
|
238
397
|
# Cover every surface under the sanitizer: the query engine (XPath/CSS over
|
|
239
398
|
# parsed fixtures), the XML parser (hostile documents), and the XML mutation
|
|
240
399
|
# surface (random edit sequences + invariants).
|
|
241
|
-
["", "--target xml", "--target mutate"].each do |surface|
|
|
400
|
+
["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
|
|
242
401
|
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
|
|
243
402
|
end
|
|
244
403
|
end
|
|
245
404
|
end
|
|
405
|
+
|
|
406
|
+
# Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
|
|
407
|
+
# XPath compile+eval). These are Ruby-free standalone binaries, so they run
|
|
408
|
+
# directly under clang's libFuzzer driver without the Ruby interpreter.
|
|
409
|
+
# They complement the Ruby-based robustness fuzzer by providing coverage
|
|
410
|
+
# feedback and 2-3 orders of magnitude faster execution for the C core.
|
|
411
|
+
desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
|
|
412
|
+
task :libfuzzer_build => :compile do
|
|
413
|
+
libfuzzer_available? or
|
|
414
|
+
abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
|
|
415
|
+
"Install an LLVM clang with libFuzzer support and run with " \
|
|
416
|
+
"CLANG=/path/to/clang CXX=/path/to/clang++."
|
|
417
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
418
|
+
sh "make clean"
|
|
419
|
+
sh "make all"
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
|
|
424
|
+
task :libfuzzer => :libfuzzer_build do
|
|
425
|
+
time = ENV["FUZZ_TIME"] || "60"
|
|
426
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
427
|
+
sh "mkdir -p corpus/xml corpus/xpath"
|
|
428
|
+
sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
|
|
429
|
+
sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
desc "Show code statistics"
|
|
435
|
+
task :stats do
|
|
436
|
+
sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
|
|
246
437
|
end
|