makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
data/README.md
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# Makiri
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Makiri is a Ruby library for parsing and querying HTML and XML documents.
|
|
4
|
+
|
|
5
|
+
It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
|
|
6
|
+
Makiri does not depend on libxml2.
|
|
5
7
|
|
|
6
8
|
> [!WARNING]
|
|
7
9
|
> Status: early release. APIs and behavior may change before v1.0.
|
|
@@ -13,13 +15,17 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
|
|
|
13
15
|
|
|
14
16
|
* HTML5 parsing via [Lexbor](https://lexbor.com)
|
|
15
17
|
* Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
|
|
16
|
-
* Lexbor-specific behavior is isolated in a thin compatibility layer
|
|
17
|
-
(`ext/makiri/lexbor_compat/`).
|
|
18
18
|
* CSS selector support via Lexbor
|
|
19
19
|
* Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
|
|
20
20
|
* Native XPath 1.0 engine
|
|
21
21
|
* XPath is parsed and evaluated by Makiri's own engine, written from scratch.
|
|
22
22
|
* Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
|
|
23
|
+
* Native XML 1.0 parser
|
|
24
|
+
* A strict, non-validating, fail-closed parser with its own node arena (not
|
|
25
|
+
Lexbor's HTML DOM), queried through the same native XPath engine, with
|
|
26
|
+
in-place tree edits (attributes, content, rename, remove).
|
|
27
|
+
* Conformance is held by the W3C XML Conformance Test Suite, an XPath
|
|
28
|
+
differential, and property-based testing vs Nokogiri (see below).
|
|
23
29
|
* Bounded, fail-closed execution
|
|
24
30
|
* XPath evaluation is bounded by per-evaluation limits on work, memory, and recursion.
|
|
25
31
|
* Ownership and borrowing are kept explicit across layers, with owned/borrowed
|
|
@@ -46,7 +52,7 @@ HTML
|
|
|
46
52
|
doc.css("a").map { |a| a["href"] } # => ["/a", "/b"]
|
|
47
53
|
doc.at_css("p.lead").text # => "Hello"
|
|
48
54
|
|
|
49
|
-
# XPath 1.0 (native engine
|
|
55
|
+
# XPath 1.0 (native engine - no libxml2)
|
|
50
56
|
doc.xpath("//a").length # => 2
|
|
51
57
|
doc.xpath("count(//a)") # => 2.0
|
|
52
58
|
doc.at_xpath('//*[@id="main"]/p').text # => "Hello"
|
|
@@ -72,16 +78,120 @@ ctx.register_variable("cls", "lead")
|
|
|
72
78
|
ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
|
|
73
79
|
```
|
|
74
80
|
|
|
81
|
+
### XML (with in-place editing)
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
doc = Makiri::XML(<<~XML)
|
|
85
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
86
|
+
<entry><title>Hello</title></entry>
|
|
87
|
+
<entry><title>World</title></entry>
|
|
88
|
+
</feed>
|
|
89
|
+
XML
|
|
90
|
+
|
|
91
|
+
# Namespace matching is strict, so a default namespace needs a registered prefix.
|
|
92
|
+
ns = { "a" => "http://www.w3.org/2005/Atom" }
|
|
93
|
+
doc.xpath("//entry").length # => 0 (default namespace)
|
|
94
|
+
doc.xpath("//a:entry", ns).length # => 2
|
|
95
|
+
doc.at_xpath("//a:entry/a:title", ns).text # => "Hello"
|
|
96
|
+
|
|
97
|
+
# Or reuse a context (caches registrations + compiled expressions):
|
|
98
|
+
ctx = Makiri::XPathContext.new(doc.root)
|
|
99
|
+
ctx.register_namespace("a", "http://www.w3.org/2005/Atom")
|
|
100
|
+
ctx.evaluate("//a:entry").length # => 2
|
|
101
|
+
|
|
102
|
+
el = doc.at_xpath("//a:entry", ns)
|
|
103
|
+
el.local_name # => "entry"
|
|
104
|
+
el.namespace_uri # => "http://www.w3.org/2005/Atom"
|
|
105
|
+
|
|
106
|
+
# CSS selectors work too (lowered to the native XPath engine): a bare type
|
|
107
|
+
# selector binds to the document's default namespace, so this just works.
|
|
108
|
+
doc.css("entry").length # => 2
|
|
109
|
+
doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
|
|
110
|
+
|
|
111
|
+
# Serialize back to XML
|
|
112
|
+
doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
|
|
113
|
+
doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
|
|
114
|
+
doc.to_xml(pretty: true) # indented, element-only content
|
|
115
|
+
|
|
116
|
+
# DOCTYPE is recognized but the DTD is not processed (no entities, no I/O):
|
|
117
|
+
dtd = Makiri::XML(%(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0//EN" "x.dtd"><html/>))
|
|
118
|
+
.internal_subset
|
|
119
|
+
dtd.name # => "html"
|
|
120
|
+
dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
|
|
121
|
+
dtd.system_id # => "x.dtd"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
The tree supports in-place mutation.
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
|
|
128
|
+
e = doc.at_xpath("//entry")
|
|
129
|
+
|
|
130
|
+
e["id"] = "9" # add or replace an attribute (value escaped on output)
|
|
131
|
+
e["dc:k"] = "v" # a prefixed name resolves against the in-scope xmlns
|
|
132
|
+
e.content = "Bye" # replace an element's children with text
|
|
133
|
+
e.name = "post" # rename in place (identity + namespace re-resolved)
|
|
134
|
+
e.delete("id") # remove an attribute
|
|
135
|
+
doc.at_xpath("//draft").remove
|
|
136
|
+
|
|
137
|
+
doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
XML subtrees can be built with `Document#create_element` and related node factory methods,
|
|
141
|
+
then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
|
|
142
|
+
namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
|
|
146
|
+
entry = doc.create_element("entry")
|
|
147
|
+
entry["dc:id"] = "42" # prefixed attr resolves on insertion
|
|
148
|
+
entry.add_child(doc.create_element("title", "Hello"))
|
|
149
|
+
doc.root.add_child(entry)
|
|
150
|
+
|
|
151
|
+
doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
`Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
|
|
155
|
+
|
|
156
|
+
```ruby
|
|
157
|
+
builder = Makiri::XML::Builder.new do |xml|
|
|
158
|
+
xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
|
|
159
|
+
xml.title("Example Feed")
|
|
160
|
+
xml.entry("dc:id" => "1") do
|
|
161
|
+
xml.title("First")
|
|
162
|
+
xml.summary { xml.cdata("raw <b>html</b>") }
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
builder.to_xml # the whole document (with XML declaration)
|
|
168
|
+
builder.doc # the Makiri::XML::Document being built
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
XML parsing is bounded by an arena memory limit, 256 MiB by default,
|
|
172
|
+
and unusually large documents can raise it with `max_bytes:`.
|
|
173
|
+
|
|
174
|
+
```ruby
|
|
175
|
+
Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
|
|
176
|
+
```
|
|
177
|
+
|
|
75
178
|
## Non-goals (v1.0)
|
|
76
179
|
|
|
77
|
-
*
|
|
180
|
+
* Passing a raw markup string straight to an insertion method
|
|
181
|
+
(`node.add_child("<x/>")`); parse it into a fragment first
|
|
182
|
+
(`Document#fragment` / `DocumentFragment.parse`). (Building XML from scratch
|
|
183
|
+
(`XML::Document.new` + `#root=`), the node factories - `Document#create_element`
|
|
184
|
+
etc. - fragments, node insertion (`#add_child` / `#before` / `#after` /
|
|
185
|
+
`#replace`), and `#to_xml` serialization ARE supported.)
|
|
78
186
|
* XSLT, DTD / Schema / RelaxNG validation, XPointer, XInclude.
|
|
79
187
|
* Streaming / SAX parsing.
|
|
80
188
|
* Drop-in replacement for every Nokogiri method. Makiri covers the common
|
|
81
189
|
HTML-scraping and manipulation surface. Deliberately not provided:
|
|
82
|
-
-
|
|
190
|
+
- XHTML serialization variants (`to_xhtml`, `write_xml_to`); `#to_xml` is supported
|
|
83
191
|
- XML/DTD construction (`create_internal_subset`, `external_subset`)
|
|
84
|
-
- namespace
|
|
192
|
+
- namespace *mutation* (`add_namespace_definition`); read introspection
|
|
193
|
+
(`#namespace`, `#namespace_definitions`, `#namespaces`, `#collect_namespaces`)
|
|
194
|
+
is supported on `Makiri::XML` nodes
|
|
85
195
|
- Nokogiri internals (`decorate`, `slop!`, `validate`).
|
|
86
196
|
|
|
87
197
|
## Differences from Nokogiri
|
|
@@ -103,16 +213,67 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
|
|
|
103
213
|
* `namespace-uri()` of an HTML element returns the XHTML URI (DOM-correct, as browsers report)
|
|
104
214
|
* `Nokogiri::HTML5` returns `""`.
|
|
105
215
|
|
|
216
|
+
### XML
|
|
217
|
+
|
|
218
|
+
* `Makiri::XML` is **XML 1.0 only and non-validating**.
|
|
219
|
+
* A `version="1.1"` declaration is rejected; Nokogiri parses XML 1.1.
|
|
220
|
+
* The DTD is recognized but not processed: DTD-defined entities are not
|
|
221
|
+
expanded and DTD default attributes are not applied (Nokogiri/libxml2 can do
|
|
222
|
+
both). External entities/subsets are never fetched (no I/O).
|
|
223
|
+
* Mutation supports in-place edits, the node factories, fragments
|
|
224
|
+
(`Document#fragment` / `DocumentFragment.parse`), node insertion, and building
|
|
225
|
+
a document from scratch (`XML::Document.new` + `#root=`); only handing a raw
|
|
226
|
+
markup string straight to `#add_child` is unsupported (parse it into a fragment
|
|
227
|
+
first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
|
|
228
|
+
/ `inner_html` / `outer_html` - is not.)
|
|
229
|
+
* Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
|
|
230
|
+
the property-based differential), including namespaces, prolog/epilog comments
|
|
231
|
+
and PIs, and adjacent-CDATA coalescing.
|
|
232
|
+
|
|
106
233
|
### CSS
|
|
107
234
|
|
|
108
|
-
* jQuery/Nokogiri CSS extensions are not supported (`:
|
|
109
|
-
* Makiri uses Lexbor's standards-
|
|
110
|
-
Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
235
|
+
* Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
|
|
236
|
+
* Makiri uses Lexbor's selector engine, which is standards-based apart from one
|
|
237
|
+
text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
238
|
+
Enumerable (`css('li')[1]`) for the rest.
|
|
111
239
|
Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
|
|
240
|
+
* `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
|
|
241
|
+
spelling of the jQuery `:contains()` substring filter, matching an element
|
|
242
|
+
whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
|
|
243
|
+
for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
|
|
244
|
+
alias.) Like Lexbor's matcher, it tests the element's **immediate child text
|
|
245
|
+
nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
|
|
246
|
+
to XPath `child::text()[contains(., "text")]`.
|
|
247
|
+
* Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
|
|
248
|
+
selector) is supported and correct on both HTML and XML - the "type" is the
|
|
249
|
+
element's own expanded name.
|
|
250
|
+
* Nokogiri (XML and HTML5) mistranslates these to first-/only-child
|
|
251
|
+
(`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
|
|
252
|
+
Lexbor's HTML matcher.
|
|
112
253
|
* Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
|
|
113
254
|
* `Nokogiri::HTML5` is case-sensitive there.
|
|
114
|
-
|
|
115
|
-
|
|
255
|
+
|
|
256
|
+
## Conformance
|
|
257
|
+
|
|
258
|
+
The XPath engine and XML parser are original code, so their correctness is held by
|
|
259
|
+
differential and standards harnesses in `spec/conformance/`.
|
|
260
|
+
The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
|
|
261
|
+
(Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
|
|
262
|
+
HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
|
|
263
|
+
parsing itself is checked against the WHATWG html5lib-tests corpus, and
|
|
264
|
+
XPath-over-HTML semantics additionally against browsers via a WPT port.
|
|
265
|
+
See also [`spec/conformance/README.md`](spec/conformance/README.md).
|
|
266
|
+
|
|
267
|
+
| Suite | Input | Oracle | `rake` task |
|
|
268
|
+
|---|---|---|---|
|
|
269
|
+
| HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
|
|
270
|
+
| XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
|
|
271
|
+
| XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
|
|
272
|
+
| CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
|
|
273
|
+
| Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
|
|
274
|
+
| XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
|
|
275
|
+
| Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
|
|
276
|
+
| CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
|
|
116
277
|
|
|
117
278
|
## Requirements
|
|
118
279
|
|
|
@@ -129,6 +290,15 @@ bundle exec rake compile
|
|
|
129
290
|
bundle exec rake spec
|
|
130
291
|
```
|
|
131
292
|
|
|
293
|
+
### Vendored Lexbor version
|
|
294
|
+
|
|
295
|
+
`vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
|
|
296
|
+
commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
|
|
297
|
+
case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
|
|
298
|
+
heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
|
|
299
|
+
bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
|
|
300
|
+
v3.0.0. See `CLAUDE.md` for details.
|
|
301
|
+
|
|
132
302
|
## License
|
|
133
303
|
|
|
134
304
|
Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
|
data/Rakefile
CHANGED
|
@@ -4,9 +4,28 @@ require "bundler/gem_tasks"
|
|
|
4
4
|
require "rspec/core/rake_task"
|
|
5
5
|
require "rake/extensiontask"
|
|
6
6
|
require "shellwords"
|
|
7
|
+
require "tmpdir"
|
|
7
8
|
|
|
8
9
|
GEMSPEC = Gem::Specification.load("makiri.gemspec")
|
|
9
10
|
|
|
11
|
+
# Replace bundler/gem_tasks' `release` (which builds a source-only gem and
|
|
12
|
+
# `gem push`es it from the dev machine) with a tag push: it hands the build,
|
|
13
|
+
# GitHub Release, and the approval-gated RubyGems publish off to CI
|
|
14
|
+
# (.github/workflows/release.yml). Nothing is pushed to RubyGems locally.
|
|
15
|
+
Rake::Task["release"].clear
|
|
16
|
+
desc "Tag v#{GEMSPEC.version} and push it; CI builds, releases, and publishes"
|
|
17
|
+
task release: %w[release:guard_clean release:source_control_push] do
|
|
18
|
+
puts <<~MSG
|
|
19
|
+
|
|
20
|
+
Pushed tag v#{GEMSPEC.version}. GitHub Actions (release.yml) will now:
|
|
21
|
+
1. build the source gem + precompiled native gems,
|
|
22
|
+
2. create the GitHub Release and attach them, then
|
|
23
|
+
3. publish to RubyGems via OIDC - after the `rubygems` environment approval.
|
|
24
|
+
Approve the pending deployment in the Actions run to publish; nothing is
|
|
25
|
+
pushed to RubyGems from this machine.
|
|
26
|
+
MSG
|
|
27
|
+
end
|
|
28
|
+
|
|
10
29
|
Rake::ExtensionTask.new("makiri", GEMSPEC) do |ext|
|
|
11
30
|
ext.lib_dir = "lib/makiri"
|
|
12
31
|
ext.ext_dir = "ext/makiri"
|
|
@@ -17,6 +36,59 @@ RSpec::Core::RakeTask.new(:spec)
|
|
|
17
36
|
|
|
18
37
|
task default: %i[compile spec]
|
|
19
38
|
|
|
39
|
+
# `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
|
|
40
|
+
# ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
|
|
41
|
+
# Ruby's own Valgrind suppression files (matched by Ruby version) and filters
|
|
42
|
+
# the report down to errors whose stack touches our extension, so we no longer
|
|
43
|
+
# have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
|
|
44
|
+
#
|
|
45
|
+
# We keep this job's historical contract: catch *use of uninitialised values*
|
|
46
|
+
# and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
|
|
47
|
+
# detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
|
|
48
|
+
# which disable undef-value errors and turn on full leak-check.
|
|
49
|
+
#
|
|
50
|
+
# `filter_all_errors: true` is essential: by default ruby_memcheck only applies
|
|
51
|
+
# its "stack must touch the makiri binary" filter to *leak*-kind errors
|
|
52
|
+
# (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
|
|
53
|
+
# uninitialised-value report is surfaced regardless of where it comes from. Ruby's
|
|
54
|
+
# conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
|
|
55
|
+
# legitimately reads uninitialised words, and the bundled ruby.supp does not cover
|
|
56
|
+
# the free-at-exit / subprocess stacks the `:isolated` specs spin up under
|
|
57
|
+
# `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
|
|
58
|
+
# Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
|
|
59
|
+
# *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
|
|
60
|
+
# and is still reported.
|
|
61
|
+
#
|
|
62
|
+
# Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
|
|
63
|
+
# normal `bundle exec rake` (without that group) must not fail to load.
|
|
64
|
+
begin
|
|
65
|
+
require "ruby_memcheck"
|
|
66
|
+
require "ruby_memcheck/rspec/rake_task"
|
|
67
|
+
|
|
68
|
+
RubyMemcheck.config(
|
|
69
|
+
binary_name: "makiri",
|
|
70
|
+
filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
|
|
71
|
+
# not just leaks (see note above) - drops Ruby's own
|
|
72
|
+
# GC uninitialised-value noise, keeps mkr_* reports
|
|
73
|
+
valgrind_options: [
|
|
74
|
+
"--num-callers=50",
|
|
75
|
+
"--error-limit=no",
|
|
76
|
+
"--trace-children=yes", # spec processes may fork
|
|
77
|
+
"--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
|
|
78
|
+
"--track-origins=yes", # report where an uninitialised value came from
|
|
79
|
+
"--leak-check=no", # leaks are `rake leaks`' job, not this one
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
namespace :spec do
|
|
84
|
+
desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
|
|
85
|
+
":valgrind bundler group and the valgrind binary)"
|
|
86
|
+
RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
|
|
87
|
+
end
|
|
88
|
+
rescue LoadError
|
|
89
|
+
# ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
|
|
90
|
+
end
|
|
91
|
+
|
|
20
92
|
namespace :security do
|
|
21
93
|
desc "Run mechanical C safety lint over ext/makiri"
|
|
22
94
|
task :clint do
|
|
@@ -26,7 +98,7 @@ end
|
|
|
26
98
|
|
|
27
99
|
# `rake clean` (from rake-compiler) removes the ext build dir under tmp/,
|
|
28
100
|
# including the generated Makefile. The next `rake compile` re-runs extconf,
|
|
29
|
-
# so newly-added .c files are picked up
|
|
101
|
+
# so newly-added .c files are picked up - without this, a stale Makefile omits
|
|
30
102
|
# new sources and macOS's -undefined dynamic_lookup turns the missing symbols
|
|
31
103
|
# into runtime NULL calls. The vendored Lexbor build is deliberately NOT wiped
|
|
32
104
|
# here (it is slow to rebuild and rarely changes); use `rake clean:lexbor` for
|
|
@@ -63,6 +135,28 @@ def asan_runtime_path
|
|
|
63
135
|
nil
|
|
64
136
|
end
|
|
65
137
|
|
|
138
|
+
def libfuzzer_available?
|
|
139
|
+
cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
|
|
140
|
+
Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
|
|
141
|
+
src = File.join(dir, "check.cc")
|
|
142
|
+
exe = File.join(dir, "check")
|
|
143
|
+
File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
|
|
144
|
+
return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
|
|
145
|
+
out: File::NULL, err: File::NULL)
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# The compiled extension, and whether it carries sanitizer instrumentation, so
|
|
150
|
+
# `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
|
|
151
|
+
def ext_bundle_path
|
|
152
|
+
Dir["lib/makiri/makiri.{bundle,so}"].first
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def ext_sanitized?
|
|
156
|
+
bundle = ext_bundle_path or return false
|
|
157
|
+
!(`nm "#{bundle}" 2>/dev/null` =~ /asan|ubsan/i).nil?
|
|
158
|
+
end
|
|
159
|
+
|
|
66
160
|
desc "Build the extension with sanitizers (MAKIRI_SANITIZE, default " \
|
|
67
161
|
"address,undefined) and run the spec suite under them"
|
|
68
162
|
task :sanitize do
|
|
@@ -87,11 +181,99 @@ task :sanitize do
|
|
|
87
181
|
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
88
182
|
end
|
|
89
183
|
|
|
184
|
+
desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
|
|
185
|
+
"Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
|
|
186
|
+
"a line-level detail file to tmp/coverage/show.txt."
|
|
187
|
+
task :coverage do
|
|
188
|
+
require "fileutils"
|
|
189
|
+
dir = File.expand_path("tmp/coverage")
|
|
190
|
+
FileUtils.rm_rf(dir)
|
|
191
|
+
FileUtils.mkdir_p(dir)
|
|
192
|
+
|
|
193
|
+
# Instrument only our sources (Lexbor is built separately, uninstrumented).
|
|
194
|
+
sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
195
|
+
# %p -> PID, so any forked spec process gets its own raw profile.
|
|
196
|
+
sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
|
|
197
|
+
|
|
198
|
+
profdata = File.join(dir, "makiri.profdata")
|
|
199
|
+
bundle = "lib/makiri/makiri.bundle"
|
|
200
|
+
ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
|
|
201
|
+
sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
|
|
202
|
+
sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
|
|
203
|
+
"-ignore-filename-regex='#{ignore}' -show-branch-summary"
|
|
204
|
+
show = File.join(dir, "show.txt")
|
|
205
|
+
sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
|
|
206
|
+
"-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
|
|
207
|
+
puts "\ncoverage line/branch detail: #{show}"
|
|
208
|
+
puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
|
|
212
|
+
"INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
|
|
213
|
+
"spec suite, or FUZZ_ARGS via the fuzzer when set."
|
|
214
|
+
task "sanitize:lexbor" do
|
|
215
|
+
sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
|
|
216
|
+
sanitize.include?("address") or
|
|
217
|
+
abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
|
|
218
|
+
|
|
219
|
+
# MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
|
|
220
|
+
# (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
|
|
221
|
+
# plain<->asan switch, so no manual clean:lexbor is needed before or after.
|
|
222
|
+
build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
|
|
223
|
+
sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
|
|
224
|
+
|
|
225
|
+
env = {
|
|
226
|
+
"ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
|
|
227
|
+
"detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
|
|
228
|
+
"UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
|
|
229
|
+
}
|
|
230
|
+
runtime = asan_runtime_path or
|
|
231
|
+
abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
|
|
232
|
+
preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
|
|
233
|
+
env[preload] = runtime
|
|
234
|
+
puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
|
|
235
|
+
|
|
236
|
+
if ENV["FUZZ_ARGS"]
|
|
237
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
238
|
+
else
|
|
239
|
+
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
90
243
|
desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
|
|
91
244
|
task fuzz: :compile do
|
|
92
245
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
|
|
93
246
|
end
|
|
94
247
|
|
|
248
|
+
desc "Fuzz the XML parser (hostile/mutated documents; override via FUZZ_ARGS)"
|
|
249
|
+
task "fuzz:xml": :compile do
|
|
250
|
+
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target xml #{ENV['FUZZ_ARGS']}"
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
desc "Fuzz the XML mutation surface (random edit sequences + invariants; override via FUZZ_ARGS)"
|
|
254
|
+
task "fuzz:mutate": :compile do
|
|
255
|
+
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
|
|
259
|
+
task leaks: :compile do
|
|
260
|
+
# ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
|
|
261
|
+
# leaks are otherwise never machine-checked; see script/check_leaks.rb.
|
|
262
|
+
sh "#{FileUtils::RUBY} script/check_leaks.rb"
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
|
|
266
|
+
"allocation site, verifying each failure fails closed (clean raise or " \
|
|
267
|
+
"baseline-identical result, never truncated output)"
|
|
268
|
+
task :oom do
|
|
269
|
+
# The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
|
|
270
|
+
# a normal build), so this needs its own rebuild; see
|
|
271
|
+
# script/check_alloc_failures.rb for the protocol and the property gated.
|
|
272
|
+
sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
273
|
+
sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
|
|
274
|
+
puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
|
|
275
|
+
end
|
|
276
|
+
|
|
95
277
|
desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
|
|
96
278
|
task bench: :compile do
|
|
97
279
|
# Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
|
|
@@ -101,6 +283,13 @@ task bench: :compile do
|
|
|
101
283
|
end
|
|
102
284
|
end
|
|
103
285
|
|
|
286
|
+
desc "Run the XML reader benchmark (Makiri::XML vs Nokogiri::XML reference)"
|
|
287
|
+
task "bench:xml" => :compile do
|
|
288
|
+
Bundler.with_unbundled_env do
|
|
289
|
+
sh "#{FileUtils::RUBY} -Ilib bench/bench_xml.rb"
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
|
|
104
293
|
namespace :conformance do
|
|
105
294
|
desc "WHATWG HTML5 parsing conformance: run html5lib-tests through Makiri"
|
|
106
295
|
task html5: :compile do
|
|
@@ -116,22 +305,77 @@ namespace :conformance do
|
|
|
116
305
|
end
|
|
117
306
|
end
|
|
118
307
|
|
|
308
|
+
desc "XML XPath 1.0 differential conformance: Makiri::XML vs Nokogiri::XML"
|
|
309
|
+
task xpath_xml: :compile do
|
|
310
|
+
Bundler.with_unbundled_env do
|
|
311
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_xpath_diff.rb #{ENV['XPATH_ARGS']}"
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
desc "W3C XML Conformance Test Suite: well-formedness through Makiri::XML"
|
|
316
|
+
task xmlconf: :compile do
|
|
317
|
+
# Nokogiri (bench-only) parses the manifests, so run outside the bundle.
|
|
318
|
+
Bundler.with_unbundled_env do
|
|
319
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xmlconf_runner.rb #{ENV['XMLCONF_ARGS']}"
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
desc "Property-based XML differential: generated documents, Makiri vs Nokogiri tree"
|
|
324
|
+
task xml_pbt: :compile do
|
|
325
|
+
Bundler.with_unbundled_env do
|
|
326
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_pbt_diff.rb #{ENV['PBT_ARGS']}"
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
119
330
|
desc "CSS Selectors differential conformance vs Nokogiri::HTML5"
|
|
120
331
|
task css: :compile do
|
|
121
332
|
Bundler.with_unbundled_env do
|
|
122
333
|
sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
|
|
123
334
|
end
|
|
124
335
|
end
|
|
336
|
+
|
|
337
|
+
desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
|
|
338
|
+
task css_xml: :compile do
|
|
339
|
+
Bundler.with_unbundled_env do
|
|
340
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
|
|
345
|
+
task builder: :compile do
|
|
346
|
+
Bundler.with_unbundled_env do
|
|
347
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
|
|
348
|
+
end
|
|
349
|
+
end
|
|
125
350
|
end
|
|
126
351
|
|
|
127
|
-
desc "Run all conformance suites
|
|
128
|
-
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
352
|
+
desc "Run all conformance suites"
|
|
353
|
+
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
354
|
+
conformance:xmlconf conformance:xpath_xml conformance:css_xml
|
|
355
|
+
conformance:builder]
|
|
129
356
|
|
|
130
357
|
namespace :fuzz do
|
|
131
|
-
|
|
358
|
+
# Run the fuzzer under the sanitizer. Toggles (all via env):
|
|
359
|
+
# FAST=1 run the surfaces NON-isolated (one process, no fork-per-query).
|
|
360
|
+
# Far higher throughput; ASan still aborts on a memory error
|
|
361
|
+
# (halt_on_error). The default (isolated) is the complete net:
|
|
362
|
+
# it also survives + attributes a genuine segfault and catches a
|
|
363
|
+
# hang via the per-query timeout, at much lower throughput.
|
|
364
|
+
# SKIP_BUILD=1 reuse the current build instead of rebuilding (refuses to run
|
|
365
|
+
# if it is not a sanitizer build, so you never fuzz a plain ext).
|
|
366
|
+
# FUZZ_TIME=N seconds per surface (default 90).
|
|
367
|
+
# FUZZ_ARGS=... run a single custom invocation instead of the three surfaces.
|
|
368
|
+
desc "Run the fuzzer under AddressSanitizer (FAST=1 non-isolated, SKIP_BUILD=1 reuse build)"
|
|
132
369
|
task :sanitize do
|
|
133
370
|
sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
|
|
134
|
-
|
|
371
|
+
if %w[1 true yes].include?(ENV["SKIP_BUILD"].to_s.downcase)
|
|
372
|
+
ext_sanitized? or
|
|
373
|
+
abort "fuzz:sanitize: SKIP_BUILD set but lib/makiri is not a sanitizer build; " \
|
|
374
|
+
"drop SKIP_BUILD to rebuild with MAKIRI_SANITIZE"
|
|
375
|
+
puts "fuzz:sanitize: reusing the existing sanitizer build (SKIP_BUILD)"
|
|
376
|
+
else
|
|
377
|
+
sh({ "MAKIRI_SANITIZE" => sanitize }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
378
|
+
end
|
|
135
379
|
|
|
136
380
|
env = {
|
|
137
381
|
"ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
|
|
@@ -144,7 +388,50 @@ namespace :fuzz do
|
|
|
144
388
|
preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
|
|
145
389
|
env[preload] = runtime
|
|
146
390
|
end
|
|
147
|
-
|
|
148
|
-
|
|
391
|
+
|
|
392
|
+
if ENV["FUZZ_ARGS"]
|
|
393
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
394
|
+
else
|
|
395
|
+
iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
|
|
396
|
+
secs = ENV["FUZZ_TIME"] || "90"
|
|
397
|
+
# Cover every surface under the sanitizer: the query engine (XPath/CSS over
|
|
398
|
+
# parsed fixtures), the XML parser (hostile documents), and the XML mutation
|
|
399
|
+
# surface (random edit sequences + invariants).
|
|
400
|
+
["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
|
|
401
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
|
|
402
|
+
end
|
|
403
|
+
end
|
|
149
404
|
end
|
|
405
|
+
|
|
406
|
+
# Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
|
|
407
|
+
# XPath compile+eval). These are Ruby-free standalone binaries, so they run
|
|
408
|
+
# directly under clang's libFuzzer driver without the Ruby interpreter.
|
|
409
|
+
# They complement the Ruby-based robustness fuzzer by providing coverage
|
|
410
|
+
# feedback and 2-3 orders of magnitude faster execution for the C core.
|
|
411
|
+
desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
|
|
412
|
+
task :libfuzzer_build => :compile do
|
|
413
|
+
libfuzzer_available? or
|
|
414
|
+
abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
|
|
415
|
+
"Install an LLVM clang with libFuzzer support and run with " \
|
|
416
|
+
"CLANG=/path/to/clang CXX=/path/to/clang++."
|
|
417
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
418
|
+
sh "make clean"
|
|
419
|
+
sh "make all"
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
|
|
424
|
+
task :libfuzzer => :libfuzzer_build do
|
|
425
|
+
time = ENV["FUZZ_TIME"] || "60"
|
|
426
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
427
|
+
sh "mkdir -p corpus/xml corpus/xpath"
|
|
428
|
+
sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
|
|
429
|
+
sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
desc "Show code statistics"
|
|
435
|
+
task :stats do
|
|
436
|
+
sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
|
|
150
437
|
end
|
data/ext/makiri/bridge/bridge.h
CHANGED
|
@@ -46,6 +46,34 @@ mkr_ruby_borrowed_bytes_t mkr_ruby_bytes_view(VALUE in);
|
|
|
46
46
|
* for an empty input), suitable for use while the GVL is released. */
|
|
47
47
|
int mkr_ruby_copy_bytes(VALUE in, mkr_owned_bytes_t *out);
|
|
48
48
|
|
|
49
|
+
/* Return a UTF-8 Ruby String for `str`, honouring its declared encoding: UTF-8 /
|
|
50
|
+
* US-ASCII / ASCII-8BIT are returned unchanged (the parser handles their bytes
|
|
51
|
+
* directly); any other encoding is transcoded to UTF-8 (invalid/undef -> U+FFFD)
|
|
52
|
+
* so its content is preserved rather than read as raw UTF-8. The UTF-8 common
|
|
53
|
+
* case is a single encoding comparison. */
|
|
54
|
+
VALUE mkr_ruby_to_utf8(VALUE str);
|
|
55
|
+
|
|
56
|
+
/* STRICT decode for XML (§2.1): like mkr_ruby_to_utf8 it honours the String's
|
|
57
|
+
* declared encoding (UTF-8 / US-ASCII / ASCII-8BIT pass through; any other
|
|
58
|
+
* encoding is transcoded to UTF-8) - but FAIL-CLOSED, never lenient: a non-UTF-8
|
|
59
|
+
* byte that can't be converted, invalid UTF-8, or an embedded NUL all raise
|
|
60
|
+
* Makiri::XML::SyntaxError (no U+FFFD replacement). Returns a validated,
|
|
61
|
+
* UTF-8-tagged Ruby String. (The HTML replace path mkr_ruby_to_utf8 itself is
|
|
62
|
+
* NOT reused for the conversion - only its encoding-judgment rule is shared.)
|
|
63
|
+
*
|
|
64
|
+
* +max_bytes+ bounds the decoded UTF-8 length: an input that already exceeds the
|
|
65
|
+
* parser's arena byte budget is rejected here with Makiri::XML::LimitExceeded,
|
|
66
|
+
* before the validation copy and the caller's GVL-release copy (so a hostile
|
|
67
|
+
* oversized document is not copied twice for a doomed parse). 0 disables the
|
|
68
|
+
* check (decode-only callers that build no arena). */
|
|
69
|
+
VALUE mkr_xml_decode_input(VALUE str, size_t max_bytes);
|
|
70
|
+
|
|
71
|
+
/* True if `str` is *already known* to be valid UTF-8 - pure ASCII, or valid in
|
|
72
|
+
* the UTF-8 encoding - from its cached coderange, WITHOUT forcing a scan. Lets
|
|
73
|
+
* the parse skip mkr_utf8_sanitize's validation pass for input Ruby has already
|
|
74
|
+
* classified (an unknown/broken coderange returns false: sanitize handles it). */
|
|
75
|
+
bool mkr_ruby_str_known_valid_utf8(VALUE str);
|
|
76
|
+
|
|
49
77
|
/* Validate a Ruby String for use as an XPath engine string: valid UTF-8,
|
|
50
78
|
* no interior NUL, and at most +max_bytes+. Returns NULL on success and fills
|
|
51
79
|
* +out+; otherwise returns a static reason string. +sv+ must be a String. */
|