makiri 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +138 -0
- data/CHANGELOG.md +127 -2
- data/README.md +95 -77
- data/Rakefile +207 -3
- data/ext/makiri/bridge/ruby_string.c +159 -80
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +13 -3
- data/ext/makiri/core/mkr_buf.h +80 -5
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +10 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
- data/ext/makiri/dom_adapter/cross_import.c +434 -0
- data/ext/makiri/dom_adapter/cross_import.h +35 -0
- data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +14 -16
- data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
- data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +5 -78
- data/ext/makiri/extconf.rb +104 -9
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/cross_import.h +30 -0
- data/ext/makiri/glue/glue.h +9 -1
- data/ext/makiri/glue/ruby_doc.c +31 -27
- data/ext/makiri/glue/ruby_html_css.c +58 -12
- data/ext/makiri/glue/ruby_html_mutate.c +17 -6
- data/ext/makiri/glue/ruby_html_node.c +4 -33
- data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
- data/ext/makiri/glue/ruby_node.c +53 -0
- data/ext/makiri/glue/ruby_xml.c +228 -17
- data/ext/makiri/glue/ruby_xml_node.c +133 -61
- data/ext/makiri/glue/ruby_xpath.c +20 -5
- data/ext/makiri/makiri.c +48 -0
- data/ext/makiri/makiri.h +5 -0
- data/ext/makiri/xml/mkr_xml.h +7 -3
- data/ext/makiri/xml/mkr_xml_chars.c +89 -97
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +220 -168
- data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
- data/ext/makiri/xml/mkr_xml_node.c +147 -15
- data/ext/makiri/xml/mkr_xml_node.h +71 -6
- data/ext/makiri/xml/mkr_xml_tree.c +246 -174
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +65 -0
- data/ext/makiri/xpath/mkr_xpath.h +18 -1
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +383 -90
- data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
- data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
- data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
- data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
- data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
- data/lib/makiri/cdata_section.rb +1 -3
- data/lib/makiri/comment.rb +1 -3
- data/lib/makiri/document.rb +8 -0
- data/lib/makiri/element.rb +1 -3
- data/lib/makiri/html/document.rb +11 -12
- data/lib/makiri/html/node_methods.rb +0 -1
- data/lib/makiri/node_set.rb +14 -9
- data/lib/makiri/processing_instruction.rb +8 -2
- data/lib/makiri/text.rb +1 -3
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +271 -0
- data/lib/makiri/xml/node_methods.rb +47 -0
- data/lib/makiri/xpath_context.rb +12 -4
- data/lib/makiri.rb +1 -0
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +45 -2
- data/script/check_c_safety_allowlist.yml +27 -5
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +71 -0
- data/suppressions/ruby.supp +140 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +31 -8
- /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
data/README.md
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# Makiri
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
Makiri is a Ruby library for parsing and querying HTML and XML documents.
|
|
4
|
+
|
|
5
|
+
It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
|
|
6
|
+
Makiri does not depend on libxml2.
|
|
6
7
|
|
|
7
8
|
> [!WARNING]
|
|
8
9
|
> Status: early release. APIs and behavior may change before v1.0.
|
|
@@ -14,14 +15,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
|
|
|
14
15
|
|
|
15
16
|
* HTML5 parsing via [Lexbor](https://lexbor.com)
|
|
16
17
|
* Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
|
|
17
|
-
* Lexbor-specific behavior is isolated in a thin compatibility layer
|
|
18
|
-
(`ext/makiri/lexbor_compat/`).
|
|
19
18
|
* CSS selector support via Lexbor
|
|
20
19
|
* Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
|
|
21
20
|
* Native XPath 1.0 engine
|
|
22
21
|
* XPath is parsed and evaluated by Makiri's own engine, written from scratch.
|
|
23
22
|
* Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
|
|
24
|
-
* Native XML 1.0
|
|
23
|
+
* Native XML 1.0 parser
|
|
25
24
|
* A strict, non-validating, fail-closed parser with its own node arena (not
|
|
26
25
|
Lexbor's HTML DOM), queried through the same native XPath engine, with
|
|
27
26
|
in-place tree edits (attributes, content, rename, remove).
|
|
@@ -81,21 +80,6 @@ ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
|
|
|
81
80
|
|
|
82
81
|
### XML (with in-place editing)
|
|
83
82
|
|
|
84
|
-
`Makiri::XML(source)` parses **XML 1.0** with a native, strict,
|
|
85
|
-
well-formedness-checking parser (no libxml2) and queries it through the same
|
|
86
|
-
native XPath 1.0 engine. `source` is a String or any object responding to
|
|
87
|
-
`#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
|
|
88
|
-
(`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
|
|
89
|
-
**fail-closed**: malformed input, a duplicate attribute, or a
|
|
90
|
-
non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
|
|
91
|
-
XML does not support raise `NotImplementedError` rather than returning a wrong
|
|
92
|
-
result. The tree supports in-place edits and building new subtrees (see below).
|
|
93
|
-
A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
|
|
94
|
-
entity/element declarations are loaded, no external subset is fetched) - so a
|
|
95
|
-
DTD-defined entity reference stays an undefined-entity error and **XXE /
|
|
96
|
-
billion-laughs are structurally impossible**. The doctype's name and identifiers
|
|
97
|
-
are still readable:
|
|
98
|
-
|
|
99
83
|
```ruby
|
|
100
84
|
doc = Makiri::XML(<<~XML)
|
|
101
85
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
@@ -119,10 +103,13 @@ el = doc.at_xpath("//a:entry", ns)
|
|
|
119
103
|
el.local_name # => "entry"
|
|
120
104
|
el.namespace_uri # => "http://www.w3.org/2005/Atom"
|
|
121
105
|
|
|
122
|
-
|
|
106
|
+
# CSS selectors work too (lowered to the native XPath engine): a bare type
|
|
107
|
+
# selector binds to the document's default namespace, so this just works.
|
|
108
|
+
doc.css("entry").length # => 2
|
|
109
|
+
doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
|
|
123
110
|
|
|
124
111
|
# Serialize back to XML
|
|
125
|
-
doc.to_xml # => "<?xml version=\"1.0\"
|
|
112
|
+
doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
|
|
126
113
|
doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
|
|
127
114
|
doc.to_xml(pretty: true) # indented, element-only content
|
|
128
115
|
|
|
@@ -134,22 +121,7 @@ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
|
|
|
134
121
|
dtd.system_id # => "x.dtd"
|
|
135
122
|
```
|
|
136
123
|
|
|
137
|
-
|
|
138
|
-
children (reachable via `//comment()` / `//processing-instruction()` and
|
|
139
|
-
`#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
|
|
140
|
-
data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
|
|
141
|
-
or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
|
|
142
|
-
transcode, with a hex character reference for anything the encoding can't hold);
|
|
143
|
-
a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
|
|
144
|
-
Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
|
|
145
|
-
comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
|
|
146
|
-
(Lexbor's selector engine lower-cases names, which breaks XML case/namespace
|
|
147
|
-
matching) - use XPath.
|
|
148
|
-
|
|
149
|
-
The tree supports in-place mutation - every edit validates its input (names as
|
|
150
|
-
XML 1.0 QNames, values as XML Char) so the tree stays serializable to
|
|
151
|
-
well-formed XML, and a removed node is detached, never freed, so a live wrapper
|
|
152
|
-
that aliases it stays usable:
|
|
124
|
+
The tree supports in-place mutation.
|
|
153
125
|
|
|
154
126
|
```ruby
|
|
155
127
|
doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
|
|
@@ -165,15 +137,17 @@ doc.at_xpath("//draft").remove
|
|
|
165
137
|
doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
|
|
166
138
|
```
|
|
167
139
|
|
|
168
|
-
|
|
169
|
-
`#
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
140
|
+
XML subtrees can be built with `Document#create_element` and related node factory methods,
|
|
141
|
+
then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
|
|
142
|
+
namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
|
|
143
|
+
|
|
144
|
+
`Document#import_node(node, deep = false)` brings a node into a document as a
|
|
145
|
+
detached copy, and works **across representations**: importing a `Makiri::HTML`
|
|
146
|
+
node into a `Makiri::XML::Document` (or vice versa) translates the subtree between
|
|
147
|
+
the two node representations, preserving namespaces (e.g. an inline `<svg>` keeps
|
|
148
|
+
the SVG namespace, HTML elements the XHTML namespace; custom namespaces are
|
|
149
|
+
preserved across both directions). An XML CDATA section has no HTML counterpart,
|
|
150
|
+
so importing one into an HTML document raises.
|
|
177
151
|
|
|
178
152
|
```ruby
|
|
179
153
|
doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
|
|
@@ -185,34 +159,29 @@ doc.root.add_child(entry)
|
|
|
185
159
|
doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
|
|
186
160
|
```
|
|
187
161
|
|
|
188
|
-
|
|
189
|
-
`#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
|
|
190
|
-
`#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
|
|
191
|
-
`DocumentFragment` (its children are spliced in); a fragment is parsed by
|
|
192
|
-
`Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
|
|
193
|
-
(standalone). A raw string handed straight to `#add_child` is **not** accepted -
|
|
194
|
-
parse it into a fragment first. A whole document can also be built from scratch
|
|
195
|
-
with `XML::Document.new` + `#root=` and the factories.
|
|
196
|
-
|
|
197
|
-
The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
|
|
198
|
-
the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
|
|
199
|
-
in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
|
|
200
|
-
concrete String encoding stays authoritative - a BOM or declaration that
|
|
201
|
-
contradicts it is a fatal error, not a silent mis-decode.
|
|
202
|
-
|
|
203
|
-
Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
|
|
204
|
-
counting node structs and text), which fits every standard document. Raise it
|
|
205
|
-
per parse for an unusually large one:
|
|
162
|
+
`Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
|
|
206
163
|
|
|
207
164
|
```ruby
|
|
208
|
-
|
|
165
|
+
builder = Makiri::XML::Builder.new do |xml|
|
|
166
|
+
xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
|
|
167
|
+
xml.title("Example Feed")
|
|
168
|
+
xml.entry("dc:id" => "1") do
|
|
169
|
+
xml.title("First")
|
|
170
|
+
xml.summary { xml.cdata("raw <b>html</b>") }
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
builder.to_xml # the whole document (with XML declaration)
|
|
176
|
+
builder.doc # the Makiri::XML::Document being built
|
|
209
177
|
```
|
|
210
178
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
179
|
+
XML parsing is bounded by an arena memory limit, 256 MiB by default,
|
|
180
|
+
and unusually large documents can raise it with `max_bytes:`.
|
|
181
|
+
|
|
182
|
+
```ruby
|
|
183
|
+
Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
|
|
184
|
+
```
|
|
216
185
|
|
|
217
186
|
## Non-goals (v1.0)
|
|
218
187
|
|
|
@@ -265,20 +234,60 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
|
|
|
265
234
|
markup string straight to `#add_child` is unsupported (parse it into a fragment
|
|
266
235
|
first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
|
|
267
236
|
/ `inner_html` / `outer_html` - is not.)
|
|
237
|
+
* A colon in a processing-instruction target is well-formed (`<?a:b ...?>` parses).
|
|
238
|
+
* XML 1.0 §2.6: a `PITarget` is a `Name`, not an NCName, and Namespaces in XML
|
|
239
|
+
1.0's normative conformance section constrains only element/attribute names
|
|
240
|
+
(QNames), never PI targets. Nokogiri/libxml2 rejects it (`colons are forbidden
|
|
241
|
+
from PI names`); Makiri follows the normative text. Only the reserved `xml`
|
|
242
|
+
(any case) target is rejected.
|
|
268
243
|
* Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
|
|
269
244
|
the property-based differential), including namespaces, prolog/epilog comments
|
|
270
245
|
and PIs, and adjacent-CDATA coalescing.
|
|
271
246
|
|
|
272
247
|
### CSS
|
|
273
248
|
|
|
274
|
-
* jQuery/Nokogiri CSS extensions are not supported (`:
|
|
275
|
-
* Makiri uses Lexbor's standards-
|
|
276
|
-
Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
249
|
+
* Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
|
|
250
|
+
* Makiri uses Lexbor's selector engine, which is standards-based apart from one
|
|
251
|
+
text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
|
|
252
|
+
Enumerable (`css('li')[1]`) for the rest.
|
|
277
253
|
Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
|
|
254
|
+
* `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
|
|
255
|
+
spelling of the jQuery `:contains()` substring filter, matching an element
|
|
256
|
+
whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
|
|
257
|
+
for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
|
|
258
|
+
alias.) Like Lexbor's matcher, it tests the element's **immediate child text
|
|
259
|
+
nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
|
|
260
|
+
to XPath `child::text()[contains(., "text")]`.
|
|
261
|
+
* Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
|
|
262
|
+
selector) is supported and correct on both HTML and XML - the "type" is the
|
|
263
|
+
element's own expanded name.
|
|
264
|
+
* Nokogiri (XML and HTML5) mistranslates these to first-/only-child
|
|
265
|
+
(`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
|
|
266
|
+
Lexbor's HTML matcher.
|
|
278
267
|
* Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
|
|
279
268
|
* `Nokogiri::HTML5` is case-sensitive there.
|
|
280
|
-
|
|
281
|
-
|
|
269
|
+
|
|
270
|
+
## Conformance
|
|
271
|
+
|
|
272
|
+
The XPath engine and XML parser are original code, so their correctness is held by
|
|
273
|
+
differential and standards harnesses in `spec/conformance/`.
|
|
274
|
+
The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
|
|
275
|
+
(Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
|
|
276
|
+
HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
|
|
277
|
+
parsing itself is checked against the WHATWG html5lib-tests corpus, and
|
|
278
|
+
XPath-over-HTML semantics additionally against browsers via a WPT port.
|
|
279
|
+
See also [`spec/conformance/README.md`](spec/conformance/README.md).
|
|
280
|
+
|
|
281
|
+
| Suite | Input | Oracle | `rake` task |
|
|
282
|
+
|---|---|---|---|
|
|
283
|
+
| HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
|
|
284
|
+
| XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
|
|
285
|
+
| XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
|
|
286
|
+
| CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
|
|
287
|
+
| Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
|
|
288
|
+
| XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
|
|
289
|
+
| Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
|
|
290
|
+
| CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
|
|
282
291
|
|
|
283
292
|
## Requirements
|
|
284
293
|
|
|
@@ -295,6 +304,15 @@ bundle exec rake compile
|
|
|
295
304
|
bundle exec rake spec
|
|
296
305
|
```
|
|
297
306
|
|
|
307
|
+
### Vendored Lexbor version
|
|
308
|
+
|
|
309
|
+
`vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
|
|
310
|
+
commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
|
|
311
|
+
case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
|
|
312
|
+
heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
|
|
313
|
+
bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
|
|
314
|
+
v3.0.0. See `CLAUDE.md` for details.
|
|
315
|
+
|
|
298
316
|
## License
|
|
299
317
|
|
|
300
318
|
Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
|
data/Rakefile
CHANGED
|
@@ -4,6 +4,7 @@ require "bundler/gem_tasks"
|
|
|
4
4
|
require "rspec/core/rake_task"
|
|
5
5
|
require "rake/extensiontask"
|
|
6
6
|
require "shellwords"
|
|
7
|
+
require "tmpdir"
|
|
7
8
|
|
|
8
9
|
GEMSPEC = Gem::Specification.load("makiri.gemspec")
|
|
9
10
|
|
|
@@ -35,6 +36,72 @@ RSpec::Core::RakeTask.new(:spec)
|
|
|
35
36
|
|
|
36
37
|
task default: %i[compile spec]
|
|
37
38
|
|
|
39
|
+
# `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
|
|
40
|
+
# ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
|
|
41
|
+
# Ruby's own Valgrind suppression files (matched by Ruby version) and filters
|
|
42
|
+
# the report down to errors whose stack touches our extension, so we no longer
|
|
43
|
+
# have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
|
|
44
|
+
#
|
|
45
|
+
# We keep this job's historical contract: catch *use of uninitialised values*
|
|
46
|
+
# and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
|
|
47
|
+
# detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
|
|
48
|
+
# which disable undef-value errors and turn on full leak-check.
|
|
49
|
+
#
|
|
50
|
+
# `filter_all_errors: true` is essential: by default ruby_memcheck only applies
|
|
51
|
+
# its "stack must touch the makiri binary" filter to *leak*-kind errors
|
|
52
|
+
# (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
|
|
53
|
+
# uninitialised-value report is surfaced regardless of where it comes from. Ruby's
|
|
54
|
+
# conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
|
|
55
|
+
# legitimately reads uninitialised words, and the bundled ruby.supp does not cover
|
|
56
|
+
# the free-at-exit / subprocess stacks the `:isolated` specs spin up under
|
|
57
|
+
# `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
|
|
58
|
+
# Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
|
|
59
|
+
# *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
|
|
60
|
+
# and is still reported.
|
|
61
|
+
#
|
|
62
|
+
# BUT the binary-touch filter is too coarse for one residual class: when a GC
|
|
63
|
+
# cycle fires *inside* one of our allocations (or marks through our mark
|
|
64
|
+
# callback), CRuby's conservative collector legitimately reads uninitialised
|
|
65
|
+
# words (machine-stack scan reading stale frames, incremental mark/sweep reading
|
|
66
|
+
# not-yet-written RVALUE flags) while a makiri frame sits on the stack - so ~190
|
|
67
|
+
# of these pure-Ruby-GC false positives pass the filter. The gem's bundled
|
|
68
|
+
# ruby.supp only covers `each_location*` under Addr8, not the Cond/Value8 reads
|
|
69
|
+
# we hit. `suppressions/ruby.supp` (auto-loaded by ruby_memcheck: it globs
|
|
70
|
+
# `<dir>/<ruby-version>.supp`, and the bare `ruby.supp` matches every version)
|
|
71
|
+
# suppresses exactly those GC-driver-anchored uninit reads, plus the VM
|
|
72
|
+
# method-cache id_table the interpreter never frees before exit. A real uninit
|
|
73
|
+
# read in our code does not descend from a GC driver, so it still fails.
|
|
74
|
+
#
|
|
75
|
+
# Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
|
|
76
|
+
# normal `bundle exec rake` (without that group) must not fail to load.
|
|
77
|
+
begin
|
|
78
|
+
require "ruby_memcheck"
|
|
79
|
+
require "ruby_memcheck/rspec/rake_task"
|
|
80
|
+
|
|
81
|
+
RubyMemcheck.config(
|
|
82
|
+
binary_name: "makiri",
|
|
83
|
+
filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
|
|
84
|
+
# not just leaks (see note above) - drops Ruby's own
|
|
85
|
+
# GC uninitialised-value noise, keeps mkr_* reports
|
|
86
|
+
valgrind_options: [
|
|
87
|
+
"--num-callers=50",
|
|
88
|
+
"--error-limit=no",
|
|
89
|
+
"--trace-children=yes", # spec processes may fork
|
|
90
|
+
"--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
|
|
91
|
+
"--track-origins=yes", # report where an uninitialised value came from
|
|
92
|
+
"--leak-check=no", # leaks are `rake leaks`' job, not this one
|
|
93
|
+
],
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
namespace :spec do
|
|
97
|
+
desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
|
|
98
|
+
":valgrind bundler group and the valgrind binary)"
|
|
99
|
+
RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
|
|
100
|
+
end
|
|
101
|
+
rescue LoadError
|
|
102
|
+
# ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
|
|
103
|
+
end
|
|
104
|
+
|
|
38
105
|
namespace :security do
|
|
39
106
|
desc "Run mechanical C safety lint over ext/makiri"
|
|
40
107
|
task :clint do
|
|
@@ -81,6 +148,17 @@ def asan_runtime_path
|
|
|
81
148
|
nil
|
|
82
149
|
end
|
|
83
150
|
|
|
151
|
+
def libfuzzer_available?
|
|
152
|
+
cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
|
|
153
|
+
Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
|
|
154
|
+
src = File.join(dir, "check.cc")
|
|
155
|
+
exe = File.join(dir, "check")
|
|
156
|
+
File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
|
|
157
|
+
return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
|
|
158
|
+
out: File::NULL, err: File::NULL)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
84
162
|
# The compiled extension, and whether it carries sanitizer instrumentation, so
|
|
85
163
|
# `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
|
|
86
164
|
def ext_bundle_path
|
|
@@ -116,6 +194,65 @@ task :sanitize do
|
|
|
116
194
|
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
117
195
|
end
|
|
118
196
|
|
|
197
|
+
desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
|
|
198
|
+
"Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
|
|
199
|
+
"a line-level detail file to tmp/coverage/show.txt."
|
|
200
|
+
task :coverage do
|
|
201
|
+
require "fileutils"
|
|
202
|
+
dir = File.expand_path("tmp/coverage")
|
|
203
|
+
FileUtils.rm_rf(dir)
|
|
204
|
+
FileUtils.mkdir_p(dir)
|
|
205
|
+
|
|
206
|
+
# Instrument only our sources (Lexbor is built separately, uninstrumented).
|
|
207
|
+
sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
208
|
+
# %p -> PID, so any forked spec process gets its own raw profile.
|
|
209
|
+
sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
|
|
210
|
+
|
|
211
|
+
profdata = File.join(dir, "makiri.profdata")
|
|
212
|
+
bundle = "lib/makiri/makiri.bundle"
|
|
213
|
+
ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
|
|
214
|
+
sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
|
|
215
|
+
sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
|
|
216
|
+
"-ignore-filename-regex='#{ignore}' -show-branch-summary"
|
|
217
|
+
show = File.join(dir, "show.txt")
|
|
218
|
+
sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
|
|
219
|
+
"-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
|
|
220
|
+
puts "\ncoverage line/branch detail: #{show}"
|
|
221
|
+
puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
|
|
225
|
+
"INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
|
|
226
|
+
"spec suite, or FUZZ_ARGS via the fuzzer when set."
|
|
227
|
+
task "sanitize:lexbor" do
|
|
228
|
+
sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
|
|
229
|
+
sanitize.include?("address") or
|
|
230
|
+
abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
|
|
231
|
+
|
|
232
|
+
# MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
|
|
233
|
+
# (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
|
|
234
|
+
# plain<->asan switch, so no manual clean:lexbor is needed before or after.
|
|
235
|
+
build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
|
|
236
|
+
sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
|
|
237
|
+
|
|
238
|
+
env = {
|
|
239
|
+
"ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
|
|
240
|
+
"detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
|
|
241
|
+
"UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
|
|
242
|
+
}
|
|
243
|
+
runtime = asan_runtime_path or
|
|
244
|
+
abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
|
|
245
|
+
preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
|
|
246
|
+
env[preload] = runtime
|
|
247
|
+
puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
|
|
248
|
+
|
|
249
|
+
if ENV["FUZZ_ARGS"]
|
|
250
|
+
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
251
|
+
else
|
|
252
|
+
sh(env, "#{FileUtils::RUBY} -S rspec")
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
119
256
|
desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
|
|
120
257
|
task fuzz: :compile do
|
|
121
258
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
|
|
@@ -131,6 +268,25 @@ task "fuzz:mutate": :compile do
|
|
|
131
268
|
sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
|
|
132
269
|
end
|
|
133
270
|
|
|
271
|
+
desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
|
|
272
|
+
task leaks: :compile do
|
|
273
|
+
# ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
|
|
274
|
+
# leaks are otherwise never machine-checked; see script/check_leaks.rb.
|
|
275
|
+
sh "#{FileUtils::RUBY} script/check_leaks.rb"
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
|
|
279
|
+
"allocation site, verifying each failure fails closed (clean raise or " \
|
|
280
|
+
"baseline-identical result, never truncated output)"
|
|
281
|
+
task :oom do
|
|
282
|
+
# The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
|
|
283
|
+
# a normal build), so this needs its own rebuild; see
|
|
284
|
+
# script/check_alloc_failures.rb for the protocol and the property gated.
|
|
285
|
+
sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
|
|
286
|
+
sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
|
|
287
|
+
puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
|
|
288
|
+
end
|
|
289
|
+
|
|
134
290
|
desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
|
|
135
291
|
task bench: :compile do
|
|
136
292
|
# Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
|
|
@@ -190,10 +346,26 @@ namespace :conformance do
|
|
|
190
346
|
sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
|
|
191
347
|
end
|
|
192
348
|
end
|
|
349
|
+
|
|
350
|
+
desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
|
|
351
|
+
task css_xml: :compile do
|
|
352
|
+
Bundler.with_unbundled_env do
|
|
353
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
|
|
358
|
+
task builder: :compile do
|
|
359
|
+
Bundler.with_unbundled_env do
|
|
360
|
+
sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
|
|
361
|
+
end
|
|
362
|
+
end
|
|
193
363
|
end
|
|
194
364
|
|
|
195
365
|
desc "Run all conformance suites"
|
|
196
|
-
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
366
|
+
task conformance: %w[conformance:html5 conformance:xpath conformance:css
|
|
367
|
+
conformance:xmlconf conformance:xpath_xml conformance:css_xml
|
|
368
|
+
conformance:builder]
|
|
197
369
|
|
|
198
370
|
namespace :fuzz do
|
|
199
371
|
# Run the fuzzer under the sanitizer. Toggles (all via env):
|
|
@@ -233,14 +405,46 @@ namespace :fuzz do
|
|
|
233
405
|
if ENV["FUZZ_ARGS"]
|
|
234
406
|
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
|
|
235
407
|
else
|
|
236
|
-
iso = %w[1 true yes].include?(ENV["
|
|
408
|
+
iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
|
|
237
409
|
secs = ENV["FUZZ_TIME"] || "90"
|
|
238
410
|
# Cover every surface under the sanitizer: the query engine (XPath/CSS over
|
|
239
411
|
# parsed fixtures), the XML parser (hostile documents), and the XML mutation
|
|
240
412
|
# surface (random edit sequences + invariants).
|
|
241
|
-
["", "--target xml", "--target mutate"].each do |surface|
|
|
413
|
+
["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
|
|
242
414
|
sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
|
|
243
415
|
end
|
|
244
416
|
end
|
|
245
417
|
end
|
|
418
|
+
|
|
419
|
+
# Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
|
|
420
|
+
# XPath compile+eval). These are Ruby-free standalone binaries, so they run
|
|
421
|
+
# directly under clang's libFuzzer driver without the Ruby interpreter.
|
|
422
|
+
# They complement the Ruby-based robustness fuzzer by providing coverage
|
|
423
|
+
# feedback and 2-3 orders of magnitude faster execution for the C core.
|
|
424
|
+
desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
|
|
425
|
+
task :libfuzzer_build => :compile do
|
|
426
|
+
libfuzzer_available? or
|
|
427
|
+
abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
|
|
428
|
+
"Install an LLVM clang with libFuzzer support and run with " \
|
|
429
|
+
"CLANG=/path/to/clang CXX=/path/to/clang++."
|
|
430
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
431
|
+
sh "make clean"
|
|
432
|
+
sh "make all"
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
|
|
437
|
+
task :libfuzzer => :libfuzzer_build do
|
|
438
|
+
time = ENV["FUZZ_TIME"] || "60"
|
|
439
|
+
Dir.chdir("ext/makiri/fuzz") do
|
|
440
|
+
sh "mkdir -p corpus/xml corpus/xpath"
|
|
441
|
+
sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
|
|
442
|
+
sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
desc "Show code statistics"
|
|
448
|
+
task :stats do
|
|
449
|
+
sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
|
|
246
450
|
end
|