makiri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/security.yml +88 -3
  5. data/.github/workflows/valgrind.yml +135 -0
  6. data/CHANGELOG.md +60 -2
  7. data/README.md +81 -77
  8. data/Rakefile +194 -3
  9. data/ext/makiri/bridge/ruby_string.c +119 -66
  10. data/ext/makiri/core/mkr_alloc.c +40 -3
  11. data/ext/makiri/core/mkr_alloc.h +27 -4
  12. data/ext/makiri/core/mkr_buf.c +13 -3
  13. data/ext/makiri/core/mkr_buf.h +80 -5
  14. data/ext/makiri/core/mkr_core.c +143 -0
  15. data/ext/makiri/core/mkr_core.h +10 -1
  16. data/ext/makiri/core/mkr_span.h +186 -0
  17. data/ext/makiri/core/mkr_utf8.c +101 -0
  18. data/ext/makiri/core/mkr_utf8.h +88 -0
  19. data/ext/makiri/extconf.rb +104 -9
  20. data/ext/makiri/fuzz/Makefile +95 -0
  21. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  22. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  23. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  24. data/ext/makiri/glue/glue.h +8 -0
  25. data/ext/makiri/glue/ruby_doc.c +20 -24
  26. data/ext/makiri/glue/ruby_html_css.c +58 -12
  27. data/ext/makiri/glue/ruby_html_mutate.c +11 -6
  28. data/ext/makiri/glue/ruby_html_node.c +3 -32
  29. data/ext/makiri/glue/ruby_node.c +39 -0
  30. data/ext/makiri/glue/ruby_xml.c +198 -16
  31. data/ext/makiri/glue/ruby_xml_node.c +46 -59
  32. data/ext/makiri/glue/ruby_xpath.c +4 -4
  33. data/ext/makiri/lexbor_compat/source_loc.c +14 -16
  34. data/ext/makiri/lexbor_compat/utf8_input.c +5 -78
  35. data/ext/makiri/makiri.c +45 -0
  36. data/ext/makiri/xml/mkr_xml.h +2 -3
  37. data/ext/makiri/xml/mkr_xml_chars.c +67 -97
  38. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  39. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  40. data/ext/makiri/xml/mkr_xml_mutate.c +63 -121
  41. data/ext/makiri/xml/mkr_xml_node.c +147 -15
  42. data/ext/makiri/xml/mkr_xml_node.h +71 -6
  43. data/ext/makiri/xml/mkr_xml_tree.c +185 -149
  44. data/ext/makiri/xpath/mkr_css.c +1023 -0
  45. data/ext/makiri/xpath/mkr_css.h +65 -0
  46. data/ext/makiri/xpath/mkr_xpath.c +37 -0
  47. data/ext/makiri/xpath/mkr_xpath.h +13 -0
  48. data/ext/makiri/xpath/mkr_xpath_eval_body.h +373 -90
  49. data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
  50. data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
  51. data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
  53. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  54. data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
  55. data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
  56. data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
  57. data/lib/makiri/cdata_section.rb +1 -3
  58. data/lib/makiri/comment.rb +1 -3
  59. data/lib/makiri/document.rb +8 -0
  60. data/lib/makiri/element.rb +1 -3
  61. data/lib/makiri/processing_instruction.rb +1 -3
  62. data/lib/makiri/text.rb +1 -3
  63. data/lib/makiri/version.rb +1 -1
  64. data/lib/makiri/xml/builder.rb +263 -0
  65. data/lib/makiri/xml/node_methods.rb +47 -0
  66. data/lib/makiri.rb +1 -0
  67. data/script/check_alloc_failures.rb +266 -0
  68. data/script/check_c_safety.rb +45 -2
  69. data/script/check_c_safety_allowlist.yml +19 -0
  70. data/script/check_leaks.rb +64 -0
  71. data/script/leaks_harness.rb +64 -0
  72. data/vendor/lexbor/CMakeLists.txt +6 -0
  73. data/vendor/lexbor/README.md +12 -0
  74. data/vendor/lexbor/config.cmake +1 -1
  75. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  76. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  77. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  78. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  79. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  80. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  81. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  82. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  83. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  84. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  85. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  86. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  87. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  88. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  89. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  90. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  91. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  92. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  93. data/vendor/lexbor/version +1 -1
  94. metadata +19 -1
data/README.md CHANGED
@@ -1,8 +1,9 @@
1
1
  # Makiri
2
2
 
3
- Standards-oriented HTML5/XML parsing, CSS selector querying, XPath 1.0 querying,
4
- and a native XML 1.0 reader/editor for Ruby, powered by Lexbor and a native XPath
5
- engine - with no libxml2 dependency.
3
+ Makiri is a Ruby library for parsing and querying HTML and XML documents.
4
+
5
+ It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
6
+ Makiri does not depend on libxml2.
6
7
 
7
8
  > [!WARNING]
8
9
  > Status: early release. APIs and behavior may change before v1.0.
@@ -14,14 +15,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
14
15
 
15
16
  * HTML5 parsing via [Lexbor](https://lexbor.com)
16
17
  * Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
17
- * Lexbor-specific behavior is isolated in a thin compatibility layer
18
- (`ext/makiri/lexbor_compat/`).
19
18
  * CSS selector support via Lexbor
20
19
  * Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
21
20
  * Native XPath 1.0 engine
22
21
  * XPath is parsed and evaluated by Makiri's own engine, written from scratch.
23
22
  * Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
24
- * Native XML 1.0 reader + in-place editor (`Makiri::XML`)
23
+ * Native XML 1.0 parser
25
24
  * A strict, non-validating, fail-closed parser with its own node arena (not
26
25
  Lexbor's HTML DOM), queried through the same native XPath engine, with
27
26
  in-place tree edits (attributes, content, rename, remove).
@@ -81,21 +80,6 @@ ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
81
80
 
82
81
  ### XML (with in-place editing)
83
82
 
84
- `Makiri::XML(source)` parses **XML 1.0** with a native, strict,
85
- well-formedness-checking parser (no libxml2) and queries it through the same
86
- native XPath 1.0 engine. `source` is a String or any object responding to
87
- `#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
88
- (`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
89
- **fail-closed**: malformed input, a duplicate attribute, or a
90
- non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
91
- XML does not support raise `NotImplementedError` rather than returning a wrong
92
- result. The tree supports in-place edits and building new subtrees (see below).
93
- A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
94
- entity/element declarations are loaded, no external subset is fetched) - so a
95
- DTD-defined entity reference stays an undefined-entity error and **XXE /
96
- billion-laughs are structurally impossible**. The doctype's name and identifiers
97
- are still readable:
98
-
99
83
  ```ruby
100
84
  doc = Makiri::XML(<<~XML)
101
85
  <feed xmlns="http://www.w3.org/2005/Atom">
@@ -119,10 +103,13 @@ el = doc.at_xpath("//a:entry", ns)
119
103
  el.local_name # => "entry"
120
104
  el.namespace_uri # => "http://www.w3.org/2005/Atom"
121
105
 
122
- doc.css("entry") # raises NotImplementedError (use #xpath)
106
+ # CSS selectors work too (lowered to the native XPath engine): a bare type
107
+ # selector binds to the document's default namespace, so this just works.
108
+ doc.css("entry").length # => 2
109
+ doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
123
110
 
124
111
  # Serialize back to XML
125
- doc.to_xml # => "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<feed ...>...</feed>\n"
112
+ doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
126
113
  doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
127
114
  doc.to_xml(pretty: true) # indented, element-only content
128
115
 
@@ -134,22 +121,7 @@ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
134
121
  dtd.system_id # => "x.dtd"
135
122
  ```
136
123
 
137
- Comments and processing instructions in the prolog/epilog are document-node
138
- children (reachable via `//comment()` / `//processing-instruction()` and
139
- `#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
140
- data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
141
- or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
142
- transcode, with a hex character reference for anything the encoding can't hold);
143
- a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
144
- Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
145
- comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
146
- (Lexbor's selector engine lower-cases names, which breaks XML case/namespace
147
- matching) - use XPath.
148
-
149
- The tree supports in-place mutation - every edit validates its input (names as
150
- XML 1.0 QNames, values as XML Char) so the tree stays serializable to
151
- well-formed XML, and a removed node is detached, never freed, so a live wrapper
152
- that aliases it stays usable:
124
+ The tree supports in-place mutation.
153
125
 
154
126
  ```ruby
155
127
  doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
@@ -165,15 +137,9 @@ doc.at_xpath("//draft").remove
165
137
  doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
166
138
  ```
167
139
 
168
- New subtrees can be built too - `Document#create_element` (and
169
- `#create_text_node` / `#create_comment` / `#create_cdata` /
170
- `#create_processing_instruction`) make detached nodes, and `#add_child` / `<<`,
171
- `#add_previous_sibling` / `#before`, `#add_next_sibling` / `#after`, `#replace`
172
- link them. A node's namespace is resolved against its position **at insertion**
173
- (a prefixed name binds to the in-scope `xmlns`, an unprefixed element to the
174
- default namespace), so the same tree results whether you set names before or
175
- after attaching; an unbound prefix in the live tree fails closed. A node from
176
- another document is **deep-copied** into the target (the source is untouched):
140
+ XML subtrees can be built with `Document#create_element` and related node factory methods,
141
+ then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
142
+ namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
177
143
 
178
144
  ```ruby
179
145
  doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
@@ -185,34 +151,29 @@ doc.root.add_child(entry)
185
151
  doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
186
152
  ```
187
153
 
188
- Supported edits: `#[]=`, `#delete` / `#remove_attribute`, `#content=`, `#name=`,
189
- `#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
190
- `#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
191
- `DocumentFragment` (its children are spliced in); a fragment is parsed by
192
- `Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
193
- (standalone). A raw string handed straight to `#add_child` is **not** accepted -
194
- parse it into a fragment first. A whole document can also be built from scratch
195
- with `XML::Document.new` + `#root=` and the factories.
196
-
197
- The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
198
- the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
199
- in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
200
- concrete String encoding stays authoritative - a BOM or declaration that
201
- contradicts it is a fatal error, not a silent mis-decode.
202
-
203
- Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
204
- counting node structs and text), which fits every standard document. Raise it
205
- per parse for an unusually large one:
154
+ `Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
206
155
 
207
156
  ```ruby
208
- Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
157
+ builder = Makiri::XML::Builder.new do |xml|
158
+ xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
159
+ xml.title("Example Feed")
160
+ xml.entry("dc:id" => "1") do
161
+ xml.title("First")
162
+ xml.summary { xml.cdata("raw <b>html</b>") }
163
+ end
164
+ end
165
+ end
166
+
167
+ builder.to_xml # the whole document (with XML declaration)
168
+ builder.doc # the Makiri::XML::Document being built
209
169
  ```
210
170
 
211
- Conformance is held by a regression net: the **W3C XML Conformance Test Suite**
212
- (`rake conformance:xmlconf`, 100% of the in-scope non-validating XML-1.0 tests),
213
- an XPath 1.0 differential vs Nokogiri/libxml2 (`rake conformance:xpath_xml`), and
214
- property-based testing that requires Makiri's tree to be byte-identical to
215
- Nokogiri's over generated documents (`rake conformance:xml_pbt`).
171
+ XML parsing is bounded by an arena memory limit, 256 MiB by default,
172
+ and unusually large documents can raise it with `max_bytes:`.
173
+
174
+ ```ruby
175
+ Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
176
+ ```
216
177
 
217
178
  ## Non-goals (v1.0)
218
179
 
@@ -271,14 +232,48 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
271
232
 
272
233
  ### CSS
273
234
 
274
- * jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, ...)
275
- * Makiri uses Lexbor's standards-only selector engine.
276
- Use XPath (`xpath("//p[contains(., 'x')]")`) or Enumerable (`css('li')[1]`).
235
+ * Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
236
+ * Makiri uses Lexbor's selector engine, which is standards-based apart from one
237
+ text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
238
+ Enumerable (`css('li')[1]`) for the rest.
277
239
  Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
240
+ * `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
241
+ spelling of the jQuery `:contains()` substring filter, matching an element
242
+ whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
243
+ for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
244
+ alias.) Like Lexbor's matcher, it tests the element's **immediate child text
245
+ nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
246
+ to XPath `child::text()[contains(., "text")]`.
247
+ * Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
248
+ selector) is supported and correct on both HTML and XML - the "type" is the
249
+ element's own expanded name.
250
+ * Nokogiri (XML and HTML5) mistranslates these to first-/only-child
251
+ (`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
252
+ Lexbor's HTML matcher.
278
253
  * Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
279
254
  * `Nokogiri::HTML5` is case-sensitive there.
280
- * Class/ID selectors are matched case-insensitively regardless of quirks mode (a Lexbor behaviour)
281
- * In a no-quirks document browsers and `Nokogiri::HTML5` match them case-sensitively.
255
+
256
+ ## Conformance
257
+
258
+ The XPath engine and XML parser are original code, so their correctness is held by
259
+ differential and standards harnesses in `spec/conformance/`.
260
+ The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
261
+ (Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
262
+ HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
263
+ parsing itself is checked against the WHATWG html5lib-tests corpus, and
264
+ XPath-over-HTML semantics additionally against browsers via a WPT port.
265
+ See also [`spec/conformance/README.md`](spec/conformance/README.md).
266
+
267
+ | Suite | Input | Oracle | `rake` task |
268
+ |---|---|---|---|
269
+ | HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
270
+ | XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
271
+ | XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
272
+ | CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
273
+ | Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
274
+ | XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
275
+ | Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
276
+ | CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
282
277
 
283
278
  ## Requirements
284
279
 
@@ -295,6 +290,15 @@ bundle exec rake compile
295
290
  bundle exec rake spec
296
291
  ```
297
292
 
293
+ ### Vendored Lexbor version
294
+
295
+ `vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
296
+ commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
297
+ case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
298
+ heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
299
+ bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
300
+ v3.0.0. See `CLAUDE.md` for details.
301
+
298
302
  ## License
299
303
 
300
304
  Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require "bundler/gem_tasks"
4
4
  require "rspec/core/rake_task"
5
5
  require "rake/extensiontask"
6
6
  require "shellwords"
7
+ require "tmpdir"
7
8
 
8
9
  GEMSPEC = Gem::Specification.load("makiri.gemspec")
9
10
 
@@ -35,6 +36,59 @@ RSpec::Core::RakeTask.new(:spec)
35
36
 
36
37
  task default: %i[compile spec]
37
38
 
39
+ # `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
40
+ # ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
41
+ # Ruby's own Valgrind suppression files (matched by Ruby version) and filters
42
+ # the report down to errors whose stack touches our extension, so we no longer
43
+ # have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
44
+ #
45
+ # We keep this job's historical contract: catch *use of uninitialised values*
46
+ # and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
47
+ # detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
48
+ # which disable undef-value errors and turn on full leak-check.
49
+ #
50
+ # `filter_all_errors: true` is essential: by default ruby_memcheck only applies
51
+ # its "stack must touch the makiri binary" filter to *leak*-kind errors
52
+ # (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
53
+ # uninitialised-value report is surfaced regardless of where it comes from. Ruby's
54
+ # conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
55
+ # legitimately reads uninitialised words, and the bundled ruby.supp does not cover
56
+ # the free-at-exit / subprocess stacks the `:isolated` specs spin up under
57
+ # `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
58
+ # Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
59
+ # *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
60
+ # and is still reported.
61
+ #
62
+ # Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
63
+ # normal `bundle exec rake` (without that group) must not fail to load.
64
+ begin
65
+ require "ruby_memcheck"
66
+ require "ruby_memcheck/rspec/rake_task"
67
+
68
+ RubyMemcheck.config(
69
+ binary_name: "makiri",
70
+ filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
71
+ # not just leaks (see note above) - drops Ruby's own
72
+ # GC uninitialised-value noise, keeps mkr_* reports
73
+ valgrind_options: [
74
+ "--num-callers=50",
75
+ "--error-limit=no",
76
+ "--trace-children=yes", # spec processes may fork
77
+ "--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
78
+ "--track-origins=yes", # report where an uninitialised value came from
79
+ "--leak-check=no", # leaks are `rake leaks`' job, not this one
80
+ ],
81
+ )
82
+
83
+ namespace :spec do
84
+ desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
85
+ ":valgrind bundler group and the valgrind binary)"
86
+ RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
87
+ end
88
+ rescue LoadError
89
+ # ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
90
+ end
91
+
38
92
  namespace :security do
39
93
  desc "Run mechanical C safety lint over ext/makiri"
40
94
  task :clint do
@@ -81,6 +135,17 @@ def asan_runtime_path
81
135
  nil
82
136
  end
83
137
 
138
+ def libfuzzer_available?
139
+ cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
140
+ Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
141
+ src = File.join(dir, "check.cc")
142
+ exe = File.join(dir, "check")
143
+ File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
144
+ return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
145
+ out: File::NULL, err: File::NULL)
146
+ end
147
+ end
148
+
84
149
  # The compiled extension, and whether it carries sanitizer instrumentation, so
85
150
  # `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
86
151
  def ext_bundle_path
@@ -116,6 +181,65 @@ task :sanitize do
116
181
  sh(env, "#{FileUtils::RUBY} -S rspec")
117
182
  end
118
183
 
184
+ desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
185
+ "Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
186
+ "a line-level detail file to tmp/coverage/show.txt."
187
+ task :coverage do
188
+ require "fileutils"
189
+ dir = File.expand_path("tmp/coverage")
190
+ FileUtils.rm_rf(dir)
191
+ FileUtils.mkdir_p(dir)
192
+
193
+ # Instrument only our sources (Lexbor is built separately, uninstrumented).
194
+ sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
195
+ # %p -> PID, so any forked spec process gets its own raw profile.
196
+ sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
197
+
198
+ profdata = File.join(dir, "makiri.profdata")
199
+ bundle = "lib/makiri/makiri.bundle"
200
+ ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
201
+ sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
202
+ sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
203
+ "-ignore-filename-regex='#{ignore}' -show-branch-summary"
204
+ show = File.join(dir, "show.txt")
205
+ sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
206
+ "-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
207
+ puts "\ncoverage line/branch detail: #{show}"
208
+ puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
209
+ end
210
+
211
+ desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
212
+ "INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
213
+ "spec suite, or FUZZ_ARGS via the fuzzer when set."
214
+ task "sanitize:lexbor" do
215
+ sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
216
+ sanitize.include?("address") or
217
+ abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
218
+
219
+ # MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
220
+ # (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
221
+ # plain<->asan switch, so no manual clean:lexbor is needed before or after.
222
+ build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
223
+ sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
224
+
225
+ env = {
226
+ "ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
227
+ "detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
228
+ "UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
229
+ }
230
+ runtime = asan_runtime_path or
231
+ abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
232
+ preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
233
+ env[preload] = runtime
234
+ puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
235
+
236
+ if ENV["FUZZ_ARGS"]
237
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
238
+ else
239
+ sh(env, "#{FileUtils::RUBY} -S rspec")
240
+ end
241
+ end
242
+
119
243
  desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
120
244
  task fuzz: :compile do
121
245
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
@@ -131,6 +255,25 @@ task "fuzz:mutate": :compile do
131
255
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
132
256
  end
133
257
 
258
+ desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
259
+ task leaks: :compile do
260
+ # ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
261
+ # leaks are otherwise never machine-checked; see script/check_leaks.rb.
262
+ sh "#{FileUtils::RUBY} script/check_leaks.rb"
263
+ end
264
+
265
+ desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
266
+ "allocation site, verifying each failure fails closed (clean raise or " \
267
+ "baseline-identical result, never truncated output)"
268
+ task :oom do
269
+ # The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
270
+ # a normal build), so this needs its own rebuild; see
271
+ # script/check_alloc_failures.rb for the protocol and the property gated.
272
+ sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
273
+ sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
274
+ puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
275
+ end
276
+
134
277
  desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
135
278
  task bench: :compile do
136
279
  # Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
@@ -190,10 +333,26 @@ namespace :conformance do
190
333
  sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
191
334
  end
192
335
  end
336
+
337
+ desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
338
+ task css_xml: :compile do
339
+ Bundler.with_unbundled_env do
340
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
341
+ end
342
+ end
343
+
344
+ desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
345
+ task builder: :compile do
346
+ Bundler.with_unbundled_env do
347
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
348
+ end
349
+ end
193
350
  end
194
351
 
195
352
  desc "Run all conformance suites"
196
- task conformance: %w[conformance:html5 conformance:xpath conformance:css conformance:xmlconf conformance:xpath_xml]
353
+ task conformance: %w[conformance:html5 conformance:xpath conformance:css
354
+ conformance:xmlconf conformance:xpath_xml conformance:css_xml
355
+ conformance:builder]
197
356
 
198
357
  namespace :fuzz do
199
358
  # Run the fuzzer under the sanitizer. Toggles (all via env):
@@ -233,14 +392,46 @@ namespace :fuzz do
233
392
  if ENV["FUZZ_ARGS"]
234
393
  sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
235
394
  else
236
- iso = %w[1 true yes].include?(ENV["FAST"].to_s.downcase) ? "" : "--isolated"
395
+ iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
237
396
  secs = ENV["FUZZ_TIME"] || "90"
238
397
  # Cover every surface under the sanitizer: the query engine (XPath/CSS over
239
398
  # parsed fixtures), the XML parser (hostile documents), and the XML mutation
240
399
  # surface (random edit sequences + invariants).
241
- ["", "--target xml", "--target mutate"].each do |surface|
400
+ ["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
242
401
  sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
243
402
  end
244
403
  end
245
404
  end
405
+
406
+ # Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
407
+ # XPath compile+eval). These are Ruby-free standalone binaries, so they run
408
+ # directly under clang's libFuzzer driver without the Ruby interpreter.
409
+ # They complement the Ruby-based robustness fuzzer by providing coverage
410
+ # feedback and 2-3 orders of magnitude faster execution for the C core.
411
+ desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
412
+ task :libfuzzer_build => :compile do
413
+ libfuzzer_available? or
414
+ abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
415
+ "Install an LLVM clang with libFuzzer support and run with " \
416
+ "CLANG=/path/to/clang CXX=/path/to/clang++."
417
+ Dir.chdir("ext/makiri/fuzz") do
418
+ sh "make clean"
419
+ sh "make all"
420
+ end
421
+ end
422
+
423
+ desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
424
+ task :libfuzzer => :libfuzzer_build do
425
+ time = ENV["FUZZ_TIME"] || "60"
426
+ Dir.chdir("ext/makiri/fuzz") do
427
+ sh "mkdir -p corpus/xml corpus/xpath"
428
+ sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
429
+ sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
430
+ end
431
+ end
432
+ end
433
+
434
+ desc "Show code statistics"
435
+ task :stats do
436
+ sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
246
437
  end