makiri 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/security.yml +88 -3
  5. data/.github/workflows/valgrind.yml +138 -0
  6. data/CHANGELOG.md +127 -2
  7. data/README.md +95 -77
  8. data/Rakefile +207 -3
  9. data/ext/makiri/bridge/ruby_string.c +159 -80
  10. data/ext/makiri/core/mkr_alloc.c +40 -3
  11. data/ext/makiri/core/mkr_alloc.h +28 -5
  12. data/ext/makiri/core/mkr_buf.c +13 -3
  13. data/ext/makiri/core/mkr_buf.h +80 -5
  14. data/ext/makiri/core/mkr_core.c +143 -0
  15. data/ext/makiri/core/mkr_core.h +10 -1
  16. data/ext/makiri/core/mkr_span.h +186 -0
  17. data/ext/makiri/core/mkr_utf8.c +101 -0
  18. data/ext/makiri/core/mkr_utf8.h +88 -0
  19. data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
  20. data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
  21. data/ext/makiri/dom_adapter/cross_import.c +434 -0
  22. data/ext/makiri/dom_adapter/cross_import.h +35 -0
  23. data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +14 -16
  24. data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
  25. data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +5 -78
  26. data/ext/makiri/extconf.rb +104 -9
  27. data/ext/makiri/fuzz/Makefile +95 -0
  28. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  29. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  30. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  31. data/ext/makiri/glue/cross_import.h +30 -0
  32. data/ext/makiri/glue/glue.h +9 -1
  33. data/ext/makiri/glue/ruby_doc.c +31 -27
  34. data/ext/makiri/glue/ruby_html_css.c +58 -12
  35. data/ext/makiri/glue/ruby_html_mutate.c +17 -6
  36. data/ext/makiri/glue/ruby_html_node.c +4 -33
  37. data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
  38. data/ext/makiri/glue/ruby_node.c +53 -0
  39. data/ext/makiri/glue/ruby_xml.c +228 -17
  40. data/ext/makiri/glue/ruby_xml_node.c +133 -61
  41. data/ext/makiri/glue/ruby_xpath.c +20 -5
  42. data/ext/makiri/makiri.c +48 -0
  43. data/ext/makiri/makiri.h +5 -0
  44. data/ext/makiri/xml/mkr_xml.h +7 -3
  45. data/ext/makiri/xml/mkr_xml_chars.c +89 -97
  46. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  47. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  48. data/ext/makiri/xml/mkr_xml_mutate.c +220 -168
  49. data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
  50. data/ext/makiri/xml/mkr_xml_node.c +147 -15
  51. data/ext/makiri/xml/mkr_xml_node.h +71 -6
  52. data/ext/makiri/xml/mkr_xml_tree.c +246 -174
  53. data/ext/makiri/xpath/mkr_css.c +1023 -0
  54. data/ext/makiri/xpath/mkr_css.h +65 -0
  55. data/ext/makiri/xpath/mkr_xpath.c +65 -0
  56. data/ext/makiri/xpath/mkr_xpath.h +18 -1
  57. data/ext/makiri/xpath/mkr_xpath_eval_body.h +383 -90
  58. data/ext/makiri/xpath/mkr_xpath_funcs_body.h +249 -231
  59. data/ext/makiri/xpath/mkr_xpath_internal.h +89 -9
  60. data/ext/makiri/xpath/mkr_xpath_lex.c +94 -124
  61. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +6 -3
  62. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  63. data/ext/makiri/xpath/mkr_xpath_parse.c +79 -90
  64. data/ext/makiri/xpath/mkr_xpath_shared.c +40 -24
  65. data/ext/makiri/xpath/mkr_xpath_value_body.h +50 -24
  66. data/lib/makiri/cdata_section.rb +1 -3
  67. data/lib/makiri/comment.rb +1 -3
  68. data/lib/makiri/document.rb +8 -0
  69. data/lib/makiri/element.rb +1 -3
  70. data/lib/makiri/html/document.rb +11 -12
  71. data/lib/makiri/html/node_methods.rb +0 -1
  72. data/lib/makiri/node_set.rb +14 -9
  73. data/lib/makiri/processing_instruction.rb +8 -2
  74. data/lib/makiri/text.rb +1 -3
  75. data/lib/makiri/version.rb +1 -1
  76. data/lib/makiri/xml/builder.rb +271 -0
  77. data/lib/makiri/xml/node_methods.rb +47 -0
  78. data/lib/makiri/xpath_context.rb +12 -4
  79. data/lib/makiri.rb +1 -0
  80. data/script/check_alloc_failures.rb +266 -0
  81. data/script/check_c_safety.rb +45 -2
  82. data/script/check_c_safety_allowlist.yml +27 -5
  83. data/script/check_leaks.rb +64 -0
  84. data/script/leaks_harness.rb +71 -0
  85. data/suppressions/ruby.supp +140 -0
  86. data/vendor/lexbor/CMakeLists.txt +6 -0
  87. data/vendor/lexbor/README.md +12 -0
  88. data/vendor/lexbor/config.cmake +1 -1
  89. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  90. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  91. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  92. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  93. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  94. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  95. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  96. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  97. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  98. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  99. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  100. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  101. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  102. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  103. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  104. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  105. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  106. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  107. data/vendor/lexbor/version +1 -1
  108. metadata +31 -8
  109. /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
  110. /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
data/README.md CHANGED
@@ -1,8 +1,9 @@
1
1
  # Makiri
2
2
 
3
- Standards-oriented HTML5/XML parsing, CSS selector querying, XPath 1.0 querying,
4
- and a native XML 1.0 reader/editor for Ruby, powered by Lexbor and a native XPath
5
- engine - with no libxml2 dependency.
3
+ Makiri is a Ruby library for parsing and querying HTML and XML documents.
4
+
5
+ It uses [Lexbor](https://lexbor.com/) for HTML parsing and CSS selector matching, and includes a built-in native XPath 1.0 engine and XML 1.0 parser.
6
+ Makiri does not depend on libxml2.
6
7
 
7
8
  > [!WARNING]
8
9
  > Status: early release. APIs and behavior may change before v1.0.
@@ -14,14 +15,12 @@ XPath 1.0 evaluation in its own native engine, with no libxml2 dependency.
14
15
 
15
16
  * HTML5 parsing via [Lexbor](https://lexbor.com)
16
17
  * Makiri uses Lexbor as the parsing backend and provides a Ruby-facing DOM/query layer.
17
- * Lexbor-specific behavior is isolated in a thin compatibility layer
18
- (`ext/makiri/lexbor_compat/`).
19
18
  * CSS selector support via Lexbor
20
19
  * Supports Lexbor-backed standard CSS selector querying, including `:is`/`:where`/`:has`
21
20
  * Native XPath 1.0 engine
22
21
  * XPath is parsed and evaluated by Makiri's own engine, written from scratch.
23
22
  * Makiri does not depend on libxml2 for parsing, DOM representation, or XPath evaluation.
24
- * Native XML 1.0 reader + in-place editor (`Makiri::XML`)
23
+ * Native XML 1.0 parser
25
24
  * A strict, non-validating, fail-closed parser with its own node arena (not
26
25
  Lexbor's HTML DOM), queried through the same native XPath engine, with
27
26
  in-place tree edits (attributes, content, rename, remove).
@@ -81,21 +80,6 @@ ctx.evaluate('//p[@class=$cls]').first.text # => "Hello"
81
80
 
82
81
  ### XML (with in-place editing)
83
82
 
84
- `Makiri::XML(source)` parses **XML 1.0** with a native, strict,
85
- well-formedness-checking parser (no libxml2) and queries it through the same
86
- native XPath 1.0 engine. `source` is a String or any object responding to
87
- `#read` (an `IO` / `File` / `StringIO`); read a non-UTF-8 file in binary mode
88
- (`File.binread`) so its encoding is autodetected. Element-name case and namespaces are preserved. It is
89
- **fail-closed**: malformed input, a duplicate attribute, or a
90
- non-`1.0` version declaration raises `Makiri::XML::SyntaxError`, and operations
91
- XML does not support raise `NotImplementedError` rather than returning a wrong
92
- result. The tree supports in-place edits and building new subtrees (see below).
93
- A `<!DOCTYPE ...>` is recognized but its **DTD is not processed** (no
94
- entity/element declarations are loaded, no external subset is fetched) - so a
95
- DTD-defined entity reference stays an undefined-entity error and **XXE /
96
- billion-laughs are structurally impossible**. The doctype's name and identifiers
97
- are still readable:
98
-
99
83
  ```ruby
100
84
  doc = Makiri::XML(<<~XML)
101
85
  <feed xmlns="http://www.w3.org/2005/Atom">
@@ -119,10 +103,13 @@ el = doc.at_xpath("//a:entry", ns)
119
103
  el.local_name # => "entry"
120
104
  el.namespace_uri # => "http://www.w3.org/2005/Atom"
121
105
 
122
- doc.css("entry") # raises NotImplementedError (use #xpath)
106
+ # CSS selectors work too (lowered to the native XPath engine): a bare type
107
+ # selector binds to the document's default namespace, so this just works.
108
+ doc.css("entry").length # => 2
109
+ doc.css("feed > entry").map { |e| e.at_css("title").text } # => ["Hello", "World"]
123
110
 
124
111
  # Serialize back to XML
125
- doc.to_xml # => "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<feed ...>...</feed>\n"
112
+ doc.to_xml # => "<?xml version=\"1.0\"?>\n<feed ...>...</feed>\n"
126
113
  doc.at_xpath("//a:entry", ns).to_xml # => "<entry><title>Hello</title></entry>" (no declaration)
127
114
  doc.to_xml(pretty: true) # indented, element-only content
128
115
 
@@ -134,22 +121,7 @@ dtd.external_id # => "-//W3C//DTD XHTML 1.0//EN" (alias: #public_id)
134
121
  dtd.system_id # => "x.dtd"
135
122
  ```
136
123
 
137
- Comments and processing instructions in the prolog/epilog are document-node
138
- children (reachable via `//comment()` / `//processing-instruction()` and
139
- `#children`), and adjacent CDATA is coalesced - matching libxml2 and the XPath
140
- data model. `#to_xml` / `#to_s` serialize the tree back to XML (`pretty: true`,
141
- or `indent: n`, for indented element-only content; `encoding: "Shift_JIS"` to
142
- transcode, with a hex character reference for anything the encoding can't hold);
143
- a `Document#to_xml` adds the declaration and the DOCTYPE. `#canonicalize` emits
144
- Inclusive Canonical XML 1.0 (for XML signatures; `comments: true` to keep
145
- comments), byte-identical to libxml2. CSS is intentionally unavailable for XML
146
- (Lexbor's selector engine lower-cases names, which breaks XML case/namespace
147
- matching) - use XPath.
148
-
149
- The tree supports in-place mutation - every edit validates its input (names as
150
- XML 1.0 QNames, values as XML Char) so the tree stays serializable to
151
- well-formed XML, and a removed node is detached, never freed, so a live wrapper
152
- that aliases it stays usable:
124
+ The tree supports in-place mutation.
153
125
 
154
126
  ```ruby
155
127
  doc = Makiri::XML(%(<feed xmlns:dc="urn:dc"><entry id="1">Hi</entry><draft/></feed>))
@@ -165,15 +137,17 @@ doc.at_xpath("//draft").remove
165
137
  doc.root.to_xml # => "<feed xmlns:dc=\"urn:dc\"><post dc:k=\"v\">Bye</post></feed>"
166
138
  ```
167
139
 
168
- New subtrees can be built too - `Document#create_element` (and
169
- `#create_text_node` / `#create_comment` / `#create_cdata` /
170
- `#create_processing_instruction`) make detached nodes, and `#add_child` / `<<`,
171
- `#add_previous_sibling` / `#before`, `#add_next_sibling` / `#after`, `#replace`
172
- link them. A node's namespace is resolved against its position **at insertion**
173
- (a prefixed name binds to the in-scope `xmlns`, an unprefixed element to the
174
- default namespace), so the same tree results whether you set names before or
175
- after attaching; an unbound prefix in the live tree fails closed. A node from
176
- another document is **deep-copied** into the target (the source is untouched):
140
+ XML subtrees can be built with `Document#create_element` and related node factory methods,
141
+ then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
142
+ namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
143
+
144
+ `Document#import_node(node, deep = false)` brings a node into a document as a
145
+ detached copy, and works **across representations**: importing a `Makiri::HTML`
146
+ node into a `Makiri::XML::Document` (or vice versa) translates the subtree between
147
+ the two node representations, preserving namespaces (e.g. an inline `<svg>` keeps
148
+ the SVG namespace, HTML elements the XHTML namespace; custom namespaces are
149
+ preserved across both directions). An XML CDATA section has no HTML counterpart,
150
+ so importing one into an HTML document raises.
177
151
 
178
152
  ```ruby
179
153
  doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
@@ -185,34 +159,29 @@ doc.root.add_child(entry)
185
159
  doc.to_xml # => "...<entry dc:id=\"42\"><title>Hello</title></entry>..."
186
160
  ```
187
161
 
188
- Supported edits: `#[]=`, `#delete` / `#remove_attribute`, `#content=`, `#name=`,
189
- `#remove` / `#unlink`, the factories above, and `#add_child` / `<<` /
190
- `#before` / `#after` / `#replace`. Insertion takes a `Makiri::XML` node or a
191
- `DocumentFragment` (its children are spliced in); a fragment is parsed by
192
- `Document#fragment(str)` (bound to the document) or `DocumentFragment.parse(str)`
193
- (standalone). A raw string handed straight to `#add_child` is **not** accepted -
194
- parse it into a fragment first. A whole document can also be built from scratch
195
- with `XML::Document.new` + `#root=` and the factories.
196
-
197
- The character encoding is autodetected (XML 1.0 Appendix F): a byte-order mark or
198
- the `<?xml encoding="..."?>` declaration selects it, so raw bytes (`File.binread`)
199
- in UTF-16, Shift_JIS, etc. parse correctly and a leading BOM is stripped. A
200
- concrete String encoding stays authoritative - a BOM or declaration that
201
- contradicts it is a fatal error, not a silent mis-decode.
202
-
203
- Parsing is DoS-bounded by a single arena memory ceiling (default 256 MiB,
204
- counting node structs and text), which fits every standard document. Raise it
205
- per parse for an unusually large one:
162
+ `Makiri::XML::Builder` is the Nokogiri-compatible DSL over those factories.
206
163
 
207
164
  ```ruby
208
- Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
165
+ builder = Makiri::XML::Builder.new do |xml|
166
+ xml.feed("xmlns" => "http://www.w3.org/2005/Atom", "xmlns:dc" => "urn:dc") do
167
+ xml.title("Example Feed")
168
+ xml.entry("dc:id" => "1") do
169
+ xml.title("First")
170
+ xml.summary { xml.cdata("raw <b>html</b>") }
171
+ end
172
+ end
173
+ end
174
+
175
+ builder.to_xml # the whole document (with XML declaration)
176
+ builder.doc # the Makiri::XML::Document being built
209
177
  ```
210
178
 
211
- Conformance is held by a regression net: the **W3C XML Conformance Test Suite**
212
- (`rake conformance:xmlconf`, 100% of the in-scope non-validating XML-1.0 tests),
213
- an XPath 1.0 differential vs Nokogiri/libxml2 (`rake conformance:xpath_xml`), and
214
- property-based testing that requires Makiri's tree to be byte-identical to
215
- Nokogiri's over generated documents (`rake conformance:xml_pbt`).
179
+ XML parsing is bounded by an arena memory limit, 256 MiB by default,
180
+ and unusually large documents can raise it with `max_bytes:`.
181
+
182
+ ```ruby
183
+ Makiri::XML(huge_xml, max_bytes: 512 * 1024 * 1024) # also Makiri::XML::Document.parse(..., max_bytes:)
184
+ ```
216
185
 
217
186
  ## Non-goals (v1.0)
218
187
 
@@ -265,20 +234,60 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
265
234
  markup string straight to `#add_child` is unsupported (parse it into a fragment
266
235
  first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
267
236
  / `inner_html` / `outer_html` - is not.)
237
+ * A colon in a processing-instruction target is well-formed (`<?a:b ...?>` parses).
238
+ * XML 1.0 §2.6: a `PITarget` is a `Name`, not an NCName, and Namespaces in XML
239
+ 1.0's normative conformance section constrains only element/attribute names
240
+ (QNames), never PI targets. Nokogiri/libxml2 rejects it (`colons are forbidden
241
+ from PI names`); Makiri follows the normative text. Only the reserved `xml`
242
+ (any case) target is rejected.
268
243
  * Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
269
244
  the property-based differential), including namespaces, prolog/epilog comments
270
245
  and PIs, and adjacent-CDATA coalescing.
271
246
 
272
247
  ### CSS
273
248
 
274
- * jQuery/Nokogiri CSS extensions are not supported (`:contains`, `:gt`, `:lt`, `:eq`, `:first`, ...)
275
- * Makiri uses Lexbor's standards-only selector engine.
276
- Use XPath (`xpath("//p[contains(., 'x')]")`) or Enumerable (`css('li')[1]`).
249
+ * Most jQuery/Nokogiri CSS extensions are not supported (`:gt`, `:lt`, `:eq`, `:first`, ...)
250
+ * Makiri uses Lexbor's selector engine, which is standards-based apart from one
251
+ text-containment extension. Use XPath (`xpath("//p[contains(., 'x')]")`) or
252
+ Enumerable (`css('li')[1]`) for the rest.
277
253
  Standard Level-4 selectors (`:is` / `:where` / `:has`) are supported; some of which Nokogiri rejects.
254
+ * `:lexbor-contains("text")` **is** supported (on both HTML and XML) - Lexbor's
255
+ spelling of the jQuery `:contains()` substring filter, matching an element
256
+ whose text contains the string; append ` i` (`:lexbor-contains("text" i)`)
257
+ for an ASCII case-insensitive match. (Nokogiri's name `:contains` is not an
258
+ alias.) Like Lexbor's matcher, it tests the element's **immediate child text
259
+ nodes** (not the deep string-value), so HTML and XML agree; on XML it lowers
260
+ to XPath `child::text()[contains(., "text")]`.
261
+ * Untyped `:*-of-type` (`:first-of-type`, `:nth-of-type(an+b)`, ... with no type
262
+ selector) is supported and correct on both HTML and XML - the "type" is the
263
+ element's own expanded name.
264
+ * Nokogiri (XML and HTML5) mistranslates these to first-/only-child
265
+ (`//*[position()=1]` / `//*[last()=1]`), so it under-matches; Makiri matches
266
+ Lexbor's HTML matcher.
278
267
  * Type selectors are ASCII case-insensitive (CSS-correct for HTML; `LI` matches `<li>`)
279
268
  * `Nokogiri::HTML5` is case-sensitive there.
280
- * Class/ID selectors are matched case-insensitively regardless of quirks mode (a Lexbor behaviour)
281
- * In a no-quirks document browsers and `Nokogiri::HTML5` match them case-sensitively.
269
+
270
+ ## Conformance
271
+
272
+ The XPath engine and XML parser are original code, so their correctness is held by
273
+ differential and standards harnesses in `spec/conformance/`.
274
+ The HTML XPath and CSS suites are differentials against **`Nokogiri::HTML5`**
275
+ (Gumbo / WHATWG, never libxml2's non-conformant HTML4 parser): both sides parse
276
+ HTML5, so the DOM is isomorphic and results are compared node-for-node. HTML
277
+ parsing itself is checked against the WHATWG html5lib-tests corpus, and
278
+ XPath-over-HTML semantics additionally against browsers via a WPT port.
279
+ See also [`spec/conformance/README.md`](spec/conformance/README.md).
280
+
281
+ | Suite | Input | Oracle | `rake` task |
282
+ |---|---|---|---|
283
+ | HTML parsing | HTML | WHATWG html5lib-tests (expected-tree corpus) | `conformance:html5` |
284
+ | XPath 1.0 | HTML | `Nokogiri::HTML5` (libxml2 XPath) — differential | `conformance:xpath` |
285
+ | XPath over HTML | HTML | browsers (WPT `domxpath`, hand-ported; runs under `rake spec`) | — |
286
+ | CSS selectors | HTML | `Nokogiri::HTML5#css` — differential | `conformance:css` |
287
+ | Well-formedness | XML | W3C XML Conformance Test Suite | `conformance:xmlconf` |
288
+ | XPath 1.0 | XML | `Nokogiri::XML` — differential | `conformance:xpath_xml` |
289
+ | Parsed tree (property-based) | XML | `Nokogiri::XML` — differential | `conformance:xml_pbt` |
290
+ | CSS selectors | XML | `Nokogiri::XML` — differential | `conformance:css_xml` |
282
291
 
283
292
  ## Requirements
284
293
 
@@ -295,6 +304,15 @@ bundle exec rake compile
295
304
  bundle exec rake spec
296
305
  ```
297
306
 
307
+ ### Vendored Lexbor version
308
+
309
+ `vendor/lexbor` is pinned to `3a2d595` (`v3.0.0-25`), an untagged `master`
310
+ commit, for fixes that v3.0.0 lacks: two upstreamed CSS-selector fixes (class/ID
311
+ case-sensitivity in quirks mode, and prefix-less type-selector namespacing), a
312
+ heap-overflow fix in the `:lexbor-contains()` parser, and other post-v3.0.0
313
+ bugfixes. Lexbor stays vanilla; we return to a release tag once one ships after
314
+ v3.0.0. See `CLAUDE.md` for details.
315
+
298
316
  ## License
299
317
 
300
318
  Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require "bundler/gem_tasks"
4
4
  require "rspec/core/rake_task"
5
5
  require "rake/extensiontask"
6
6
  require "shellwords"
7
+ require "tmpdir"
7
8
 
8
9
  GEMSPEC = Gem::Specification.load("makiri.gemspec")
9
10
 
@@ -35,6 +36,72 @@ RSpec::Core::RakeTask.new(:spec)
35
36
 
36
37
  task default: %i[compile spec]
37
38
 
39
+ # `rake spec:valgrind` - run the spec suite under Valgrind memcheck via
40
+ # ruby_memcheck (Linux CI; see .github/workflows/valgrind.yml). The gem ships
41
+ # Ruby's own Valgrind suppression files (matched by Ruby version) and filters
42
+ # the report down to errors whose stack touches our extension, so we no longer
43
+ # have to fetch ruby.supp from ruby/ruby (that path was removed upstream).
44
+ #
45
+ # We keep this job's historical contract: catch *use of uninitialised values*
46
+ # and *invalid reads/writes* (incl. intra-arena overflows) - NOT leaks (leak
47
+ # detection stays with `rake leaks`). So we override ruby_memcheck's defaults,
48
+ # which disable undef-value errors and turn on full leak-check.
49
+ #
50
+ # `filter_all_errors: true` is essential: by default ruby_memcheck only applies
51
+ # its "stack must touch the makiri binary" filter to *leak*-kind errors
52
+ # (`ValgrindError#should_filter? = filter_all_errors? || kind_leak?`), so every
53
+ # uninitialised-value report is surfaced regardless of where it comes from. Ruby's
54
+ # conservative GC (machine-context scan, RVALUE flag aging, free-at-exit teardown)
55
+ # legitimately reads uninitialised words, and the bundled ruby.supp does not cover
56
+ # the free-at-exit / subprocess stacks the `:isolated` specs spin up under
57
+ # `--trace-children=yes` - which buried the run in ~3500 pure-Ruby false positives.
58
+ # Filtering all error kinds by the same binary-touch rule keeps the gate scoped to
59
+ # *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
60
+ # and is still reported.
61
+ #
62
+ # BUT the binary-touch filter is too coarse for one residual class: when a GC
63
+ # cycle fires *inside* one of our allocations (or marks through our mark
64
+ # callback), CRuby's conservative collector legitimately reads uninitialised
65
+ # words (machine-stack scan reading stale frames, incremental mark/sweep reading
66
+ # not-yet-written RVALUE flags) while a makiri frame sits on the stack - so ~190
67
+ # of these pure-Ruby-GC false positives pass the filter. The gem's bundled
68
+ # ruby.supp only covers `each_location*` under Addr8, not the Cond/Value8 reads
69
+ # we hit. `suppressions/ruby.supp` (auto-loaded by ruby_memcheck: it globs
70
+ # `<dir>/<ruby-version>.supp`, and the bare `ruby.supp` matches every version)
71
+ # suppresses exactly those GC-driver-anchored uninit reads, plus the VM
72
+ # method-cache id_table the interpreter never frees before exit. A real uninit
73
+ # read in our code does not descend from a GC driver, so it still fails.
74
+ #
75
+ # Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
76
+ # normal `bundle exec rake` (without that group) must not fail to load.
77
+ begin
78
+ require "ruby_memcheck"
79
+ require "ruby_memcheck/rspec/rake_task"
80
+
81
+ RubyMemcheck.config(
82
+ binary_name: "makiri",
83
+ filter_all_errors: true, # apply the binary-touch filter to ALL error kinds,
84
+ # not just leaks (see note above) - drops Ruby's own
85
+ # GC uninitialised-value noise, keeps mkr_* reports
86
+ valgrind_options: [
87
+ "--num-callers=50",
88
+ "--error-limit=no",
89
+ "--trace-children=yes", # spec processes may fork
90
+ "--undef-value-errors=yes", # the point of this job (ruby_memcheck defaults to =no)
91
+ "--track-origins=yes", # report where an uninitialised value came from
92
+ "--leak-check=no", # leaks are `rake leaks`' job, not this one
93
+ ],
94
+ )
95
+
96
+ namespace :spec do
97
+ desc "Run the spec suite under Valgrind memcheck (ruby_memcheck; needs the " \
98
+ ":valgrind bundler group and the valgrind binary)"
99
+ RubyMemcheck::RSpec::RakeTask.new(valgrind: :compile)
100
+ end
101
+ rescue LoadError
102
+ # ruby_memcheck not installed (optional :valgrind group absent) - skip the task.
103
+ end
104
+
38
105
  namespace :security do
39
106
  desc "Run mechanical C safety lint over ext/makiri"
40
107
  task :clint do
@@ -81,6 +148,17 @@ def asan_runtime_path
81
148
  nil
82
149
  end
83
150
 
151
+ def libfuzzer_available?
152
+ cxx = ENV["CXX"].to_s.empty? ? "clang++" : ENV["CXX"]
153
+ Dir.mktmpdir("makiri-libfuzzer-check") do |dir|
154
+ src = File.join(dir, "check.cc")
155
+ exe = File.join(dir, "check")
156
+ File.write(src, "extern \"C\" int LLVMFuzzerTestOneInput(const unsigned char*, unsigned long){return 0;}\n")
157
+ return system(cxx, "-fsanitize=fuzzer,address,undefined", src, "-o", exe,
158
+ out: File::NULL, err: File::NULL)
159
+ end
160
+ end
161
+
84
162
  # The compiled extension, and whether it carries sanitizer instrumentation, so
85
163
  # `fuzz:sanitize SKIP_BUILD=1` can refuse to run a plain (non-ASan) build.
86
164
  def ext_bundle_path
@@ -116,6 +194,65 @@ task :sanitize do
116
194
  sh(env, "#{FileUtils::RUBY} -S rspec")
117
195
  end
118
196
 
197
+ desc "Measure C coverage of OUR sources (clang source-based) over the spec suite. " \
198
+ "Prints an llvm-cov region+branch report (excludes vendored Lexbor) and writes " \
199
+ "a line-level detail file to tmp/coverage/show.txt."
200
+ task :coverage do
201
+ require "fileutils"
202
+ dir = File.expand_path("tmp/coverage")
203
+ FileUtils.rm_rf(dir)
204
+ FileUtils.mkdir_p(dir)
205
+
206
+ # Instrument only our sources (Lexbor is built separately, uninstrumented).
207
+ sh({ "MAKIRI_COVERAGE" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
208
+ # %p -> PID, so any forked spec process gets its own raw profile.
209
+ sh({ "LLVM_PROFILE_FILE" => File.join(dir, "makiri-%p.profraw") }, "#{FileUtils::RUBY} -S rspec")
210
+
211
+ profdata = File.join(dir, "makiri.profdata")
212
+ bundle = "lib/makiri/makiri.bundle"
213
+ ignore = "(vendor/lexbor|/usr/|/Library/|ruby/|rubygems)"
214
+ sh "xcrun llvm-profdata merge -sparse #{dir}/*.profraw -o #{profdata}"
215
+ sh "xcrun llvm-cov report #{bundle} -instr-profile=#{profdata} " \
216
+ "-ignore-filename-regex='#{ignore}' -show-branch-summary"
217
+ show = File.join(dir, "show.txt")
218
+ sh "xcrun llvm-cov show #{bundle} -instr-profile=#{profdata} " \
219
+ "-ignore-filename-regex='#{ignore}' -show-branches=count -show-line-counts-or-regions > #{show}"
220
+ puts "\ncoverage line/branch detail: #{show}"
221
+ puts "(coverage build left in place; run `rake clean compile` to restore a normal build)"
222
+ end
223
+
224
+ desc "Like :sanitize but also builds the vendored Lexbor under ASan, so overflows " \
225
+ "INSIDE Lexbor's mraw arena are caught (slow: full Lexbor rebuild). Runs the " \
226
+ "spec suite, or FUZZ_ARGS via the fuzzer when set."
227
+ task "sanitize:lexbor" do
228
+ sanitize = ENV["MAKIRI_SANITIZE"] || "address,undefined"
229
+ sanitize.include?("address") or
230
+ abort "sanitize:lexbor needs an address build (MAKIRI_SANITIZE must include 'address')"
231
+
232
+ # MAKIRI_SANITIZE_LEXBOR makes extconf build Lexbor with -DLEXBOR_BUILD_WITH_ASAN
233
+ # (enabling its mraw poisoning); the build-mode stamp auto-rebuilds Lexbor on the
234
+ # plain<->asan switch, so no manual clean:lexbor is needed before or after.
235
+ build_env = { "MAKIRI_SANITIZE" => sanitize, "MAKIRI_SANITIZE_LEXBOR" => "1" }
236
+ sh(build_env, "#{FileUtils::RUBY} -S rake clean compile")
237
+
238
+ env = {
239
+ "ASAN_OPTIONS" => "detect_leaks=0:detect_container_overflow=0:" \
240
+ "detect_odr_violation=0:abort_on_error=1:halt_on_error=1",
241
+ "UBSAN_OPTIONS" => "print_stacktrace=1:halt_on_error=1",
242
+ }
243
+ runtime = asan_runtime_path or
244
+ abort "sanitize:lexbor: could not locate the ASan runtime for #{RbConfig::CONFIG['CC']}"
245
+ preload = RbConfig::CONFIG["target_os"] =~ /darwin/ ? "DYLD_INSERT_LIBRARIES" : "LD_PRELOAD"
246
+ env[preload] = runtime
247
+ puts "sanitize:lexbor: preloading #{runtime} via #{preload}"
248
+
249
+ if ENV["FUZZ_ARGS"]
250
+ sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
251
+ else
252
+ sh(env, "#{FileUtils::RUBY} -S rspec")
253
+ end
254
+ end
255
+
119
256
  desc "Run the robustness fuzzer (override options via FUZZ_ARGS)"
120
257
  task fuzz: :compile do
121
258
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}"
@@ -131,6 +268,25 @@ task "fuzz:mutate": :compile do
131
268
  sh "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb --target mutate #{ENV['FUZZ_ARGS']}"
132
269
  end
133
270
 
271
+ desc "Malloc-leak gate (macOS `leaks`): fails on per-call leak stacks through the ext"
272
+ task leaks: :compile do
273
+ # ASan runs with detect_leaks=0 (Ruby/Lexbor are uninstrumented), so plain
274
+ # leaks are otherwise never machine-checked; see script/check_leaks.rb.
275
+ sh "#{FileUtils::RUBY} script/check_leaks.rb"
276
+ end
277
+
278
+ desc "OOM-injection gate: rebuild with MAKIRI_ALLOC_INJECT=1 and sweep every core " \
279
+ "allocation site, verifying each failure fails closed (clean raise or " \
280
+ "baseline-identical result, never truncated output)"
281
+ task :oom do
282
+ # The hook is compiled in only under MAKIRI_ALLOC_INJECT=1 (zero overhead in
283
+ # a normal build), so this needs its own rebuild; see
284
+ # script/check_alloc_failures.rb for the protocol and the property gated.
285
+ sh({ "MAKIRI_ALLOC_INJECT" => "1" }, "#{FileUtils::RUBY} -S rake clean compile")
286
+ sh "#{FileUtils::RUBY} -Ilib script/check_alloc_failures.rb"
287
+ puts "(injection build left in place; run `rake clean compile` to restore a normal build)"
288
+ end
289
+
134
290
  desc "Run the performance benchmark (Makiri vs Nokogiri reference)"
135
291
  task bench: :compile do
136
292
  # Run outside the bundle so the bench-only gems (nokogiri, benchmark-ips)
@@ -190,10 +346,26 @@ namespace :conformance do
190
346
  sh "#{FileUtils::RUBY} -Ilib spec/conformance/css_diff.rb #{ENV['CSS_ARGS']}"
191
347
  end
192
348
  end
349
+
350
+ desc "XML CSS-selector differential conformance: Makiri::XML vs Nokogiri::XML"
351
+ task css_xml: :compile do
352
+ Bundler.with_unbundled_env do
353
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/xml_css_diff.rb #{ENV['CSS_XML_ARGS']}"
354
+ end
355
+ end
356
+
357
+ desc "XML Builder differential conformance: Makiri::XML::Builder vs Nokogiri::XML::Builder"
358
+ task builder: :compile do
359
+ Bundler.with_unbundled_env do
360
+ sh "#{FileUtils::RUBY} -Ilib spec/conformance/builder_diff.rb #{ENV['BUILDER_ARGS']}"
361
+ end
362
+ end
193
363
  end
194
364
 
195
365
  desc "Run all conformance suites"
196
- task conformance: %w[conformance:html5 conformance:xpath conformance:css conformance:xmlconf conformance:xpath_xml]
366
+ task conformance: %w[conformance:html5 conformance:xpath conformance:css
367
+ conformance:xmlconf conformance:xpath_xml conformance:css_xml
368
+ conformance:builder]
197
369
 
198
370
  namespace :fuzz do
199
371
  # Run the fuzzer under the sanitizer. Toggles (all via env):
@@ -233,14 +405,46 @@ namespace :fuzz do
233
405
  if ENV["FUZZ_ARGS"]
234
406
  sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{ENV['FUZZ_ARGS']}")
235
407
  else
236
- iso = %w[1 true yes].include?(ENV["FAST"].to_s.downcase) ? "" : "--isolated"
408
+ iso = %w[1 true yes].include?(ENV["ISOLATED"].to_s.downcase) ? "--isolated" : ""
237
409
  secs = ENV["FUZZ_TIME"] || "90"
238
410
  # Cover every surface under the sanitizer: the query engine (XPath/CSS over
239
411
  # parsed fixtures), the XML parser (hostile documents), and the XML mutation
240
412
  # surface (random edit sequences + invariants).
241
- ["", "--target xml", "--target mutate"].each do |surface|
413
+ ["", "--target xml", "--target mutate", "--target xmlcss"].each do |surface|
242
414
  sh(env, "#{FileUtils::RUBY} -Ilib spec/fuzz/run.rb #{surface} #{iso} --time #{secs}".squeeze(" ").strip)
243
415
  end
244
416
  end
245
417
  end
418
+
419
+ # Coverage-guided libFuzzer harnesses for the pure-C surfaces (XML parser and
420
+ # XPath compile+eval). These are Ruby-free standalone binaries, so they run
421
+ # directly under clang's libFuzzer driver without the Ruby interpreter.
422
+ # They complement the Ruby-based robustness fuzzer by providing coverage
423
+ # feedback and 2-3 orders of magnitude faster execution for the C core.
424
+ desc "Build the libFuzzer harnesses (requires clang with libFuzzer support)"
425
+ task :libfuzzer_build => :compile do
426
+ libfuzzer_available? or
427
+ abort "fuzz:libfuzzer_build: #{ENV['CXX'] || 'clang++'} cannot link libFuzzer. " \
428
+ "Install an LLVM clang with libFuzzer support and run with " \
429
+ "CLANG=/path/to/clang CXX=/path/to/clang++."
430
+ Dir.chdir("ext/makiri/fuzz") do
431
+ sh "make clean"
432
+ sh "make all"
433
+ end
434
+ end
435
+
436
+ desc "Run the libFuzzer coverage-guided harnesses (default: 60s per target)"
437
+ task :libfuzzer => :libfuzzer_build do
438
+ time = ENV["FUZZ_TIME"] || "60"
439
+ Dir.chdir("ext/makiri/fuzz") do
440
+ sh "mkdir -p corpus/xml corpus/xpath"
441
+ sh "./xml_fuzz -max_total_time=#{time} -max_len=4096 corpus/xml"
442
+ sh "./xpath_fuzz -max_total_time=#{time} -max_len=4096 corpus/xpath"
443
+ end
444
+ end
445
+ end
446
+
447
+ desc "Show code statistics"
448
+ task :stats do
449
+ sh "tokei lib ext spec script --exclude tmp --exclude vendor --exclude docs"
246
450
  end