makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Allocation-failure injection sweep for the C extension (run via `rake oom`).
4
+ #
5
+ # The sanitizers and the leak gate prove the happy path is memory-safe; neither
6
+ # proves the OOM branches are CORRECT. Makiri's contract is fail-closed: when a
7
+ # core C allocation fails, a call must either raise a clean Makiri::Error /
8
+ # NoMemoryError or complete with the exact same result as the unfailed run -
9
+ # never a truncated/partial result (the property the XPath node-set caps and
10
+ # the "build OOM -> walk fallback" designs exist for). This gate machine-checks
11
+ # that contract: with the ext built under MAKIRI_ALLOC_INJECT=1, every core
12
+ # allocation site routes through a hook that can be armed to fail the nth
13
+ # attempt once. For each representative workload we record a failure-free
14
+ # BASELINE result and the total number of allocation attempts, then re-run the
15
+ # workload once per allocation site with exactly that site failing, and verify
16
+ # each run either raised cleanly or returned a baseline-identical value.
17
+ #
18
+ # A segfault/abort kills this process; the caller (rake/CI) sees the nonzero
19
+ # exit, which is the verdict too.
20
+ #
21
+ # bundle exec rake oom # rebuild + sweep
22
+ # bundle exec ruby -Ilib script/check_alloc_failures.rb # sweep current build
23
+
24
+ require "makiri"
25
+
26
+ unless Makiri.send(:__alloc_inject?)
27
+ abort "check_alloc_failures: extension built without the injection hook - " \
28
+ "rebuild with MAKIRI_ALLOC_INJECT=1 (`rake oom` does this)"
29
+ end
30
+
31
+ # Each scenario runs one workload END-TO-END and returns a canonical String, so
32
+ # an injected run's result can be compared (==) against the baseline. Fixtures
33
+ # are built INSIDE the lambda (unless reuse is the point) so the sweep covers
34
+ # their parse/build allocations too.
35
+ SCENARIOS = {
36
+ # XML parse covering the syntax surface: declaration, DOCTYPE (SYSTEM id +
37
+ # internal subset), default + prefixed namespaces, prefixed attributes,
38
+ # references, comment, CDATA, PI, nesting, CRLF normalization in an attr.
39
+ "xml_parse" => lambda do
40
+ src = <<~XML
41
+ <?xml version="1.0" encoding="UTF-8"?>
42
+ <!DOCTYPE root SYSTEM "urn:example:dtd" [<!ENTITY local "subset">]>
43
+ <root xmlns="urn:d" xmlns:p="urn:p" p:pa="pv" mixed="A&amp;&#x41; x\r\ny">
44
+ <!-- a comment -->
45
+ <p:branch><leaf depth="2">text &amp; &#x41; refs</leaf></p:branch>
46
+ <![CDATA[raw < cdata & bytes]]>
47
+ <?pi-target some data?>
48
+ <empty/>
49
+ </root>
50
+ XML
51
+ Makiri::XML::Document.parse(src).to_xml
52
+ end,
53
+
54
+ # Fragment parse + serialize + splice into a host document.
55
+ "xml_fragment" => lambda do
56
+ doc = Makiri::XML::Document.parse("<r xmlns='urn:d'><keep>k</keep></r>")
57
+ frag = doc.fragment("<a xmlns:p='u'><p:b>t</p:b></a>text")
58
+ out = frag.to_xml
59
+ doc.root.add_child(frag)
60
+ out + doc.to_xml
61
+ end,
62
+
63
+ # XPath battery: predicates, functions, a union, axes, plus an XPathContext
64
+ # evaluation with a registered namespace and variable.
65
+ "xml_xpath" => lambda do
66
+ doc = Makiri::XML::Document.parse(<<~XML)
67
+ <root xmlns:p="urn:p">
68
+ <a v="1">alpha</a>
69
+ <a v="2">beta</a>
70
+ <b n="3"> gamma delta </b>
71
+ <p:c><a v="9">nested</a></p:c>
72
+ </root>
73
+ XML
74
+ canon = lambda do |r|
75
+ r.is_a?(Makiri::NodeSet) ? r.map(&:to_xml).join("|") : r.inspect
76
+ end
77
+ exprs = [
78
+ "//a[@v='2']",
79
+ "//a[position()=2]",
80
+ "//a[last()]",
81
+ "count(//a)",
82
+ "sum(//b/@n)",
83
+ "concat(string(//a[1]), '-', substring('abcdef', 2, 3))",
84
+ "translate('abc', 'abc', 'xyz')",
85
+ "normalize-space(//b)",
86
+ "contains(//a[1], 'alp')",
87
+ "starts-with(//b, ' g')",
88
+ "//a | //b",
89
+ "//a[1]/ancestor::root",
90
+ "//a[1]/following-sibling::b",
91
+ "//root/descendant-or-self::a",
92
+ ]
93
+ parts = exprs.map { |e| canon.call(doc.xpath(e)) }
94
+ ctx = Makiri::XPathContext.new(doc)
95
+ ctx.register_namespace("p", "urn:p")
96
+ ctx.register_variable("want", "9")
97
+ parts << canon.call(ctx.evaluate("//p:c/a[@v=$want]"))
98
+ parts.join("\n")
99
+ end,
100
+
101
+ # Same battery shape over an HTML5-parsed document.
102
+ "html_xpath" => lambda do
103
+ doc = Makiri::HTML::Document.parse(<<~HTML)
104
+ <html><body>
105
+ <div id="top"><p class="x">one</p><p class="y">two</p></div>
106
+ <ul><li data-n="1">a</li><li data-n="2"> b c </li></ul>
107
+ </body></html>
108
+ HTML
109
+ canon = lambda do |r|
110
+ r.is_a?(Makiri::NodeSet) ? r.map(&:to_html).join("|") : r.inspect
111
+ end
112
+ exprs = [
113
+ "//p[@class='y']",
114
+ "//li[position()=2]",
115
+ "//li[last()]",
116
+ "count(//p)",
117
+ "sum(//li/@data-n)",
118
+ "concat(string(//p[1]), '+', substring(//p[2], 1, 2))",
119
+ "translate('one', 'one', 'uno')",
120
+ "normalize-space(//li[2])",
121
+ "contains(//p[1], 'on')",
122
+ "starts-with(//p[2], 'tw')",
123
+ "//p | //li",
124
+ "//p[1]/ancestor::div",
125
+ "//p[1]/following-sibling::p",
126
+ "//div/descendant-or-self::p",
127
+ ]
128
+ parts = exprs.map { |e| canon.call(doc.xpath(e)) }
129
+ ctx = Makiri::XPathContext.new(doc)
130
+ ctx.register_variable("cls", "x")
131
+ parts << canon.call(ctx.evaluate("//p[@class=$cls]"))
132
+ parts.join("\n")
133
+ end,
134
+
135
+ # The mutation surface: create_*, insertion on every side, rename, content,
136
+ # attributes, replace, remove, and a fragment splice.
137
+ "xml_mutate" => lambda do
138
+ doc = Makiri::XML::Document.parse("<root><old>x</old><gone/></root>")
139
+ el = doc.create_element("made")
140
+ el.add_child(doc.create_text_node("inner"))
141
+ el["k"] = "v"
142
+ doc.root.add_child(el)
143
+ el.name = "renamed"
144
+ el.content = "rewritten"
145
+ el.add_previous_sibling(doc.create_element("before"))
146
+ el.add_next_sibling(doc.create_element("after"))
147
+ doc.root.at_xpath("old").replace(doc.create_element("new"))
148
+ doc.root.at_xpath("gone").remove
149
+ doc.root.add_child(doc.fragment("<f1/>tail<f2 a='b'/>"))
150
+ doc.to_xml
151
+ end,
152
+
153
+ # Serialization over a non-trivial tree (~200 elements built inside the
154
+ # lambda, so the parse is swept too), both tree and deep serializers.
155
+ "html_serialize" => lambda do
156
+ body = (1..50).map { |i|
157
+ "<div id='d#{i}' class='row'><p>cell #{i}</p><span>tail &amp; #{i}</span></div>"
158
+ }.join
159
+ doc = Makiri::HTML::Document.parse("<html><body>#{body}</body></html>")
160
+ doc.to_html + doc.at_css("body").inner_html
161
+ end,
162
+
163
+ # Full-document text extraction (exercises the text-index build, and its
164
+ # fail-closed OOM -> walk fallback).
165
+ "html_text" => lambda do
166
+ body = (1..80).map { |i| "<p>para #{i} <em>em#{i}</em> tail</p>" }.join
167
+ doc = Makiri::HTML::Document.parse("<html><body>#{body}</body></html>")
168
+ doc.text
169
+ end,
170
+
171
+ # CSS: a comma list with combinators through the reused engine, plus the
172
+ # at_css first-match path.
173
+ "css" => lambda do
174
+ doc = Makiri::HTML::Document.parse(<<~HTML)
175
+ <html><body>
176
+ <p class="c">one</p><p>skip</p><p class="c">two</p>
177
+ <div><span>in</span></div><span>out</span>
178
+ <section id="x">target</section>
179
+ </body></html>
180
+ HTML
181
+ doc.css("p.c, div > span").map { |n| n.name }.join(",") +
182
+ doc.at_css("#x")&.name.to_s
183
+ end,
184
+
185
+ # The Builder DSL (pure Ruby over create_*/add_child, so this sweeps the
186
+ # construction factories).
187
+ "xml_builder" => lambda do
188
+ Makiri::XML::Builder.new do |xml|
189
+ xml.feed("xmlns" => "urn:a", "xmlns:dc" => "urn:dc") do
190
+ xml.entry do
191
+ xml.title("Hello")
192
+ xml["dc"].id_("42")
193
+ xml.cdata("a < b")
194
+ xml.comment(" note ")
195
+ end
196
+ end
197
+ end.to_xml
198
+ end,
199
+ }.freeze
200
+
201
+ ALLOWED = [Makiri::Error, NoMemoryError].freeze
202
+ TRUNCATE = 120
203
+
204
+ def disarm = Makiri.send(:__alloc_inject, 0)
205
+
206
+ failures_total = 0
207
+
208
+ SCENARIOS.each do |name, work|
209
+ # Warm twice with injection off: process-global engines (CSS) and lazy
210
+ # builds settle, so the counted run below is representative and stable.
211
+ disarm
212
+ 2.times { work.call }
213
+
214
+ # Counted baseline run: __alloc_inject(0) also resets the counter, so the
215
+ # calls reading right after is exactly this run's allocation-attempt total.
216
+ disarm
217
+ baseline = work.call
218
+ total = Makiri.send(:__alloc_inject_calls)
219
+
220
+ ok_raised = 0
221
+ ok_identical = 0
222
+ failures = []
223
+
224
+ (1..total).each do |n|
225
+ Makiri.send(:__alloc_inject, n)
226
+ begin
227
+ result = work.call
228
+ if result == baseline
229
+ ok_identical += 1
230
+ else
231
+ failures << [n, "truncated/wrong result",
232
+ "baseline=#{baseline.to_s[0, TRUNCATE].inspect} " \
233
+ "got=#{result.to_s[0, TRUNCATE].inspect}"]
234
+ end
235
+ rescue *ALLOWED
236
+ ok_raised += 1
237
+ rescue Exception => e # rubocop:disable Lint/RescueException -- the wrong class IS the finding
238
+ failures << [n, "wrong exception class",
239
+ "#{e.class}: #{e.message.to_s[0, TRUNCATE]}"]
240
+ ensure
241
+ disarm
242
+ end
243
+ end
244
+
245
+ failures_total += failures.size
246
+ puts format("%-16s allocations=%-5d raised=%-5d identical=%-5d failed=%d",
247
+ name, total, ok_raised, ok_identical, failures.size)
248
+ failures.each do |n, kind, detail|
249
+ puts " n=#{n} #{kind}: #{detail}"
250
+ end
251
+ if total.zero?
252
+ # A scenario that never reaches a core allocation sweeps nothing - that is
253
+ # a broken scenario, not a pass.
254
+ failures_total += 1
255
+ puts " scenario performed ZERO core allocations - workload not reaching the C core"
256
+ end
257
+ end
258
+
259
+ if failures_total.zero?
260
+ puts "check_alloc_failures: OK - every injected allocation failure failed closed " \
261
+ "(clean raise or baseline-identical result)"
262
+ else
263
+ puts "check_alloc_failures: FAILED - #{failures_total} injected failure(s) " \
264
+ "produced a wrong exception or a non-baseline result"
265
+ exit 1
266
+ end
@@ -8,9 +8,26 @@ require "yaml"
8
8
  ROOT = Pathname.new(__dir__).join("..").expand_path
9
9
  ALLOWLIST_PATH = ROOT.join("script/check_c_safety_allowlist.yml")
10
10
 
11
- Rule = Struct.new(:id, :message, :regex, keyword_init: true)
11
+ # A rule may carry `paths` (an array of path globs): it then applies ONLY to
12
+ # matching files. Used for the parser-TU reader discipline, where the ban is
13
+ # meaningful only in TUs whose input reads must go through mkr_span_t.
14
+ Rule = Struct.new(:id, :message, :regex, :paths, keyword_init: true)
12
15
  Finding = Struct.new(:path, :line, :rule, :text, keyword_init: true)
13
16
 
17
+ # Byte-scanning parser TUs: every input read goes through the bounded reader
18
+ # (core/mkr_span.h) - see its header comment. The rules below turn that from a
19
+ # convention into a machine-enforced invariant for these files.
20
+ PARSER_TUS = %w[
21
+ ext/makiri/xml/mkr_xml_tree.c
22
+ ext/makiri/xml/mkr_xml_chars.c
23
+ ext/makiri/xml/mkr_xml_node.c
24
+ ext/makiri/xpath/mkr_xpath_lex.c
25
+ ext/makiri/xpath/mkr_xpath_funcs_body.h
26
+ ext/makiri/xpath/mkr_xpath_value_body.h
27
+ ext/makiri/bridge/ruby_string.c
28
+ ext/makiri/lexbor_compat/source_loc.c
29
+ ].freeze
30
+
14
31
  RULES = [
15
32
  Rule.new(
16
33
  id: "string_value_cstr",
@@ -60,7 +77,64 @@ RULES = [
60
77
  Rule.new(
61
78
  id: "verified_text_forge",
62
79
  message: "mkr_verified_text_t must be minted only by mkr_verified_text_from_view (the validated boundary)",
63
- regex: /\(\s*mkr_verified_text_t\s*\)\s*\{/
80
+ # Both forge shapes: the compound-literal cast AND the declaration
81
+ # initializer (`mkr_verified_text_t x = {...}`), which the cast-only regex
82
+ # used to miss - that gap let a fuzz harness mint over a non-NUL-terminated
83
+ # buffer unnoticed.
84
+ regex: /\(\s*mkr_verified_text_t\s*\)\s*\{|\bmkr_verified_text_t\s+\w+\s*=\s*\{/
85
+ ),
86
+ # --- HTML/XML representation boundary (see docs/html_xml_boundary_hardening) ---
87
+ # These symbols assume one DOM representation; using them outside their
88
+ # representation-correct / kind-checked home is how shared glue (XPath, NodeSet,
89
+ # node identity) silently treats an XML node as HTML (or vice versa) - an
90
+ # assert-abort or memory type-confusion. Each is allowlisted only in the files
91
+ # that legitimately own it; anywhere else trips the lint.
92
+ Rule.new(
93
+ id: "html_doc_unwrap_boundary",
94
+ message: "mkr_html_doc_unwrap is HTML-only; shared/XML code must use the kind-aware mkr_node_unwrap",
95
+ regex: /\bmkr_html_doc_unwrap\s*\(/
96
+ ),
97
+ Rule.new(
98
+ id: "parsed_html_doc_boundary",
99
+ message: "mkr_parsed_html_doc (asserts kind==HTML) may only be used in a kind-checked / HTML-only site",
100
+ regex: /\bmkr_parsed_html_doc\s*\(/
101
+ ),
102
+ Rule.new(
103
+ id: "parsed_xml_doc_boundary",
104
+ message: "mkr_parsed_xml_doc may only be used in a kind-checked / XML-representation site",
105
+ regex: /\bmkr_parsed_xml_doc\s*\(/
106
+ ),
107
+ Rule.new(
108
+ id: "owner_document_boundary",
109
+ message: "owner_document is an HTML-only lxb field; shared code must compare documents via mkr_node_document",
110
+ regex: /\bowner_document\b/
111
+ ),
112
+ Rule.new(
113
+ id: "node_raw_boundary",
114
+ message: "mkr_node_raw is the kind-agnostic raw pointer (identity / kind-guaranteed only); " \
115
+ "to dereference a node use mkr_html_node or mkr_xml_node_unwrap (kind-checked)",
116
+ regex: /\bmkr_node_raw\s*\(/
117
+ ),
118
+ # --- parser-TU reader discipline (see core/mkr_span.h) ---
119
+ # In the byte-scanning parser TUs every input read must go through the
120
+ # bounded reader: a raw libc scan reintroduces the "forgot the bounds check"
121
+ # class the span made structurally impossible. memcpy stays allowed (an
122
+ # explicit-length copy, not a scan).
123
+ Rule.new(
124
+ id: "raw_scan_call",
125
+ message: "parser TUs must read input through mkr_span_* / mkr_bytes_eq / mkr_utf8_* " \
126
+ "(core), not raw libc scanning",
127
+ regex: /\b(?:memchr|memcmp|strchr|strrchr|strstr|strn?cmp|strcspn|strspn|strpbrk|strtod|strtol|strtoull?|sscanf)\s*\(/,
128
+ paths: PARSER_TUS
129
+ ),
130
+ # The span's own cursor/bound fields are private to core/mkr_span.h: touching
131
+ # `.p` / `.end` in a parser TU is how a hand-rolled (uncovered) cursor starts.
132
+ Rule.new(
133
+ id: "raw_cursor_member",
134
+ message: "parser TUs must not access a span's .p/.end (or keep a raw cursor struct); " \
135
+ "use the mkr_span_* helpers (mark/since for slice capture)",
136
+ regex: /(?:->|\.)\s*(?:p|end)\b/,
137
+ paths: PARSER_TUS
64
138
  ),
65
139
  ].freeze
66
140
 
@@ -131,6 +205,7 @@ def scan_findings(ignores)
131
205
  next [] unless code_line?(line)
132
206
 
133
207
  RULES.filter_map do |rule|
208
+ next if rule.paths && rule.paths.none? { |pat| path_matches?(pat, rel) }
134
209
  next unless line.match?(rule.regex)
135
210
  next if rule_ignored?(rel, rule.id, ignores)
136
211
 
@@ -10,3 +10,105 @@ ignore_paths:
10
10
  - path: ext/makiri/bridge/text_token.c
11
11
  rule: verified_text_forge
12
12
  reason: mkr_verified_text_from_view, the sole sanctioned mint of mkr_verified_text_t; its input view is already validated by the bridge string helpers.
13
+ # --- HTML/XML representation boundary: each per-rep symbol is exempt only in
14
+ # the files that own it (declaration, definition, or a kind-checked use), so
15
+ # a new use anywhere else — especially shared glue — trips the lint ---
16
+ # mkr_html_doc_unwrap (HTML-only document unwrap)
17
+ - path: ext/makiri/glue/glue.h
18
+ rule: html_doc_unwrap_boundary
19
+ reason: declaration of the HTML-only document unwrap.
20
+ - path: ext/makiri/glue/ruby_doc.c
21
+ rule: html_doc_unwrap_boundary
22
+ reason: definition + HTML Document methods (parse/serialize/title/compat_mode) — all HTML-only.
23
+ - path: ext/makiri/glue/ruby_node.c
24
+ rule: html_doc_unwrap_boundary
25
+ reason: the kind-aware mkr_node_raw calls it only on its HTML branch (after a MKR_DOC_XML check).
26
+ - path: ext/makiri/glue/ruby_html_node.c
27
+ rule: html_doc_unwrap_boundary
28
+ reason: mkr_html_node_unwrap resolves an HTML Document (after rejecting an XML one at the type boundary).
29
+ - path: ext/makiri/glue/ruby_html_mutate.c
30
+ rule: html_doc_unwrap_boundary
31
+ reason: HTML tree/fragment mutation; XML mutation has its own arena path.
32
+ - path: ext/makiri/glue/ruby_xpath.c
33
+ rule: html_doc_unwrap_boundary
34
+ reason: the HTML branch of mkr_xpath_context_for, entered only after the XML kind returns early.
35
+ # mkr_parsed_html_doc (asserts kind == HTML)
36
+ - path: ext/makiri/lexbor_compat/compat.h
37
+ rule: parsed_html_doc_boundary
38
+ reason: declaration of the HTML parsed-document accessor.
39
+ - path: ext/makiri/lexbor_compat/post_parse.c
40
+ rule: parsed_html_doc_boundary
41
+ reason: definition (the kind assert lives here) + HTML post-parse pipeline.
42
+ - path: ext/makiri/glue/ruby_doc.c
43
+ rule: parsed_html_doc_boundary
44
+ reason: mkr_html_doc_unwrap is defined here over the HTML parsed document.
45
+ # mkr_parsed_xml_doc (XML arena accessor) — kept out of the pure-HTML glue files
46
+ - path: ext/makiri/lexbor_compat/compat.h
47
+ rule: parsed_xml_doc_boundary
48
+ reason: declaration of the XML parsed-document accessor.
49
+ - path: ext/makiri/lexbor_compat/post_parse.c
50
+ rule: parsed_xml_doc_boundary
51
+ reason: definition + XML document wrapping/teardown.
52
+ - path: ext/makiri/glue/ruby_xml.c
53
+ rule: parsed_xml_doc_boundary
54
+ reason: XML parse entry / Document construction.
55
+ - path: ext/makiri/glue/ruby_xml_node.c
56
+ rule: parsed_xml_doc_boundary
57
+ reason: XML node read + mutation surface over the arena.
58
+ - path: ext/makiri/glue/ruby_doc.c
59
+ rule: parsed_xml_doc_boundary
60
+ reason: shared Document glue, used only on kind-checked XML branches.
61
+ - path: ext/makiri/glue/ruby_xpath.c
62
+ rule: parsed_xml_doc_boundary
63
+ reason: the XML branch of mkr_xpath_context_for (kind-checked).
64
+ - path: ext/makiri/glue/ruby_node.c
65
+ rule: parsed_xml_doc_boundary
66
+ reason: the kind-aware mkr_node_raw resolves an XML Document only after a MKR_DOC_XML check.
67
+ # owner_document (HTML-only lxb_dom_node_t field)
68
+ - path: ext/makiri/lexbor_compat/post_parse.c
69
+ rule: owner_document_boundary
70
+ reason: mkr_lxb_document_bytes resolves a Lexbor node's owner document to size its mraw pools (HTML-only; the XML serializer uses arena_bytes instead).
71
+ - path: ext/makiri/glue/ruby_html_node.c
72
+ rule: owner_document_boundary
73
+ reason: HTML node readers operate on lxb_dom_node_t.
74
+ - path: ext/makiri/glue/ruby_doc.c
75
+ rule: owner_document_boundary
76
+ reason: HTML Document operations.
77
+ - path: ext/makiri/glue/ruby_html_mutate.c
78
+ rule: owner_document_boundary
79
+ reason: HTML tree mutation over Lexbor nodes.
80
+ - path: ext/makiri/xpath/mkr_xpath_node_access_html.h
81
+ rule: owner_document_boundary
82
+ reason: the HTML monomorphization of the engine's node-access layer.
83
+ # mkr_node_raw (kind-agnostic void* raw pointer; never dereferenced as a typed node)
84
+ - path: ext/makiri/glue/glue.h
85
+ rule: node_raw_boundary
86
+ reason: declaration of the kind-agnostic accessor.
87
+ - path: ext/makiri/glue/ruby_node.c
88
+ rule: node_raw_boundary
89
+ reason: defines mkr_node_raw / mkr_node_id (node identity — the pointer is only compared, never dereferenced).
90
+ - path: ext/makiri/glue/ruby_xpath.c
91
+ rule: node_raw_boundary
92
+ reason: the XPath context node / handler-result node, where same-document (hence the representation) is verified before the engine takes the raw pointer.
93
+ - path: ext/makiri/glue/ruby_node_set.c
94
+ rule: node_raw_boundary
95
+ reason: NodeSet.new stores representation-opaque pointers; mkr_node_raw takes a seed node's pointer only after same-document validation guarantees its kind matches the set.
96
+
97
+ # --- parser-TU reader discipline (raw_scan_call / raw_cursor_member) ---
98
+ allowlist:
99
+ - path: ext/makiri/xpath/mkr_xpath_funcs_body.h
100
+ rule: raw_scan_call
101
+ max: 3
102
+ reason: "mkr_lookup_function's three strcmp over compile-time function-name tables: both sides are NUL-terminated (table literals / owned AST text), and the lookup signature takes bare const char* from the evaluator."
103
+
104
+ # --- verified_text_forge: the two Ruby-free test entry points. Neither can use
105
+ # mkr_verified_text_from_view (bridge = Ruby boundary), so each mints the
106
+ # token itself and must supply the contract by construction. ---
107
+ - path: ext/makiri/xpath/mkr_xpath_xml_selftest.c
108
+ rule: verified_text_forge
109
+ max: 2
110
+ reason: "Selftest expressions are compile-time string literals: NUL-terminated, NUL-free, valid UTF-8 by construction."
111
+ - path: ext/makiri/fuzz/xpath_fuzz.c
112
+ rule: verified_text_forge
113
+ max: 1
114
+ reason: "The libFuzzer harness mints over an owned mkr_strndup copy, which supplies NUL-termination and no-interior-NUL; UTF-8 validity is deliberately left to the lexer's strict decoder (its rejection path is fuzz-target behavior)."
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Malloc-leak gate for the C extension (macOS only; run via `rake leaks`).
4
+ #
5
+ # ASan runs everywhere with detect_leaks=0 (Ruby and Lexbor are uninstrumented,
6
+ # so LeakSanitizer drowns in their noise) - which means plain leaks were never
7
+ # machine-checked. This gate fills that hole with macOS's `leaks` tool: it runs
8
+ # script/leaks_harness.rb (the public surface in a loop, INCLUDING rescued
9
+ # failure paths) under MallocStackLogging, then scans the leak report for
10
+ # allocation stacks that (a) pass through makiri.bundle and (b) repeat in
11
+ # proportion to the loop count - i.e. a leak per call, which is what a missing
12
+ # free on some path looks like. One-time Init allocations and Ruby's own
13
+ # at-exit/unscannable noise stay at 1-2 instances and are ignored.
14
+ #
15
+ # Found (and since fixed) by this harness: the transient fragment document
16
+ # leaked by every inner_html=/outer_html=, and the partially-built step leaked
17
+ # by every failing XPath parse.
18
+ #
19
+ # ruby script/check_leaks.rb # threshold = ITERATIONS / 4
20
+ # LEAKS_ITERATIONS=200 rake leaks # more iterations, sharper signal
21
+
22
+ require "rbconfig"
23
+ require "tempfile"
24
+
25
+ abort "check_leaks: the `leaks` tool is macOS-only" unless RUBY_PLATFORM.include?("darwin")
26
+
27
+ iterations = Integer(ENV.fetch("LEAKS_ITERATIONS", "120"))
28
+ threshold = [iterations / 4, 10].max
29
+ harness = File.expand_path("leaks_harness.rb", __dir__)
30
+ lib = File.expand_path("../lib", __dir__)
31
+
32
+ out = Tempfile.create("makiri-leaks")
33
+ ok = system({ "MallocStackLogging" => "1", "LEAKS_ITERATIONS" => iterations.to_s },
34
+ "leaks", "--atExit", "--",
35
+ RbConfig.ruby, "-I#{lib}", harness,
36
+ out: out.path, err: out.path)
37
+ report = File.read(out.path)
38
+ # `leaks` exits non-zero whenever ANY leak exists (Ruby itself always reports
39
+ # some at-exit noise), so the exit status is not the verdict - the scan below
40
+ # is. But the harness itself must have completed.
41
+ abort "check_leaks: harness did not complete:\n#{report[-2000..]}" unless report.include?("leaks harness done")
42
+ abort "check_leaks: no leak report produced (leaks tool failed?)" unless report.include?("STACK OF") || ok
43
+
44
+ offenders = report.split(/\n(?=STACK OF )/).filter_map do |stanza|
45
+ next unless stanza.include?("makiri.bundle")
46
+
47
+ instances = stanza[/STACK OF (\d+) INSTANCES?/, 1].to_i
48
+ next if instances < threshold
49
+
50
+ frames = stanza.lines.grep(/makiri\.bundle/).first(4).map(&:strip)
51
+ [instances, frames]
52
+ end
53
+
54
+ if offenders.empty?
55
+ puts "check_leaks: OK - no repeated (>= #{threshold}x) leak stacks through makiri.bundle " \
56
+ "(#{iterations} iterations)"
57
+ else
58
+ puts "check_leaks: FAILED - #{offenders.size} repeated leak stack(s) through makiri.bundle:"
59
+ offenders.each do |instances, frames|
60
+ puts " #{instances}x:"
61
+ frames.each { |f| puts " #{f}" }
62
+ end
63
+ exit 1
64
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Exercise script for the malloc-leak gate (script/check_leaks.rb runs this
4
+ # under macOS `leaks --atExit`). It loops the public surface - parsing, queries,
5
+ # serialization, mutation, fragments, the Builder, XPathContext - and, crucially,
6
+ # RESCUED FAILURE paths (raises that skip cleanup are where leaks hide; both
7
+ # leaks this gate was built on lived there or in a transient-document path).
8
+ #
9
+ # A per-call leak shows up as a leak stack with ~ITERATIONS instances, which the
10
+ # driver flags; one-time/Init allocations stay at 1-2 instances and pass.
11
+
12
+ require "makiri"
13
+
14
+ ITERATIONS = Integer(ENV.fetch("LEAKS_ITERATIONS", "120"))
15
+
16
+ HTML = "<div id=m class='a b'><ul><li class=item>x</li><li>y<svg><path/></svg></li></ul><p>t&amp;</p></div>"
17
+ XML = %(<r xmlns:p="urn:p" xmlns="urn:d"><a id="1">t</a><p:b/><!--c--><![CDATA[z]]></r>)
18
+
19
+ handler = Class.new { def my_fn(nodes) = nodes.length.to_s }.new
20
+
21
+ ITERATIONS.times do |i|
22
+ # --- HTML: parse / query / serialize / mutate / fragments ---
23
+ d = Makiri::HTML(HTML)
24
+ d.css("li.item"); d.at_css("#m"); d.at_css("li").matches?("li")
25
+ begin d.css("li[") rescue Makiri::CSS::SyntaxError; end # selector syntax error (engine reset path)
26
+ d.xpath("//li"); d.at_xpath("//p"); d.xpath("count(//li)")
27
+ begin d.xpath("//li[") rescue Makiri::XPath::SyntaxError; end # parse failure (partial-AST/step cleanup)
28
+ begin d.xpath("//li", handler) rescue nil; end
29
+ d.xpath("//*[local-name()='path']")
30
+ d.to_html; d.at_css("ul").inner_html; d.to_html(pretty: true); d.text
31
+ e = d.at_css("li"); e["k#{i}"] = "v"; e.name = "li2"; e.content = "c"
32
+ e.set_attribute_ns("urn:x", "x:y", "1"); e.remove_attribute_ns("urn:x", "y")
33
+ d.at_css("ul").inner_html = "<li>new</li>" # transient fragment document path
34
+ frag = d.fragment("<b>f</b>"); d.at_css("p") << frag
35
+ Makiri::DocumentFragment.parse("<tr><td>1</td></tr>", context: "tbody")
36
+ d.at_css("p").clone_node(true); d.dup
37
+ begin d.fragment("x", context: "no-such-tag") rescue ArgumentError; end
38
+
39
+ # --- XML: parse (ok + failures) / query / serialize / mutate / Builder ---
40
+ x = Makiri::XML(XML)
41
+ begin Makiri::XML("<r>\xC3</r>".b) rescue Makiri::XML::SyntaxError; end
42
+ begin Makiri::XML("<r><a></r>") rescue Makiri::XML::SyntaxError; end
43
+ begin Makiri::XML("<r/>", max_bytes: 1) rescue Makiri::XML::LimitExceeded; end
44
+ begin Makiri::XML(%(<?xml version="1.1"?><r/>)) rescue Makiri::XML::SyntaxError; end
45
+ x.xpath("//d:a", "d" => "urn:d"); x.at_xpath("//p:b", "p" => "urn:p")
46
+ begin x.xpath("//unbound:a") rescue Makiri::Error; end
47
+ x.css("a"); x.at_css("p|b", "p" => "urn:p")
48
+ begin x.css("a[") rescue Makiri::CSS::SyntaxError; end
49
+ x.to_xml; x.to_xml(pretty: true); x.root.canonicalize
50
+ el = x.create_element("n", "t", "k" => "v"); x.root.add_child(el)
51
+ begin x.root.add_child(x.create_element("zz:q")) rescue Makiri::Error; end
52
+ x.root.children.first.replace(x.create_element("rep")); x.fragment("<f1/><f2/>")
53
+ Makiri::XML::Builder.new { |b| b.root("xmlns:d" => "urn:d") { b.item("a"); b["d"].q } }.to_xml
54
+
55
+ # --- XPathContext: AST cache / registrations / failing evaluate ---
56
+ ctx = Makiri::XPathContext.new(x)
57
+ ctx.register_namespace("d", "urn:d"); ctx.register_variable("v", "1")
58
+ ctx.evaluate("//d:a[@id=$v]"); ctx.evaluate("//d:a[@id=$v]")
59
+ begin ctx.evaluate("//(") rescue Makiri::XPath::SyntaxError; end
60
+ end
61
+
62
+ GC.start
63
+ GC.start
64
+ puts "leaks harness done (#{ITERATIONS} iterations)"
@@ -203,6 +203,12 @@ ENDFOREACH()
203
203
  ## First, need to add target for shared and static library
204
204
  IF(LEXBOR_BUILD_SHARED)
205
205
  add_library(${LEXBOR_LIB_NAME} SHARED ${LEXBOR_SOURCES})
206
+ if(UNIX)
207
+ set_target_properties(${LEXBOR_LIB_NAME} PROPERTIES SOVERSION ${PROJECT_VERSION_MAJOR} VERSION ${PROJECT_VERSION})
208
+ endif()
209
+ if ((WIN32) AND (CMAKE_VERSION VERSION_GREATER_EQUAL "3.27"))
210
+ set_target_properties(${LEXBOR_LIB_NAME} PROPERTIES DLL_NAME_WITH_SOVERSION 1)
211
+ endif()
206
212
  target_include_directories(${LEXBOR_LIB_NAME} PUBLIC $<BUILD_INTERFACE:${LEXBOR_SOURCE}>
207
213
  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
208
214
  target_compile_definitions(${LEXBOR_LIB_NAME} PRIVATE "LEXBOR_BUILDING")
@@ -39,6 +39,7 @@ https://lexbor.com/modules/.
39
39
  - **[SerpApi](https://serpapi.com/)** — uses Lexbor in production for HTML parsing at scale
40
40
  - **[Selectolax](https://github.com/rushter/selectolax)** — popular Python library for fast web scraping
41
41
  - **[Nokolexbor](https://github.com/serpapi/nokolexbor)** — high-performance Nokogiri alternative for Ruby
42
+ - **[Nordstjernen](https://github.com/nordstjernen-web/nordstjernen)** - Web browser written entirely in C
42
43
 
43
44
  [More bindings](#external-bindings-and-wrappers) available for Elixir, Crystal, D, Julia, Erlang.
44
45
 
@@ -215,6 +216,7 @@ The `liblexbor-html` library already contains all the pointers to the required d
215
216
 
216
217
  ## External Bindings and Wrappers
217
218
 
219
+ * [Elixir](https://github.com/dashbitco/lazy_html) Fast HTML parsing and querying. Default HTML engine in Phoenix LiveViewTest (since LiveView 1.1).
218
220
  * [Elixir](https://git.pleroma.social/pleroma/elixir-libraries/fast_html) binding for the HTML module (since 2.0 version)
219
221
  * [Erlang](https://hex.pm/packages/lexbor_erl) Fast HTML5 Parser with CSS selectors and DOM manipulation (since 2.6.0 version)
220
222
  * [Crystal](https://github.com/kostya/lexbor) Fast HTML5 Parser with CSS selectors for Crystal language
@@ -227,6 +229,16 @@ The `liblexbor-html` library already contains all the pointers to the required d
227
229
 
228
230
  You can create a binding or wrapper for the `lexbor` and place the link here!
229
231
 
232
+ ## AI Policy
233
+
234
+ Lexbor draws a clear line between its core library and the surrounding ecosystem.
235
+
236
+ **Core library** — all source code that compiles into the distributable binary — is written entirely by humans. This is an engineering choice, not an ideological one. Lexbor is a performance-critical, standards-compliant C library where every line of code must reflect a decision its author fully understands and can defend. We use AI tools in other areas of the project and see their value clearly; the core is simply not the right place for them.
237
+
238
+ AI-assisted tools are welcome and actively used for bindings, WASM builds, benchmarks, documentation, examples, tests, and other supporting work. This section is one such example — it was drafted with AI and reviewed by a human. The em dashes are a giveaway.
239
+
240
+ **For contributors:** pull requests targeting core library code are expected to be human-authored. The contributor — not an AI model — performed the reasoning, made the design choices, and wrote the code. The code you submit is yours, and you can speak to the intent and correctness of every line. We are not interested in policing workflows, but if a pull request reads like AI did the thinking, it will likely be rejected.
241
+
230
242
  ## Documentation
231
243
 
232
244
  Available on [lexbor.com](https://lexbor.com) in [Documentation](https://lexbor.com/documentation/) section.
@@ -167,7 +167,7 @@ MACRO(INCLUDE_MODULE_CONFIG pname module module_dir)
167
167
 
168
168
  IF(EXISTS "${conf_path}")
169
169
  set(CURRENT_LIB_NAME "${PROJECT_NAME}-${module}")
170
- set(CURRENT_LIB_NAME_STATIC "${PROJECT_NAME}-${module}-static")
170
+ set(CURRENT_LIB_NAME_STATIC "${PROJECT_NAME}-${module}_static")
171
171
 
172
172
  include("${conf_path}")
173
173
  ENDIF()