makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Allocation-failure injection sweep for the C extension (run via `rake oom`).
|
|
4
|
+
#
|
|
5
|
+
# The sanitizers and the leak gate prove the happy path is memory-safe; neither
|
|
6
|
+
# proves the OOM branches are CORRECT. Makiri's contract is fail-closed: when a
|
|
7
|
+
# core C allocation fails, a call must either raise a clean Makiri::Error /
|
|
8
|
+
# NoMemoryError or complete with the exact same result as the unfailed run -
|
|
9
|
+
# never a truncated/partial result (the property the XPath node-set caps and
|
|
10
|
+
# the "build OOM -> walk fallback" designs exist for). This gate machine-checks
|
|
11
|
+
# that contract: with the ext built under MAKIRI_ALLOC_INJECT=1, every core
|
|
12
|
+
# allocation site routes through a hook that can be armed to fail the nth
|
|
13
|
+
# attempt once. For each representative workload we record a failure-free
|
|
14
|
+
# BASELINE result and the total number of allocation attempts, then re-run the
|
|
15
|
+
# workload once per allocation site with exactly that site failing, and verify
|
|
16
|
+
# each run either raised cleanly or returned a baseline-identical value.
|
|
17
|
+
#
|
|
18
|
+
# A segfault/abort kills this process; the caller (rake/CI) sees the nonzero
|
|
19
|
+
# exit, which is the verdict too.
|
|
20
|
+
#
|
|
21
|
+
# bundle exec rake oom # rebuild + sweep
|
|
22
|
+
# bundle exec ruby -Ilib script/check_alloc_failures.rb # sweep current build
|
|
23
|
+
|
|
24
|
+
require "makiri"
|
|
25
|
+
|
|
26
|
+
unless Makiri.send(:__alloc_inject?)
|
|
27
|
+
abort "check_alloc_failures: extension built without the injection hook - " \
|
|
28
|
+
"rebuild with MAKIRI_ALLOC_INJECT=1 (`rake oom` does this)"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Each scenario runs one workload END-TO-END and returns a canonical String, so
|
|
32
|
+
# an injected run's result can be compared (==) against the baseline. Fixtures
|
|
33
|
+
# are built INSIDE the lambda (unless reuse is the point) so the sweep covers
|
|
34
|
+
# their parse/build allocations too.
|
|
35
|
+
SCENARIOS = {
|
|
36
|
+
# XML parse covering the syntax surface: declaration, DOCTYPE (SYSTEM id +
|
|
37
|
+
# internal subset), default + prefixed namespaces, prefixed attributes,
|
|
38
|
+
# references, comment, CDATA, PI, nesting, CRLF normalization in an attr.
|
|
39
|
+
"xml_parse" => lambda do
|
|
40
|
+
src = <<~XML
|
|
41
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
42
|
+
<!DOCTYPE root SYSTEM "urn:example:dtd" [<!ENTITY local "subset">]>
|
|
43
|
+
<root xmlns="urn:d" xmlns:p="urn:p" p:pa="pv" mixed="A&A x\r\ny">
|
|
44
|
+
<!-- a comment -->
|
|
45
|
+
<p:branch><leaf depth="2">text & A refs</leaf></p:branch>
|
|
46
|
+
<![CDATA[raw < cdata & bytes]]>
|
|
47
|
+
<?pi-target some data?>
|
|
48
|
+
<empty/>
|
|
49
|
+
</root>
|
|
50
|
+
XML
|
|
51
|
+
Makiri::XML::Document.parse(src).to_xml
|
|
52
|
+
end,
|
|
53
|
+
|
|
54
|
+
# Fragment parse + serialize + splice into a host document.
|
|
55
|
+
"xml_fragment" => lambda do
|
|
56
|
+
doc = Makiri::XML::Document.parse("<r xmlns='urn:d'><keep>k</keep></r>")
|
|
57
|
+
frag = doc.fragment("<a xmlns:p='u'><p:b>t</p:b></a>text")
|
|
58
|
+
out = frag.to_xml
|
|
59
|
+
doc.root.add_child(frag)
|
|
60
|
+
out + doc.to_xml
|
|
61
|
+
end,
|
|
62
|
+
|
|
63
|
+
# XPath battery: predicates, functions, a union, axes, plus an XPathContext
|
|
64
|
+
# evaluation with a registered namespace and variable.
|
|
65
|
+
"xml_xpath" => lambda do
|
|
66
|
+
doc = Makiri::XML::Document.parse(<<~XML)
|
|
67
|
+
<root xmlns:p="urn:p">
|
|
68
|
+
<a v="1">alpha</a>
|
|
69
|
+
<a v="2">beta</a>
|
|
70
|
+
<b n="3"> gamma delta </b>
|
|
71
|
+
<p:c><a v="9">nested</a></p:c>
|
|
72
|
+
</root>
|
|
73
|
+
XML
|
|
74
|
+
canon = lambda do |r|
|
|
75
|
+
r.is_a?(Makiri::NodeSet) ? r.map(&:to_xml).join("|") : r.inspect
|
|
76
|
+
end
|
|
77
|
+
exprs = [
|
|
78
|
+
"//a[@v='2']",
|
|
79
|
+
"//a[position()=2]",
|
|
80
|
+
"//a[last()]",
|
|
81
|
+
"count(//a)",
|
|
82
|
+
"sum(//b/@n)",
|
|
83
|
+
"concat(string(//a[1]), '-', substring('abcdef', 2, 3))",
|
|
84
|
+
"translate('abc', 'abc', 'xyz')",
|
|
85
|
+
"normalize-space(//b)",
|
|
86
|
+
"contains(//a[1], 'alp')",
|
|
87
|
+
"starts-with(//b, ' g')",
|
|
88
|
+
"//a | //b",
|
|
89
|
+
"//a[1]/ancestor::root",
|
|
90
|
+
"//a[1]/following-sibling::b",
|
|
91
|
+
"//root/descendant-or-self::a",
|
|
92
|
+
]
|
|
93
|
+
parts = exprs.map { |e| canon.call(doc.xpath(e)) }
|
|
94
|
+
ctx = Makiri::XPathContext.new(doc)
|
|
95
|
+
ctx.register_namespace("p", "urn:p")
|
|
96
|
+
ctx.register_variable("want", "9")
|
|
97
|
+
parts << canon.call(ctx.evaluate("//p:c/a[@v=$want]"))
|
|
98
|
+
parts.join("\n")
|
|
99
|
+
end,
|
|
100
|
+
|
|
101
|
+
# Same battery shape over an HTML5-parsed document.
|
|
102
|
+
"html_xpath" => lambda do
|
|
103
|
+
doc = Makiri::HTML::Document.parse(<<~HTML)
|
|
104
|
+
<html><body>
|
|
105
|
+
<div id="top"><p class="x">one</p><p class="y">two</p></div>
|
|
106
|
+
<ul><li data-n="1">a</li><li data-n="2"> b c </li></ul>
|
|
107
|
+
</body></html>
|
|
108
|
+
HTML
|
|
109
|
+
canon = lambda do |r|
|
|
110
|
+
r.is_a?(Makiri::NodeSet) ? r.map(&:to_html).join("|") : r.inspect
|
|
111
|
+
end
|
|
112
|
+
exprs = [
|
|
113
|
+
"//p[@class='y']",
|
|
114
|
+
"//li[position()=2]",
|
|
115
|
+
"//li[last()]",
|
|
116
|
+
"count(//p)",
|
|
117
|
+
"sum(//li/@data-n)",
|
|
118
|
+
"concat(string(//p[1]), '+', substring(//p[2], 1, 2))",
|
|
119
|
+
"translate('one', 'one', 'uno')",
|
|
120
|
+
"normalize-space(//li[2])",
|
|
121
|
+
"contains(//p[1], 'on')",
|
|
122
|
+
"starts-with(//p[2], 'tw')",
|
|
123
|
+
"//p | //li",
|
|
124
|
+
"//p[1]/ancestor::div",
|
|
125
|
+
"//p[1]/following-sibling::p",
|
|
126
|
+
"//div/descendant-or-self::p",
|
|
127
|
+
]
|
|
128
|
+
parts = exprs.map { |e| canon.call(doc.xpath(e)) }
|
|
129
|
+
ctx = Makiri::XPathContext.new(doc)
|
|
130
|
+
ctx.register_variable("cls", "x")
|
|
131
|
+
parts << canon.call(ctx.evaluate("//p[@class=$cls]"))
|
|
132
|
+
parts.join("\n")
|
|
133
|
+
end,
|
|
134
|
+
|
|
135
|
+
# The mutation surface: create_*, insertion on every side, rename, content,
|
|
136
|
+
# attributes, replace, remove, and a fragment splice.
|
|
137
|
+
"xml_mutate" => lambda do
|
|
138
|
+
doc = Makiri::XML::Document.parse("<root><old>x</old><gone/></root>")
|
|
139
|
+
el = doc.create_element("made")
|
|
140
|
+
el.add_child(doc.create_text_node("inner"))
|
|
141
|
+
el["k"] = "v"
|
|
142
|
+
doc.root.add_child(el)
|
|
143
|
+
el.name = "renamed"
|
|
144
|
+
el.content = "rewritten"
|
|
145
|
+
el.add_previous_sibling(doc.create_element("before"))
|
|
146
|
+
el.add_next_sibling(doc.create_element("after"))
|
|
147
|
+
doc.root.at_xpath("old").replace(doc.create_element("new"))
|
|
148
|
+
doc.root.at_xpath("gone").remove
|
|
149
|
+
doc.root.add_child(doc.fragment("<f1/>tail<f2 a='b'/>"))
|
|
150
|
+
doc.to_xml
|
|
151
|
+
end,
|
|
152
|
+
|
|
153
|
+
# Serialization over a non-trivial tree (~200 elements built inside the
|
|
154
|
+
# lambda, so the parse is swept too), both tree and deep serializers.
|
|
155
|
+
"html_serialize" => lambda do
|
|
156
|
+
body = (1..50).map { |i|
|
|
157
|
+
"<div id='d#{i}' class='row'><p>cell #{i}</p><span>tail & #{i}</span></div>"
|
|
158
|
+
}.join
|
|
159
|
+
doc = Makiri::HTML::Document.parse("<html><body>#{body}</body></html>")
|
|
160
|
+
doc.to_html + doc.at_css("body").inner_html
|
|
161
|
+
end,
|
|
162
|
+
|
|
163
|
+
# Full-document text extraction (exercises the text-index build, and its
|
|
164
|
+
# fail-closed OOM -> walk fallback).
|
|
165
|
+
"html_text" => lambda do
|
|
166
|
+
body = (1..80).map { |i| "<p>para #{i} <em>em#{i}</em> tail</p>" }.join
|
|
167
|
+
doc = Makiri::HTML::Document.parse("<html><body>#{body}</body></html>")
|
|
168
|
+
doc.text
|
|
169
|
+
end,
|
|
170
|
+
|
|
171
|
+
# CSS: a comma list with combinators through the reused engine, plus the
|
|
172
|
+
# at_css first-match path.
|
|
173
|
+
"css" => lambda do
|
|
174
|
+
doc = Makiri::HTML::Document.parse(<<~HTML)
|
|
175
|
+
<html><body>
|
|
176
|
+
<p class="c">one</p><p>skip</p><p class="c">two</p>
|
|
177
|
+
<div><span>in</span></div><span>out</span>
|
|
178
|
+
<section id="x">target</section>
|
|
179
|
+
</body></html>
|
|
180
|
+
HTML
|
|
181
|
+
doc.css("p.c, div > span").map { |n| n.name }.join(",") +
|
|
182
|
+
doc.at_css("#x")&.name.to_s
|
|
183
|
+
end,
|
|
184
|
+
|
|
185
|
+
# The Builder DSL (pure Ruby over create_*/add_child, so this sweeps the
|
|
186
|
+
# construction factories).
|
|
187
|
+
"xml_builder" => lambda do
|
|
188
|
+
Makiri::XML::Builder.new do |xml|
|
|
189
|
+
xml.feed("xmlns" => "urn:a", "xmlns:dc" => "urn:dc") do
|
|
190
|
+
xml.entry do
|
|
191
|
+
xml.title("Hello")
|
|
192
|
+
xml["dc"].id_("42")
|
|
193
|
+
xml.cdata("a < b")
|
|
194
|
+
xml.comment(" note ")
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end.to_xml
|
|
198
|
+
end,
|
|
199
|
+
}.freeze
|
|
200
|
+
|
|
201
|
+
ALLOWED = [Makiri::Error, NoMemoryError].freeze
|
|
202
|
+
TRUNCATE = 120
|
|
203
|
+
|
|
204
|
+
def disarm = Makiri.send(:__alloc_inject, 0)
|
|
205
|
+
|
|
206
|
+
failures_total = 0
|
|
207
|
+
|
|
208
|
+
SCENARIOS.each do |name, work|
|
|
209
|
+
# Warm twice with injection off: process-global engines (CSS) and lazy
|
|
210
|
+
# builds settle, so the counted run below is representative and stable.
|
|
211
|
+
disarm
|
|
212
|
+
2.times { work.call }
|
|
213
|
+
|
|
214
|
+
# Counted baseline run: __alloc_inject(0) also resets the counter, so the
|
|
215
|
+
# calls reading right after is exactly this run's allocation-attempt total.
|
|
216
|
+
disarm
|
|
217
|
+
baseline = work.call
|
|
218
|
+
total = Makiri.send(:__alloc_inject_calls)
|
|
219
|
+
|
|
220
|
+
ok_raised = 0
|
|
221
|
+
ok_identical = 0
|
|
222
|
+
failures = []
|
|
223
|
+
|
|
224
|
+
(1..total).each do |n|
|
|
225
|
+
Makiri.send(:__alloc_inject, n)
|
|
226
|
+
begin
|
|
227
|
+
result = work.call
|
|
228
|
+
if result == baseline
|
|
229
|
+
ok_identical += 1
|
|
230
|
+
else
|
|
231
|
+
failures << [n, "truncated/wrong result",
|
|
232
|
+
"baseline=#{baseline.to_s[0, TRUNCATE].inspect} " \
|
|
233
|
+
"got=#{result.to_s[0, TRUNCATE].inspect}"]
|
|
234
|
+
end
|
|
235
|
+
rescue *ALLOWED
|
|
236
|
+
ok_raised += 1
|
|
237
|
+
rescue Exception => e # rubocop:disable Lint/RescueException -- the wrong class IS the finding
|
|
238
|
+
failures << [n, "wrong exception class",
|
|
239
|
+
"#{e.class}: #{e.message.to_s[0, TRUNCATE]}"]
|
|
240
|
+
ensure
|
|
241
|
+
disarm
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
failures_total += failures.size
|
|
246
|
+
puts format("%-16s allocations=%-5d raised=%-5d identical=%-5d failed=%d",
|
|
247
|
+
name, total, ok_raised, ok_identical, failures.size)
|
|
248
|
+
failures.each do |n, kind, detail|
|
|
249
|
+
puts " n=#{n} #{kind}: #{detail}"
|
|
250
|
+
end
|
|
251
|
+
if total.zero?
|
|
252
|
+
# A scenario that never reaches a core allocation sweeps nothing - that is
|
|
253
|
+
# a broken scenario, not a pass.
|
|
254
|
+
failures_total += 1
|
|
255
|
+
puts " scenario performed ZERO core allocations - workload not reaching the C core"
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
if failures_total.zero?
|
|
260
|
+
puts "check_alloc_failures: OK - every injected allocation failure failed closed " \
|
|
261
|
+
"(clean raise or baseline-identical result)"
|
|
262
|
+
else
|
|
263
|
+
puts "check_alloc_failures: FAILED - #{failures_total} injected failure(s) " \
|
|
264
|
+
"produced a wrong exception or a non-baseline result"
|
|
265
|
+
exit 1
|
|
266
|
+
end
|
data/script/check_c_safety.rb
CHANGED
|
@@ -8,9 +8,26 @@ require "yaml"
|
|
|
8
8
|
ROOT = Pathname.new(__dir__).join("..").expand_path
|
|
9
9
|
ALLOWLIST_PATH = ROOT.join("script/check_c_safety_allowlist.yml")
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
# A rule may carry `paths` (an array of path globs): it then applies ONLY to
|
|
12
|
+
# matching files. Used for the parser-TU reader discipline, where the ban is
|
|
13
|
+
# meaningful only in TUs whose input reads must go through mkr_span_t.
|
|
14
|
+
Rule = Struct.new(:id, :message, :regex, :paths, keyword_init: true)
|
|
12
15
|
Finding = Struct.new(:path, :line, :rule, :text, keyword_init: true)
|
|
13
16
|
|
|
17
|
+
# Byte-scanning parser TUs: every input read goes through the bounded reader
|
|
18
|
+
# (core/mkr_span.h) - see its header comment. The rules below turn that from a
|
|
19
|
+
# convention into a machine-enforced invariant for these files.
|
|
20
|
+
PARSER_TUS = %w[
|
|
21
|
+
ext/makiri/xml/mkr_xml_tree.c
|
|
22
|
+
ext/makiri/xml/mkr_xml_chars.c
|
|
23
|
+
ext/makiri/xml/mkr_xml_node.c
|
|
24
|
+
ext/makiri/xpath/mkr_xpath_lex.c
|
|
25
|
+
ext/makiri/xpath/mkr_xpath_funcs_body.h
|
|
26
|
+
ext/makiri/xpath/mkr_xpath_value_body.h
|
|
27
|
+
ext/makiri/bridge/ruby_string.c
|
|
28
|
+
ext/makiri/lexbor_compat/source_loc.c
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
14
31
|
RULES = [
|
|
15
32
|
Rule.new(
|
|
16
33
|
id: "string_value_cstr",
|
|
@@ -60,7 +77,64 @@ RULES = [
|
|
|
60
77
|
Rule.new(
|
|
61
78
|
id: "verified_text_forge",
|
|
62
79
|
message: "mkr_verified_text_t must be minted only by mkr_verified_text_from_view (the validated boundary)",
|
|
63
|
-
|
|
80
|
+
# Both forge shapes: the compound-literal cast AND the declaration
|
|
81
|
+
# initializer (`mkr_verified_text_t x = {...}`), which the cast-only regex
|
|
82
|
+
# used to miss - that gap let a fuzz harness mint over a non-NUL-terminated
|
|
83
|
+
# buffer unnoticed.
|
|
84
|
+
regex: /\(\s*mkr_verified_text_t\s*\)\s*\{|\bmkr_verified_text_t\s+\w+\s*=\s*\{/
|
|
85
|
+
),
|
|
86
|
+
# --- HTML/XML representation boundary (see docs/html_xml_boundary_hardening) ---
|
|
87
|
+
# These symbols assume one DOM representation; using them outside their
|
|
88
|
+
# representation-correct / kind-checked home is how shared glue (XPath, NodeSet,
|
|
89
|
+
# node identity) silently treats an XML node as HTML (or vice versa) - an
|
|
90
|
+
# assert-abort or memory type-confusion. Each is allowlisted only in the files
|
|
91
|
+
# that legitimately own it; anywhere else trips the lint.
|
|
92
|
+
Rule.new(
|
|
93
|
+
id: "html_doc_unwrap_boundary",
|
|
94
|
+
message: "mkr_html_doc_unwrap is HTML-only; shared/XML code must use the kind-aware mkr_node_unwrap",
|
|
95
|
+
regex: /\bmkr_html_doc_unwrap\s*\(/
|
|
96
|
+
),
|
|
97
|
+
Rule.new(
|
|
98
|
+
id: "parsed_html_doc_boundary",
|
|
99
|
+
message: "mkr_parsed_html_doc (asserts kind==HTML) may only be used in a kind-checked / HTML-only site",
|
|
100
|
+
regex: /\bmkr_parsed_html_doc\s*\(/
|
|
101
|
+
),
|
|
102
|
+
Rule.new(
|
|
103
|
+
id: "parsed_xml_doc_boundary",
|
|
104
|
+
message: "mkr_parsed_xml_doc may only be used in a kind-checked / XML-representation site",
|
|
105
|
+
regex: /\bmkr_parsed_xml_doc\s*\(/
|
|
106
|
+
),
|
|
107
|
+
Rule.new(
|
|
108
|
+
id: "owner_document_boundary",
|
|
109
|
+
message: "owner_document is an HTML-only lxb field; shared code must compare documents via mkr_node_document",
|
|
110
|
+
regex: /\bowner_document\b/
|
|
111
|
+
),
|
|
112
|
+
Rule.new(
|
|
113
|
+
id: "node_raw_boundary",
|
|
114
|
+
message: "mkr_node_raw is the kind-agnostic raw pointer (identity / kind-guaranteed only); " \
|
|
115
|
+
"to dereference a node use mkr_html_node or mkr_xml_node_unwrap (kind-checked)",
|
|
116
|
+
regex: /\bmkr_node_raw\s*\(/
|
|
117
|
+
),
|
|
118
|
+
# --- parser-TU reader discipline (see core/mkr_span.h) ---
|
|
119
|
+
# In the byte-scanning parser TUs every input read must go through the
|
|
120
|
+
# bounded reader: a raw libc scan reintroduces the "forgot the bounds check"
|
|
121
|
+
# class the span made structurally impossible. memcpy stays allowed (an
|
|
122
|
+
# explicit-length copy, not a scan).
|
|
123
|
+
Rule.new(
|
|
124
|
+
id: "raw_scan_call",
|
|
125
|
+
message: "parser TUs must read input through mkr_span_* / mkr_bytes_eq / mkr_utf8_* " \
|
|
126
|
+
"(core), not raw libc scanning",
|
|
127
|
+
regex: /\b(?:memchr|memcmp|strchr|strrchr|strstr|strn?cmp|strcspn|strspn|strpbrk|strtod|strtol|strtoull?|sscanf)\s*\(/,
|
|
128
|
+
paths: PARSER_TUS
|
|
129
|
+
),
|
|
130
|
+
# The span's own cursor/bound fields are private to core/mkr_span.h: touching
|
|
131
|
+
# `.p` / `.end` in a parser TU is how a hand-rolled (uncovered) cursor starts.
|
|
132
|
+
Rule.new(
|
|
133
|
+
id: "raw_cursor_member",
|
|
134
|
+
message: "parser TUs must not access a span's .p/.end (or keep a raw cursor struct); " \
|
|
135
|
+
"use the mkr_span_* helpers (mark/since for slice capture)",
|
|
136
|
+
regex: /(?:->|\.)\s*(?:p|end)\b/,
|
|
137
|
+
paths: PARSER_TUS
|
|
64
138
|
),
|
|
65
139
|
].freeze
|
|
66
140
|
|
|
@@ -131,6 +205,7 @@ def scan_findings(ignores)
|
|
|
131
205
|
next [] unless code_line?(line)
|
|
132
206
|
|
|
133
207
|
RULES.filter_map do |rule|
|
|
208
|
+
next if rule.paths && rule.paths.none? { |pat| path_matches?(pat, rel) }
|
|
134
209
|
next unless line.match?(rule.regex)
|
|
135
210
|
next if rule_ignored?(rel, rule.id, ignores)
|
|
136
211
|
|
|
@@ -10,3 +10,105 @@ ignore_paths:
|
|
|
10
10
|
- path: ext/makiri/bridge/text_token.c
|
|
11
11
|
rule: verified_text_forge
|
|
12
12
|
reason: mkr_verified_text_from_view, the sole sanctioned mint of mkr_verified_text_t; its input view is already validated by the bridge string helpers.
|
|
13
|
+
# --- HTML/XML representation boundary: each per-rep symbol is exempt only in
|
|
14
|
+
# the files that own it (declaration, definition, or a kind-checked use), so
|
|
15
|
+
# a new use anywhere else — especially shared glue — trips the lint ---
|
|
16
|
+
# mkr_html_doc_unwrap (HTML-only document unwrap)
|
|
17
|
+
- path: ext/makiri/glue/glue.h
|
|
18
|
+
rule: html_doc_unwrap_boundary
|
|
19
|
+
reason: declaration of the HTML-only document unwrap.
|
|
20
|
+
- path: ext/makiri/glue/ruby_doc.c
|
|
21
|
+
rule: html_doc_unwrap_boundary
|
|
22
|
+
reason: definition + HTML Document methods (parse/serialize/title/compat_mode) — all HTML-only.
|
|
23
|
+
- path: ext/makiri/glue/ruby_node.c
|
|
24
|
+
rule: html_doc_unwrap_boundary
|
|
25
|
+
reason: the kind-aware mkr_node_raw calls it only on its HTML branch (after a MKR_DOC_XML check).
|
|
26
|
+
- path: ext/makiri/glue/ruby_html_node.c
|
|
27
|
+
rule: html_doc_unwrap_boundary
|
|
28
|
+
reason: mkr_html_node_unwrap resolves an HTML Document (after rejecting an XML one at the type boundary).
|
|
29
|
+
- path: ext/makiri/glue/ruby_html_mutate.c
|
|
30
|
+
rule: html_doc_unwrap_boundary
|
|
31
|
+
reason: HTML tree/fragment mutation; XML mutation has its own arena path.
|
|
32
|
+
- path: ext/makiri/glue/ruby_xpath.c
|
|
33
|
+
rule: html_doc_unwrap_boundary
|
|
34
|
+
reason: the HTML branch of mkr_xpath_context_for, entered only after the XML kind returns early.
|
|
35
|
+
# mkr_parsed_html_doc (asserts kind == HTML)
|
|
36
|
+
- path: ext/makiri/lexbor_compat/compat.h
|
|
37
|
+
rule: parsed_html_doc_boundary
|
|
38
|
+
reason: declaration of the HTML parsed-document accessor.
|
|
39
|
+
- path: ext/makiri/lexbor_compat/post_parse.c
|
|
40
|
+
rule: parsed_html_doc_boundary
|
|
41
|
+
reason: definition (the kind assert lives here) + HTML post-parse pipeline.
|
|
42
|
+
- path: ext/makiri/glue/ruby_doc.c
|
|
43
|
+
rule: parsed_html_doc_boundary
|
|
44
|
+
reason: mkr_html_doc_unwrap is defined here over the HTML parsed document.
|
|
45
|
+
# mkr_parsed_xml_doc (XML arena accessor) — kept out of the pure-HTML glue files
|
|
46
|
+
- path: ext/makiri/lexbor_compat/compat.h
|
|
47
|
+
rule: parsed_xml_doc_boundary
|
|
48
|
+
reason: declaration of the XML parsed-document accessor.
|
|
49
|
+
- path: ext/makiri/lexbor_compat/post_parse.c
|
|
50
|
+
rule: parsed_xml_doc_boundary
|
|
51
|
+
reason: definition + XML document wrapping/teardown.
|
|
52
|
+
- path: ext/makiri/glue/ruby_xml.c
|
|
53
|
+
rule: parsed_xml_doc_boundary
|
|
54
|
+
reason: XML parse entry / Document construction.
|
|
55
|
+
- path: ext/makiri/glue/ruby_xml_node.c
|
|
56
|
+
rule: parsed_xml_doc_boundary
|
|
57
|
+
reason: XML node read + mutation surface over the arena.
|
|
58
|
+
- path: ext/makiri/glue/ruby_doc.c
|
|
59
|
+
rule: parsed_xml_doc_boundary
|
|
60
|
+
reason: shared Document glue, used only on kind-checked XML branches.
|
|
61
|
+
- path: ext/makiri/glue/ruby_xpath.c
|
|
62
|
+
rule: parsed_xml_doc_boundary
|
|
63
|
+
reason: the XML branch of mkr_xpath_context_for (kind-checked).
|
|
64
|
+
- path: ext/makiri/glue/ruby_node.c
|
|
65
|
+
rule: parsed_xml_doc_boundary
|
|
66
|
+
reason: the kind-aware mkr_node_raw resolves an XML Document only after a MKR_DOC_XML check.
|
|
67
|
+
# owner_document (HTML-only lxb_dom_node_t field)
|
|
68
|
+
- path: ext/makiri/lexbor_compat/post_parse.c
|
|
69
|
+
rule: owner_document_boundary
|
|
70
|
+
reason: mkr_lxb_document_bytes resolves a Lexbor node's owner document to size its mraw pools (HTML-only; the XML serializer uses arena_bytes instead).
|
|
71
|
+
- path: ext/makiri/glue/ruby_html_node.c
|
|
72
|
+
rule: owner_document_boundary
|
|
73
|
+
reason: HTML node readers operate on lxb_dom_node_t.
|
|
74
|
+
- path: ext/makiri/glue/ruby_doc.c
|
|
75
|
+
rule: owner_document_boundary
|
|
76
|
+
reason: HTML Document operations.
|
|
77
|
+
- path: ext/makiri/glue/ruby_html_mutate.c
|
|
78
|
+
rule: owner_document_boundary
|
|
79
|
+
reason: HTML tree mutation over Lexbor nodes.
|
|
80
|
+
- path: ext/makiri/xpath/mkr_xpath_node_access_html.h
|
|
81
|
+
rule: owner_document_boundary
|
|
82
|
+
reason: the HTML monomorphization of the engine's node-access layer.
|
|
83
|
+
# mkr_node_raw (kind-agnostic void* raw pointer; never dereferenced as a typed node)
|
|
84
|
+
- path: ext/makiri/glue/glue.h
|
|
85
|
+
rule: node_raw_boundary
|
|
86
|
+
reason: declaration of the kind-agnostic accessor.
|
|
87
|
+
- path: ext/makiri/glue/ruby_node.c
|
|
88
|
+
rule: node_raw_boundary
|
|
89
|
+
reason: defines mkr_node_raw / mkr_node_id (node identity — the pointer is only compared, never dereferenced).
|
|
90
|
+
- path: ext/makiri/glue/ruby_xpath.c
|
|
91
|
+
rule: node_raw_boundary
|
|
92
|
+
reason: the XPath context node / handler-result node, where same-document (hence the representation) is verified before the engine takes the raw pointer.
|
|
93
|
+
- path: ext/makiri/glue/ruby_node_set.c
|
|
94
|
+
rule: node_raw_boundary
|
|
95
|
+
reason: NodeSet.new stores representation-opaque pointers; mkr_node_raw takes a seed node's pointer only after same-document validation guarantees its kind matches the set.
|
|
96
|
+
|
|
97
|
+
# --- parser-TU reader discipline (raw_scan_call / raw_cursor_member) ---
|
|
98
|
+
allowlist:
|
|
99
|
+
- path: ext/makiri/xpath/mkr_xpath_funcs_body.h
|
|
100
|
+
rule: raw_scan_call
|
|
101
|
+
max: 3
|
|
102
|
+
reason: "mkr_lookup_function's three strcmp over compile-time function-name tables: both sides are NUL-terminated (table literals / owned AST text), and the lookup signature takes bare const char* from the evaluator."
|
|
103
|
+
|
|
104
|
+
# --- verified_text_forge: the two Ruby-free test entry points. Neither can use
|
|
105
|
+
# mkr_verified_text_from_view (bridge = Ruby boundary), so each mints the
|
|
106
|
+
# token itself and must supply the contract by construction. ---
|
|
107
|
+
- path: ext/makiri/xpath/mkr_xpath_xml_selftest.c
|
|
108
|
+
rule: verified_text_forge
|
|
109
|
+
max: 2
|
|
110
|
+
reason: "Selftest expressions are compile-time string literals: NUL-terminated, NUL-free, valid UTF-8 by construction."
|
|
111
|
+
- path: ext/makiri/fuzz/xpath_fuzz.c
|
|
112
|
+
rule: verified_text_forge
|
|
113
|
+
max: 1
|
|
114
|
+
reason: "The libFuzzer harness mints over an owned mkr_strndup copy, which supplies NUL-termination and no-interior-NUL; UTF-8 validity is deliberately left to the lexer's strict decoder (its rejection path is fuzz-target behavior)."
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Malloc-leak gate for the C extension (macOS only; run via `rake leaks`).
|
|
4
|
+
#
|
|
5
|
+
# ASan runs everywhere with detect_leaks=0 (Ruby and Lexbor are uninstrumented,
|
|
6
|
+
# so LeakSanitizer drowns in their noise) - which means plain leaks were never
|
|
7
|
+
# machine-checked. This gate fills that hole with macOS's `leaks` tool: it runs
|
|
8
|
+
# script/leaks_harness.rb (the public surface in a loop, INCLUDING rescued
|
|
9
|
+
# failure paths) under MallocStackLogging, then scans the leak report for
|
|
10
|
+
# allocation stacks that (a) pass through makiri.bundle and (b) repeat in
|
|
11
|
+
# proportion to the loop count - i.e. a leak per call, which is what a missing
|
|
12
|
+
# free on some path looks like. One-time Init allocations and Ruby's own
|
|
13
|
+
# at-exit/unscannable noise stay at 1-2 instances and are ignored.
|
|
14
|
+
#
|
|
15
|
+
# Found (and since fixed) by this harness: the transient fragment document
|
|
16
|
+
# leaked by every inner_html=/outer_html=, and the partially-built step leaked
|
|
17
|
+
# by every failing XPath parse.
|
|
18
|
+
#
|
|
19
|
+
# ruby script/check_leaks.rb # threshold = ITERATIONS / 4
|
|
20
|
+
# LEAKS_ITERATIONS=200 rake leaks # more iterations, sharper signal
|
|
21
|
+
|
|
22
|
+
require "rbconfig"
|
|
23
|
+
require "tempfile"
|
|
24
|
+
|
|
25
|
+
abort "check_leaks: the `leaks` tool is macOS-only" unless RUBY_PLATFORM.include?("darwin")
|
|
26
|
+
|
|
27
|
+
iterations = Integer(ENV.fetch("LEAKS_ITERATIONS", "120"))
|
|
28
|
+
threshold = [iterations / 4, 10].max
|
|
29
|
+
harness = File.expand_path("leaks_harness.rb", __dir__)
|
|
30
|
+
lib = File.expand_path("../lib", __dir__)
|
|
31
|
+
|
|
32
|
+
out = Tempfile.create("makiri-leaks")
|
|
33
|
+
ok = system({ "MallocStackLogging" => "1", "LEAKS_ITERATIONS" => iterations.to_s },
|
|
34
|
+
"leaks", "--atExit", "--",
|
|
35
|
+
RbConfig.ruby, "-I#{lib}", harness,
|
|
36
|
+
out: out.path, err: out.path)
|
|
37
|
+
report = File.read(out.path)
|
|
38
|
+
# `leaks` exits non-zero whenever ANY leak exists (Ruby itself always reports
|
|
39
|
+
# some at-exit noise), so the exit status is not the verdict - the scan below
|
|
40
|
+
# is. But the harness itself must have completed.
|
|
41
|
+
abort "check_leaks: harness did not complete:\n#{report[-2000..]}" unless report.include?("leaks harness done")
|
|
42
|
+
abort "check_leaks: no leak report produced (leaks tool failed?)" unless report.include?("STACK OF") || ok
|
|
43
|
+
|
|
44
|
+
offenders = report.split(/\n(?=STACK OF )/).filter_map do |stanza|
|
|
45
|
+
next unless stanza.include?("makiri.bundle")
|
|
46
|
+
|
|
47
|
+
instances = stanza[/STACK OF (\d+) INSTANCES?/, 1].to_i
|
|
48
|
+
next if instances < threshold
|
|
49
|
+
|
|
50
|
+
frames = stanza.lines.grep(/makiri\.bundle/).first(4).map(&:strip)
|
|
51
|
+
[instances, frames]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
if offenders.empty?
|
|
55
|
+
puts "check_leaks: OK - no repeated (>= #{threshold}x) leak stacks through makiri.bundle " \
|
|
56
|
+
"(#{iterations} iterations)"
|
|
57
|
+
else
|
|
58
|
+
puts "check_leaks: FAILED - #{offenders.size} repeated leak stack(s) through makiri.bundle:"
|
|
59
|
+
offenders.each do |instances, frames|
|
|
60
|
+
puts " #{instances}x:"
|
|
61
|
+
frames.each { |f| puts " #{f}" }
|
|
62
|
+
end
|
|
63
|
+
exit 1
|
|
64
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Exercise script for the malloc-leak gate (script/check_leaks.rb runs this
|
|
4
|
+
# under macOS `leaks --atExit`). It loops the public surface - parsing, queries,
|
|
5
|
+
# serialization, mutation, fragments, the Builder, XPathContext - and, crucially,
|
|
6
|
+
# RESCUED FAILURE paths (raises that skip cleanup are where leaks hide; both
|
|
7
|
+
# leaks this gate was built on lived there or in a transient-document path).
|
|
8
|
+
#
|
|
9
|
+
# A per-call leak shows up as a leak stack with ~ITERATIONS instances, which the
|
|
10
|
+
# driver flags; one-time/Init allocations stay at 1-2 instances and pass.
|
|
11
|
+
|
|
12
|
+
require "makiri"
|
|
13
|
+
|
|
14
|
+
ITERATIONS = Integer(ENV.fetch("LEAKS_ITERATIONS", "120"))
|
|
15
|
+
|
|
16
|
+
HTML = "<div id=m class='a b'><ul><li class=item>x</li><li>y<svg><path/></svg></li></ul><p>t&</p></div>"
|
|
17
|
+
XML = %(<r xmlns:p="urn:p" xmlns="urn:d"><a id="1">t</a><p:b/><!--c--><![CDATA[z]]></r>)
|
|
18
|
+
|
|
19
|
+
handler = Class.new { def my_fn(nodes) = nodes.length.to_s }.new
|
|
20
|
+
|
|
21
|
+
ITERATIONS.times do |i|
|
|
22
|
+
# --- HTML: parse / query / serialize / mutate / fragments ---
|
|
23
|
+
d = Makiri::HTML(HTML)
|
|
24
|
+
d.css("li.item"); d.at_css("#m"); d.at_css("li").matches?("li")
|
|
25
|
+
begin d.css("li[") rescue Makiri::CSS::SyntaxError; end # selector syntax error (engine reset path)
|
|
26
|
+
d.xpath("//li"); d.at_xpath("//p"); d.xpath("count(//li)")
|
|
27
|
+
begin d.xpath("//li[") rescue Makiri::XPath::SyntaxError; end # parse failure (partial-AST/step cleanup)
|
|
28
|
+
begin d.xpath("//li", handler) rescue nil; end
|
|
29
|
+
d.xpath("//*[local-name()='path']")
|
|
30
|
+
d.to_html; d.at_css("ul").inner_html; d.to_html(pretty: true); d.text
|
|
31
|
+
e = d.at_css("li"); e["k#{i}"] = "v"; e.name = "li2"; e.content = "c"
|
|
32
|
+
e.set_attribute_ns("urn:x", "x:y", "1"); e.remove_attribute_ns("urn:x", "y")
|
|
33
|
+
d.at_css("ul").inner_html = "<li>new</li>" # transient fragment document path
|
|
34
|
+
frag = d.fragment("<b>f</b>"); d.at_css("p") << frag
|
|
35
|
+
Makiri::DocumentFragment.parse("<tr><td>1</td></tr>", context: "tbody")
|
|
36
|
+
d.at_css("p").clone_node(true); d.dup
|
|
37
|
+
begin d.fragment("x", context: "no-such-tag") rescue ArgumentError; end
|
|
38
|
+
|
|
39
|
+
# --- XML: parse (ok + failures) / query / serialize / mutate / Builder ---
|
|
40
|
+
x = Makiri::XML(XML)
|
|
41
|
+
begin Makiri::XML("<r>\xC3</r>".b) rescue Makiri::XML::SyntaxError; end
|
|
42
|
+
begin Makiri::XML("<r><a></r>") rescue Makiri::XML::SyntaxError; end
|
|
43
|
+
begin Makiri::XML("<r/>", max_bytes: 1) rescue Makiri::XML::LimitExceeded; end
|
|
44
|
+
begin Makiri::XML(%(<?xml version="1.1"?><r/>)) rescue Makiri::XML::SyntaxError; end
|
|
45
|
+
x.xpath("//d:a", "d" => "urn:d"); x.at_xpath("//p:b", "p" => "urn:p")
|
|
46
|
+
begin x.xpath("//unbound:a") rescue Makiri::Error; end
|
|
47
|
+
x.css("a"); x.at_css("p|b", "p" => "urn:p")
|
|
48
|
+
begin x.css("a[") rescue Makiri::CSS::SyntaxError; end
|
|
49
|
+
x.to_xml; x.to_xml(pretty: true); x.root.canonicalize
|
|
50
|
+
el = x.create_element("n", "t", "k" => "v"); x.root.add_child(el)
|
|
51
|
+
begin x.root.add_child(x.create_element("zz:q")) rescue Makiri::Error; end
|
|
52
|
+
x.root.children.first.replace(x.create_element("rep")); x.fragment("<f1/><f2/>")
|
|
53
|
+
Makiri::XML::Builder.new { |b| b.root("xmlns:d" => "urn:d") { b.item("a"); b["d"].q } }.to_xml
|
|
54
|
+
|
|
55
|
+
# --- XPathContext: AST cache / registrations / failing evaluate ---
|
|
56
|
+
ctx = Makiri::XPathContext.new(x)
|
|
57
|
+
ctx.register_namespace("d", "urn:d"); ctx.register_variable("v", "1")
|
|
58
|
+
ctx.evaluate("//d:a[@id=$v]"); ctx.evaluate("//d:a[@id=$v]")
|
|
59
|
+
begin ctx.evaluate("//(") rescue Makiri::XPath::SyntaxError; end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
GC.start
|
|
63
|
+
GC.start
|
|
64
|
+
puts "leaks harness done (#{ITERATIONS} iterations)"
|
|
@@ -203,6 +203,12 @@ ENDFOREACH()
|
|
|
203
203
|
## First, need to add target for shared and static library
|
|
204
204
|
IF(LEXBOR_BUILD_SHARED)
|
|
205
205
|
add_library(${LEXBOR_LIB_NAME} SHARED ${LEXBOR_SOURCES})
|
|
206
|
+
if(UNIX)
|
|
207
|
+
set_target_properties(${LEXBOR_LIB_NAME} PROPERTIES SOVERSION ${PROJECT_VERSION_MAJOR} VERSION ${PROJECT_VERSION})
|
|
208
|
+
endif()
|
|
209
|
+
if ((WIN32) AND (CMAKE_VERSION VERSION_GREATER_EQUAL "3.27"))
|
|
210
|
+
set_target_properties(${LEXBOR_LIB_NAME} PROPERTIES DLL_NAME_WITH_SOVERSION 1)
|
|
211
|
+
endif()
|
|
206
212
|
target_include_directories(${LEXBOR_LIB_NAME} PUBLIC $<BUILD_INTERFACE:${LEXBOR_SOURCE}>
|
|
207
213
|
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
|
208
214
|
target_compile_definitions(${LEXBOR_LIB_NAME} PRIVATE "LEXBOR_BUILDING")
|
data/vendor/lexbor/README.md
CHANGED
|
@@ -39,6 +39,7 @@ https://lexbor.com/modules/.
|
|
|
39
39
|
- **[SerpApi](https://serpapi.com/)** — uses Lexbor in production for HTML parsing at scale
|
|
40
40
|
- **[Selectolax](https://github.com/rushter/selectolax)** — popular Python library for fast web scraping
|
|
41
41
|
- **[Nokolexbor](https://github.com/serpapi/nokolexbor)** — high-performance Nokogiri alternative for Ruby
|
|
42
|
+
- **[Nordstjernen](https://github.com/nordstjernen-web/nordstjernen)** - Web browser written entirely in C
|
|
42
43
|
|
|
43
44
|
[More bindings](#external-bindings-and-wrappers) available for Elixir, Crystal, D, Julia, Erlang.
|
|
44
45
|
|
|
@@ -215,6 +216,7 @@ The `liblexbor-html` library already contains all the pointers to the required d
|
|
|
215
216
|
|
|
216
217
|
## External Bindings and Wrappers
|
|
217
218
|
|
|
219
|
+
* [Elixir](https://github.com/dashbitco/lazy_html) Fast HTML parsing and querying. Default HTML engine in Phoenix LiveViewTest (since LiveView 1.1).
|
|
218
220
|
* [Elixir](https://git.pleroma.social/pleroma/elixir-libraries/fast_html) binding for the HTML module (since 2.0 version)
|
|
219
221
|
* [Erlang](https://hex.pm/packages/lexbor_erl) Fast HTML5 Parser with CSS selectors and DOM manipulation (since 2.6.0 version)
|
|
220
222
|
* [Crystal](https://github.com/kostya/lexbor) Fast HTML5 Parser with CSS selectors for Crystal language
|
|
@@ -227,6 +229,16 @@ The `liblexbor-html` library already contains all the pointers to the required d
|
|
|
227
229
|
|
|
228
230
|
You can create a binding or wrapper for the `lexbor` and place the link here!
|
|
229
231
|
|
|
232
|
+
## AI Policy
|
|
233
|
+
|
|
234
|
+
Lexbor draws a clear line between its core library and the surrounding ecosystem.
|
|
235
|
+
|
|
236
|
+
**Core library** — all source code that compiles into the distributable binary — is written entirely by humans. This is an engineering choice, not an ideological one. Lexbor is a performance-critical, standards-compliant C library where every line of code must reflect a decision its author fully understands and can defend. We use AI tools in other areas of the project and see their value clearly; the core is simply not the right place for them.
|
|
237
|
+
|
|
238
|
+
AI-assisted tools are welcome and actively used for bindings, WASM builds, benchmarks, documentation, examples, tests, and other supporting work. This section is one such example — it was drafted with AI and reviewed by a human. The em dashes are a giveaway.
|
|
239
|
+
|
|
240
|
+
**For contributors:** pull requests targeting core library code are expected to be human-authored. The contributor — not an AI model — performed the reasoning, made the design choices, and wrote the code. The code you submit is yours, and you can speak to the intent and correctness of every line. We are not interested in policing workflows, but if a pull request reads like AI did the thinking, it will likely be rejected.
|
|
241
|
+
|
|
230
242
|
## Documentation
|
|
231
243
|
|
|
232
244
|
Available on [lexbor.com](https://lexbor.com) in [Documentation](https://lexbor.com/documentation/) section.
|
data/vendor/lexbor/config.cmake
CHANGED
|
@@ -167,7 +167,7 @@ MACRO(INCLUDE_MODULE_CONFIG pname module module_dir)
|
|
|
167
167
|
|
|
168
168
|
IF(EXISTS "${conf_path}")
|
|
169
169
|
set(CURRENT_LIB_NAME "${PROJECT_NAME}-${module}")
|
|
170
|
-
set(CURRENT_LIB_NAME_STATIC "${PROJECT_NAME}-${module}
|
|
170
|
+
set(CURRENT_LIB_NAME_STATIC "${PROJECT_NAME}-${module}_static")
|
|
171
171
|
|
|
172
172
|
include("${conf_path}")
|
|
173
173
|
ENDIF()
|