makiri 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/valgrind.yml +49 -46
- data/CHANGELOG.md +68 -1
- data/README.md +14 -0
- data/Rakefile +13 -0
- data/ext/makiri/bridge/ruby_string.c +80 -54
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_utf8.c +1 -1
- data/ext/makiri/core/mkr_utf8.h +1 -1
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat.h +4 -4
- data/ext/makiri/{lexbor_compat → dom_adapter}/compat_internal.h +1 -1
- data/ext/makiri/dom_adapter/cross_import.c +434 -0
- data/ext/makiri/dom_adapter/cross_import.h +35 -0
- data/ext/makiri/{lexbor_compat → dom_adapter}/text_index.c +1 -1
- data/ext/makiri/fuzz/Makefile +1 -1
- data/ext/makiri/glue/cross_import.h +30 -0
- data/ext/makiri/glue/glue.h +1 -1
- data/ext/makiri/glue/ruby_doc.c +11 -3
- data/ext/makiri/glue/ruby_html_mutate.c +6 -0
- data/ext/makiri/glue/ruby_html_node.c +1 -1
- data/ext/makiri/glue/ruby_lexbor_css.c +462 -0
- data/ext/makiri/glue/ruby_node.c +14 -0
- data/ext/makiri/glue/ruby_xml.c +31 -2
- data/ext/makiri/glue/ruby_xml_node.c +87 -2
- data/ext/makiri/glue/ruby_xpath.c +16 -1
- data/ext/makiri/makiri.c +3 -0
- data/ext/makiri/makiri.h +5 -0
- data/ext/makiri/xml/mkr_xml.h +5 -0
- data/ext/makiri/xml/mkr_xml_chars.c +22 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +160 -50
- data/ext/makiri/xml/mkr_xml_mutate.h +24 -0
- data/ext/makiri/xml/mkr_xml_tree.c +63 -27
- data/ext/makiri/xpath/mkr_xpath.c +28 -0
- data/ext/makiri/xpath/mkr_xpath.h +5 -1
- data/ext/makiri/xpath/mkr_xpath_eval_body.h +11 -1
- data/lib/makiri/html/document.rb +11 -12
- data/lib/makiri/html/node_methods.rb +0 -1
- data/lib/makiri/node_set.rb +14 -9
- data/lib/makiri/processing_instruction.rb +8 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +29 -21
- data/lib/makiri/xpath_context.rb +12 -4
- data/script/check_c_safety.rb +1 -1
- data/script/check_c_safety_allowlist.yml +8 -5
- data/script/leaks_harness.rb +7 -0
- data/suppressions/ruby.supp +140 -0
- metadata +13 -8
- /data/ext/makiri/{lexbor_compat → dom_adapter}/dom_index.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/post_parse.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/source_loc.c +0 -0
- /data/ext/makiri/{lexbor_compat → dom_adapter}/utf8_input.c +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 27ac120b94ab835caee9bbb50a1cee71b19e339dde2384496db9608e58b3269b
|
|
4
|
+
data.tar.gz: 27b8ea683abe8854e6c68269413d4858e0f2fedfdd04f04d8fa91130b9b05ac1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 84754fb994af236692bdbc281cb0cba89a8cd6d7c75e2caa4e16ebe9b1efa6c4cbd270409be2957e461db4909bbabc32296ed44e185ccaa8985a0c285f25846c
|
|
7
|
+
data.tar.gz: c3fba2792720ad30d1bee90343e4ae7877bd9871195620ce667281fffeb36e994b3e715a5f456327aaa4e8de1e80e3c24c7e6a898739c660f4d1c8d5ffa51c60
|
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
name: Valgrind + GC.compact
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
#
|
|
4
|
+
# Valgrind memcheck ALSO runs on push to main: it is the only check without a
|
|
5
|
+
# frequency threshold (any "definitely lost" / uninitialised-value use fails,
|
|
6
|
+
# unlike the PR-level macOS leak gate, which only flags stacks repeated >=30x),
|
|
7
|
+
# so a leak on a rarely-hit error path slips past the PR gates and would
|
|
8
|
+
# otherwise surface only on the next nightly. Running it post-merge catches such
|
|
9
|
+
# regressions within ~30 min without adding ~20 min to every PR. (It is gated to
|
|
10
|
+
# main only, not pull_request, to keep PR latency low.)
|
|
11
|
+
#
|
|
12
|
+
# The GC.stress job stays nightly-only (see its `if:` below): it is heavy and
|
|
13
|
+
# checks structural properties that do not vary by day-to-day churn.
|
|
14
|
+
push:
|
|
15
|
+
branches: [main, master]
|
|
7
16
|
schedule:
|
|
8
17
|
- cron: "0 2 * * *"
|
|
9
18
|
workflow_dispatch:
|
|
@@ -61,32 +70,44 @@ jobs:
|
|
|
61
70
|
- name: Run spec suite under Valgrind (ruby_memcheck)
|
|
62
71
|
run: bundle exec rake spec:valgrind
|
|
63
72
|
|
|
64
|
-
# GC.auto_compact + GC.stress
|
|
65
|
-
# tests the borrowed-pointer discipline under the condition that
|
|
66
|
-
# actually move (compaction) and that every allocation triggers a
|
|
67
|
-
# cycle (stress). Failures here are typically use-after-move or stale
|
|
73
|
+
# GC.auto_compact + GC.stress over the GC-sensitive examples. This
|
|
74
|
+
# structurally tests the borrowed-pointer discipline under the condition that
|
|
75
|
+
# Ruby Strings actually move (compaction) and that every allocation triggers a
|
|
76
|
+
# full GC cycle (stress). Failures here are typically use-after-move or stale
|
|
68
77
|
# pointer bugs in the C extension or bridge layer.
|
|
69
78
|
#
|
|
70
|
-
#
|
|
71
|
-
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
#
|
|
76
|
-
#
|
|
77
|
-
#
|
|
79
|
+
# Scope: only the examples tagged `:gc_compact` (the `memory safety` blocks in
|
|
80
|
+
# css/xpath/serialize/mutation/source_location/xpath_handler/api_compat2 +
|
|
81
|
+
# attribute's lazy-index example). Those are the examples written to exercise
|
|
82
|
+
# the borrowed-pointer paths. `GC_COMPACT_STRESS=1` makes spec_helper set
|
|
83
|
+
# `GC.auto_compact = true` process-wide and wrap every example in `GC.stress`,
|
|
84
|
+
# so each allocation inside a tagged example triggers a *compacting* GC - the
|
|
85
|
+
# strongest form of the use-after-move test. The high-volume churn loops
|
|
86
|
+
# (parse/drop cycles) scale their iteration count down under stress
|
|
87
|
+
# (`gc_churn_iters` / `GC_COMPACT_ITERS`) because each stressed iteration is
|
|
88
|
+
# orders of magnitude heavier; `GC_COMPACT_ITERS` below tunes the total runtime
|
|
89
|
+
# (~6-9 min on CI at 200). An earlier version forced GC.stress onto the
|
|
90
|
+
# *entire* suite (~800 examples): it ran 1h40m+ and never finished, while
|
|
91
|
+
# testing borrowed-pointer discipline on hundreds of examples that have none.
|
|
92
|
+
# The rest of the suite still runs in ci.yml.
|
|
93
|
+
#
|
|
94
|
+
# THREADING is deliberately OFF here. The :threading suite is 8 threads x tens
|
|
95
|
+
# of iterations; it runs in ci.yml and its GC-sensitive examples opt into
|
|
96
|
+
# GC.stress themselves, so cross-thread interactions are covered there.
|
|
78
97
|
gc-compact-stress:
|
|
79
|
-
#
|
|
80
|
-
|
|
98
|
+
# Nightly / on-demand only - not on push (the valgrind job is the post-merge
|
|
99
|
+
# gate; GC.stress is heavy and structural, so it does not need per-push runs).
|
|
100
|
+
if: github.event_name != 'push'
|
|
81
101
|
name: GC.auto_compact + GC.stress (Ruby ${{ matrix.ruby }})
|
|
82
102
|
runs-on: ubuntu-latest
|
|
83
|
-
timeout-minutes:
|
|
103
|
+
timeout-minutes: 30
|
|
84
104
|
env:
|
|
85
|
-
|
|
86
|
-
#
|
|
87
|
-
#
|
|
88
|
-
|
|
89
|
-
|
|
105
|
+
GC_COMPACT_STRESS: "1"
|
|
106
|
+
# Per-iteration cost under per-allocation compacting GC is ~1000x normal, so
|
|
107
|
+
# the churn loops run this many iterations (vs their normal 200-1000). Tunes
|
|
108
|
+
# the job's runtime; raise for more coverage, lower if it approaches the
|
|
109
|
+
# timeout.
|
|
110
|
+
GC_COMPACT_ITERS: "200"
|
|
90
111
|
strategy:
|
|
91
112
|
fail-fast: false
|
|
92
113
|
matrix:
|
|
@@ -110,26 +131,8 @@ jobs:
|
|
|
110
131
|
- name: Compile the extension
|
|
111
132
|
run: bundle exec rake compile
|
|
112
133
|
|
|
113
|
-
#
|
|
114
|
-
#
|
|
115
|
-
#
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
# job), while loading/collection runs at normal speed.
|
|
119
|
-
- name: Run spec suite under GC.auto_compact + GC.stress
|
|
120
|
-
run: |
|
|
121
|
-
bundle exec ruby -Ilib -e '
|
|
122
|
-
GC.auto_compact = true
|
|
123
|
-
require "rspec/core"
|
|
124
|
-
RSpec.configure do |c|
|
|
125
|
-
c.around(:each) do |example|
|
|
126
|
-
GC.stress = true
|
|
127
|
-
begin
|
|
128
|
-
example.run
|
|
129
|
-
ensure
|
|
130
|
-
GC.stress = false
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
exit RSpec::Core::Runner.run(ARGV)
|
|
135
|
-
' spec
|
|
134
|
+
# GC_COMPACT_STRESS=1 (set in env above) makes spec_helper enable
|
|
135
|
+
# auto_compact globally and wrap each example in GC.stress; --tag gc_compact
|
|
136
|
+
# limits the run to the borrowed-pointer examples.
|
|
137
|
+
- name: Run GC-sensitive examples under GC.auto_compact + GC.stress
|
|
138
|
+
run: bundle exec rspec --tag gc_compact spec
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,72 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.5.0] - 2026-06-14
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
* Use-after-free when an XPath custom-function handler mutated the same
|
|
13
|
+
`XPathContext` (`register_*` / `node=`) mid-`evaluate`: such re-entrant context
|
|
14
|
+
mutation is now refused instead of invalidating the running evaluation's state.
|
|
15
|
+
|
|
16
|
+
* `Node#name=` now invalidates the element-name index, so a later `//tag` query
|
|
17
|
+
reflects the rename instead of seeing a stale bucket.
|
|
18
|
+
|
|
19
|
+
* XML processing-instruction targets now follow XML 1.0 §2.6: a PITarget is a
|
|
20
|
+
`Name`, not an NCName, so a colon is permitted (`<?a:b ...?>` parses, and
|
|
21
|
+
`create_processing_instruction("a:b", ...)` succeeds). Only the reserved `xml`
|
|
22
|
+
(any case) is still rejected. Previously a colon in a PI target was rejected as
|
|
23
|
+
not-well-formed, which was stricter than the spec (a PI target is not subject to
|
|
24
|
+
namespace processing).
|
|
25
|
+
|
|
26
|
+
* Memory leaks of the internal XPath evaluation context on error / edge paths: a
|
|
27
|
+
`Makiri::XML` `#css` / `#xpath` / `#at_xpath` whose selector or expression failed
|
|
28
|
+
the text-input contract leaked the context (it is now verified BEFORE the context
|
|
29
|
+
is allocated), and a context could leak if building the Ruby result raised (it is
|
|
30
|
+
now freed before conversion).
|
|
31
|
+
|
|
32
|
+
### Added
|
|
33
|
+
|
|
34
|
+
* `ProcessingInstruction#target` on the XML node (the PI's target name).
|
|
35
|
+
|
|
36
|
+
* Cross-kind `Document#import_node(node, deep = false)`. `import_node` now
|
|
37
|
+
translates a subtree across representations: `Makiri::XML::Document#import_node`
|
|
38
|
+
(newly added) imports an HTML (Lexbor) node by translating it to the XML node
|
|
39
|
+
representation, and `Makiri::HTML::Document#import_node` likewise translates an
|
|
40
|
+
XML node to HTML. Same-representation imports keep working (HTML to HTML via
|
|
41
|
+
Lexbor, XML to XML via the arena deep/shallow copy). The result is a detached
|
|
42
|
+
copy owned by the target document; the source is untouched. Elements (with
|
|
43
|
+
attributes), text, comment, and processing-instruction nodes translate both
|
|
44
|
+
ways, and an HTML `<template>`'s contents (which HTML keeps in a separate
|
|
45
|
+
fragment) are carried across rather than silently dropped; an XML CDATA section
|
|
46
|
+
has no HTML counterpart, so translating one into an HTML document fails closed
|
|
47
|
+
(`Makiri::Error`). Namespaces are preserved across the translation: HTML->XML
|
|
48
|
+
synthesizes the xmlns declarations needed to reproduce each node's namespace
|
|
49
|
+
(so e.g. an inline `<svg>` stays in the SVG namespace and HTML elements in the
|
|
50
|
+
XHTML namespace), and XML->HTML maps the namespace URI back to a Lexbor
|
|
51
|
+
namespace id, interning any URI (not only the ones Lexbor knows by default) so
|
|
52
|
+
custom namespaces survive too. An HTML-namespaced `<template>`'s content is
|
|
53
|
+
placed in its content fragment (HTMLTemplateElement.content), like a parsed
|
|
54
|
+
template. The other node-argument mutators
|
|
55
|
+
(`add_child`/`before`/`after`/`replace`/`fragment`) still reject a foreign-kind
|
|
56
|
+
node; `import_node` is the one sanctioned crossing point.
|
|
57
|
+
|
|
58
|
+
* `set_attribute_ns(namespace, qualified_name, value)` and
|
|
59
|
+
`remove_attribute_ns(namespace, local_name)` on `Makiri::XML` elements - the DOM
|
|
60
|
+
setAttributeNS / removeAttributeNS, keyed on the (explicit namespace, local name)
|
|
61
|
+
pair so two attributes with the same qualified name in different namespaces
|
|
62
|
+
coexist (a null/"" namespace is the null namespace).
|
|
63
|
+
|
|
64
|
+
* `Makiri::Lexbor::CSS.parse_stylesheet(text)`, a thin binding over Lexbor's
|
|
65
|
+
CSS stylesheet parser that returns the parsed rules as plain Ruby primitives
|
|
66
|
+
(`{type: :style, selectors: [{text:, specificity: [a,b,c]}, ...],
|
|
67
|
+
declarations: [{name:, value:, important:}, ...]}` and nested
|
|
68
|
+
`{type: :media, condition:, rules: [...]}`, in source order). Selector
|
|
69
|
+
specificity and value normalization come from Lexbor; `css-syntax-3` error
|
|
70
|
+
recovery means a broken stylesheet yields its valid rules instead of raising.
|
|
71
|
+
Hosts the new `Makiri::Lexbor::*` namespace (the unabstracted lexbor-native
|
|
72
|
+
surface, distinct from the Nokogiri-compatible `Makiri::*`).
|
|
73
|
+
|
|
8
74
|
## [0.4.0] - 2026-06-12
|
|
9
75
|
|
|
10
76
|
### Added
|
|
@@ -296,7 +362,8 @@ libxml2 / libxslt dependency at any layer**.
|
|
|
296
362
|
domxpath, CSS differential vs `Nokogiri::HTML5`). GitHub Actions CI across
|
|
297
363
|
Ruby 3.2–4.0 × Ubuntu/macOS plus a sanitizer job.
|
|
298
364
|
|
|
299
|
-
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.
|
|
365
|
+
[Unreleased]: https://github.com/takahashim/makiri/compare/v0.5.0...HEAD
|
|
366
|
+
[0.5.0]: https://github.com/takahashim/makiri/compare/v0.4.0...v0.5.0
|
|
300
367
|
[0.4.0]: https://github.com/takahashim/makiri/compare/v0.3.0...v0.4.0
|
|
301
368
|
[0.3.0]: https://github.com/takahashim/makiri/compare/v0.2.0...v0.3.0
|
|
302
369
|
[0.2.0]: https://github.com/takahashim/makiri/compare/v0.1.0...v0.2.0
|
data/README.md
CHANGED
|
@@ -141,6 +141,14 @@ XML subtrees can be built with `Document#create_element` and related node factor
|
|
|
141
141
|
then inserted with `#add_child`, `#before`, `#after`, or `#replace`;
|
|
142
142
|
namespaces are resolved at insertion time, and cross-document nodes are deep-copied.
|
|
143
143
|
|
|
144
|
+
`Document#import_node(node, deep = false)` brings a node into a document as a
|
|
145
|
+
detached copy, and works **across representations**: importing a `Makiri::HTML`
|
|
146
|
+
node into a `Makiri::XML::Document` (or vice versa) translates the subtree between
|
|
147
|
+
the two node representations, preserving namespaces (e.g. an inline `<svg>` keeps
|
|
148
|
+
the SVG namespace, HTML elements the XHTML namespace; custom namespaces are
|
|
149
|
+
preserved across both directions). An XML CDATA section has no HTML counterpart,
|
|
150
|
+
so importing one into an HTML document raises.
|
|
151
|
+
|
|
144
152
|
```ruby
|
|
145
153
|
doc = Makiri::XML(%(<feed xmlns="urn:a" xmlns:dc="urn:dc"/>))
|
|
146
154
|
entry = doc.create_element("entry")
|
|
@@ -226,6 +234,12 @@ Detailed, test-backed notes live in `spec/conformance/README.md`.
|
|
|
226
234
|
markup string straight to `#add_child` is unsupported (parse it into a fragment
|
|
227
235
|
first). (`#to_xml` serialization is supported; HTML serialization - `to_html`
|
|
228
236
|
/ `inner_html` / `outer_html` - is not.)
|
|
237
|
+
* A colon in a processing-instruction target is well-formed (`<?a:b ...?>` parses).
|
|
238
|
+
* XML 1.0 §2.6: a `PITarget` is a `Name`, not an NCName, and Namespaces in XML
|
|
239
|
+
1.0's normative conformance section constrains only element/attribute names
|
|
240
|
+
(QNames), never PI targets. Nokogiri/libxml2 rejects it (`colons are forbidden
|
|
241
|
+
from PI names`); Makiri follows the normative text. Only the reserved `xml`
|
|
242
|
+
(any case) target is rejected.
|
|
229
243
|
* Otherwise the parsed tree is byte-identical to `Nokogiri::XML`'s (verified by
|
|
230
244
|
the property-based differential), including namespaces, prolog/epilog comments
|
|
231
245
|
and PIs, and adjacent-CDATA coalescing.
|
data/Rakefile
CHANGED
|
@@ -59,6 +59,19 @@ task default: %i[compile spec]
|
|
|
59
59
|
# *our* code: a real uninit/invalid access in mkr_*/Lexbor still has a makiri frame
|
|
60
60
|
# and is still reported.
|
|
61
61
|
#
|
|
62
|
+
# BUT the binary-touch filter is too coarse for one residual class: when a GC
|
|
63
|
+
# cycle fires *inside* one of our allocations (or marks through our mark
|
|
64
|
+
# callback), CRuby's conservative collector legitimately reads uninitialised
|
|
65
|
+
# words (machine-stack scan reading stale frames, incremental mark/sweep reading
|
|
66
|
+
# not-yet-written RVALUE flags) while a makiri frame sits on the stack - so ~190
|
|
67
|
+
# of these pure-Ruby-GC false positives pass the filter. The gem's bundled
|
|
68
|
+
# ruby.supp only covers `each_location*` under Addr8, not the Cond/Value8 reads
|
|
69
|
+
# we hit. `suppressions/ruby.supp` (auto-loaded by ruby_memcheck: it globs
|
|
70
|
+
# `<dir>/<ruby-version>.supp`, and the bare `ruby.supp` matches every version)
|
|
71
|
+
# suppresses exactly those GC-driver-anchored uninit reads, plus the VM
|
|
72
|
+
# method-cache id_table the interpreter never frees before exit. A real uninit
|
|
73
|
+
# read in our code does not descend from a GC driver, so it still fails.
|
|
74
|
+
#
|
|
62
75
|
# Guarded: ruby_memcheck lives in the optional :valgrind bundler group, so a
|
|
63
76
|
# normal `bundle exec rake` (without that group) must not fail to load.
|
|
64
77
|
begin
|
|
@@ -45,33 +45,61 @@ mkr_ruby_str_from_borrowed(mkr_borrowed_text_t text)
|
|
|
45
45
|
return rb_utf8_str_new(text.ptr, (long)text.len);
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
48
|
+
/* The shared core of Makiri's strict text contract: no NUL byte, valid UTF-8.
|
|
49
|
+
* Returns the specific violation (or MKR_TEXT_OK); each caller maps the verdict
|
|
50
|
+
* to its own error surface (Makiri::Error, XML::SyntaxError, or a reason string).
|
|
51
|
+
*
|
|
52
|
+
* ALLOCATION-FREE BY DESIGN, which every caller relies on: it runs between a
|
|
53
|
+
* caller taking a borrowed RSTRING pointer and using it, so it must not be a GC
|
|
54
|
+
* point. (The former per-caller implementations each built a throwaway Ruby
|
|
55
|
+
* String (rb_enc_str_new) to read its coderange - a Ruby allocation inside every
|
|
56
|
+
* borrow, which both passed the borrowed ptr into an allocating call and opened a
|
|
57
|
+
* GC window under every OTHER borrow already held at multi-borrow call sites.)
|
|
58
|
+
*
|
|
59
|
+
* +coderange_str+ is the String consulted for its CACHED coderange (no scan, no
|
|
60
|
+
* alloc); +ptr+/+len+ are the bytes validated. They may differ: the XML path
|
|
61
|
+
* passes the whole decoded String for the coderange but a BOM-stripped suffix as
|
|
62
|
+
* the bytes (the BOM is one complete UTF-8 char, so a whole-string VALID
|
|
63
|
+
* coderange still proves the suffix valid). Bytes are validated as UTF-8
|
|
64
|
+
* regardless of the String's declared encoding. */
|
|
65
|
+
typedef enum {
|
|
66
|
+
MKR_TEXT_OK = 0,
|
|
67
|
+
MKR_TEXT_HAS_NUL,
|
|
68
|
+
MKR_TEXT_INVALID_UTF8,
|
|
69
|
+
} mkr_text_verdict_t;
|
|
61
70
|
|
|
62
|
-
|
|
71
|
+
static mkr_text_verdict_t
|
|
72
|
+
mkr_text_check(VALUE coderange_str, const char *ptr, size_t len)
|
|
73
|
+
{
|
|
74
|
+
mkr_span_t sv = mkr_span(ptr, len);
|
|
63
75
|
size_t nul_at;
|
|
64
76
|
if (mkr_span_find(&sv, '\0', &nul_at)) {
|
|
65
|
-
|
|
77
|
+
return MKR_TEXT_HAS_NUL;
|
|
66
78
|
}
|
|
67
|
-
|
|
68
79
|
/* Cached-coderange fast path (reads flags, never scans, never allocates);
|
|
69
|
-
* NUL is valid UTF-8, so the
|
|
70
|
-
if (mkr_ruby_str_known_valid_utf8(
|
|
71
|
-
return;
|
|
80
|
+
* NUL is valid UTF-8, so the find above stays either way. */
|
|
81
|
+
if (mkr_ruby_str_known_valid_utf8(coderange_str)) {
|
|
82
|
+
return MKR_TEXT_OK;
|
|
83
|
+
}
|
|
84
|
+
if (!mkr_utf8_valid((const unsigned char *)ptr, len)) {
|
|
85
|
+
return MKR_TEXT_INVALID_UTF8;
|
|
72
86
|
}
|
|
73
|
-
|
|
74
|
-
|
|
87
|
+
return MKR_TEXT_OK;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
void
|
|
91
|
+
mkr_verify_text(VALUE str, const char *what)
|
|
92
|
+
{
|
|
93
|
+
const char *ptr = RSTRING_PTR(str);
|
|
94
|
+
size_t len = (size_t)RSTRING_LEN(str);
|
|
95
|
+
|
|
96
|
+
switch (mkr_text_check(str, ptr, len)) {
|
|
97
|
+
case MKR_TEXT_HAS_NUL:
|
|
98
|
+
rb_raise(mkr_eError, "%s must not contain a NUL byte", what);
|
|
99
|
+
case MKR_TEXT_INVALID_UTF8:
|
|
100
|
+
rb_raise(mkr_eError, "%s must be valid UTF-8", what);
|
|
101
|
+
case MKR_TEXT_OK:
|
|
102
|
+
break;
|
|
75
103
|
}
|
|
76
104
|
}
|
|
77
105
|
|
|
@@ -180,6 +208,12 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
|
|
|
180
208
|
return NULL;
|
|
181
209
|
}
|
|
182
210
|
|
|
211
|
+
static int
|
|
212
|
+
mkr_decl_ws(int c)
|
|
213
|
+
{
|
|
214
|
+
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
|
215
|
+
}
|
|
216
|
+
|
|
183
217
|
/* The encoding named in the '<?xml ... encoding="NAME" ?>' declaration, or NULL.
|
|
184
218
|
* The declaration is ASCII; for a UTF-16/32-detected document its bytes are
|
|
185
219
|
* stride-interleaved, so the ASCII column is extracted (stride/off resolved by
|
|
@@ -190,12 +224,6 @@ mkr_xml_bom_encoding(const unsigned char *p, long len, long *bom_len, long *stri
|
|
|
190
224
|
* of p is done: the stride/off geometry is passed in (rather than derived here
|
|
191
225
|
* via rb_enc_find, which can autoload = a GC point), and the only rb_enc_find -
|
|
192
226
|
* the final name lookup - runs after the bytes have been copied into head[]. */
|
|
193
|
-
static int
|
|
194
|
-
mkr_decl_ws(int c)
|
|
195
|
-
{
|
|
196
|
-
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
|
197
|
-
}
|
|
198
|
-
|
|
199
227
|
static rb_encoding *
|
|
200
228
|
mkr_xml_decl_encoding(const unsigned char *p, long len, long stride, long off)
|
|
201
229
|
{
|
|
@@ -336,19 +364,19 @@ mkr_xml_decode_input(VALUE str, size_t max_bytes)
|
|
|
336
364
|
rb_raise(mkr_eXmlLimitExceeded, "XML input exceeds the byte budget");
|
|
337
365
|
}
|
|
338
366
|
|
|
339
|
-
/* Strict UTF-8 validation, allocation-free - no GC point
|
|
340
|
-
* borrowed
|
|
341
|
-
*
|
|
342
|
-
*
|
|
343
|
-
*
|
|
344
|
-
*
|
|
345
|
-
size_t
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
367
|
+
/* Strict UTF-8 validation via the shared, allocation-free core - no GC point
|
|
368
|
+
* while `ptr` is borrowed: an embedded NUL or any invalid UTF-8 is fatal (no
|
|
369
|
+
* U+FFFD repair - unlike the HTML mkr_utf8_sanitize path). The whole-string
|
|
370
|
+
* `s` is consulted for the cached coderange (it covers the BOM-stripped
|
|
371
|
+
* suffix too - the BOM is one complete UTF-8 character), while the validated
|
|
372
|
+
* bytes are the stripped suffix `ptr + off`. */
|
|
373
|
+
switch (mkr_text_check(s, ptr + off, (size_t)len)) {
|
|
374
|
+
case MKR_TEXT_HAS_NUL:
|
|
375
|
+
rb_raise(mkr_eXmlSyntaxError, "XML input must not contain a NUL byte");
|
|
376
|
+
case MKR_TEXT_INVALID_UTF8:
|
|
377
|
+
rb_raise(mkr_eXmlSyntaxError, "XML input must be valid UTF-8");
|
|
378
|
+
case MKR_TEXT_OK:
|
|
379
|
+
break;
|
|
352
380
|
}
|
|
353
381
|
/* Build the result through the VALUE, not the borrowed ptr (rb_str_subseq
|
|
354
382
|
* allocates, so the ptr must not be what it copies from). */
|
|
@@ -379,26 +407,24 @@ mkr_ruby_str_known_valid_utf8(VALUE str)
|
|
|
379
407
|
const char *
|
|
380
408
|
mkr_ruby_try_verified_text(VALUE sv, size_t max_bytes, mkr_ruby_borrowed_text_t *out)
|
|
381
409
|
{
|
|
382
|
-
/* ALLOCATION-FREE, like mkr_verify_text: the returned
|
|
383
|
-
* crossed a Ruby allocation
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
if ((size_t)len > max_bytes) {
|
|
410
|
+
/* ALLOCATION-FREE, like mkr_verify_text (see mkr_text_check): the returned
|
|
411
|
+
* borrow must not have crossed a Ruby allocation. */
|
|
412
|
+
size_t len = (size_t)RSTRING_LEN(sv);
|
|
413
|
+
if (len > max_bytes) {
|
|
387
414
|
return "string exceeds the maximum length";
|
|
388
415
|
}
|
|
389
416
|
const char *ptr = RSTRING_PTR(sv);
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
return "string is not valid UTF-8";
|
|
417
|
+
switch (mkr_text_check(sv, ptr, len)) {
|
|
418
|
+
case MKR_TEXT_HAS_NUL:
|
|
419
|
+
return "string contains a NUL byte";
|
|
420
|
+
case MKR_TEXT_INVALID_UTF8:
|
|
421
|
+
return "string is not valid UTF-8";
|
|
422
|
+
case MKR_TEXT_OK:
|
|
423
|
+
break;
|
|
398
424
|
}
|
|
399
425
|
out->value = sv;
|
|
400
426
|
out->ptr = ptr;
|
|
401
|
-
out->len =
|
|
427
|
+
out->len = len;
|
|
402
428
|
return NULL;
|
|
403
429
|
}
|
|
404
430
|
|
data/ext/makiri/core/mkr_alloc.h
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
/*
|
|
5
5
|
* Fail-closed memory primitives: overflow-checked size arithmetic and
|
|
6
6
|
* allocators, the foundation every other C layer (glue, xpath engine,
|
|
7
|
-
*
|
|
7
|
+
* dom_adapter) builds on, so the ad-hoc `cap *= 2` / `n + 1` /
|
|
8
8
|
* `malloc(n * sizeof(T))` patterns are written once, here, and fail closed.
|
|
9
9
|
* NOTHING in this header touches Ruby - exception mapping happens at the glue
|
|
10
10
|
* boundary. (mkr_core.h is a thin umbrella over this + the other core headers.)
|
data/ext/makiri/core/mkr_utf8.c
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/* mkr_utf8.c - the shared pure-C UTF-8 validator. Ruby-free, allocation-free.
|
|
2
2
|
* See mkr_utf8.h for the contract and why it lives in core. Moved verbatim from
|
|
3
|
-
*
|
|
3
|
+
* dom_adapter/utf8_input.c (whose sanitiser fast path now calls this). */
|
|
4
4
|
#include "mkr_utf8.h"
|
|
5
5
|
|
|
6
6
|
#include <string.h> /* memcpy for the word-at-a-time ASCII scan */
|
data/ext/makiri/core/mkr_utf8.h
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* separately (memchr).
|
|
14
14
|
*
|
|
15
15
|
* This lives in core so the Ruby bridge (mkr_verify_text - the strict
|
|
16
|
-
* programmatic-input gate) and the HTML input sanitiser (
|
|
16
|
+
* programmatic-input gate) and the HTML input sanitiser (dom_adapter/
|
|
17
17
|
* utf8_input.c fast path) share a single implementation, and so the bridge's
|
|
18
18
|
* validation never allocates: a borrowed RSTRING pointer must not be held
|
|
19
19
|
* across a Ruby allocation (= GC point), so the validator the bridge runs
|
|
@@ -67,7 +67,7 @@ int mkr_utf8_sanitize(const lxb_char_t *src, size_t len,
|
|
|
67
67
|
|
|
68
68
|
void mkr_parsed_destroy(mkr_parsed_t *p);
|
|
69
69
|
|
|
70
|
-
/* ---- attribute -> owner element index (
|
|
70
|
+
/* ---- attribute -> owner element index (dom_adapter/dom_index.c) ----
|
|
71
71
|
*
|
|
72
72
|
* Lexbor sets neither lxb_dom_attr_t::owner nor attr->node.parent, so an
|
|
73
73
|
* attribute node has no usable back-pointer to its element. We build our own
|
|
@@ -94,7 +94,7 @@ int mkr_parsed_dom_index_build(mkr_parsed_t *p);
|
|
|
94
94
|
* removing a subtree). */
|
|
95
95
|
void mkr_parsed_dom_index_invalidate(mkr_parsed_t *p);
|
|
96
96
|
|
|
97
|
-
/* ---- element index: tag id -> elements (
|
|
97
|
+
/* ---- element index: tag id -> elements (dom_adapter/dom_index.c) ----
|
|
98
98
|
*
|
|
99
99
|
* Co-built with the attr->owner index in the same document walk (same object,
|
|
100
100
|
* same lazy build, same invalidation). Groups every element by tag id in
|
|
@@ -119,7 +119,7 @@ lxb_dom_node_t *const *mkr_element_index_tag(const void *idx, lxb_tag_id_t tag_i
|
|
|
119
119
|
* (fail safe) for a NULL index. */
|
|
120
120
|
int mkr_element_index_has_foreign(const void *idx);
|
|
121
121
|
|
|
122
|
-
/* ---- source location (
|
|
122
|
+
/* ---- source location (dom_adapter/source_loc.c) ----
|
|
123
123
|
*
|
|
124
124
|
* mkr_parse_html drives Lexbor's low-level parser pipeline and chains the
|
|
125
125
|
* tokenizer's token-done callback so we can record the byte offset of every
|
|
@@ -156,7 +156,7 @@ size_t mkr_parsed_node_line(mkr_parsed_t *p, lxb_dom_node_t *node);
|
|
|
156
156
|
* mkr_parsed_destroy; exposed so post_parse.c need not see the index layout. */
|
|
157
157
|
void mkr_dom_index_free(void *idx);
|
|
158
158
|
|
|
159
|
-
/* ---- text-extraction index (
|
|
159
|
+
/* ---- text-extraction index (dom_adapter/text_index.c) ----
|
|
160
160
|
*
|
|
161
161
|
* Maps a node to the contiguous run of document-order TEXT/CDATA byte slices
|
|
162
162
|
* its subtree owns, so Node#text / XPath string-value can serve a pre-sized
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
#define MAKIRI_COMPAT_INTERNAL_H
|
|
3
3
|
|
|
4
4
|
/* Low-level helpers shared across the extension's C translation units (the
|
|
5
|
-
*
|
|
5
|
+
* dom_adapter layer and the Ruby↔C glue) but not part of the compat public
|
|
6
6
|
* API in compat.h. */
|
|
7
7
|
|
|
8
8
|
#include <lexbor/dom/dom.h>
|