makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#include "glue.h"
|
|
2
|
+
|
|
3
|
+
#include <lexbor/html/serialize.h>
|
|
4
|
+
|
|
5
|
+
/*
|
|
6
|
+
* HTML serialization, delegated to Lexbor's serializer.
|
|
7
|
+
*
|
|
8
|
+
* Node#to_html / #to_s / #outer_html -> the node and its subtree (outer)
|
|
9
|
+
* Node#inner_html -> the node's children only (inner)
|
|
10
|
+
*
|
|
11
|
+
* Lexbor's serializer streams the output in many small chunks (one per tag /
|
|
12
|
+
* attribute / text piece). We collect them into a single growing C buffer
|
|
13
|
+
* (mkr_buf) and copy that into a Ruby String once at the end, instead of
|
|
14
|
+
* rb_str_cat per chunk - the per-chunk Ruby-string growth (a capacity check +
|
|
15
|
+
* coderange bookkeeping on each of thousands of appends) was the dominant cost.
|
|
16
|
+
* The buffer is **pre-reserved** to roughly the output size up front, so the
|
|
17
|
+
* per-chunk appends do not realloc on every geometric step - growing in cheap,
|
|
18
|
+
* GC-untracked C memory and paying one final copy (vs serializing straight into
|
|
19
|
+
* a growing Ruby String, where the intermediate growth pressures the GC and
|
|
20
|
+
* measured slower). Lexbor emits UTF-8, which is the string's encoding.
|
|
21
|
+
*
|
|
22
|
+
* Mutating setters (inner_html=, outer_html=) arrive with the v0.2 mutation
|
|
23
|
+
* API and are not defined here.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
static lxb_status_t
|
|
27
|
+
mkr_serialize_cb(const lxb_char_t *data, size_t len, void *ctx)
|
|
28
|
+
{
|
|
29
|
+
return mkr_buf_append((mkr_buf_t *)ctx, data, len) == MKR_OK
|
|
30
|
+
? LXB_STATUS_OK
|
|
31
|
+
: LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/* Copy the collected bytes into one UTF-8 Ruby String, always freeing the
|
|
35
|
+
* buffer; raises if the serializer (or an append) failed. */
|
|
36
|
+
static VALUE
|
|
37
|
+
mkr_serialized_str(mkr_buf_t *buf, lxb_status_t st)
|
|
38
|
+
{
|
|
39
|
+
if (st != LXB_STATUS_OK) {
|
|
40
|
+
mkr_buf_free(buf);
|
|
41
|
+
rb_raise(mkr_eError, "HTML serialization failed");
|
|
42
|
+
}
|
|
43
|
+
VALUE str = rb_utf8_str_new(buf->len ? buf->data : "", (long)buf->len);
|
|
44
|
+
mkr_buf_free(buf);
|
|
45
|
+
return str;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/* The serializer's buffer cap + initial reservation, both derived from the
|
|
49
|
+
* document's live bytes in one walk. The cap is the Lexbor analogue of the XML
|
|
50
|
+
* serializer's arena_bytes cap (mkr_xml_serialize_cap): 32x the live bytes
|
|
51
|
+
* (covering escaping + maximal pretty indentation) plus a 64 KiB floor, clamped
|
|
52
|
+
* to MKR_BUF_HARD_MAX - tight for a small document yet scaling with a large one
|
|
53
|
+
* so a legitimate parse round-trips through to_html (HTML parsing is itself
|
|
54
|
+
* byte-uncapped); a pathologically deep pretty-print exceeds it and fails closed
|
|
55
|
+
* (MKR_ERR_LIMIT) rather than growing without bound. The HTML tree cannot cycle
|
|
56
|
+
* (mutation guards + Lexbor's insert checks), so the cap is never reached in
|
|
57
|
+
* normal operation. The reservation is ~live/4: the serialized output is a
|
|
58
|
+
* fraction of the arena (96-byte node structs dwarf their markup), so this
|
|
59
|
+
* pre-sizes close to the real output without a wasteful over-allocation, leaving
|
|
60
|
+
* the geometric growth to cover any underestimate. */
|
|
61
|
+
static void
|
|
62
|
+
mkr_html_serialize_sizes(lxb_dom_node_t *node, size_t *cap, size_t *reserve)
|
|
63
|
+
{
|
|
64
|
+
size_t live = mkr_lxb_document_bytes(node);
|
|
65
|
+
|
|
66
|
+
size_t c = 65536; /* floor for a small subtree */
|
|
67
|
+
if (live > 0) {
|
|
68
|
+
c = (live <= (SIZE_MAX - c) / 32) ? c + live * 32 : SIZE_MAX;
|
|
69
|
+
}
|
|
70
|
+
if (c > MKR_BUF_HARD_MAX) {
|
|
71
|
+
c = MKR_BUF_HARD_MAX;
|
|
72
|
+
}
|
|
73
|
+
*cap = c;
|
|
74
|
+
|
|
75
|
+
size_t r = live / 4;
|
|
76
|
+
if (r < 4096) {
|
|
77
|
+
r = 4096;
|
|
78
|
+
}
|
|
79
|
+
*reserve = r;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/* Serialize +node+ into a fresh UTF-8 String. +deep+ selects the children-only
|
|
83
|
+
* (inner) serializer over the tree (outer) one; +pretty+ selects indented
|
|
84
|
+
* output. Raises Makiri::Error on serializer failure or if the output exceeds
|
|
85
|
+
* the content-scaled cap. */
|
|
86
|
+
static VALUE
|
|
87
|
+
mkr_html_serialize(lxb_dom_node_t *node, int deep, int pretty)
|
|
88
|
+
{
|
|
89
|
+
size_t cap, reserve;
|
|
90
|
+
mkr_html_serialize_sizes(node, &cap, &reserve);
|
|
91
|
+
|
|
92
|
+
mkr_buf_t buf;
|
|
93
|
+
mkr_buf_init(&buf, cap);
|
|
94
|
+
(void)mkr_buf_reserve(&buf, reserve); /* best-effort pre-size */
|
|
95
|
+
|
|
96
|
+
lxb_status_t st;
|
|
97
|
+
if (deep) {
|
|
98
|
+
st = pretty
|
|
99
|
+
? lxb_html_serialize_pretty_deep_cb(node, LXB_HTML_SERIALIZE_OPT_UNDEF,
|
|
100
|
+
0, mkr_serialize_cb, &buf)
|
|
101
|
+
: lxb_html_serialize_deep_cb(node, mkr_serialize_cb, &buf);
|
|
102
|
+
} else {
|
|
103
|
+
st = pretty
|
|
104
|
+
? lxb_html_serialize_pretty_tree_cb(node, LXB_HTML_SERIALIZE_OPT_UNDEF,
|
|
105
|
+
0, mkr_serialize_cb, &buf)
|
|
106
|
+
: lxb_html_serialize_tree_cb(node, mkr_serialize_cb, &buf);
|
|
107
|
+
}
|
|
108
|
+
return mkr_serialized_str(&buf, st);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/* Read the optional `pretty:` keyword. */
|
|
112
|
+
static int
|
|
113
|
+
mkr_serialize_pretty_opt(int argc, VALUE *argv)
|
|
114
|
+
{
|
|
115
|
+
VALUE opts = Qnil;
|
|
116
|
+
rb_scan_args(argc, argv, "0:", &opts);
|
|
117
|
+
if (NIL_P(opts)) {
|
|
118
|
+
return 0;
|
|
119
|
+
}
|
|
120
|
+
return RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("pretty"))));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/* Outer HTML: the node itself plus its descendants.
|
|
124
|
+
* Pass `pretty: true` for indented output. */
|
|
125
|
+
static VALUE
|
|
126
|
+
mkr_node_to_html(int argc, VALUE *argv, VALUE self)
|
|
127
|
+
{
|
|
128
|
+
int pretty = mkr_serialize_pretty_opt(argc, argv);
|
|
129
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
130
|
+
|
|
131
|
+
/* A document fragment has no tag of its own; "outer" == its children, so the
|
|
132
|
+
* deep (children) serializer is the right one (the tree serializer rejects a
|
|
133
|
+
* fragment node). */
|
|
134
|
+
int deep = (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT);
|
|
135
|
+
return mkr_html_serialize(node, deep, pretty);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/* Inner HTML: the node's children, without the node's own tag. */
|
|
139
|
+
static VALUE
|
|
140
|
+
mkr_node_inner_html(int argc, VALUE *argv, VALUE self)
|
|
141
|
+
{
|
|
142
|
+
int pretty = mkr_serialize_pretty_opt(argc, argv);
|
|
143
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
144
|
+
return mkr_html_serialize(node, 1 /* deep */, pretty);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
void
|
|
148
|
+
mkr_init_serialize(void)
|
|
149
|
+
{
|
|
150
|
+
rb_define_method(mkr_mHtmlNodeMethods, "to_html", mkr_node_to_html, -1);
|
|
151
|
+
rb_define_method(mkr_mHtmlNodeMethods, "to_s", mkr_node_to_html, -1);
|
|
152
|
+
rb_define_method(mkr_mHtmlNodeMethods, "outer_html", mkr_node_to_html, -1);
|
|
153
|
+
rb_define_method(mkr_mHtmlNodeMethods, "inner_html", mkr_node_inner_html, -1);
|
|
154
|
+
}
|