makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,154 @@
1
+ #include "glue.h"
2
+
3
+ #include <lexbor/html/serialize.h>
4
+
5
+ /*
6
+ * HTML serialization, delegated to Lexbor's serializer.
7
+ *
8
+ * Node#to_html / #to_s / #outer_html -> the node and its subtree (outer)
9
+ * Node#inner_html -> the node's children only (inner)
10
+ *
11
+ * Lexbor's serializer streams the output in many small chunks (one per tag /
12
+ * attribute / text piece). We collect them into a single growing C buffer
13
+ * (mkr_buf) and copy that into a Ruby String once at the end, instead of
14
+ * rb_str_cat per chunk - the per-chunk Ruby-string growth (a capacity check +
15
+ * coderange bookkeeping on each of thousands of appends) was the dominant cost.
16
+ * The buffer is **pre-reserved** to roughly the output size up front, so the
17
+ * per-chunk appends do not realloc on every geometric step - growing in cheap,
18
+ * GC-untracked C memory and paying one final copy (vs serializing straight into
19
+ * a growing Ruby String, where the intermediate growth pressures the GC and
20
+ * measured slower). Lexbor emits UTF-8, which is the string's encoding.
21
+ *
22
+ * Mutating setters (inner_html=, outer_html=) arrive with the v0.2 mutation
23
+ * API and are not defined here.
24
+ */
25
+
26
+ static lxb_status_t
27
+ mkr_serialize_cb(const lxb_char_t *data, size_t len, void *ctx)
28
+ {
29
+ return mkr_buf_append((mkr_buf_t *)ctx, data, len) == MKR_OK
30
+ ? LXB_STATUS_OK
31
+ : LXB_STATUS_ERROR_MEMORY_ALLOCATION;
32
+ }
33
+
34
+ /* Copy the collected bytes into one UTF-8 Ruby String, always freeing the
35
+ * buffer; raises if the serializer (or an append) failed. */
36
+ static VALUE
37
+ mkr_serialized_str(mkr_buf_t *buf, lxb_status_t st)
38
+ {
39
+ if (st != LXB_STATUS_OK) {
40
+ mkr_buf_free(buf);
41
+ rb_raise(mkr_eError, "HTML serialization failed");
42
+ }
43
+ VALUE str = rb_utf8_str_new(buf->len ? buf->data : "", (long)buf->len);
44
+ mkr_buf_free(buf);
45
+ return str;
46
+ }
47
+
48
+ /* The serializer's buffer cap + initial reservation, both derived from the
49
+ * document's live bytes in one walk. The cap is the Lexbor analogue of the XML
50
+ * serializer's arena_bytes cap (mkr_xml_serialize_cap): 32x the live bytes
51
+ * (covering escaping + maximal pretty indentation) plus a 64 KiB floor, clamped
52
+ * to MKR_BUF_HARD_MAX - tight for a small document yet scaling with a large one
53
+ * so a legitimate parse round-trips through to_html (HTML parsing is itself
54
+ * byte-uncapped); a pathologically deep pretty-print exceeds it and fails closed
55
+ * (MKR_ERR_LIMIT) rather than growing without bound. The HTML tree cannot cycle
56
+ * (mutation guards + Lexbor's insert checks), so the cap is never reached in
57
+ * normal operation. The reservation is ~live/4: the serialized output is a
58
+ * fraction of the arena (96-byte node structs dwarf their markup), so this
59
+ * pre-sizes close to the real output without a wasteful over-allocation, leaving
60
+ * the geometric growth to cover any underestimate. */
61
+ static void
62
+ mkr_html_serialize_sizes(lxb_dom_node_t *node, size_t *cap, size_t *reserve)
63
+ {
64
+ size_t live = mkr_lxb_document_bytes(node);
65
+
66
+ size_t c = 65536; /* floor for a small subtree */
67
+ if (live > 0) {
68
+ c = (live <= (SIZE_MAX - c) / 32) ? c + live * 32 : SIZE_MAX;
69
+ }
70
+ if (c > MKR_BUF_HARD_MAX) {
71
+ c = MKR_BUF_HARD_MAX;
72
+ }
73
+ *cap = c;
74
+
75
+ size_t r = live / 4;
76
+ if (r < 4096) {
77
+ r = 4096;
78
+ }
79
+ *reserve = r;
80
+ }
81
+
82
+ /* Serialize +node+ into a fresh UTF-8 String. +deep+ selects the children-only
83
+ * (inner) serializer over the tree (outer) one; +pretty+ selects indented
84
+ * output. Raises Makiri::Error on serializer failure or if the output exceeds
85
+ * the content-scaled cap. */
86
+ static VALUE
87
+ mkr_html_serialize(lxb_dom_node_t *node, int deep, int pretty)
88
+ {
89
+ size_t cap, reserve;
90
+ mkr_html_serialize_sizes(node, &cap, &reserve);
91
+
92
+ mkr_buf_t buf;
93
+ mkr_buf_init(&buf, cap);
94
+ (void)mkr_buf_reserve(&buf, reserve); /* best-effort pre-size */
95
+
96
+ lxb_status_t st;
97
+ if (deep) {
98
+ st = pretty
99
+ ? lxb_html_serialize_pretty_deep_cb(node, LXB_HTML_SERIALIZE_OPT_UNDEF,
100
+ 0, mkr_serialize_cb, &buf)
101
+ : lxb_html_serialize_deep_cb(node, mkr_serialize_cb, &buf);
102
+ } else {
103
+ st = pretty
104
+ ? lxb_html_serialize_pretty_tree_cb(node, LXB_HTML_SERIALIZE_OPT_UNDEF,
105
+ 0, mkr_serialize_cb, &buf)
106
+ : lxb_html_serialize_tree_cb(node, mkr_serialize_cb, &buf);
107
+ }
108
+ return mkr_serialized_str(&buf, st);
109
+ }
110
+
111
+ /* Read the optional `pretty:` keyword. */
112
+ static int
113
+ mkr_serialize_pretty_opt(int argc, VALUE *argv)
114
+ {
115
+ VALUE opts = Qnil;
116
+ rb_scan_args(argc, argv, "0:", &opts);
117
+ if (NIL_P(opts)) {
118
+ return 0;
119
+ }
120
+ return RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("pretty"))));
121
+ }
122
+
123
+ /* Outer HTML: the node itself plus its descendants.
124
+ * Pass `pretty: true` for indented output. */
125
+ static VALUE
126
+ mkr_node_to_html(int argc, VALUE *argv, VALUE self)
127
+ {
128
+ int pretty = mkr_serialize_pretty_opt(argc, argv);
129
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
130
+
131
+ /* A document fragment has no tag of its own; "outer" == its children, so the
132
+ * deep (children) serializer is the right one (the tree serializer rejects a
133
+ * fragment node). */
134
+ int deep = (node->type == LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT);
135
+ return mkr_html_serialize(node, deep, pretty);
136
+ }
137
+
138
+ /* Inner HTML: the node's children, without the node's own tag. */
139
+ static VALUE
140
+ mkr_node_inner_html(int argc, VALUE *argv, VALUE self)
141
+ {
142
+ int pretty = mkr_serialize_pretty_opt(argc, argv);
143
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
144
+ return mkr_html_serialize(node, 1 /* deep */, pretty);
145
+ }
146
+
147
+ void
148
+ mkr_init_serialize(void)
149
+ {
150
+ rb_define_method(mkr_mHtmlNodeMethods, "to_html", mkr_node_to_html, -1);
151
+ rb_define_method(mkr_mHtmlNodeMethods, "to_s", mkr_node_to_html, -1);
152
+ rb_define_method(mkr_mHtmlNodeMethods, "outer_html", mkr_node_to_html, -1);
153
+ rb_define_method(mkr_mHtmlNodeMethods, "inner_html", mkr_node_inner_html, -1);
154
+ }