makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
@@ -0,0 +1,420 @@
1
+ /* ruby_xml.c - Ruby boundary for the native XML reader (Phase 1).
2
+ *
3
+ * Makiri::XML::Document.parse(source) (and the Makiri::XML(source) convenience
4
+ * that delegates to it): strict-decode the input (§2.1), then run the Ruby-free
5
+ * parser with the GVL released, and return a
6
+ * Makiri::XML::Document. The document is held in a kind=MKR_DOC_XML mkr_parsed_t
7
+ * (the common document handle, §2.3) and wrapped by mkr_wrap_document, which GC
8
+ * frees via mkr_parsed_destroy (the XML branch whole-arena-frees).
9
+ */
10
+ #include "../makiri.h"
11
+ #include "../core/mkr_core.h"
12
+ #include "../xml/mkr_xml.h"
13
+ #include "../xml/mkr_xml_node.h"
14
+ #include "glue.h" /* mkr_wrap_document, mkr_parsed_* (via compat.h) */
15
+ #include "ruby_xpath.h" /* mkr_xpath_value_to_ruby / mkr_xpath_raise (shared) */
16
+ #include "../xpath/mkr_xpath.h"
17
+
18
+ #include <ruby/thread.h>
19
+
20
+ /* ---- GVL-released parse ---- */
21
+ typedef struct {
22
+ const char *src;
23
+ size_t len;
24
+ mkr_xml_limits_t limits; /* per-parse budget overrides (0 fields = default) */
25
+ mkr_xml_doc_t *result;
26
+ mkr_xml_status_t status;
27
+ } mkr_xml_parse_nogvl_t;
28
+
29
+ static void *
30
+ mkr_xml_parse_nogvl(void *p)
31
+ {
32
+ mkr_xml_parse_nogvl_t *a = (mkr_xml_parse_nogvl_t *)p;
33
+ a->result = mkr_xml_parse_ex(a->src, a->len, &a->limits, &a->status);
34
+ return NULL;
35
+ }
36
+
37
+ /* Read the optional per-parse budget overrides from the keyword hash. Today only
38
+ * max_bytes (the arena memory ceiling) is configurable; rb_get_kwargs rejects any
39
+ * other keyword, and new budgets join the table here as they become runtime-
40
+ * configurable. max_bytes must be a positive Integer (0 / negative / non-Integer
41
+ * raise), and 0 in the struct means "use the compile-time default". */
42
+ static mkr_xml_limits_t
43
+ mkr_xml_parse_limits(VALUE rb_opts)
44
+ {
45
+ mkr_xml_limits_t limits = { 0 };
46
+ if (NIL_P(rb_opts)) return limits;
47
+
48
+ static ID kw_ids[1];
49
+ if (kw_ids[0] == 0) kw_ids[0] = rb_intern_const("max_bytes");
50
+ VALUE kw_vals[1];
51
+ rb_get_kwargs(rb_opts, kw_ids, 0, 1, kw_vals); /* unknown keyword -> ArgumentError */
52
+
53
+ if (kw_vals[0] != Qundef) {
54
+ if (!RB_INTEGER_TYPE_P(kw_vals[0])) {
55
+ rb_raise(rb_eTypeError, "max_bytes must be an Integer");
56
+ }
57
+ /* Reject <= 0 BEFORE the unsigned conversion: NUM2SIZET wraps a negative
58
+ * Integer into a huge size_t (an accidental budget bypass), so guard the
59
+ * sign first. A too-large positive still raises RangeError in NUM2SIZET. */
60
+ if (RTEST(rb_funcall(kw_vals[0], rb_intern("<="), 1, INT2FIX(0)))) {
61
+ rb_raise(rb_eArgError, "max_bytes must be positive");
62
+ }
63
+ limits.max_bytes = NUM2SIZET(kw_vals[0]);
64
+ }
65
+ return limits;
66
+ }
67
+
68
+ /* call-seq: Makiri::XML::Document.parse(source, max_bytes: nil) -> Makiri::XML::Document
69
+ * Makiri::XML(source, max_bytes: nil) -> Makiri::XML::Document
70
+ * +source+ is a String or any object responding to +#read+ (an IO / File /
71
+ * StringIO); +max_bytes+ overrides the default arena memory ceiling for this
72
+ * parse. Read a non-UTF-8 file in binary mode (File.binread / "rb") so the
73
+ * encoding is autodetected from its BOM / declaration. */
74
+ static VALUE
75
+ mkr_xml_s_parse(int argc, VALUE *argv, VALUE self)
76
+ {
77
+ (void)self;
78
+ VALUE rb_source, rb_opts;
79
+ rb_scan_args(argc, argv, "1:", &rb_source, &rb_opts);
80
+ mkr_xml_limits_t limits = mkr_xml_parse_limits(rb_opts); /* validates; may raise */
81
+ size_t budget = limits.max_bytes ? limits.max_bytes : (size_t)MKR_XML_MAX_BYTES;
82
+
83
+ /* Read an IO/File-like source (an object responding to #read), like the HTML
84
+ * entry; a String passes straight through. */
85
+ if (rb_respond_to(rb_source, rb_intern("read"))) {
86
+ rb_source = rb_funcall(rb_source, rb_intern("read"), 0);
87
+ }
88
+
89
+ /* Strict decode under the GVL: invalid UTF-8 / undecodable byte / NUL all
90
+ * raise Makiri::XML::SyntaxError here (no U+FFFD repair). Passing the budget
91
+ * lets decode reject an over-budget input (LimitExceeded) before its
92
+ * validation copy and the GVL-release copy below - so a hostile oversized
93
+ * document is not materialised twice for a doomed parse. */
94
+ VALUE decoded = mkr_xml_decode_input(rb_String(rb_source), budget);
95
+
96
+ /* Build an empty XML handle and wrap it first (doc == NULL) so a failure
97
+ * mid-parse frees cleanly via GC (mkr_parsed_destroy -> the XML branch ->
98
+ * mkr_xml_doc_destroy(NULL), a no-op). */
99
+ mkr_parsed_t *parsed = mkr_parsed_new_xml(NULL);
100
+ if (parsed == NULL) {
101
+ rb_raise(mkr_eError, "out of memory allocating XML document");
102
+ }
103
+ VALUE obj = mkr_wrap_document(parsed); /* GC owns +parsed+ from here */
104
+
105
+ /* Copy the decoded bytes so the parse can run with the GVL released without
106
+ * racing GC/compaction on the String's backing store. */
107
+ mkr_owned_bytes_t source = {0};
108
+ if (mkr_ruby_copy_bytes(decoded, &source) != 0) {
109
+ rb_raise(mkr_eError, "out of memory copying XML source");
110
+ }
111
+ RB_GC_GUARD(decoded);
112
+
113
+ mkr_xml_parse_nogvl_t args = { source.ptr, source.len, limits, NULL, MKR_XML_OK };
114
+ rb_thread_call_without_gvl(mkr_xml_parse_nogvl, &args, NULL, NULL);
115
+ mkr_owned_bytes_clear(&source);
116
+
117
+ if (args.result == NULL) {
118
+ switch (args.status) {
119
+ case MKR_XML_ERR_SYNTAX: rb_raise(mkr_eXmlSyntaxError, "malformed XML"); break;
120
+ case MKR_XML_ERR_LIMIT: rb_raise(mkr_eXmlLimitExceeded, "XML document budget exceeded"); break;
121
+ case MKR_XML_ERR_VERSION: rb_raise(mkr_eXmlSyntaxError,
122
+ "unsupported XML version (only XML 1.0 is supported)"); break;
123
+ default: rb_raise(mkr_eError, "failed to parse XML document"); break;
124
+ }
125
+ }
126
+ mkr_parsed_set_xml_doc(parsed, args.result);
127
+ RB_GC_GUARD(obj);
128
+ return obj;
129
+ }
130
+
131
+ /* XPath value -> Ruby and error -> exception are shared with the HTML query glue
132
+ * (mkr_xpath_value_to_ruby / mkr_xpath_raise, ruby_xpath.h): both query entry
133
+ * points run the same engine and return the same public value/error types, so
134
+ * the conversion lives in one place. mkr_node_set_new/push are kind-aware, so the
135
+ * shared value converter wraps these results as Makiri::XML::* nodes. */
136
+
137
+ /* Resolve the (document VALUE, context node) an XPath query runs against: for a
138
+ * Makiri::XML::Document the context is the document node, for a node it is that
139
+ * node (and its owning Document). */
140
+ static mkr_xml_node_t *
141
+ mkr_xml_query_context(VALUE self, VALUE *out_document)
142
+ {
143
+ /* mkr_xml_node_unwrap is kind-checked (raises on a non-XML node) and resolves
144
+ * an XML Document to its arena document node; mkr_node_document gives the
145
+ * keepalive Document for either. */
146
+ *out_document = mkr_node_document(self);
147
+ return mkr_xml_node_unwrap(self);
148
+ }
149
+
150
+ /* Register a {prefix => uri} Ruby Hash onto +ctx+ for a single query. On any bad
151
+ * entry (non-string-coercible, invalid UTF-8 / embedded NUL, or OOM) the context
152
+ * is freed and an exception is raised - never a partial registration. RSS/Atom
153
+ * live in a default namespace, so a prefix is the strict-mode way to select them
154
+ * (e.g. xpath("//a:entry", "a" => "http://www.w3.org/2005/Atom")). */
155
+ static void
156
+ mkr_xml_register_query_namespaces(mkr_xpath_context_t *ctx, VALUE rb_ns)
157
+ {
158
+ if (NIL_P(rb_ns)) return;
159
+ if (!RB_TYPE_P(rb_ns, T_HASH)) {
160
+ mkr_xpath_context_free(ctx);
161
+ rb_raise(rb_eTypeError, "namespaces must be a Hash of prefix => uri");
162
+ }
163
+ size_t cap = mkr_ctx_limits(ctx)->max_string_bytes;
164
+ VALUE keys = rb_funcall(rb_ns, rb_intern("keys"), 0);
165
+ for (long i = 0; i < RARRAY_LEN(keys); i++) {
166
+ VALUE k = rb_ary_entry(keys, i);
167
+ VALUE ks = rb_obj_as_string(k);
168
+ VALUE vs = rb_obj_as_string(rb_hash_aref(rb_ns, k));
169
+ mkr_ruby_borrowed_text_t pv, uv;
170
+ const char *bad = mkr_ruby_try_verified_text(ks, cap, &pv);
171
+ if (bad == NULL) bad = mkr_ruby_try_verified_text(vs, cap, &uv);
172
+ if (bad != NULL) {
173
+ mkr_xpath_context_free(ctx);
174
+ rb_raise(mkr_eError, "invalid namespace mapping: %s", bad);
175
+ }
176
+ int rc = mkr_xpath_register_ns(ctx, mkr_verified_text_from_view(pv),
177
+ mkr_verified_text_from_view(uv));
178
+ RB_GC_GUARD(ks);
179
+ RB_GC_GUARD(vs);
180
+ if (rc != 0) {
181
+ mkr_xpath_context_free(ctx);
182
+ rb_raise(mkr_eError, "failed to register namespace");
183
+ }
184
+ }
185
+ RB_GC_GUARD(keys);
186
+ }
187
+
188
+ /* Makiri::XML::{Document,*}#xpath(expr, namespaces = nil) / #at_xpath(...):
189
+ * evaluate +expr+ over the XML engine instance, rooted at +self+'s context node,
190
+ * and return a NodeSet (node-set) or scalar. +namespaces+ is an optional
191
+ * {prefix => uri} Hash registered for this query (RSS/Atom default-namespace
192
+ * docs need a prefix under strict matching). Phase 1: no custom-function
193
+ * handler. Makiri::XPathContext is the alternative when many queries share one
194
+ * namespace set (it caches the registrations and the compiled ASTs). */
195
+ static VALUE
196
+ mkr_xml_doc_xpath_run(VALUE self, VALUE rb_expr, VALUE rb_ns, int first_only)
197
+ {
198
+ VALUE document = Qnil;
199
+ mkr_xml_node_t *context = mkr_xml_query_context(self, &document);
200
+ if (context == NULL) {
201
+ return first_only ? Qnil : mkr_node_set_new(document);
202
+ }
203
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(document));
204
+
205
+ /* The document node is the "document" (the engine's XML namespace services
206
+ * ignore it) and the "/" root for absolute paths; +context+ is the relative
207
+ * context node (the document node for a Document, else the node itself). */
208
+ mkr_xpath_context_t *ctx =
209
+ mkr_xpath_context_new((void *)xdoc->doc_node, (void *)context);
210
+ if (ctx == NULL) {
211
+ rb_raise(mkr_eError, "failed to allocate XPath context");
212
+ }
213
+ mkr_xpath_set_engine_kind(ctx, 1);
214
+ mkr_xml_register_query_namespaces(ctx, rb_ns); /* frees ctx + raises on error */
215
+
216
+ /* Mint the borrowed expression view AFTER namespace registration: that step
217
+ * allocates Ruby objects (and may run GC), and the borrowed bytes must not
218
+ * be held live across it. mkr_parse below runs pure C. */
219
+ mkr_ruby_borrowed_text_t ev = mkr_ruby_verified_text(rb_expr, "XPath expression");
220
+ mkr_xpath_error_t error = {0};
221
+ mkr_xpath_limits_t *limits = mkr_ctx_limits(ctx);
222
+ limits->ast_nodes = 0;
223
+ mkr_node_t *ast = mkr_parse(mkr_verified_text_from_view(ev), limits, &error);
224
+ RB_GC_GUARD(ev.value);
225
+ if (ast == NULL) {
226
+ mkr_xpath_context_free(ctx);
227
+ mkr_xpath_raise(&error);
228
+ }
229
+
230
+ mkr_xpath_value_t value = {0};
231
+ int rc = first_only ? mkr_xpath_eval_compiled_first(ctx, ast, &value, &error)
232
+ : mkr_xpath_eval_compiled(ctx, ast, &value, &error);
233
+ mkr_node_free(ast);
234
+ if (rc != 0) {
235
+ mkr_xpath_context_free(ctx);
236
+ mkr_xpath_raise(&error);
237
+ }
238
+ VALUE result = mkr_xpath_value_to_ruby(&value, document); /* converts AND clears value */
239
+ mkr_xpath_context_free(ctx);
240
+
241
+ if (first_only && rb_obj_is_kind_of(result, mkr_cNodeSet)) {
242
+ return rb_funcall(result, rb_intern("first"), 0);
243
+ }
244
+ return result;
245
+ }
246
+
247
+ static VALUE
248
+ mkr_xml_doc_xpath(int argc, VALUE *argv, VALUE self)
249
+ {
250
+ VALUE expr, ns;
251
+ rb_scan_args(argc, argv, "11", &expr, &ns);
252
+ return mkr_xml_doc_xpath_run(self, expr, ns, 0);
253
+ }
254
+
255
+ static VALUE
256
+ mkr_xml_doc_at_xpath(int argc, VALUE *argv, VALUE self)
257
+ {
258
+ VALUE expr, ns;
259
+ rb_scan_args(argc, argv, "11", &expr, &ns);
260
+ return mkr_xml_doc_xpath_run(self, expr, ns, 1);
261
+ }
262
+
263
+ /* The document's root element. */
264
+ static VALUE
265
+ mkr_xml_doc_root(VALUE self)
266
+ {
267
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
268
+ return (xdoc == NULL) ? Qnil : mkr_wrap_xml_node(xdoc->root, self);
269
+ }
270
+
271
+ /* The document's DOCTYPE as a Makiri::XML::DocumentType (aliased
272
+ * Makiri::XML::DTD), or nil if the document had no
273
+ * `<!DOCTYPE ...>`. Mirrors Nokogiri's Document#internal_subset. The DTD's name
274
+ * and external/system identifiers are read; the DTD body is NOT parsed (no
275
+ * entity/element declarations are loaded - &name; stays an undefined-entity
276
+ * error and no external subset is fetched). The doctype node is kept off the
277
+ * tree, so XPath never sees it (XPath 1.0 has no doctype node type). */
278
+ static VALUE
279
+ mkr_xml_doc_internal_subset(VALUE self)
280
+ {
281
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
282
+ return (xdoc == NULL || xdoc->doctype == NULL)
283
+ ? Qnil
284
+ : mkr_wrap_xml_node(xdoc->doctype, self);
285
+ }
286
+
287
+ /* Map a fragment-parse status to its Ruby exception (never returns on error). */
288
+ NORETURN(static void mkr_xml_raise_fragment_status(mkr_xml_status_t st));
289
+ static void
290
+ mkr_xml_raise_fragment_status(mkr_xml_status_t st)
291
+ {
292
+ switch (st) {
293
+ case MKR_XML_ERR_SYNTAX: rb_raise(mkr_eXmlSyntaxError, "malformed XML fragment");
294
+ case MKR_XML_ERR_LIMIT: rb_raise(mkr_eXmlLimitExceeded, "XML fragment budget exceeded");
295
+ case MKR_XML_ERR_VERSION: rb_raise(mkr_eXmlSyntaxError,
296
+ "unsupported XML version (only XML 1.0 is supported)");
297
+ default: rb_raise(mkr_eError, "failed to parse XML fragment");
298
+ }
299
+ }
300
+
301
+ /* Strict-decode +rb_source+ and parse it as a fragment into +xdoc+ (when
302
+ * +inherit_doc_ns+, names resolve against the document's root namespaces). The
303
+ * parse runs under the GVL: a fragment is small, and an existing document's arena
304
+ * must never be mutated with the GVL released. Returns the DOCUMENT_FRAGMENT node;
305
+ * raises on a decode/parse failure. */
306
+ static mkr_xml_node_t *
307
+ mkr_xml_fragment_into(mkr_xml_doc_t *xdoc, VALUE rb_source, int inherit_doc_ns)
308
+ {
309
+ VALUE decoded = mkr_xml_decode_input(rb_String(rb_source), xdoc->max_bytes);
310
+ mkr_owned_bytes_t src = { 0 };
311
+ if (mkr_ruby_copy_bytes(decoded, &src) != 0) {
312
+ rb_raise(mkr_eError, "out of memory copying XML fragment source");
313
+ }
314
+ RB_GC_GUARD(decoded);
315
+
316
+ mkr_xml_status_t st = MKR_XML_OK;
317
+ mkr_xml_node_t *frag = mkr_xml_parse_fragment(xdoc, src.ptr, src.len, inherit_doc_ns, &st);
318
+ mkr_owned_bytes_clear(&src);
319
+ if (frag == NULL) {
320
+ mkr_xml_raise_fragment_status(st);
321
+ }
322
+ return frag;
323
+ }
324
+
325
+ /* A fresh, empty XML Document VALUE: a backing arena holding a DOCUMENT node and
326
+ * no root element. Used by Document.new and as DocumentFragment.parse's backing
327
+ * document. Raises on OOM (with +parsed+ already GC-owned, so it frees cleanly). */
328
+ static VALUE
329
+ mkr_xml_new_empty_document(void)
330
+ {
331
+ mkr_parsed_t *parsed = mkr_parsed_new_xml(NULL);
332
+ if (parsed == NULL) {
333
+ rb_raise(mkr_eError, "out of memory allocating XML document");
334
+ }
335
+ VALUE doc_obj = mkr_wrap_document(parsed); /* GC owns +parsed+ from here */
336
+ mkr_xml_doc_t *xdoc = mkr_xml_doc_new();
337
+ if (xdoc == NULL) {
338
+ rb_raise(mkr_eError, "out of memory allocating XML document");
339
+ }
340
+ mkr_parsed_set_xml_doc(parsed, xdoc); /* GC now frees +xdoc+ via +parsed+ */
341
+ xdoc->doc_node = mkr_xml_arena_node(xdoc, MKR_XML_NODE_TYPE_DOCUMENT);
342
+ if (xdoc->doc_node == NULL) {
343
+ rb_raise(mkr_eError, "out of memory allocating XML document");
344
+ }
345
+ return doc_obj;
346
+ }
347
+
348
+ /* call-seq: Makiri::XML::Document.new -> Document
349
+ * A new, empty XML document (no root element) to build up programmatically with
350
+ * #create_element etc. and #add_child / #root=, like Nokogiri. Any arguments
351
+ * (Nokogiri accepts a version / encoding) are accepted and ignored. */
352
+ static VALUE
353
+ mkr_xml_document_s_new(int argc, VALUE *argv, VALUE klass)
354
+ {
355
+ (void)argc; (void)argv; (void)klass;
356
+ return mkr_xml_new_empty_document();
357
+ }
358
+
359
+ /* call-seq: Makiri::XML::DocumentFragment.parse(source) -> DocumentFragment
360
+ * Parse +source+ into a standalone fragment with its own (empty) backing
361
+ * document. The fragment is self-contained: a prefixed name must declare its
362
+ * namespace within the fragment itself (use Document#fragment to parse against an
363
+ * existing document's in-scope namespaces). */
364
+ static VALUE
365
+ mkr_xml_fragment_s_parse(VALUE klass, VALUE rb_source)
366
+ {
367
+ (void)klass;
368
+ VALUE doc_obj = mkr_xml_new_empty_document();
369
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(doc_obj));
370
+ mkr_xml_node_t *frag = mkr_xml_fragment_into(xdoc, rb_source, 0);
371
+ VALUE result = mkr_wrap_xml_node(frag, doc_obj);
372
+ RB_GC_GUARD(doc_obj);
373
+ return result;
374
+ }
375
+
376
+ /* call-seq: doc.fragment(source) -> DocumentFragment
377
+ * Parse +source+ into a fragment bound to this document, resolving names against
378
+ * the document's in-scope (root) namespaces, so the fragment's nodes can be
379
+ * spliced in with Node#add_child and friends. */
380
+ static VALUE
381
+ mkr_xml_doc_fragment(VALUE self, VALUE rb_source)
382
+ {
383
+ mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
384
+ if (xdoc == NULL) {
385
+ rb_raise(mkr_eError, "the document has no arena");
386
+ }
387
+ mkr_xml_node_t *frag = mkr_xml_fragment_into(xdoc, rb_source, 1);
388
+ VALUE result = mkr_wrap_xml_node(frag, self);
389
+ RB_GC_GUARD(self);
390
+ return result;
391
+ }
392
+
393
+ void
394
+ mkr_init_xml(void)
395
+ {
396
+ /* XML::Document is a Makiri::Document leaf (§12): is_a?(Makiri::Document) is
397
+ * true, but it carries no HTML readers (those are on Makiri::HTML, which it
398
+ * does not include) - the read-only XML surface is structural. */
399
+ mkr_cXmlDocument = rb_define_class_under(mkr_mXML, "Document", mkr_cDocument);
400
+ rb_undef_alloc_func(mkr_cXmlDocument); /* created only from C, never .new */
401
+ rb_include_module(mkr_cXmlDocument, mkr_mXmlNodeMethods);
402
+
403
+ rb_define_method(mkr_cXmlDocument, "root", mkr_xml_doc_root, 0);
404
+ rb_define_method(mkr_cXmlDocument, "internal_subset", mkr_xml_doc_internal_subset, 0);
405
+ rb_define_method(mkr_cXmlDocument, "fragment", mkr_xml_doc_fragment, 1);
406
+ rb_define_singleton_method(mkr_cXmlDocument, "new", mkr_xml_document_s_new, -1);
407
+ rb_define_singleton_method(mkr_cXmlDocumentFragment, "parse", mkr_xml_fragment_s_parse, 1);
408
+
409
+ /* xpath / at_xpath work on the document and on any XML node (rooted at that
410
+ * node), so they live on the shared XML node behavior module + the document. */
411
+ rb_define_method(mkr_cXmlDocument, "xpath", mkr_xml_doc_xpath, -1);
412
+ rb_define_method(mkr_cXmlDocument, "at_xpath", mkr_xml_doc_at_xpath, -1);
413
+ rb_define_method(mkr_mXmlNodeMethods, "xpath", mkr_xml_doc_xpath, -1);
414
+ rb_define_method(mkr_mXmlNodeMethods, "at_xpath", mkr_xml_doc_at_xpath, -1);
415
+
416
+ /* The native XML parser, exposed as XML::Document.parse, mirroring HTML
417
+ * (HTML::Document.parse). The Makiri::XML(source) convenience delegates to it
418
+ * in Ruby (lib/makiri.rb), as Makiri.HTML does for HTML::Document.parse. */
419
+ rb_define_singleton_method(mkr_cXmlDocument, "parse", mkr_xml_s_parse, -1);
420
+ }