makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
/* ruby_xml.c - Ruby boundary for the native XML reader (Phase 1).
|
|
2
|
+
*
|
|
3
|
+
* Makiri::XML::Document.parse(source) (and the Makiri::XML(source) convenience
|
|
4
|
+
* that delegates to it): strict-decode the input (§2.1), then run the Ruby-free
|
|
5
|
+
* parser with the GVL released, and return a
|
|
6
|
+
* Makiri::XML::Document. The document is held in a kind=MKR_DOC_XML mkr_parsed_t
|
|
7
|
+
* (the common document handle, §2.3) and wrapped by mkr_wrap_document, which GC
|
|
8
|
+
* frees via mkr_parsed_destroy (the XML branch whole-arena-frees).
|
|
9
|
+
*/
|
|
10
|
+
#include "../makiri.h"
|
|
11
|
+
#include "../core/mkr_core.h"
|
|
12
|
+
#include "../xml/mkr_xml.h"
|
|
13
|
+
#include "../xml/mkr_xml_node.h"
|
|
14
|
+
#include "glue.h" /* mkr_wrap_document, mkr_parsed_* (via compat.h) */
|
|
15
|
+
#include "ruby_xpath.h" /* mkr_xpath_value_to_ruby / mkr_xpath_raise (shared) */
|
|
16
|
+
#include "../xpath/mkr_xpath.h"
|
|
17
|
+
|
|
18
|
+
#include <ruby/thread.h>
|
|
19
|
+
|
|
20
|
+
/* ---- GVL-released parse ---- */
|
|
21
|
+
typedef struct {
|
|
22
|
+
const char *src;
|
|
23
|
+
size_t len;
|
|
24
|
+
mkr_xml_limits_t limits; /* per-parse budget overrides (0 fields = default) */
|
|
25
|
+
mkr_xml_doc_t *result;
|
|
26
|
+
mkr_xml_status_t status;
|
|
27
|
+
} mkr_xml_parse_nogvl_t;
|
|
28
|
+
|
|
29
|
+
static void *
|
|
30
|
+
mkr_xml_parse_nogvl(void *p)
|
|
31
|
+
{
|
|
32
|
+
mkr_xml_parse_nogvl_t *a = (mkr_xml_parse_nogvl_t *)p;
|
|
33
|
+
a->result = mkr_xml_parse_ex(a->src, a->len, &a->limits, &a->status);
|
|
34
|
+
return NULL;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/* Read the optional per-parse budget overrides from the keyword hash. Today only
|
|
38
|
+
* max_bytes (the arena memory ceiling) is configurable; rb_get_kwargs rejects any
|
|
39
|
+
* other keyword, and new budgets join the table here as they become runtime-
|
|
40
|
+
* configurable. max_bytes must be a positive Integer (0 / negative / non-Integer
|
|
41
|
+
* raise), and 0 in the struct means "use the compile-time default". */
|
|
42
|
+
static mkr_xml_limits_t
|
|
43
|
+
mkr_xml_parse_limits(VALUE rb_opts)
|
|
44
|
+
{
|
|
45
|
+
mkr_xml_limits_t limits = { 0 };
|
|
46
|
+
if (NIL_P(rb_opts)) return limits;
|
|
47
|
+
|
|
48
|
+
static ID kw_ids[1];
|
|
49
|
+
if (kw_ids[0] == 0) kw_ids[0] = rb_intern_const("max_bytes");
|
|
50
|
+
VALUE kw_vals[1];
|
|
51
|
+
rb_get_kwargs(rb_opts, kw_ids, 0, 1, kw_vals); /* unknown keyword -> ArgumentError */
|
|
52
|
+
|
|
53
|
+
if (kw_vals[0] != Qundef) {
|
|
54
|
+
if (!RB_INTEGER_TYPE_P(kw_vals[0])) {
|
|
55
|
+
rb_raise(rb_eTypeError, "max_bytes must be an Integer");
|
|
56
|
+
}
|
|
57
|
+
/* Reject <= 0 BEFORE the unsigned conversion: NUM2SIZET wraps a negative
|
|
58
|
+
* Integer into a huge size_t (an accidental budget bypass), so guard the
|
|
59
|
+
* sign first. A too-large positive still raises RangeError in NUM2SIZET. */
|
|
60
|
+
if (RTEST(rb_funcall(kw_vals[0], rb_intern("<="), 1, INT2FIX(0)))) {
|
|
61
|
+
rb_raise(rb_eArgError, "max_bytes must be positive");
|
|
62
|
+
}
|
|
63
|
+
limits.max_bytes = NUM2SIZET(kw_vals[0]);
|
|
64
|
+
}
|
|
65
|
+
return limits;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/* call-seq: Makiri::XML::Document.parse(source, max_bytes: nil) -> Makiri::XML::Document
|
|
69
|
+
* Makiri::XML(source, max_bytes: nil) -> Makiri::XML::Document
|
|
70
|
+
* +source+ is a String or any object responding to +#read+ (an IO / File /
|
|
71
|
+
* StringIO); +max_bytes+ overrides the default arena memory ceiling for this
|
|
72
|
+
* parse. Read a non-UTF-8 file in binary mode (File.binread / "rb") so the
|
|
73
|
+
* encoding is autodetected from its BOM / declaration. */
|
|
74
|
+
static VALUE
|
|
75
|
+
mkr_xml_s_parse(int argc, VALUE *argv, VALUE self)
|
|
76
|
+
{
|
|
77
|
+
(void)self;
|
|
78
|
+
VALUE rb_source, rb_opts;
|
|
79
|
+
rb_scan_args(argc, argv, "1:", &rb_source, &rb_opts);
|
|
80
|
+
mkr_xml_limits_t limits = mkr_xml_parse_limits(rb_opts); /* validates; may raise */
|
|
81
|
+
size_t budget = limits.max_bytes ? limits.max_bytes : (size_t)MKR_XML_MAX_BYTES;
|
|
82
|
+
|
|
83
|
+
/* Read an IO/File-like source (an object responding to #read), like the HTML
|
|
84
|
+
* entry; a String passes straight through. */
|
|
85
|
+
if (rb_respond_to(rb_source, rb_intern("read"))) {
|
|
86
|
+
rb_source = rb_funcall(rb_source, rb_intern("read"), 0);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/* Strict decode under the GVL: invalid UTF-8 / undecodable byte / NUL all
|
|
90
|
+
* raise Makiri::XML::SyntaxError here (no U+FFFD repair). Passing the budget
|
|
91
|
+
* lets decode reject an over-budget input (LimitExceeded) before its
|
|
92
|
+
* validation copy and the GVL-release copy below - so a hostile oversized
|
|
93
|
+
* document is not materialised twice for a doomed parse. */
|
|
94
|
+
VALUE decoded = mkr_xml_decode_input(rb_String(rb_source), budget);
|
|
95
|
+
|
|
96
|
+
/* Build an empty XML handle and wrap it first (doc == NULL) so a failure
|
|
97
|
+
* mid-parse frees cleanly via GC (mkr_parsed_destroy -> the XML branch ->
|
|
98
|
+
* mkr_xml_doc_destroy(NULL), a no-op). */
|
|
99
|
+
mkr_parsed_t *parsed = mkr_parsed_new_xml(NULL);
|
|
100
|
+
if (parsed == NULL) {
|
|
101
|
+
rb_raise(mkr_eError, "out of memory allocating XML document");
|
|
102
|
+
}
|
|
103
|
+
VALUE obj = mkr_wrap_document(parsed); /* GC owns +parsed+ from here */
|
|
104
|
+
|
|
105
|
+
/* Copy the decoded bytes so the parse can run with the GVL released without
|
|
106
|
+
* racing GC/compaction on the String's backing store. */
|
|
107
|
+
mkr_owned_bytes_t source = {0};
|
|
108
|
+
if (mkr_ruby_copy_bytes(decoded, &source) != 0) {
|
|
109
|
+
rb_raise(mkr_eError, "out of memory copying XML source");
|
|
110
|
+
}
|
|
111
|
+
RB_GC_GUARD(decoded);
|
|
112
|
+
|
|
113
|
+
mkr_xml_parse_nogvl_t args = { source.ptr, source.len, limits, NULL, MKR_XML_OK };
|
|
114
|
+
rb_thread_call_without_gvl(mkr_xml_parse_nogvl, &args, NULL, NULL);
|
|
115
|
+
mkr_owned_bytes_clear(&source);
|
|
116
|
+
|
|
117
|
+
if (args.result == NULL) {
|
|
118
|
+
switch (args.status) {
|
|
119
|
+
case MKR_XML_ERR_SYNTAX: rb_raise(mkr_eXmlSyntaxError, "malformed XML"); break;
|
|
120
|
+
case MKR_XML_ERR_LIMIT: rb_raise(mkr_eXmlLimitExceeded, "XML document budget exceeded"); break;
|
|
121
|
+
case MKR_XML_ERR_VERSION: rb_raise(mkr_eXmlSyntaxError,
|
|
122
|
+
"unsupported XML version (only XML 1.0 is supported)"); break;
|
|
123
|
+
default: rb_raise(mkr_eError, "failed to parse XML document"); break;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
mkr_parsed_set_xml_doc(parsed, args.result);
|
|
127
|
+
RB_GC_GUARD(obj);
|
|
128
|
+
return obj;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/* XPath value -> Ruby and error -> exception are shared with the HTML query glue
|
|
132
|
+
* (mkr_xpath_value_to_ruby / mkr_xpath_raise, ruby_xpath.h): both query entry
|
|
133
|
+
* points run the same engine and return the same public value/error types, so
|
|
134
|
+
* the conversion lives in one place. mkr_node_set_new/push are kind-aware, so the
|
|
135
|
+
* shared value converter wraps these results as Makiri::XML::* nodes. */
|
|
136
|
+
|
|
137
|
+
/* Resolve the (document VALUE, context node) an XPath query runs against: for a
|
|
138
|
+
* Makiri::XML::Document the context is the document node, for a node it is that
|
|
139
|
+
* node (and its owning Document). */
|
|
140
|
+
static mkr_xml_node_t *
|
|
141
|
+
mkr_xml_query_context(VALUE self, VALUE *out_document)
|
|
142
|
+
{
|
|
143
|
+
/* mkr_xml_node_unwrap is kind-checked (raises on a non-XML node) and resolves
|
|
144
|
+
* an XML Document to its arena document node; mkr_node_document gives the
|
|
145
|
+
* keepalive Document for either. */
|
|
146
|
+
*out_document = mkr_node_document(self);
|
|
147
|
+
return mkr_xml_node_unwrap(self);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/* Register a {prefix => uri} Ruby Hash onto +ctx+ for a single query. On any bad
|
|
151
|
+
* entry (non-string-coercible, invalid UTF-8 / embedded NUL, or OOM) the context
|
|
152
|
+
* is freed and an exception is raised - never a partial registration. RSS/Atom
|
|
153
|
+
* live in a default namespace, so a prefix is the strict-mode way to select them
|
|
154
|
+
* (e.g. xpath("//a:entry", "a" => "http://www.w3.org/2005/Atom")). */
|
|
155
|
+
static void
|
|
156
|
+
mkr_xml_register_query_namespaces(mkr_xpath_context_t *ctx, VALUE rb_ns)
|
|
157
|
+
{
|
|
158
|
+
if (NIL_P(rb_ns)) return;
|
|
159
|
+
if (!RB_TYPE_P(rb_ns, T_HASH)) {
|
|
160
|
+
mkr_xpath_context_free(ctx);
|
|
161
|
+
rb_raise(rb_eTypeError, "namespaces must be a Hash of prefix => uri");
|
|
162
|
+
}
|
|
163
|
+
size_t cap = mkr_ctx_limits(ctx)->max_string_bytes;
|
|
164
|
+
VALUE keys = rb_funcall(rb_ns, rb_intern("keys"), 0);
|
|
165
|
+
for (long i = 0; i < RARRAY_LEN(keys); i++) {
|
|
166
|
+
VALUE k = rb_ary_entry(keys, i);
|
|
167
|
+
VALUE ks = rb_obj_as_string(k);
|
|
168
|
+
VALUE vs = rb_obj_as_string(rb_hash_aref(rb_ns, k));
|
|
169
|
+
mkr_ruby_borrowed_text_t pv, uv;
|
|
170
|
+
const char *bad = mkr_ruby_try_verified_text(ks, cap, &pv);
|
|
171
|
+
if (bad == NULL) bad = mkr_ruby_try_verified_text(vs, cap, &uv);
|
|
172
|
+
if (bad != NULL) {
|
|
173
|
+
mkr_xpath_context_free(ctx);
|
|
174
|
+
rb_raise(mkr_eError, "invalid namespace mapping: %s", bad);
|
|
175
|
+
}
|
|
176
|
+
int rc = mkr_xpath_register_ns(ctx, mkr_verified_text_from_view(pv),
|
|
177
|
+
mkr_verified_text_from_view(uv));
|
|
178
|
+
RB_GC_GUARD(ks);
|
|
179
|
+
RB_GC_GUARD(vs);
|
|
180
|
+
if (rc != 0) {
|
|
181
|
+
mkr_xpath_context_free(ctx);
|
|
182
|
+
rb_raise(mkr_eError, "failed to register namespace");
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
RB_GC_GUARD(keys);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/* Makiri::XML::{Document,*}#xpath(expr, namespaces = nil) / #at_xpath(...):
|
|
189
|
+
* evaluate +expr+ over the XML engine instance, rooted at +self+'s context node,
|
|
190
|
+
* and return a NodeSet (node-set) or scalar. +namespaces+ is an optional
|
|
191
|
+
* {prefix => uri} Hash registered for this query (RSS/Atom default-namespace
|
|
192
|
+
* docs need a prefix under strict matching). Phase 1: no custom-function
|
|
193
|
+
* handler. Makiri::XPathContext is the alternative when many queries share one
|
|
194
|
+
* namespace set (it caches the registrations and the compiled ASTs). */
|
|
195
|
+
static VALUE
|
|
196
|
+
mkr_xml_doc_xpath_run(VALUE self, VALUE rb_expr, VALUE rb_ns, int first_only)
|
|
197
|
+
{
|
|
198
|
+
VALUE document = Qnil;
|
|
199
|
+
mkr_xml_node_t *context = mkr_xml_query_context(self, &document);
|
|
200
|
+
if (context == NULL) {
|
|
201
|
+
return first_only ? Qnil : mkr_node_set_new(document);
|
|
202
|
+
}
|
|
203
|
+
mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(document));
|
|
204
|
+
|
|
205
|
+
/* The document node is the "document" (the engine's XML namespace services
|
|
206
|
+
* ignore it) and the "/" root for absolute paths; +context+ is the relative
|
|
207
|
+
* context node (the document node for a Document, else the node itself). */
|
|
208
|
+
mkr_xpath_context_t *ctx =
|
|
209
|
+
mkr_xpath_context_new((void *)xdoc->doc_node, (void *)context);
|
|
210
|
+
if (ctx == NULL) {
|
|
211
|
+
rb_raise(mkr_eError, "failed to allocate XPath context");
|
|
212
|
+
}
|
|
213
|
+
mkr_xpath_set_engine_kind(ctx, 1);
|
|
214
|
+
mkr_xml_register_query_namespaces(ctx, rb_ns); /* frees ctx + raises on error */
|
|
215
|
+
|
|
216
|
+
/* Mint the borrowed expression view AFTER namespace registration: that step
|
|
217
|
+
* allocates Ruby objects (and may run GC), and the borrowed bytes must not
|
|
218
|
+
* be held live across it. mkr_parse below runs pure C. */
|
|
219
|
+
mkr_ruby_borrowed_text_t ev = mkr_ruby_verified_text(rb_expr, "XPath expression");
|
|
220
|
+
mkr_xpath_error_t error = {0};
|
|
221
|
+
mkr_xpath_limits_t *limits = mkr_ctx_limits(ctx);
|
|
222
|
+
limits->ast_nodes = 0;
|
|
223
|
+
mkr_node_t *ast = mkr_parse(mkr_verified_text_from_view(ev), limits, &error);
|
|
224
|
+
RB_GC_GUARD(ev.value);
|
|
225
|
+
if (ast == NULL) {
|
|
226
|
+
mkr_xpath_context_free(ctx);
|
|
227
|
+
mkr_xpath_raise(&error);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
mkr_xpath_value_t value = {0};
|
|
231
|
+
int rc = first_only ? mkr_xpath_eval_compiled_first(ctx, ast, &value, &error)
|
|
232
|
+
: mkr_xpath_eval_compiled(ctx, ast, &value, &error);
|
|
233
|
+
mkr_node_free(ast);
|
|
234
|
+
if (rc != 0) {
|
|
235
|
+
mkr_xpath_context_free(ctx);
|
|
236
|
+
mkr_xpath_raise(&error);
|
|
237
|
+
}
|
|
238
|
+
VALUE result = mkr_xpath_value_to_ruby(&value, document); /* converts AND clears value */
|
|
239
|
+
mkr_xpath_context_free(ctx);
|
|
240
|
+
|
|
241
|
+
if (first_only && rb_obj_is_kind_of(result, mkr_cNodeSet)) {
|
|
242
|
+
return rb_funcall(result, rb_intern("first"), 0);
|
|
243
|
+
}
|
|
244
|
+
return result;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
static VALUE
|
|
248
|
+
mkr_xml_doc_xpath(int argc, VALUE *argv, VALUE self)
|
|
249
|
+
{
|
|
250
|
+
VALUE expr, ns;
|
|
251
|
+
rb_scan_args(argc, argv, "11", &expr, &ns);
|
|
252
|
+
return mkr_xml_doc_xpath_run(self, expr, ns, 0);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
static VALUE
|
|
256
|
+
mkr_xml_doc_at_xpath(int argc, VALUE *argv, VALUE self)
|
|
257
|
+
{
|
|
258
|
+
VALUE expr, ns;
|
|
259
|
+
rb_scan_args(argc, argv, "11", &expr, &ns);
|
|
260
|
+
return mkr_xml_doc_xpath_run(self, expr, ns, 1);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/* The document's root element. */
|
|
264
|
+
static VALUE
|
|
265
|
+
mkr_xml_doc_root(VALUE self)
|
|
266
|
+
{
|
|
267
|
+
mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
|
|
268
|
+
return (xdoc == NULL) ? Qnil : mkr_wrap_xml_node(xdoc->root, self);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/* The document's DOCTYPE as a Makiri::XML::DocumentType (aliased
|
|
272
|
+
* Makiri::XML::DTD), or nil if the document had no
|
|
273
|
+
* `<!DOCTYPE ...>`. Mirrors Nokogiri's Document#internal_subset. The DTD's name
|
|
274
|
+
* and external/system identifiers are read; the DTD body is NOT parsed (no
|
|
275
|
+
* entity/element declarations are loaded - &name; stays an undefined-entity
|
|
276
|
+
* error and no external subset is fetched). The doctype node is kept off the
|
|
277
|
+
* tree, so XPath never sees it (XPath 1.0 has no doctype node type). */
|
|
278
|
+
static VALUE
|
|
279
|
+
mkr_xml_doc_internal_subset(VALUE self)
|
|
280
|
+
{
|
|
281
|
+
mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
|
|
282
|
+
return (xdoc == NULL || xdoc->doctype == NULL)
|
|
283
|
+
? Qnil
|
|
284
|
+
: mkr_wrap_xml_node(xdoc->doctype, self);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/* Map a fragment-parse status to its Ruby exception (never returns on error). */
|
|
288
|
+
NORETURN(static void mkr_xml_raise_fragment_status(mkr_xml_status_t st));
|
|
289
|
+
static void
|
|
290
|
+
mkr_xml_raise_fragment_status(mkr_xml_status_t st)
|
|
291
|
+
{
|
|
292
|
+
switch (st) {
|
|
293
|
+
case MKR_XML_ERR_SYNTAX: rb_raise(mkr_eXmlSyntaxError, "malformed XML fragment");
|
|
294
|
+
case MKR_XML_ERR_LIMIT: rb_raise(mkr_eXmlLimitExceeded, "XML fragment budget exceeded");
|
|
295
|
+
case MKR_XML_ERR_VERSION: rb_raise(mkr_eXmlSyntaxError,
|
|
296
|
+
"unsupported XML version (only XML 1.0 is supported)");
|
|
297
|
+
default: rb_raise(mkr_eError, "failed to parse XML fragment");
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/* Strict-decode +rb_source+ and parse it as a fragment into +xdoc+ (when
|
|
302
|
+
* +inherit_doc_ns+, names resolve against the document's root namespaces). The
|
|
303
|
+
* parse runs under the GVL: a fragment is small, and an existing document's arena
|
|
304
|
+
* must never be mutated with the GVL released. Returns the DOCUMENT_FRAGMENT node;
|
|
305
|
+
* raises on a decode/parse failure. */
|
|
306
|
+
static mkr_xml_node_t *
|
|
307
|
+
mkr_xml_fragment_into(mkr_xml_doc_t *xdoc, VALUE rb_source, int inherit_doc_ns)
|
|
308
|
+
{
|
|
309
|
+
VALUE decoded = mkr_xml_decode_input(rb_String(rb_source), xdoc->max_bytes);
|
|
310
|
+
mkr_owned_bytes_t src = { 0 };
|
|
311
|
+
if (mkr_ruby_copy_bytes(decoded, &src) != 0) {
|
|
312
|
+
rb_raise(mkr_eError, "out of memory copying XML fragment source");
|
|
313
|
+
}
|
|
314
|
+
RB_GC_GUARD(decoded);
|
|
315
|
+
|
|
316
|
+
mkr_xml_status_t st = MKR_XML_OK;
|
|
317
|
+
mkr_xml_node_t *frag = mkr_xml_parse_fragment(xdoc, src.ptr, src.len, inherit_doc_ns, &st);
|
|
318
|
+
mkr_owned_bytes_clear(&src);
|
|
319
|
+
if (frag == NULL) {
|
|
320
|
+
mkr_xml_raise_fragment_status(st);
|
|
321
|
+
}
|
|
322
|
+
return frag;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/* A fresh, empty XML Document VALUE: a backing arena holding a DOCUMENT node and
|
|
326
|
+
* no root element. Used by Document.new and as DocumentFragment.parse's backing
|
|
327
|
+
* document. Raises on OOM (with +parsed+ already GC-owned, so it frees cleanly). */
|
|
328
|
+
static VALUE
|
|
329
|
+
mkr_xml_new_empty_document(void)
|
|
330
|
+
{
|
|
331
|
+
mkr_parsed_t *parsed = mkr_parsed_new_xml(NULL);
|
|
332
|
+
if (parsed == NULL) {
|
|
333
|
+
rb_raise(mkr_eError, "out of memory allocating XML document");
|
|
334
|
+
}
|
|
335
|
+
VALUE doc_obj = mkr_wrap_document(parsed); /* GC owns +parsed+ from here */
|
|
336
|
+
mkr_xml_doc_t *xdoc = mkr_xml_doc_new();
|
|
337
|
+
if (xdoc == NULL) {
|
|
338
|
+
rb_raise(mkr_eError, "out of memory allocating XML document");
|
|
339
|
+
}
|
|
340
|
+
mkr_parsed_set_xml_doc(parsed, xdoc); /* GC now frees +xdoc+ via +parsed+ */
|
|
341
|
+
xdoc->doc_node = mkr_xml_arena_node(xdoc, MKR_XML_NODE_TYPE_DOCUMENT);
|
|
342
|
+
if (xdoc->doc_node == NULL) {
|
|
343
|
+
rb_raise(mkr_eError, "out of memory allocating XML document");
|
|
344
|
+
}
|
|
345
|
+
return doc_obj;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/* call-seq: Makiri::XML::Document.new -> Document
|
|
349
|
+
* A new, empty XML document (no root element) to build up programmatically with
|
|
350
|
+
* #create_element etc. and #add_child / #root=, like Nokogiri. Any arguments
|
|
351
|
+
* (Nokogiri accepts a version / encoding) are accepted and ignored. */
|
|
352
|
+
static VALUE
|
|
353
|
+
mkr_xml_document_s_new(int argc, VALUE *argv, VALUE klass)
|
|
354
|
+
{
|
|
355
|
+
(void)argc; (void)argv; (void)klass;
|
|
356
|
+
return mkr_xml_new_empty_document();
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/* call-seq: Makiri::XML::DocumentFragment.parse(source) -> DocumentFragment
|
|
360
|
+
* Parse +source+ into a standalone fragment with its own (empty) backing
|
|
361
|
+
* document. The fragment is self-contained: a prefixed name must declare its
|
|
362
|
+
* namespace within the fragment itself (use Document#fragment to parse against an
|
|
363
|
+
* existing document's in-scope namespaces). */
|
|
364
|
+
static VALUE
|
|
365
|
+
mkr_xml_fragment_s_parse(VALUE klass, VALUE rb_source)
|
|
366
|
+
{
|
|
367
|
+
(void)klass;
|
|
368
|
+
VALUE doc_obj = mkr_xml_new_empty_document();
|
|
369
|
+
mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(doc_obj));
|
|
370
|
+
mkr_xml_node_t *frag = mkr_xml_fragment_into(xdoc, rb_source, 0);
|
|
371
|
+
VALUE result = mkr_wrap_xml_node(frag, doc_obj);
|
|
372
|
+
RB_GC_GUARD(doc_obj);
|
|
373
|
+
return result;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/* call-seq: doc.fragment(source) -> DocumentFragment
|
|
377
|
+
* Parse +source+ into a fragment bound to this document, resolving names against
|
|
378
|
+
* the document's in-scope (root) namespaces, so the fragment's nodes can be
|
|
379
|
+
* spliced in with Node#add_child and friends. */
|
|
380
|
+
static VALUE
|
|
381
|
+
mkr_xml_doc_fragment(VALUE self, VALUE rb_source)
|
|
382
|
+
{
|
|
383
|
+
mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
|
|
384
|
+
if (xdoc == NULL) {
|
|
385
|
+
rb_raise(mkr_eError, "the document has no arena");
|
|
386
|
+
}
|
|
387
|
+
mkr_xml_node_t *frag = mkr_xml_fragment_into(xdoc, rb_source, 1);
|
|
388
|
+
VALUE result = mkr_wrap_xml_node(frag, self);
|
|
389
|
+
RB_GC_GUARD(self);
|
|
390
|
+
return result;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
void
|
|
394
|
+
mkr_init_xml(void)
|
|
395
|
+
{
|
|
396
|
+
/* XML::Document is a Makiri::Document leaf (§12): is_a?(Makiri::Document) is
|
|
397
|
+
* true, but it carries no HTML readers (those are on Makiri::HTML, which it
|
|
398
|
+
* does not include) - the read-only XML surface is structural. */
|
|
399
|
+
mkr_cXmlDocument = rb_define_class_under(mkr_mXML, "Document", mkr_cDocument);
|
|
400
|
+
rb_undef_alloc_func(mkr_cXmlDocument); /* created only from C, never .new */
|
|
401
|
+
rb_include_module(mkr_cXmlDocument, mkr_mXmlNodeMethods);
|
|
402
|
+
|
|
403
|
+
rb_define_method(mkr_cXmlDocument, "root", mkr_xml_doc_root, 0);
|
|
404
|
+
rb_define_method(mkr_cXmlDocument, "internal_subset", mkr_xml_doc_internal_subset, 0);
|
|
405
|
+
rb_define_method(mkr_cXmlDocument, "fragment", mkr_xml_doc_fragment, 1);
|
|
406
|
+
rb_define_singleton_method(mkr_cXmlDocument, "new", mkr_xml_document_s_new, -1);
|
|
407
|
+
rb_define_singleton_method(mkr_cXmlDocumentFragment, "parse", mkr_xml_fragment_s_parse, 1);
|
|
408
|
+
|
|
409
|
+
/* xpath / at_xpath work on the document and on any XML node (rooted at that
|
|
410
|
+
* node), so they live on the shared XML node behavior module + the document. */
|
|
411
|
+
rb_define_method(mkr_cXmlDocument, "xpath", mkr_xml_doc_xpath, -1);
|
|
412
|
+
rb_define_method(mkr_cXmlDocument, "at_xpath", mkr_xml_doc_at_xpath, -1);
|
|
413
|
+
rb_define_method(mkr_mXmlNodeMethods, "xpath", mkr_xml_doc_xpath, -1);
|
|
414
|
+
rb_define_method(mkr_mXmlNodeMethods, "at_xpath", mkr_xml_doc_at_xpath, -1);
|
|
415
|
+
|
|
416
|
+
/* The native XML parser, exposed as XML::Document.parse, mirroring HTML
|
|
417
|
+
* (HTML::Document.parse). The Makiri::XML(source) convenience delegates to it
|
|
418
|
+
* in Ruby (lib/makiri.rb), as Makiri.HTML does for HTML::Document.parse. */
|
|
419
|
+
rb_define_singleton_method(mkr_cXmlDocument, "parse", mkr_xml_s_parse, -1);
|
|
420
|
+
}
|