makiri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +12 -7
- data/CHANGELOG.md +93 -14
- data/README.md +173 -7
- data/Rakefile +103 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +217 -0
- data/ext/makiri/core/mkr_alloc.h +1 -1
- data/ext/makiri/core/mkr_buf.c +35 -1
- data/ext/makiri/core/mkr_buf.h +37 -3
- data/ext/makiri/core/mkr_core.h +1 -1
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/extconf.rb +20 -2
- data/ext/makiri/glue/glue.h +47 -11
- data/ext/makiri/glue/ruby_doc.c +117 -43
- data/ext/makiri/glue/ruby_html_css.c +246 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
- data/ext/makiri/glue/ruby_html_node.c +888 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +54 -748
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +420 -0
- data/ext/makiri/glue/ruby_xml_node.c +1386 -0
- data/ext/makiri/glue/ruby_xpath.c +59 -26
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +13 -9
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
- data/ext/makiri/makiri.c +139 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +126 -0
- data/ext/makiri/xml/mkr_xml_chars.c +225 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +267 -0
- data/ext/makiri/xml/mkr_xml_node.h +119 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
- data/ext/makiri/xpath/mkr_xpath.c +59 -32
- data/ext/makiri/xpath/mkr_xpath.h +96 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
- data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
- data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
- data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +21 -0
- data/lib/makiri/comment.rb +12 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +4 -76
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +5 -3
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +12 -0
- data/lib/makiri/text.rb +2 -0
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +37 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +23 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_c_safety.rb +32 -0
- data/script/check_c_safety_allowlist.yml +83 -0
- metadata +35 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/lib/makiri/cdata.rb +0 -6
data/ext/makiri/glue/ruby_doc.c
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "glue.h"
|
|
2
2
|
#include "../lexbor_compat/compat_internal.h" /* mkr_dom_preorder_next */
|
|
3
3
|
#include "../core/mkr_core.h"
|
|
4
|
+
#include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
|
|
4
5
|
|
|
5
6
|
#include <lexbor/html/parser.h>
|
|
6
7
|
#include <ruby/thread.h>
|
|
@@ -32,23 +33,47 @@ mkr_doc_free(void *ptr)
|
|
|
32
33
|
static size_t
|
|
33
34
|
mkr_doc_memsize(const void *ptr)
|
|
34
35
|
{
|
|
35
|
-
|
|
36
|
-
(
|
|
37
|
-
|
|
36
|
+
const mkr_doc_data_t *d = (const mkr_doc_data_t *)ptr;
|
|
37
|
+
size_t total = sizeof(mkr_doc_data_t);
|
|
38
|
+
/* The Lexbor (HTML) arena size is not cheaply queryable; report the wrapper
|
|
39
|
+
* only. The XML arena tracks its own byte total, so include it. */
|
|
40
|
+
if (d->parsed != NULL && mkr_parsed_kind(d->parsed) == MKR_DOC_XML) {
|
|
41
|
+
total += mkr_xml_doc_memsize(mkr_parsed_xml_doc(d->parsed));
|
|
42
|
+
}
|
|
43
|
+
return total;
|
|
38
44
|
}
|
|
39
45
|
|
|
46
|
+
/* Like nodes, HTML and XML Documents share the mkr_doc_data_t layout and GC
|
|
47
|
+
* functions but are wrapped under DISTINCT TypedData types (both deriving from
|
|
48
|
+
* the shared base mkr_doc_type), so mkr_html_doc_unwrap - which reinterprets the
|
|
49
|
+
* parsed document as a Lexbor lxb_html_document_t - RAISES TypeError on an XML
|
|
50
|
+
* Document via Ruby's type machinery, instead of relying on the (NDEBUG-erased)
|
|
51
|
+
* assert in mkr_parsed_html_doc. mkr_doc_type (base) is kept for the kind-agnostic
|
|
52
|
+
* accessors (mkr_doc_parsed, #errors) that legitimately accept either. */
|
|
40
53
|
const rb_data_type_t mkr_doc_type = {
|
|
41
54
|
"Makiri::Document",
|
|
42
55
|
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
43
56
|
0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
44
57
|
};
|
|
58
|
+
static const rb_data_type_t mkr_html_doc_type = {
|
|
59
|
+
"Makiri::HTML::Document",
|
|
60
|
+
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
61
|
+
&mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
62
|
+
};
|
|
63
|
+
static const rb_data_type_t mkr_xml_doc_type = {
|
|
64
|
+
"Makiri::XML::Document",
|
|
65
|
+
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
66
|
+
&mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
67
|
+
};
|
|
45
68
|
|
|
46
69
|
lxb_dom_document_t *
|
|
47
|
-
|
|
70
|
+
mkr_html_doc_unwrap(VALUE rb_doc)
|
|
48
71
|
{
|
|
49
72
|
mkr_doc_data_t *d;
|
|
50
|
-
|
|
51
|
-
|
|
73
|
+
/* mkr_html_doc_type rejects an XML Document at the type boundary (its type
|
|
74
|
+
* chain does not include mkr_html_doc_type). */
|
|
75
|
+
TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_html_doc_type, d);
|
|
76
|
+
return (lxb_dom_document_t *)mkr_parsed_html_doc(d->parsed);
|
|
52
77
|
}
|
|
53
78
|
|
|
54
79
|
mkr_parsed_t *
|
|
@@ -59,13 +84,19 @@ mkr_doc_parsed(VALUE rb_doc)
|
|
|
59
84
|
return d->parsed;
|
|
60
85
|
}
|
|
61
86
|
|
|
62
|
-
/* Wrap an owned mkr_parsed_t as a
|
|
63
|
-
*
|
|
87
|
+
/* Wrap an owned mkr_parsed_t as a Document. GC takes ownership of +parsed+
|
|
88
|
+
* (freed in dfree). The Ruby leaf class is chosen by kind: a Lexbor-backed
|
|
89
|
+
* handle becomes Makiri::Document (HTML), an arena-backed one
|
|
90
|
+
* Makiri::XML::Document (§2.3). Used to back a parsed document or a standalone
|
|
91
|
+
* DocumentFragment. */
|
|
64
92
|
VALUE
|
|
65
93
|
mkr_wrap_document(mkr_parsed_t *parsed)
|
|
66
94
|
{
|
|
95
|
+
int is_xml = (mkr_parsed_kind(parsed) == MKR_DOC_XML);
|
|
96
|
+
VALUE klass = is_xml ? mkr_cXmlDocument : mkr_cHtmlDocument;
|
|
67
97
|
mkr_doc_data_t *d;
|
|
68
|
-
VALUE obj = TypedData_Make_Struct(
|
|
98
|
+
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t,
|
|
99
|
+
is_xml ? &mkr_xml_doc_type : &mkr_html_doc_type, d);
|
|
69
100
|
d->parsed = parsed;
|
|
70
101
|
d->errors = rb_ary_new();
|
|
71
102
|
return obj;
|
|
@@ -95,7 +126,7 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
|
|
|
95
126
|
}
|
|
96
127
|
|
|
97
128
|
if (rb_obj_is_kind_of(context, mkr_cNode)) {
|
|
98
|
-
lxb_dom_node_t *cn =
|
|
129
|
+
lxb_dom_node_t *cn = mkr_html_node_unwrap(context); /* reject an XML node before lxb use */
|
|
99
130
|
if (cn->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
100
131
|
rb_raise(rb_eArgError, "fragment context node must be an element");
|
|
101
132
|
}
|
|
@@ -207,7 +238,39 @@ mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
|
|
|
207
238
|
/* Browser-compatible decoding: invalid UTF-8 -> U+FFFD; valid input is used
|
|
208
239
|
* in place (no copy, *owned == NULL). Returns -1 on OOM (nothing allocated)
|
|
209
240
|
* so the caller can release its parser before raising. */
|
|
210
|
-
|
|
241
|
+
VALUE u8 = mkr_ruby_to_utf8(html); /* honour the input encoding (-> UTF-8) */
|
|
242
|
+
mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(u8);
|
|
243
|
+
|
|
244
|
+
if (u8 != html) {
|
|
245
|
+
/* Transcoded to UTF-8: a fresh String that nothing keeps alive past this
|
|
246
|
+
* return, so we must NOT borrow its bytes. It is already valid UTF-8, so
|
|
247
|
+
* copy it into an owned buffer (the caller frees *owned) - no sanitise. */
|
|
248
|
+
size_t n = (hv.len > 0) ? hv.len : 1;
|
|
249
|
+
char *buf = mkr_reallocarray(NULL, n, 1);
|
|
250
|
+
if (buf == NULL) {
|
|
251
|
+
RB_GC_GUARD(hv.value);
|
|
252
|
+
return -1;
|
|
253
|
+
}
|
|
254
|
+
if (hv.len > 0) {
|
|
255
|
+
memcpy(buf, hv.ptr, hv.len);
|
|
256
|
+
}
|
|
257
|
+
*owned = (lxb_char_t *)buf;
|
|
258
|
+
*out = (const lxb_char_t *)buf;
|
|
259
|
+
*out_len = hv.len;
|
|
260
|
+
RB_GC_GUARD(hv.value);
|
|
261
|
+
return 0;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/* Not transcoded (UTF-8 / US-ASCII / binary): input Ruby already knows is
|
|
265
|
+
* valid UTF-8 is borrowed in place (the caller keeps `html` alive);
|
|
266
|
+
* otherwise sanitise as before. */
|
|
267
|
+
if (mkr_ruby_str_known_valid_utf8(html)) {
|
|
268
|
+
*owned = NULL;
|
|
269
|
+
*out = (const lxb_char_t *)hv.ptr;
|
|
270
|
+
*out_len = hv.len;
|
|
271
|
+
RB_GC_GUARD(hv.value);
|
|
272
|
+
return 0;
|
|
273
|
+
}
|
|
211
274
|
lxb_char_t *clean = NULL;
|
|
212
275
|
size_t clean_len = 0;
|
|
213
276
|
if (mkr_utf8_sanitize((const lxb_char_t *)hv.ptr, hv.len, &clean, &clean_len) != 0) {
|
|
@@ -249,7 +312,7 @@ mkr_import_fragment_children(lxb_dom_document_t *doc, lxb_dom_node_t *root,
|
|
|
249
312
|
}
|
|
250
313
|
|
|
251
314
|
/* Node#clone_node(deep = false): a shallow (or deep, with deep truthy) copy of
|
|
252
|
-
* this node, owned by the same document and detached from any parent
|
|
315
|
+
* this node, owned by the same document and detached from any parent - the DOM
|
|
253
316
|
* cloneNode, whose `deep` defaults to false (a missing/nil/false argument =>
|
|
254
317
|
* shallow). Built on the same import_node + <template>-content fixup the
|
|
255
318
|
* fragment parser uses, so a deep-cloned <template> carries its contents (which
|
|
@@ -262,7 +325,7 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
|
|
|
262
325
|
rb_scan_args(argc, argv, "01", &deep_v);
|
|
263
326
|
bool deep = RTEST(deep_v);
|
|
264
327
|
|
|
265
|
-
lxb_dom_node_t *node =
|
|
328
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
266
329
|
lxb_dom_document_t *doc = node->owner_document;
|
|
267
330
|
|
|
268
331
|
lxb_dom_node_t *clone = lxb_dom_document_import_node(doc, node, deep);
|
|
@@ -272,11 +335,11 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
|
|
|
272
335
|
if (deep) {
|
|
273
336
|
mkr_fixup_template_content(doc, node, clone);
|
|
274
337
|
}
|
|
275
|
-
return
|
|
338
|
+
return mkr_wrap_html_node(clone, mkr_node_document(self));
|
|
276
339
|
}
|
|
277
340
|
|
|
278
341
|
/* Document#import_node(node, deep = false): a shallow (or deep, with deep
|
|
279
|
-
* truthy) copy of +node+ owned by THIS document
|
|
342
|
+
* truthy) copy of +node+ owned by THIS document - the DOM importNode, whose
|
|
280
343
|
* `deep` defaults to false (a missing/nil/false argument => shallow). Unlike
|
|
281
344
|
* Node#clone_node, the copy is owned by the receiver rather than the node's own
|
|
282
345
|
* document, so it is the way to bring a node across documents (Makiri never
|
|
@@ -290,8 +353,8 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
|
|
|
290
353
|
rb_scan_args(argc, argv, "11", &node_v, &deep_v);
|
|
291
354
|
bool deep = RTEST(deep_v);
|
|
292
355
|
|
|
293
|
-
lxb_dom_node_t *src =
|
|
294
|
-
lxb_dom_document_t *doc =
|
|
356
|
+
lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
|
|
357
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
|
|
295
358
|
|
|
296
359
|
lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
|
|
297
360
|
if (imp == NULL) {
|
|
@@ -300,7 +363,7 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
|
|
|
300
363
|
if (deep) {
|
|
301
364
|
mkr_fixup_template_content(doc, src, imp);
|
|
302
365
|
}
|
|
303
|
-
return
|
|
366
|
+
return mkr_wrap_html_node(imp, self);
|
|
304
367
|
}
|
|
305
368
|
|
|
306
369
|
/* Parse +rb_html+ as a fragment in the given (tag id, namespace) context and
|
|
@@ -315,7 +378,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
|
|
|
315
378
|
lxb_tag_id_t ctx_tag, lxb_ns_id_t ctx_ns)
|
|
316
379
|
{
|
|
317
380
|
VALUE html = rb_String(rb_html);
|
|
318
|
-
lxb_dom_document_t *doc =
|
|
381
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(document);
|
|
319
382
|
|
|
320
383
|
lxb_dom_document_fragment_t *frag = lxb_dom_document_fragment_interface_create(doc);
|
|
321
384
|
if (frag == NULL) {
|
|
@@ -349,7 +412,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
|
|
|
349
412
|
|
|
350
413
|
lxb_html_parser_destroy(parser);
|
|
351
414
|
RB_GC_GUARD(html);
|
|
352
|
-
return
|
|
415
|
+
return mkr_wrap_html_node(frag_node, document);
|
|
353
416
|
}
|
|
354
417
|
|
|
355
418
|
/* document.fragment(html, context: ...) -> DocumentFragment bound to this
|
|
@@ -363,7 +426,7 @@ mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
|
|
|
363
426
|
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
|
|
364
427
|
lxb_tag_id_t tag;
|
|
365
428
|
lxb_ns_id_t ns;
|
|
366
|
-
mkr_resolve_fragment_context(
|
|
429
|
+
mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
|
|
367
430
|
return mkr_build_fragment_ctx(self, html, tag, ns);
|
|
368
431
|
}
|
|
369
432
|
|
|
@@ -379,14 +442,14 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
|
|
|
379
442
|
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
|
|
380
443
|
|
|
381
444
|
static const lxb_char_t shell[] = "<html><body></body></html>";
|
|
382
|
-
mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1);
|
|
445
|
+
mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1, true);
|
|
383
446
|
if (parsed == NULL) {
|
|
384
447
|
rb_raise(mkr_eError, "failed to create fragment document");
|
|
385
448
|
}
|
|
386
449
|
VALUE document = mkr_wrap_document(parsed); /* GC now owns parsed */
|
|
387
450
|
lxb_tag_id_t tag;
|
|
388
451
|
lxb_ns_id_t ns;
|
|
389
|
-
mkr_resolve_fragment_context(
|
|
452
|
+
mkr_resolve_fragment_context(mkr_html_doc_unwrap(document), context, &tag, &ns);
|
|
390
453
|
return mkr_build_fragment_ctx(document, html, tag, ns);
|
|
391
454
|
}
|
|
392
455
|
|
|
@@ -396,7 +459,7 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
|
|
|
396
459
|
static VALUE
|
|
397
460
|
mkr_node_parse(VALUE self, VALUE rb_html)
|
|
398
461
|
{
|
|
399
|
-
lxb_dom_node_t *node =
|
|
462
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
400
463
|
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
401
464
|
rb_raise(rb_eArgError, "Node#parse requires an element context");
|
|
402
465
|
}
|
|
@@ -415,6 +478,7 @@ mkr_node_parse(VALUE self, VALUE rb_html)
|
|
|
415
478
|
typedef struct {
|
|
416
479
|
const lxb_char_t *src;
|
|
417
480
|
size_t len;
|
|
481
|
+
bool assume_valid;
|
|
418
482
|
mkr_parsed_t *result;
|
|
419
483
|
} mkr_parse_nogvl_t;
|
|
420
484
|
|
|
@@ -425,7 +489,7 @@ static void *
|
|
|
425
489
|
mkr_parse_nogvl(void *p)
|
|
426
490
|
{
|
|
427
491
|
mkr_parse_nogvl_t *a = (mkr_parse_nogvl_t *)p;
|
|
428
|
-
a->result = mkr_parse_html(a->src, a->len);
|
|
492
|
+
a->result = mkr_parse_html(a->src, a->len, a->assume_valid);
|
|
429
493
|
return NULL;
|
|
430
494
|
}
|
|
431
495
|
|
|
@@ -440,11 +504,16 @@ static VALUE
|
|
|
440
504
|
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
|
|
441
505
|
{
|
|
442
506
|
StringValue(rb_source);
|
|
507
|
+
/* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
|
|
508
|
+
* degradation), anything else is transcoded to UTF-8 so its content is
|
|
509
|
+
* preserved rather than read as raw UTF-8 bytes. */
|
|
510
|
+
rb_source = mkr_ruby_to_utf8(rb_source);
|
|
443
511
|
|
|
444
512
|
/* Allocate the wrapper first (with parsed == NULL) so that if parsing
|
|
445
|
-
* fails the GC-managed object frees cleanly.
|
|
513
|
+
* fails the GC-managed object frees cleanly. This is the HTML parse entry
|
|
514
|
+
* (defined on Makiri::HTML::Document), so the result is always HTML. */
|
|
446
515
|
mkr_doc_data_t *d;
|
|
447
|
-
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &
|
|
516
|
+
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
|
|
448
517
|
d->parsed = NULL;
|
|
449
518
|
d->errors = rb_ary_new();
|
|
450
519
|
|
|
@@ -457,9 +526,14 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
|
|
|
457
526
|
if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
|
|
458
527
|
rb_raise(mkr_eError, "out of memory copying source");
|
|
459
528
|
}
|
|
529
|
+
/* Read the coderange (no scan) before releasing the GVL; the copy is
|
|
530
|
+
* byte-identical, so a source Ruby already knows is valid UTF-8 lets the
|
|
531
|
+
* parse skip its sanitisation scan. */
|
|
532
|
+
bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
|
|
460
533
|
RB_GC_GUARD(rb_source);
|
|
461
534
|
|
|
462
|
-
mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
|
|
535
|
+
mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
|
|
536
|
+
assume_valid, NULL };
|
|
463
537
|
rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
|
|
464
538
|
mkr_owned_bytes_clear(&source);
|
|
465
539
|
|
|
@@ -479,8 +553,8 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
|
|
|
479
553
|
static VALUE
|
|
480
554
|
mkr_doc_root(VALUE self)
|
|
481
555
|
{
|
|
482
|
-
lxb_dom_document_t *doc =
|
|
483
|
-
return
|
|
556
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
|
|
557
|
+
return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
|
|
484
558
|
}
|
|
485
559
|
|
|
486
560
|
/* Get the document <title>, or "" if absent. */
|
|
@@ -489,7 +563,7 @@ mkr_doc_title(VALUE self)
|
|
|
489
563
|
{
|
|
490
564
|
size_t len = 0;
|
|
491
565
|
const lxb_char_t *str =
|
|
492
|
-
lxb_html_document_title((lxb_html_document_t *)
|
|
566
|
+
lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
|
|
493
567
|
return (str == NULL) ? rb_utf8_str_new("", 0)
|
|
494
568
|
: rb_utf8_str_new((const char *)str, len);
|
|
495
569
|
}
|
|
@@ -500,10 +574,10 @@ mkr_doc_title(VALUE self)
|
|
|
500
574
|
static VALUE
|
|
501
575
|
mkr_doc_internal_subset(VALUE self)
|
|
502
576
|
{
|
|
503
|
-
lxb_dom_node_t *doc = (lxb_dom_node_t *)
|
|
577
|
+
lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
|
|
504
578
|
for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
|
|
505
579
|
if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
|
|
506
|
-
return
|
|
580
|
+
return mkr_wrap_html_node(c, self);
|
|
507
581
|
}
|
|
508
582
|
}
|
|
509
583
|
return Qnil;
|
|
@@ -515,7 +589,7 @@ mkr_doc_internal_subset(VALUE self)
|
|
|
515
589
|
static VALUE
|
|
516
590
|
mkr_doc_quirks_mode(VALUE self)
|
|
517
591
|
{
|
|
518
|
-
return INT2NUM((int)
|
|
592
|
+
return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
|
|
519
593
|
}
|
|
520
594
|
|
|
521
595
|
/* Parse warnings. Reserved; currently always empty. */
|
|
@@ -530,18 +604,18 @@ mkr_doc_errors(VALUE self)
|
|
|
530
604
|
void
|
|
531
605
|
mkr_init_document(void)
|
|
532
606
|
{
|
|
533
|
-
rb_define_singleton_method(
|
|
534
|
-
rb_define_method(
|
|
535
|
-
rb_define_method(
|
|
536
|
-
rb_define_method(
|
|
537
|
-
rb_define_method(
|
|
538
|
-
rb_define_method(
|
|
539
|
-
rb_define_method(
|
|
540
|
-
rb_define_method(
|
|
607
|
+
rb_define_singleton_method(mkr_cHtmlDocument, "_parse", mkr_doc_s_parse, 1);
|
|
608
|
+
rb_define_method(mkr_cHtmlDocument, "root", mkr_doc_root, 0);
|
|
609
|
+
rb_define_method(mkr_cHtmlDocument, "title", mkr_doc_title, 0);
|
|
610
|
+
rb_define_method(mkr_cHtmlDocument, "errors", mkr_doc_errors, 0);
|
|
611
|
+
rb_define_method(mkr_cHtmlDocument, "internal_subset", mkr_doc_internal_subset, 0);
|
|
612
|
+
rb_define_method(mkr_cHtmlDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
|
|
613
|
+
rb_define_method(mkr_cHtmlDocument, "fragment", mkr_doc_fragment, -1);
|
|
614
|
+
rb_define_method(mkr_cHtmlDocument, "import_node", mkr_doc_import_node, -1);
|
|
541
615
|
|
|
542
616
|
rb_define_singleton_method(mkr_cDocumentFragment, "parse", mkr_frag_s_parse, -1);
|
|
543
617
|
|
|
544
618
|
/* Node#parse(html): fragment-parse in this element's context (Nokogiri
|
|
545
619
|
* compatible). Defined here, next to the fragment machinery it reuses. */
|
|
546
|
-
rb_define_method(
|
|
620
|
+
rb_define_method(mkr_mHtmlNodeMethods, "parse", mkr_node_parse, 1);
|
|
547
621
|
}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
#include "glue.h"
|
|
2
|
+
|
|
3
|
+
#include <lexbor/css/css.h>
|
|
4
|
+
#include <lexbor/selectors/selectors.h>
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* CSS selector queries, delegated to Lexbor's lxb_selectors engine.
|
|
8
|
+
*
|
|
9
|
+
* Node#css(selector) -> NodeSet (descendants matching, document order)
|
|
10
|
+
* Node#at_css(selector) -> first matching descendant, or nil
|
|
11
|
+
*
|
|
12
|
+
* The Lexbor CSS engine (selector parser + its arena + the traversal engine) is
|
|
13
|
+
* built once and reused for every query. CSS evaluation always holds the GVL (it
|
|
14
|
+
* never releases it), so all queries are serialized and a single process-global
|
|
15
|
+
* engine is safe with no locking. Creating and tearing the engine down per call
|
|
16
|
+
* - four create/init/destroy triples - dominated a cheap query like
|
|
17
|
+
* at_css('#id') (the match is found almost immediately, so setup IS the cost);
|
|
18
|
+
* reusing it closes the gap to nokolexbor, which caches the same objects in
|
|
19
|
+
* thread-local storage. Between calls only the parsed selector list's arena is
|
|
20
|
+
* reset (lxb_css_memory_clean) and the parser is returned to its CLEAN stage
|
|
21
|
+
* (lxb_css_parser_clean) - both preserve the memory/selectors objects set once;
|
|
22
|
+
* the selectors parse-state is auto-cleaned by the parser and the traversal
|
|
23
|
+
* engine self-cleans after each find/match. A malformed selector raises
|
|
24
|
+
* Makiri::CSS::SyntaxError.
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
/* Process-global CSS engine, created lazily and kept for the process lifetime
|
|
28
|
+
* (one small allocation). parser->memory / parser->selectors are set once so the
|
|
29
|
+
* parser reuses the same selector arena + parse state across calls. */
|
|
30
|
+
static lxb_css_memory_t *g_css_mem;
|
|
31
|
+
static lxb_css_parser_t *g_css_parser;
|
|
32
|
+
static lxb_css_selectors_t *g_css_sel;
|
|
33
|
+
static lxb_selectors_t *g_selectors;
|
|
34
|
+
static int g_css_ready;
|
|
35
|
+
|
|
36
|
+
/* Build the shared engine on first use; raises Makiri::Error on init failure
|
|
37
|
+
* (leaving the globals unset, so a later call retries). */
|
|
38
|
+
static void
|
|
39
|
+
mkr_css_engine_init(void)
|
|
40
|
+
{
|
|
41
|
+
if (g_css_ready) {
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
lxb_css_memory_t *mem = lxb_css_memory_create();
|
|
46
|
+
lxb_css_parser_t *parser = lxb_css_parser_create();
|
|
47
|
+
lxb_css_selectors_t *css_sel = lxb_css_selectors_create();
|
|
48
|
+
lxb_selectors_t *selectors = lxb_selectors_create();
|
|
49
|
+
|
|
50
|
+
int ok = (mem != NULL && parser != NULL && css_sel != NULL && selectors != NULL)
|
|
51
|
+
&& (lxb_css_memory_init(mem, 128) == LXB_STATUS_OK)
|
|
52
|
+
&& (lxb_css_parser_init(parser, NULL) == LXB_STATUS_OK)
|
|
53
|
+
&& (lxb_css_selectors_init(css_sel) == LXB_STATUS_OK)
|
|
54
|
+
&& (lxb_selectors_init(selectors) == LXB_STATUS_OK);
|
|
55
|
+
|
|
56
|
+
if (!ok) {
|
|
57
|
+
if (selectors != NULL) lxb_selectors_destroy(selectors, true);
|
|
58
|
+
if (parser != NULL) lxb_css_parser_destroy(parser, true);
|
|
59
|
+
if (mem != NULL) lxb_css_memory_destroy(mem, true);
|
|
60
|
+
if (css_sel != NULL) lxb_css_selectors_destroy(css_sel, true);
|
|
61
|
+
rb_raise(mkr_eError, "failed to initialise CSS selector engine");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
lxb_css_parser_memory_set(parser, mem);
|
|
65
|
+
lxb_css_parser_selectors_set(parser, css_sel);
|
|
66
|
+
|
|
67
|
+
g_css_mem = mem;
|
|
68
|
+
g_css_parser = parser;
|
|
69
|
+
g_css_sel = css_sel;
|
|
70
|
+
g_selectors = selectors;
|
|
71
|
+
g_css_ready = 1;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
typedef struct {
|
|
75
|
+
VALUE set;
|
|
76
|
+
lxb_dom_node_t *root; /* excluded from results: css is descendant-only */
|
|
77
|
+
size_t count;
|
|
78
|
+
int overflow;
|
|
79
|
+
} mkr_css_ctx_t;
|
|
80
|
+
|
|
81
|
+
static lxb_status_t
|
|
82
|
+
mkr_css_find_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
|
|
83
|
+
void *ctx_)
|
|
84
|
+
{
|
|
85
|
+
(void)spec;
|
|
86
|
+
mkr_css_ctx_t *c = (mkr_css_ctx_t *)ctx_;
|
|
87
|
+
|
|
88
|
+
if (node == c->root) {
|
|
89
|
+
return LXB_STATUS_OK; /* descendant-only, like Nokogiri's node.css */
|
|
90
|
+
}
|
|
91
|
+
if (c->count >= MKR_NODE_SET_MAX) {
|
|
92
|
+
c->overflow = 1;
|
|
93
|
+
return LXB_STATUS_STOP; /* fail closed without raising mid-traversal */
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
mkr_node_set_push(c->set, (mkr_raw_node_t *)node);
|
|
97
|
+
c->count++;
|
|
98
|
+
return LXB_STATUS_OK;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/* at_css: capture the first matching descendant and stop. Avoids materialising a
|
|
102
|
+
* NodeSet (and a Ruby #first dispatch) for the one node the caller wants. */
|
|
103
|
+
typedef struct {
|
|
104
|
+
lxb_dom_node_t *root; /* excluded: descendant-only */
|
|
105
|
+
lxb_dom_node_t *found;
|
|
106
|
+
} mkr_css_first_ctx_t;
|
|
107
|
+
|
|
108
|
+
static lxb_status_t
|
|
109
|
+
mkr_css_first_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
|
|
110
|
+
void *ctx_)
|
|
111
|
+
{
|
|
112
|
+
(void)spec;
|
|
113
|
+
mkr_css_first_ctx_t *c = (mkr_css_first_ctx_t *)ctx_;
|
|
114
|
+
|
|
115
|
+
if (node == c->root) {
|
|
116
|
+
return LXB_STATUS_OK; /* descendant-only */
|
|
117
|
+
}
|
|
118
|
+
c->found = node;
|
|
119
|
+
return LXB_STATUS_STOP;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/* Callback for matches?: signals that the node matched the selector. */
|
|
123
|
+
static lxb_status_t
|
|
124
|
+
mkr_css_match_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
|
|
125
|
+
void *ctx_)
|
|
126
|
+
{
|
|
127
|
+
(void)node; (void)spec;
|
|
128
|
+
*(int *)ctx_ = 1;
|
|
129
|
+
return LXB_STATUS_STOP;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/* Parse +rb_selector+ with the shared engine, hand the parsed list to +run+
|
|
133
|
+
* (the actual find / match against +node+), then reset the engine for the next
|
|
134
|
+
* call. Raises Makiri::CSS::SyntaxError on a bad selector; any result-specific
|
|
135
|
+
* limits are checked by the caller after return. */
|
|
136
|
+
static void
|
|
137
|
+
mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
|
|
138
|
+
lxb_status_t (*run)(lxb_selectors_t *, lxb_dom_node_t *,
|
|
139
|
+
lxb_css_selector_list_t *, void *),
|
|
140
|
+
void *u)
|
|
141
|
+
{
|
|
142
|
+
mkr_ruby_borrowed_text_t sv = mkr_ruby_verified_text(rb_selector, "CSS selector");
|
|
143
|
+
|
|
144
|
+
mkr_css_engine_init(); /* raises on init failure */
|
|
145
|
+
|
|
146
|
+
lxb_css_selector_list_t *list =
|
|
147
|
+
lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
|
|
148
|
+
|
|
149
|
+
int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
|
|
150
|
+
if (!syntax_error) {
|
|
151
|
+
(void)run(g_selectors, node, list, u);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/* Reset the shared engine for the next query: drop the parsed list's arena
|
|
155
|
+
* allocations and return the parser to its CLEAN stage. Both preserve the
|
|
156
|
+
* memory/selectors objects we set once; the traversal engine self-cleans
|
|
157
|
+
* after find/match. */
|
|
158
|
+
lxb_css_memory_clean(g_css_mem);
|
|
159
|
+
lxb_css_parser_clean(g_css_parser);
|
|
160
|
+
|
|
161
|
+
if (syntax_error) {
|
|
162
|
+
rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
|
|
163
|
+
}
|
|
164
|
+
RB_GC_GUARD(sv.value);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/* find: collect descendants matching the selector (MATCH_FIRST dedups a node
|
|
168
|
+
* that matches several selectors in a comma list). */
|
|
169
|
+
static lxb_status_t
|
|
170
|
+
mkr_run_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
|
|
171
|
+
lxb_css_selector_list_t *list, void *u)
|
|
172
|
+
{
|
|
173
|
+
lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
|
|
174
|
+
return lxb_selectors_find(selectors, root, list, mkr_css_find_cb, u);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/* find_first: stop at the first matching descendant (for at_css). */
|
|
178
|
+
static lxb_status_t
|
|
179
|
+
mkr_run_find_first(lxb_selectors_t *selectors, lxb_dom_node_t *root,
|
|
180
|
+
lxb_css_selector_list_t *list, void *u)
|
|
181
|
+
{
|
|
182
|
+
lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
|
|
183
|
+
return lxb_selectors_find(selectors, root, list, mkr_css_first_cb, u);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/* match_node: does THIS node match? */
|
|
187
|
+
static lxb_status_t
|
|
188
|
+
mkr_run_match(lxb_selectors_t *selectors, lxb_dom_node_t *node,
|
|
189
|
+
lxb_css_selector_list_t *list, void *u)
|
|
190
|
+
{
|
|
191
|
+
return lxb_selectors_match_node(selectors, node, list, mkr_css_match_cb, u);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/* Node#css: collect every matching descendant into a NodeSet (document order).
|
|
195
|
+
* Raises Makiri::CSS::SyntaxError on a bad selector, Makiri::Error on an
|
|
196
|
+
* over-large result. */
|
|
197
|
+
static VALUE
|
|
198
|
+
mkr_node_css(VALUE self, VALUE rb_selector)
|
|
199
|
+
{
|
|
200
|
+
lxb_dom_node_t *root = mkr_html_node_unwrap(self);
|
|
201
|
+
VALUE document = mkr_node_document(self);
|
|
202
|
+
VALUE set = mkr_node_set_new(document);
|
|
203
|
+
|
|
204
|
+
mkr_css_ctx_t ctx = { .set = set, .root = root, .count = 0, .overflow = 0 };
|
|
205
|
+
mkr_with_compiled_selector(rb_selector, root, mkr_run_find, &ctx);
|
|
206
|
+
|
|
207
|
+
if (ctx.overflow) {
|
|
208
|
+
rb_raise(mkr_eError, "CSS result set exceeded the node limit (%u)",
|
|
209
|
+
MKR_NODE_SET_MAX);
|
|
210
|
+
}
|
|
211
|
+
return set;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/* Node#at_css: the first matching descendant, or nil. */
|
|
215
|
+
static VALUE
|
|
216
|
+
mkr_node_at_css(VALUE self, VALUE rb_selector)
|
|
217
|
+
{
|
|
218
|
+
lxb_dom_node_t *root = mkr_html_node_unwrap(self);
|
|
219
|
+
|
|
220
|
+
mkr_css_first_ctx_t ctx = { .root = root, .found = NULL };
|
|
221
|
+
mkr_with_compiled_selector(rb_selector, root, mkr_run_find_first, &ctx);
|
|
222
|
+
|
|
223
|
+
return ctx.found != NULL
|
|
224
|
+
? mkr_wrap_html_node(ctx.found, mkr_node_document(self))
|
|
225
|
+
: Qnil;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/* Node#matches?(selector): does THIS node match the CSS selector? (Like
|
|
229
|
+
* Nokogiri - tested against the node itself, not its descendants.) A malformed
|
|
230
|
+
* selector raises Makiri::CSS::SyntaxError. */
|
|
231
|
+
static VALUE
|
|
232
|
+
mkr_node_matches(VALUE self, VALUE rb_selector)
|
|
233
|
+
{
|
|
234
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
235
|
+
int matched = 0;
|
|
236
|
+
mkr_with_compiled_selector(rb_selector, node, mkr_run_match, &matched);
|
|
237
|
+
return matched ? Qtrue : Qfalse;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
void
|
|
241
|
+
mkr_init_css(void)
|
|
242
|
+
{
|
|
243
|
+
rb_define_method(mkr_mHtmlNodeMethods, "css", mkr_node_css, 1);
|
|
244
|
+
rb_define_method(mkr_mHtmlNodeMethods, "at_css", mkr_node_at_css, 1);
|
|
245
|
+
rb_define_method(mkr_mHtmlNodeMethods, "matches?", mkr_node_matches, 1);
|
|
246
|
+
}
|