makiri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/conformance.yml +22 -0
- data/.github/workflows/libfuzzer.yml +83 -0
- data/.github/workflows/release.yml +12 -7
- data/.github/workflows/security.yml +88 -3
- data/.github/workflows/valgrind.yml +135 -0
- data/CHANGELOG.md +152 -15
- data/README.md +183 -13
- data/Rakefile +294 -7
- data/ext/makiri/bridge/bridge.h +28 -0
- data/ext/makiri/bridge/ruby_string.c +282 -12
- data/ext/makiri/core/mkr_alloc.c +40 -3
- data/ext/makiri/core/mkr_alloc.h +28 -5
- data/ext/makiri/core/mkr_buf.c +47 -3
- data/ext/makiri/core/mkr_buf.h +112 -3
- data/ext/makiri/core/mkr_core.c +143 -0
- data/ext/makiri/core/mkr_core.h +11 -2
- data/ext/makiri/core/mkr_hash.h +1 -1
- data/ext/makiri/core/mkr_span.h +186 -0
- data/ext/makiri/core/mkr_text.h +8 -8
- data/ext/makiri/core/mkr_utf8.c +101 -0
- data/ext/makiri/core/mkr_utf8.h +88 -0
- data/ext/makiri/extconf.rb +123 -10
- data/ext/makiri/fuzz/Makefile +95 -0
- data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
- data/ext/makiri/fuzz/xml_fuzz.c +24 -0
- data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
- data/ext/makiri/glue/glue.h +55 -11
- data/ext/makiri/glue/ruby_doc.c +129 -59
- data/ext/makiri/glue/ruby_html_css.c +292 -0
- data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
- data/ext/makiri/glue/ruby_html_node.c +859 -0
- data/ext/makiri/glue/ruby_html_serialize.c +154 -0
- data/ext/makiri/glue/ruby_node.c +74 -729
- data/ext/makiri/glue/ruby_node_set.c +167 -32
- data/ext/makiri/glue/ruby_xml.c +602 -0
- data/ext/makiri/glue/ruby_xml_node.c +1373 -0
- data/ext/makiri/glue/ruby_xpath.c +63 -30
- data/ext/makiri/glue/ruby_xpath.h +19 -0
- data/ext/makiri/lexbor_compat/compat.h +42 -9
- data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
- data/ext/makiri/lexbor_compat/dom_index.c +2 -2
- data/ext/makiri/lexbor_compat/post_parse.c +100 -10
- data/ext/makiri/lexbor_compat/source_loc.c +15 -13
- data/ext/makiri/lexbor_compat/text_index.c +14 -8
- data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
- data/ext/makiri/makiri.c +184 -6
- data/ext/makiri/makiri.h +43 -2
- data/ext/makiri/xml/mkr_xml.h +125 -0
- data/ext/makiri/xml/mkr_xml_chars.c +195 -0
- data/ext/makiri/xml/mkr_xml_index.c +169 -0
- data/ext/makiri/xml/mkr_xml_index.h +48 -0
- data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
- data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
- data/ext/makiri/xml/mkr_xml_node.c +399 -0
- data/ext/makiri/xml/mkr_xml_node.h +184 -0
- data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
- data/ext/makiri/xpath/mkr_css.c +1023 -0
- data/ext/makiri/xpath/mkr_css.h +65 -0
- data/ext/makiri/xpath/mkr_xpath.c +96 -32
- data/ext/makiri/xpath/mkr_xpath.h +109 -4
- data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
- data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
- data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
- data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
- data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
- data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
- data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
- data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
- data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
- data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
- data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
- data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
- data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
- data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
- data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
- data/lib/makiri/{attribute.rb → attr.rb} +7 -3
- data/lib/makiri/cdata_section.rb +19 -0
- data/lib/makiri/comment.rb +10 -0
- data/lib/makiri/compat_aliases.rb +30 -0
- data/lib/makiri/document.rb +9 -73
- data/lib/makiri/document_fragment.rb +14 -9
- data/lib/makiri/element.rb +4 -4
- data/lib/makiri/html/document.rb +106 -0
- data/lib/makiri/html/node_methods.rb +19 -0
- data/lib/makiri/html.rb +12 -0
- data/lib/makiri/node.rb +58 -15
- data/lib/makiri/node_set.rb +8 -0
- data/lib/makiri/processing_instruction.rb +10 -0
- data/lib/makiri/text.rb +1 -1
- data/lib/makiri/version.rb +1 -1
- data/lib/makiri/xml/builder.rb +263 -0
- data/lib/makiri/xml/document.rb +24 -0
- data/lib/makiri/xml/node_methods.rb +84 -0
- data/lib/makiri/xml.rb +10 -0
- data/lib/makiri/xpath_context.rb +1 -1
- data/lib/makiri.rb +24 -5
- data/script/build_native_gem.rb +2 -2
- data/script/check_alloc_failures.rb +266 -0
- data/script/check_c_safety.rb +77 -2
- data/script/check_c_safety_allowlist.yml +102 -0
- data/script/check_leaks.rb +64 -0
- data/script/leaks_harness.rb +64 -0
- data/vendor/lexbor/CMakeLists.txt +6 -0
- data/vendor/lexbor/README.md +12 -0
- data/vendor/lexbor/config.cmake +1 -1
- data/vendor/lexbor/source/lexbor/core/base.h +1 -1
- data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
- data/vendor/lexbor/source/lexbor/html/base.h +1 -1
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
- data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
- data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
- data/vendor/lexbor/source/lexbor/url/base.h +1 -1
- data/vendor/lexbor/source/lexbor/url/url.c +5 -2
- data/vendor/lexbor/source/lexbor/url/url.h +9 -0
- data/vendor/lexbor/version +1 -1
- metadata +53 -9
- data/ext/makiri/glue/ruby_css.c +0 -185
- data/ext/makiri/glue/ruby_serialize.c +0 -92
- data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
- data/lib/makiri/cdata.rb +0 -6
data/ext/makiri/glue/glue.h
CHANGED
|
@@ -8,11 +8,24 @@
|
|
|
8
8
|
extern "C" {
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
+
/* A DOM node pointer of UNKNOWN representation - an HTML lxb_dom_node_t or an XML
|
|
12
|
+
* mkr_xml_node_t - as stored in a node wrapper or a NodeSet. It is an INCOMPLETE
|
|
13
|
+
* type on purpose: it cannot be dereferenced, and (unlike void*) it does not
|
|
14
|
+
* implicitly convert to a typed pointer, so reading a stored node AS a specific
|
|
15
|
+
* representation requires an explicit cast that the kind-checked accessors
|
|
16
|
+
* (mkr_html_node_unwrap / mkr_xml_node_unwrap) justify by the wrapper's TypedData type
|
|
17
|
+
* (or, for a NodeSet, by doc_is_xml). The stored pointer is only ever
|
|
18
|
+
* pointer-compared or cast through one of those accessors. */
|
|
19
|
+
typedef struct mkr_raw_node mkr_raw_node_t;
|
|
20
|
+
|
|
11
21
|
/* Wrapper for any DOM node except Document. The node memory is owned by the
|
|
12
|
-
* document's Lexbor arena; we keep only the
|
|
13
|
-
* reference to the Ruby Document so the arena
|
|
22
|
+
* document's arena (an HTML Lexbor arena or the XML node arena); we keep only the
|
|
23
|
+
* pointer plus a keepalive VALUE reference to the Ruby Document so the arena
|
|
24
|
+
* outlives the wrapper. The pointer is representation-opaque (mkr_raw_node_t):
|
|
25
|
+
* read it only through mkr_html_node_unwrap / mkr_xml_node_unwrap, which check the
|
|
26
|
+
* wrapper's representation (distinct TypedData types) before casting. */
|
|
14
27
|
typedef struct {
|
|
15
|
-
|
|
28
|
+
mkr_raw_node_t *node;
|
|
16
29
|
VALUE document;
|
|
17
30
|
} mkr_node_data_t;
|
|
18
31
|
|
|
@@ -31,14 +44,43 @@ extern const rb_data_type_t mkr_node_type;
|
|
|
31
44
|
extern const rb_data_type_t mkr_doc_type;
|
|
32
45
|
extern const rb_data_type_t mkr_node_set_type;
|
|
33
46
|
|
|
34
|
-
/* Node bridge (glue/ruby_node.c).
|
|
47
|
+
/* Node bridge (glue/ruby_node.c). mkr_wrap_html_node returns the Document VALUE
|
|
35
48
|
* for the document node, Qnil for NULL, otherwise a freshly-wrapped Node. */
|
|
36
|
-
VALUE
|
|
37
|
-
lxb_dom_node_t *mkr_node_unwrap(VALUE rb_node);
|
|
49
|
+
VALUE mkr_wrap_html_node(lxb_dom_node_t *node, VALUE document);
|
|
38
50
|
VALUE mkr_node_document(VALUE rb_node);
|
|
39
51
|
|
|
52
|
+
/* HTML and XML nodes are wrapped under DISTINCT TypedData types (both deriving
|
|
53
|
+
* from the shared base mkr_node_type), so a representation-specific accessor
|
|
54
|
+
* rejects the wrong kind via Ruby's type machinery. See ruby_node.c.
|
|
55
|
+
* mkr_html_node_unwrap -> lxb_dom_node_t* ; raises on an XML node/Document.
|
|
56
|
+
* mkr_xml_node_unwrap-> mkr_xml_node_t* ; raises on an HTML node/Document (ruby_xml_node.c).
|
|
57
|
+
* mkr_node_raw -> void* ; kind-agnostic raw pointer for identity, or for a
|
|
58
|
+
* site where the kind is already guaranteed. Deref needs an
|
|
59
|
+
* explicit cast - never treat it as a typed pointer blindly.
|
|
60
|
+
* mkr_node_id -> uintptr_t ; node identity for ==/eql?/hash/pointer_id. */
|
|
61
|
+
extern const rb_data_type_t mkr_html_node_type;
|
|
62
|
+
extern const rb_data_type_t mkr_xml_node_type;
|
|
63
|
+
lxb_dom_node_t *mkr_html_node_unwrap(VALUE rb_node);
|
|
64
|
+
void *mkr_node_raw(VALUE rb_node);
|
|
65
|
+
uintptr_t mkr_node_id(VALUE rb_node);
|
|
66
|
+
|
|
67
|
+
/* Representation-neutral identity methods (ruby_node.c): depend only on
|
|
68
|
+
* mkr_node_id, so the HTML and XML NodeMethods modules bind ==/eql? to
|
|
69
|
+
* mkr_node_equals, hash to mkr_node_hash, and pointer_id to mkr_node_pointer_id -
|
|
70
|
+
* one implementation, not one per representation. */
|
|
71
|
+
VALUE mkr_node_equals(VALUE self, VALUE other);
|
|
72
|
+
VALUE mkr_node_pointer_id(VALUE self);
|
|
73
|
+
VALUE mkr_node_hash(VALUE self);
|
|
74
|
+
|
|
75
|
+
/* XML node bridge (glue/ruby_xml_node.c): wrap a custom XML node into the right
|
|
76
|
+
* Makiri::XML::* leaf (Qnil for NULL, the Document VALUE for the document node). */
|
|
77
|
+
struct mkr_xml_node;
|
|
78
|
+
VALUE mkr_wrap_xml_node(struct mkr_xml_node *node, VALUE document);
|
|
79
|
+
/* XML node-pointer accessor; raises TypeError on an HTML node/Document. */
|
|
80
|
+
struct mkr_xml_node *mkr_xml_node_unwrap(VALUE rb_node);
|
|
81
|
+
|
|
40
82
|
/* Document bridge (glue/ruby_doc.c). */
|
|
41
|
-
lxb_dom_document_t *
|
|
83
|
+
lxb_dom_document_t *mkr_html_doc_unwrap(VALUE rb_doc);
|
|
42
84
|
mkr_parsed_t *mkr_doc_parsed(VALUE rb_doc);
|
|
43
85
|
VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownership */
|
|
44
86
|
|
|
@@ -46,7 +88,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
|
|
|
46
88
|
* inner_html=/outer_html= so the UTF-8 sanitisation and import+template-fixup
|
|
47
89
|
* are not duplicated.
|
|
48
90
|
*
|
|
49
|
-
* mkr_sanitize_html_input: decode rb_html for the fragment parser
|
|
91
|
+
* mkr_sanitize_html_input: decode rb_html for the fragment parser - *out / *out_len
|
|
50
92
|
* are the bytes to parse, *owned a malloc'd buffer to free afterwards (NULL when
|
|
51
93
|
* the input is used in place). Returns 0, or -1 on OOM (nothing allocated), so
|
|
52
94
|
* the caller can release its parser before raising. See mkr_utf8_sanitize.
|
|
@@ -54,7 +96,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
|
|
|
54
96
|
* mkr_import_fragment_children: deep-import each child of `root` into `doc`, hand
|
|
55
97
|
* it to `emit`, and fix up any <template> contents (which import_node omits).
|
|
56
98
|
*
|
|
57
|
-
* mkr_emit_append / mkr_emit_before: emit callbacks
|
|
99
|
+
* mkr_emit_append / mkr_emit_before: emit callbacks - append as last child of
|
|
58
100
|
* `u`, or insert before the reference node `u`. */
|
|
59
101
|
int mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
|
|
60
102
|
lxb_char_t **owned);
|
|
@@ -69,9 +111,11 @@ void mkr_emit_before(lxb_dom_node_t *imported, void *u);
|
|
|
69
111
|
* mkr_init_node. */
|
|
70
112
|
VALUE mkr_node_clone_node(int argc, VALUE *argv, VALUE self);
|
|
71
113
|
|
|
72
|
-
/* NodeSet bridge (glue/ruby_node_set.c).
|
|
114
|
+
/* NodeSet bridge (glue/ruby_node_set.c). mkr_raw_node_t (above): callers cast
|
|
115
|
+
* their typed node to it when pushing (forgetting the type is the safe, store
|
|
116
|
+
* direction); the single typed read-back lives in mkr_node_set_wrap. */
|
|
73
117
|
VALUE mkr_node_set_new(VALUE document);
|
|
74
|
-
void mkr_node_set_push(VALUE rb_set,
|
|
118
|
+
void mkr_node_set_push(VALUE rb_set, mkr_raw_node_t *node);
|
|
75
119
|
|
|
76
120
|
#ifdef __cplusplus
|
|
77
121
|
}
|
data/ext/makiri/glue/ruby_doc.c
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "glue.h"
|
|
2
2
|
#include "../lexbor_compat/compat_internal.h" /* mkr_dom_preorder_next */
|
|
3
3
|
#include "../core/mkr_core.h"
|
|
4
|
+
#include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
|
|
4
5
|
|
|
5
6
|
#include <lexbor/html/parser.h>
|
|
6
7
|
#include <ruby/thread.h>
|
|
@@ -32,23 +33,47 @@ mkr_doc_free(void *ptr)
|
|
|
32
33
|
static size_t
|
|
33
34
|
mkr_doc_memsize(const void *ptr)
|
|
34
35
|
{
|
|
35
|
-
|
|
36
|
-
(
|
|
37
|
-
|
|
36
|
+
const mkr_doc_data_t *d = (const mkr_doc_data_t *)ptr;
|
|
37
|
+
size_t total = sizeof(mkr_doc_data_t);
|
|
38
|
+
/* The Lexbor (HTML) arena size is not cheaply queryable; report the wrapper
|
|
39
|
+
* only. The XML arena tracks its own byte total, so include it. */
|
|
40
|
+
if (d->parsed != NULL && mkr_parsed_kind(d->parsed) == MKR_DOC_XML) {
|
|
41
|
+
total += mkr_xml_doc_memsize(mkr_parsed_xml_doc(d->parsed));
|
|
42
|
+
}
|
|
43
|
+
return total;
|
|
38
44
|
}
|
|
39
45
|
|
|
46
|
+
/* Like nodes, HTML and XML Documents share the mkr_doc_data_t layout and GC
|
|
47
|
+
* functions but are wrapped under DISTINCT TypedData types (both deriving from
|
|
48
|
+
* the shared base mkr_doc_type), so mkr_html_doc_unwrap - which reinterprets the
|
|
49
|
+
* parsed document as a Lexbor lxb_html_document_t - RAISES TypeError on an XML
|
|
50
|
+
* Document via Ruby's type machinery, instead of relying on the (NDEBUG-erased)
|
|
51
|
+
* assert in mkr_parsed_html_doc. mkr_doc_type (base) is kept for the kind-agnostic
|
|
52
|
+
* accessors (mkr_doc_parsed, #errors) that legitimately accept either. */
|
|
40
53
|
const rb_data_type_t mkr_doc_type = {
|
|
41
54
|
"Makiri::Document",
|
|
42
55
|
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
43
56
|
0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
44
57
|
};
|
|
58
|
+
static const rb_data_type_t mkr_html_doc_type = {
|
|
59
|
+
"Makiri::HTML::Document",
|
|
60
|
+
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
61
|
+
&mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
62
|
+
};
|
|
63
|
+
static const rb_data_type_t mkr_xml_doc_type = {
|
|
64
|
+
"Makiri::XML::Document",
|
|
65
|
+
{ mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
|
|
66
|
+
&mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
67
|
+
};
|
|
45
68
|
|
|
46
69
|
lxb_dom_document_t *
|
|
47
|
-
|
|
70
|
+
mkr_html_doc_unwrap(VALUE rb_doc)
|
|
48
71
|
{
|
|
49
72
|
mkr_doc_data_t *d;
|
|
50
|
-
|
|
51
|
-
|
|
73
|
+
/* mkr_html_doc_type rejects an XML Document at the type boundary (its type
|
|
74
|
+
* chain does not include mkr_html_doc_type). */
|
|
75
|
+
TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_html_doc_type, d);
|
|
76
|
+
return (lxb_dom_document_t *)mkr_parsed_html_doc(d->parsed);
|
|
52
77
|
}
|
|
53
78
|
|
|
54
79
|
mkr_parsed_t *
|
|
@@ -59,13 +84,19 @@ mkr_doc_parsed(VALUE rb_doc)
|
|
|
59
84
|
return d->parsed;
|
|
60
85
|
}
|
|
61
86
|
|
|
62
|
-
/* Wrap an owned mkr_parsed_t as a
|
|
63
|
-
*
|
|
87
|
+
/* Wrap an owned mkr_parsed_t as a Document. GC takes ownership of +parsed+
|
|
88
|
+
* (freed in dfree). The Ruby leaf class is chosen by kind: a Lexbor-backed
|
|
89
|
+
* handle becomes Makiri::Document (HTML), an arena-backed one
|
|
90
|
+
* Makiri::XML::Document (§2.3). Used to back a parsed document or a standalone
|
|
91
|
+
* DocumentFragment. */
|
|
64
92
|
VALUE
|
|
65
93
|
mkr_wrap_document(mkr_parsed_t *parsed)
|
|
66
94
|
{
|
|
95
|
+
int is_xml = (mkr_parsed_kind(parsed) == MKR_DOC_XML);
|
|
96
|
+
VALUE klass = is_xml ? mkr_cXmlDocument : mkr_cHtmlDocument;
|
|
67
97
|
mkr_doc_data_t *d;
|
|
68
|
-
VALUE obj = TypedData_Make_Struct(
|
|
98
|
+
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t,
|
|
99
|
+
is_xml ? &mkr_xml_doc_type : &mkr_html_doc_type, d);
|
|
69
100
|
d->parsed = parsed;
|
|
70
101
|
d->errors = rb_ary_new();
|
|
71
102
|
return obj;
|
|
@@ -95,7 +126,7 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
|
|
|
95
126
|
}
|
|
96
127
|
|
|
97
128
|
if (rb_obj_is_kind_of(context, mkr_cNode)) {
|
|
98
|
-
lxb_dom_node_t *cn =
|
|
129
|
+
lxb_dom_node_t *cn = mkr_html_node_unwrap(context); /* reject an XML node before lxb use */
|
|
99
130
|
if (cn->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
100
131
|
rb_raise(rb_eArgError, "fragment context node must be an element");
|
|
101
132
|
}
|
|
@@ -109,10 +140,10 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
|
|
|
109
140
|
mkr_ruby_borrowed_text_t cv = mkr_ruby_verified_text(context, "fragment context element");
|
|
110
141
|
const lxb_char_t *p = (const lxb_char_t *)cv.ptr;
|
|
111
142
|
size_t n = cv.len;
|
|
112
|
-
if (
|
|
143
|
+
if (mkr_bytes_eq(p, n, "svg", 3)) {
|
|
113
144
|
*out_tag = LXB_TAG_SVG; *out_ns = LXB_NS_SVG; return;
|
|
114
145
|
}
|
|
115
|
-
if (
|
|
146
|
+
if (mkr_bytes_eq(p, n, "math", 4)) {
|
|
116
147
|
*out_tag = LXB_TAG_MATH; *out_ns = LXB_NS_MATH; return;
|
|
117
148
|
}
|
|
118
149
|
lxb_tag_id_t tid = lxb_tag_id_by_name(doc->tags, p, n);
|
|
@@ -207,17 +238,45 @@ mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
|
|
|
207
238
|
/* Browser-compatible decoding: invalid UTF-8 -> U+FFFD; valid input is used
|
|
208
239
|
* in place (no copy, *owned == NULL). Returns -1 on OOM (nothing allocated)
|
|
209
240
|
* so the caller can release its parser before raising. */
|
|
210
|
-
|
|
241
|
+
VALUE u8 = mkr_ruby_to_utf8(html); /* honour the input encoding (-> UTF-8) */
|
|
242
|
+
mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(u8);
|
|
243
|
+
|
|
244
|
+
if (u8 != html) {
|
|
245
|
+
/* Transcoded to UTF-8: a fresh String that nothing keeps alive past this
|
|
246
|
+
* return, so we must NOT borrow its bytes. It is already valid UTF-8, so
|
|
247
|
+
* copy it into an owned buffer (the caller frees *owned) - no sanitise. */
|
|
248
|
+
size_t n = (hv.len > 0) ? hv.len : 1;
|
|
249
|
+
char *buf = mkr_reallocarray(NULL, n, 1);
|
|
250
|
+
if (buf == NULL) {
|
|
251
|
+
return -1;
|
|
252
|
+
}
|
|
253
|
+
if (hv.len > 0) {
|
|
254
|
+
memcpy(buf, hv.ptr, hv.len);
|
|
255
|
+
}
|
|
256
|
+
*owned = (lxb_char_t *)buf;
|
|
257
|
+
*out = (const lxb_char_t *)buf;
|
|
258
|
+
*out_len = hv.len;
|
|
259
|
+
RB_GC_GUARD(hv.value);
|
|
260
|
+
return 0;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/* Not transcoded (UTF-8 / US-ASCII / binary): input Ruby already knows is
|
|
264
|
+
* valid UTF-8 is borrowed in place (the caller keeps `html` alive);
|
|
265
|
+
* otherwise sanitise as before. */
|
|
266
|
+
if (mkr_ruby_str_known_valid_utf8(html)) {
|
|
267
|
+
*owned = NULL;
|
|
268
|
+
*out = (const lxb_char_t *)hv.ptr;
|
|
269
|
+
*out_len = hv.len;
|
|
270
|
+
return 0;
|
|
271
|
+
}
|
|
211
272
|
lxb_char_t *clean = NULL;
|
|
212
273
|
size_t clean_len = 0;
|
|
213
274
|
if (mkr_utf8_sanitize((const lxb_char_t *)hv.ptr, hv.len, &clean, &clean_len) != 0) {
|
|
214
|
-
RB_GC_GUARD(hv.value);
|
|
215
275
|
return -1;
|
|
216
276
|
}
|
|
217
277
|
*owned = clean;
|
|
218
278
|
*out = (clean != NULL) ? clean : (const lxb_char_t *)hv.ptr;
|
|
219
279
|
*out_len = (clean != NULL) ? clean_len : hv.len;
|
|
220
|
-
RB_GC_GUARD(hv.value);
|
|
221
280
|
return 0;
|
|
222
281
|
}
|
|
223
282
|
|
|
@@ -249,7 +308,7 @@ mkr_import_fragment_children(lxb_dom_document_t *doc, lxb_dom_node_t *root,
|
|
|
249
308
|
}
|
|
250
309
|
|
|
251
310
|
/* Node#clone_node(deep = false): a shallow (or deep, with deep truthy) copy of
|
|
252
|
-
* this node, owned by the same document and detached from any parent
|
|
311
|
+
* this node, owned by the same document and detached from any parent - the DOM
|
|
253
312
|
* cloneNode, whose `deep` defaults to false (a missing/nil/false argument =>
|
|
254
313
|
* shallow). Built on the same import_node + <template>-content fixup the
|
|
255
314
|
* fragment parser uses, so a deep-cloned <template> carries its contents (which
|
|
@@ -262,7 +321,7 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
|
|
|
262
321
|
rb_scan_args(argc, argv, "01", &deep_v);
|
|
263
322
|
bool deep = RTEST(deep_v);
|
|
264
323
|
|
|
265
|
-
lxb_dom_node_t *node =
|
|
324
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
266
325
|
lxb_dom_document_t *doc = node->owner_document;
|
|
267
326
|
|
|
268
327
|
lxb_dom_node_t *clone = lxb_dom_document_import_node(doc, node, deep);
|
|
@@ -272,11 +331,11 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
|
|
|
272
331
|
if (deep) {
|
|
273
332
|
mkr_fixup_template_content(doc, node, clone);
|
|
274
333
|
}
|
|
275
|
-
return
|
|
334
|
+
return mkr_wrap_html_node(clone, mkr_node_document(self));
|
|
276
335
|
}
|
|
277
336
|
|
|
278
337
|
/* Document#import_node(node, deep = false): a shallow (or deep, with deep
|
|
279
|
-
* truthy) copy of +node+ owned by THIS document
|
|
338
|
+
* truthy) copy of +node+ owned by THIS document - the DOM importNode, whose
|
|
280
339
|
* `deep` defaults to false (a missing/nil/false argument => shallow). Unlike
|
|
281
340
|
* Node#clone_node, the copy is owned by the receiver rather than the node's own
|
|
282
341
|
* document, so it is the way to bring a node across documents (Makiri never
|
|
@@ -290,8 +349,8 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
|
|
|
290
349
|
rb_scan_args(argc, argv, "11", &node_v, &deep_v);
|
|
291
350
|
bool deep = RTEST(deep_v);
|
|
292
351
|
|
|
293
|
-
lxb_dom_node_t *src =
|
|
294
|
-
lxb_dom_document_t *doc =
|
|
352
|
+
lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
|
|
353
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
|
|
295
354
|
|
|
296
355
|
lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
|
|
297
356
|
if (imp == NULL) {
|
|
@@ -300,7 +359,7 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
|
|
|
300
359
|
if (deep) {
|
|
301
360
|
mkr_fixup_template_content(doc, src, imp);
|
|
302
361
|
}
|
|
303
|
-
return
|
|
362
|
+
return mkr_wrap_html_node(imp, self);
|
|
304
363
|
}
|
|
305
364
|
|
|
306
365
|
/* Parse +rb_html+ as a fragment in the given (tag id, namespace) context and
|
|
@@ -315,7 +374,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
|
|
|
315
374
|
lxb_tag_id_t ctx_tag, lxb_ns_id_t ctx_ns)
|
|
316
375
|
{
|
|
317
376
|
VALUE html = rb_String(rb_html);
|
|
318
|
-
lxb_dom_document_t *doc =
|
|
377
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(document);
|
|
319
378
|
|
|
320
379
|
lxb_dom_document_fragment_t *frag = lxb_dom_document_fragment_interface_create(doc);
|
|
321
380
|
if (frag == NULL) {
|
|
@@ -349,7 +408,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
|
|
|
349
408
|
|
|
350
409
|
lxb_html_parser_destroy(parser);
|
|
351
410
|
RB_GC_GUARD(html);
|
|
352
|
-
return
|
|
411
|
+
return mkr_wrap_html_node(frag_node, document);
|
|
353
412
|
}
|
|
354
413
|
|
|
355
414
|
/* document.fragment(html, context: ...) -> DocumentFragment bound to this
|
|
@@ -363,7 +422,7 @@ mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
|
|
|
363
422
|
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
|
|
364
423
|
lxb_tag_id_t tag;
|
|
365
424
|
lxb_ns_id_t ns;
|
|
366
|
-
mkr_resolve_fragment_context(
|
|
425
|
+
mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
|
|
367
426
|
return mkr_build_fragment_ctx(self, html, tag, ns);
|
|
368
427
|
}
|
|
369
428
|
|
|
@@ -379,14 +438,14 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
|
|
|
379
438
|
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
|
|
380
439
|
|
|
381
440
|
static const lxb_char_t shell[] = "<html><body></body></html>";
|
|
382
|
-
mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1);
|
|
441
|
+
mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1, true);
|
|
383
442
|
if (parsed == NULL) {
|
|
384
443
|
rb_raise(mkr_eError, "failed to create fragment document");
|
|
385
444
|
}
|
|
386
445
|
VALUE document = mkr_wrap_document(parsed); /* GC now owns parsed */
|
|
387
446
|
lxb_tag_id_t tag;
|
|
388
447
|
lxb_ns_id_t ns;
|
|
389
|
-
mkr_resolve_fragment_context(
|
|
448
|
+
mkr_resolve_fragment_context(mkr_html_doc_unwrap(document), context, &tag, &ns);
|
|
390
449
|
return mkr_build_fragment_ctx(document, html, tag, ns);
|
|
391
450
|
}
|
|
392
451
|
|
|
@@ -396,7 +455,7 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
|
|
|
396
455
|
static VALUE
|
|
397
456
|
mkr_node_parse(VALUE self, VALUE rb_html)
|
|
398
457
|
{
|
|
399
|
-
lxb_dom_node_t *node =
|
|
458
|
+
lxb_dom_node_t *node = mkr_html_node_unwrap(self);
|
|
400
459
|
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
|
|
401
460
|
rb_raise(rb_eArgError, "Node#parse requires an element context");
|
|
402
461
|
}
|
|
@@ -415,6 +474,7 @@ mkr_node_parse(VALUE self, VALUE rb_html)
|
|
|
415
474
|
typedef struct {
|
|
416
475
|
const lxb_char_t *src;
|
|
417
476
|
size_t len;
|
|
477
|
+
bool assume_valid;
|
|
418
478
|
mkr_parsed_t *result;
|
|
419
479
|
} mkr_parse_nogvl_t;
|
|
420
480
|
|
|
@@ -425,7 +485,7 @@ static void *
|
|
|
425
485
|
mkr_parse_nogvl(void *p)
|
|
426
486
|
{
|
|
427
487
|
mkr_parse_nogvl_t *a = (mkr_parse_nogvl_t *)p;
|
|
428
|
-
a->result = mkr_parse_html(a->src, a->len);
|
|
488
|
+
a->result = mkr_parse_html(a->src, a->len, a->assume_valid);
|
|
429
489
|
return NULL;
|
|
430
490
|
}
|
|
431
491
|
|
|
@@ -440,26 +500,36 @@ static VALUE
|
|
|
440
500
|
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
|
|
441
501
|
{
|
|
442
502
|
StringValue(rb_source);
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
*
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
*
|
|
453
|
-
*
|
|
454
|
-
*
|
|
455
|
-
*
|
|
503
|
+
/* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
|
|
504
|
+
* degradation), anything else is transcoded to UTF-8 so its content is
|
|
505
|
+
* preserved rather than read as raw UTF-8 bytes. */
|
|
506
|
+
rb_source = mkr_ruby_to_utf8(rb_source);
|
|
507
|
+
|
|
508
|
+
/* Copy the source into a C buffer up front - BEFORE allocating the wrapper
|
|
509
|
+
* (a Ruby allocation, and thus a GC point) - so no GC can run between
|
|
510
|
+
* obtaining rb_source (possibly a fresh transcoded String) and copying its
|
|
511
|
+
* bytes, and the parse can then run with the GVL released without racing
|
|
512
|
+
* GC/compaction on the Ruby String's backing store. The source is not
|
|
513
|
+
* retained past the parse (Lexbor copies what it needs into the arena and
|
|
514
|
+
* the line table is built up front), so the buffer is freed immediately
|
|
515
|
+
* after. The coderange is read first (no scan): a source Ruby already knows
|
|
516
|
+
* is valid UTF-8 lets the parse skip its sanitisation scan. */
|
|
517
|
+
bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
|
|
456
518
|
mkr_owned_bytes_t source = {0};
|
|
457
519
|
if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
|
|
458
520
|
rb_raise(mkr_eError, "out of memory copying source");
|
|
459
521
|
}
|
|
460
|
-
RB_GC_GUARD(rb_source);
|
|
461
522
|
|
|
462
|
-
|
|
523
|
+
/* Allocate the wrapper (with parsed == NULL) so that if parsing fails the
|
|
524
|
+
* GC-managed object frees cleanly. This is the HTML parse entry (defined on
|
|
525
|
+
* Makiri::HTML::Document), so the result is always HTML. */
|
|
526
|
+
mkr_doc_data_t *d;
|
|
527
|
+
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
|
|
528
|
+
d->parsed = NULL;
|
|
529
|
+
d->errors = rb_ary_new();
|
|
530
|
+
|
|
531
|
+
mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
|
|
532
|
+
assume_valid, NULL };
|
|
463
533
|
rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
|
|
464
534
|
mkr_owned_bytes_clear(&source);
|
|
465
535
|
|
|
@@ -479,8 +549,8 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
|
|
|
479
549
|
static VALUE
|
|
480
550
|
mkr_doc_root(VALUE self)
|
|
481
551
|
{
|
|
482
|
-
lxb_dom_document_t *doc =
|
|
483
|
-
return
|
|
552
|
+
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
|
|
553
|
+
return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
|
|
484
554
|
}
|
|
485
555
|
|
|
486
556
|
/* Get the document <title>, or "" if absent. */
|
|
@@ -489,7 +559,7 @@ mkr_doc_title(VALUE self)
|
|
|
489
559
|
{
|
|
490
560
|
size_t len = 0;
|
|
491
561
|
const lxb_char_t *str =
|
|
492
|
-
lxb_html_document_title((lxb_html_document_t *)
|
|
562
|
+
lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
|
|
493
563
|
return (str == NULL) ? rb_utf8_str_new("", 0)
|
|
494
564
|
: rb_utf8_str_new((const char *)str, len);
|
|
495
565
|
}
|
|
@@ -500,10 +570,10 @@ mkr_doc_title(VALUE self)
|
|
|
500
570
|
static VALUE
|
|
501
571
|
mkr_doc_internal_subset(VALUE self)
|
|
502
572
|
{
|
|
503
|
-
lxb_dom_node_t *doc = (lxb_dom_node_t *)
|
|
573
|
+
lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
|
|
504
574
|
for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
|
|
505
575
|
if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
|
|
506
|
-
return
|
|
576
|
+
return mkr_wrap_html_node(c, self);
|
|
507
577
|
}
|
|
508
578
|
}
|
|
509
579
|
return Qnil;
|
|
@@ -515,7 +585,7 @@ mkr_doc_internal_subset(VALUE self)
|
|
|
515
585
|
static VALUE
|
|
516
586
|
mkr_doc_quirks_mode(VALUE self)
|
|
517
587
|
{
|
|
518
|
-
return INT2NUM((int)
|
|
588
|
+
return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
|
|
519
589
|
}
|
|
520
590
|
|
|
521
591
|
/* Parse warnings. Reserved; currently always empty. */
|
|
@@ -530,18 +600,18 @@ mkr_doc_errors(VALUE self)
|
|
|
530
600
|
void
|
|
531
601
|
mkr_init_document(void)
|
|
532
602
|
{
|
|
533
|
-
rb_define_singleton_method(
|
|
534
|
-
rb_define_method(
|
|
535
|
-
rb_define_method(
|
|
536
|
-
rb_define_method(
|
|
537
|
-
rb_define_method(
|
|
538
|
-
rb_define_method(
|
|
539
|
-
rb_define_method(
|
|
540
|
-
rb_define_method(
|
|
603
|
+
rb_define_singleton_method(mkr_cHtmlDocument, "_parse", mkr_doc_s_parse, 1);
|
|
604
|
+
rb_define_method(mkr_cHtmlDocument, "root", mkr_doc_root, 0);
|
|
605
|
+
rb_define_method(mkr_cHtmlDocument, "title", mkr_doc_title, 0);
|
|
606
|
+
rb_define_method(mkr_cHtmlDocument, "errors", mkr_doc_errors, 0);
|
|
607
|
+
rb_define_method(mkr_cHtmlDocument, "internal_subset", mkr_doc_internal_subset, 0);
|
|
608
|
+
rb_define_method(mkr_cHtmlDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
|
|
609
|
+
rb_define_method(mkr_cHtmlDocument, "fragment", mkr_doc_fragment, -1);
|
|
610
|
+
rb_define_method(mkr_cHtmlDocument, "import_node", mkr_doc_import_node, -1);
|
|
541
611
|
|
|
542
612
|
rb_define_singleton_method(mkr_cDocumentFragment, "parse", mkr_frag_s_parse, -1);
|
|
543
613
|
|
|
544
614
|
/* Node#parse(html): fragment-parse in this element's context (Nokogiri
|
|
545
615
|
* compatible). Defined here, next to the fragment machinery it reuses. */
|
|
546
|
-
rb_define_method(
|
|
616
|
+
rb_define_method(mkr_mHtmlNodeMethods, "parse", mkr_node_parse, 1);
|
|
547
617
|
}
|