makiri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/conformance.yml +22 -0
  3. data/.github/workflows/libfuzzer.yml +83 -0
  4. data/.github/workflows/release.yml +12 -7
  5. data/.github/workflows/security.yml +88 -3
  6. data/.github/workflows/valgrind.yml +135 -0
  7. data/CHANGELOG.md +152 -15
  8. data/README.md +183 -13
  9. data/Rakefile +294 -7
  10. data/ext/makiri/bridge/bridge.h +28 -0
  11. data/ext/makiri/bridge/ruby_string.c +282 -12
  12. data/ext/makiri/core/mkr_alloc.c +40 -3
  13. data/ext/makiri/core/mkr_alloc.h +28 -5
  14. data/ext/makiri/core/mkr_buf.c +47 -3
  15. data/ext/makiri/core/mkr_buf.h +112 -3
  16. data/ext/makiri/core/mkr_core.c +143 -0
  17. data/ext/makiri/core/mkr_core.h +11 -2
  18. data/ext/makiri/core/mkr_hash.h +1 -1
  19. data/ext/makiri/core/mkr_span.h +186 -0
  20. data/ext/makiri/core/mkr_text.h +8 -8
  21. data/ext/makiri/core/mkr_utf8.c +101 -0
  22. data/ext/makiri/core/mkr_utf8.h +88 -0
  23. data/ext/makiri/extconf.rb +123 -10
  24. data/ext/makiri/fuzz/Makefile +95 -0
  25. data/ext/makiri/fuzz/check_fuzzer.cc +4 -0
  26. data/ext/makiri/fuzz/xml_fuzz.c +24 -0
  27. data/ext/makiri/fuzz/xpath_fuzz.c +109 -0
  28. data/ext/makiri/glue/glue.h +55 -11
  29. data/ext/makiri/glue/ruby_doc.c +129 -59
  30. data/ext/makiri/glue/ruby_html_css.c +292 -0
  31. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +248 -52
  32. data/ext/makiri/glue/ruby_html_node.c +859 -0
  33. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  34. data/ext/makiri/glue/ruby_node.c +74 -729
  35. data/ext/makiri/glue/ruby_node_set.c +167 -32
  36. data/ext/makiri/glue/ruby_xml.c +602 -0
  37. data/ext/makiri/glue/ruby_xml_node.c +1373 -0
  38. data/ext/makiri/glue/ruby_xpath.c +63 -30
  39. data/ext/makiri/glue/ruby_xpath.h +19 -0
  40. data/ext/makiri/lexbor_compat/compat.h +42 -9
  41. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  42. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  43. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  44. data/ext/makiri/lexbor_compat/source_loc.c +15 -13
  45. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  46. data/ext/makiri/lexbor_compat/utf8_input.c +19 -33
  47. data/ext/makiri/makiri.c +184 -6
  48. data/ext/makiri/makiri.h +43 -2
  49. data/ext/makiri/xml/mkr_xml.h +125 -0
  50. data/ext/makiri/xml/mkr_xml_chars.c +195 -0
  51. data/ext/makiri/xml/mkr_xml_index.c +169 -0
  52. data/ext/makiri/xml/mkr_xml_index.h +48 -0
  53. data/ext/makiri/xml/mkr_xml_mutate.c +817 -0
  54. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  55. data/ext/makiri/xml/mkr_xml_node.c +399 -0
  56. data/ext/makiri/xml/mkr_xml_node.h +184 -0
  57. data/ext/makiri/xml/mkr_xml_tree.c +1515 -0
  58. data/ext/makiri/xpath/mkr_css.c +1023 -0
  59. data/ext/makiri/xpath/mkr_css.h +65 -0
  60. data/ext/makiri/xpath/mkr_xpath.c +96 -32
  61. data/ext/makiri/xpath/mkr_xpath.h +109 -4
  62. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  63. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  64. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +551 -241
  65. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +318 -276
  66. data/ext/makiri/xpath/mkr_xpath_internal.h +177 -206
  67. data/ext/makiri/xpath/mkr_xpath_lex.c +95 -125
  68. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  69. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +145 -0
  70. data/ext/makiri/xpath/mkr_xpath_number.c +109 -0
  71. data/ext/makiri/xpath/mkr_xpath_parse.c +83 -94
  72. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  73. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  74. data/ext/makiri/xpath/mkr_xpath_shared.c +609 -0
  75. data/ext/makiri/xpath/mkr_xpath_value_body.h +801 -0
  76. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  77. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  78. data/lib/makiri/cdata_section.rb +19 -0
  79. data/lib/makiri/comment.rb +10 -0
  80. data/lib/makiri/compat_aliases.rb +30 -0
  81. data/lib/makiri/document.rb +9 -73
  82. data/lib/makiri/document_fragment.rb +14 -9
  83. data/lib/makiri/element.rb +4 -4
  84. data/lib/makiri/html/document.rb +106 -0
  85. data/lib/makiri/html/node_methods.rb +19 -0
  86. data/lib/makiri/html.rb +12 -0
  87. data/lib/makiri/node.rb +58 -15
  88. data/lib/makiri/node_set.rb +8 -0
  89. data/lib/makiri/processing_instruction.rb +10 -0
  90. data/lib/makiri/text.rb +1 -1
  91. data/lib/makiri/version.rb +1 -1
  92. data/lib/makiri/xml/builder.rb +263 -0
  93. data/lib/makiri/xml/document.rb +24 -0
  94. data/lib/makiri/xml/node_methods.rb +84 -0
  95. data/lib/makiri/xml.rb +10 -0
  96. data/lib/makiri/xpath_context.rb +1 -1
  97. data/lib/makiri.rb +24 -5
  98. data/script/build_native_gem.rb +2 -2
  99. data/script/check_alloc_failures.rb +266 -0
  100. data/script/check_c_safety.rb +77 -2
  101. data/script/check_c_safety_allowlist.yml +102 -0
  102. data/script/check_leaks.rb +64 -0
  103. data/script/leaks_harness.rb +64 -0
  104. data/vendor/lexbor/CMakeLists.txt +6 -0
  105. data/vendor/lexbor/README.md +12 -0
  106. data/vendor/lexbor/config.cmake +1 -1
  107. data/vendor/lexbor/source/lexbor/core/base.h +1 -1
  108. data/vendor/lexbor/source/lexbor/core/config.cmake +9 -1
  109. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +2 -3
  110. data/vendor/lexbor/source/lexbor/css/selectors/state.c +3 -0
  111. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +21 -0
  112. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +5 -0
  113. data/vendor/lexbor/source/lexbor/encoding/decode.c +33 -4
  114. data/vendor/lexbor/source/lexbor/html/base.h +1 -1
  115. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +4 -0
  116. data/vendor/lexbor/source/lexbor/html/serialize.c +545 -41
  117. data/vendor/lexbor/source/lexbor/html/serialize.h +2 -1
  118. data/vendor/lexbor/source/lexbor/html/tokenizer.h +2 -2
  119. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1 -1
  120. data/vendor/lexbor/source/lexbor/html/tree.c +6 -6
  121. data/vendor/lexbor/source/lexbor/selectors/selectors.c +12 -3
  122. data/vendor/lexbor/source/lexbor/url/base.h +1 -1
  123. data/vendor/lexbor/source/lexbor/url/url.c +5 -2
  124. data/vendor/lexbor/source/lexbor/url/url.h +9 -0
  125. data/vendor/lexbor/version +1 -1
  126. metadata +53 -9
  127. data/ext/makiri/glue/ruby_css.c +0 -185
  128. data/ext/makiri/glue/ruby_serialize.c +0 -92
  129. data/ext/makiri/xpath/mkr_xpath_value.c +0 -1286
  130. data/lib/makiri/cdata.rb +0 -6
@@ -8,11 +8,24 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ /* A DOM node pointer of UNKNOWN representation - an HTML lxb_dom_node_t or an XML
12
+ * mkr_xml_node_t - as stored in a node wrapper or a NodeSet. It is an INCOMPLETE
13
+ * type on purpose: it cannot be dereferenced, and (unlike void*) it does not
14
+ * implicitly convert to a typed pointer, so reading a stored node AS a specific
15
+ * representation requires an explicit cast that the kind-checked accessors
16
+ * (mkr_html_node_unwrap / mkr_xml_node_unwrap) justify by the wrapper's TypedData type
17
+ * (or, for a NodeSet, by doc_is_xml). The stored pointer is only ever
18
+ * pointer-compared or cast through one of those accessors. */
19
+ typedef struct mkr_raw_node mkr_raw_node_t;
20
+
11
21
  /* Wrapper for any DOM node except Document. The node memory is owned by the
12
- * document's Lexbor arena; we keep only the pointer plus a keepalive VALUE
13
- * reference to the Ruby Document so the arena outlives the wrapper. */
22
+ * document's arena (an HTML Lexbor arena or the XML node arena); we keep only the
23
+ * pointer plus a keepalive VALUE reference to the Ruby Document so the arena
24
+ * outlives the wrapper. The pointer is representation-opaque (mkr_raw_node_t):
25
+ * read it only through mkr_html_node_unwrap / mkr_xml_node_unwrap, which check the
26
+ * wrapper's representation (distinct TypedData types) before casting. */
14
27
  typedef struct {
15
- lxb_dom_node_t *node;
28
+ mkr_raw_node_t *node;
16
29
  VALUE document;
17
30
  } mkr_node_data_t;
18
31
 
@@ -31,14 +44,43 @@ extern const rb_data_type_t mkr_node_type;
31
44
  extern const rb_data_type_t mkr_doc_type;
32
45
  extern const rb_data_type_t mkr_node_set_type;
33
46
 
34
- /* Node bridge (glue/ruby_node.c). mkr_wrap_node returns the Document VALUE
47
+ /* Node bridge (glue/ruby_node.c). mkr_wrap_html_node returns the Document VALUE
35
48
  * for the document node, Qnil for NULL, otherwise a freshly-wrapped Node. */
36
- VALUE mkr_wrap_node(lxb_dom_node_t *node, VALUE document);
37
- lxb_dom_node_t *mkr_node_unwrap(VALUE rb_node);
49
+ VALUE mkr_wrap_html_node(lxb_dom_node_t *node, VALUE document);
38
50
  VALUE mkr_node_document(VALUE rb_node);
39
51
 
52
+ /* HTML and XML nodes are wrapped under DISTINCT TypedData types (both deriving
53
+ * from the shared base mkr_node_type), so a representation-specific accessor
54
+ * rejects the wrong kind via Ruby's type machinery. See ruby_node.c.
55
+ * mkr_html_node_unwrap -> lxb_dom_node_t* ; raises on an XML node/Document.
56
+ * mkr_xml_node_unwrap-> mkr_xml_node_t* ; raises on an HTML node/Document (ruby_xml_node.c).
57
+ * mkr_node_raw -> void* ; kind-agnostic raw pointer for identity, or for a
58
+ * site where the kind is already guaranteed. Deref needs an
59
+ * explicit cast - never treat it as a typed pointer blindly.
60
+ * mkr_node_id -> uintptr_t ; node identity for ==/eql?/hash/pointer_id. */
61
+ extern const rb_data_type_t mkr_html_node_type;
62
+ extern const rb_data_type_t mkr_xml_node_type;
63
+ lxb_dom_node_t *mkr_html_node_unwrap(VALUE rb_node);
64
+ void *mkr_node_raw(VALUE rb_node);
65
+ uintptr_t mkr_node_id(VALUE rb_node);
66
+
67
+ /* Representation-neutral identity methods (ruby_node.c): depend only on
68
+ * mkr_node_id, so the HTML and XML NodeMethods modules bind ==/eql? to
69
+ * mkr_node_equals, hash to mkr_node_hash, and pointer_id to mkr_node_pointer_id -
70
+ * one implementation, not one per representation. */
71
+ VALUE mkr_node_equals(VALUE self, VALUE other);
72
+ VALUE mkr_node_pointer_id(VALUE self);
73
+ VALUE mkr_node_hash(VALUE self);
74
+
75
+ /* XML node bridge (glue/ruby_xml_node.c): wrap a custom XML node into the right
76
+ * Makiri::XML::* leaf (Qnil for NULL, the Document VALUE for the document node). */
77
+ struct mkr_xml_node;
78
+ VALUE mkr_wrap_xml_node(struct mkr_xml_node *node, VALUE document);
79
+ /* XML node-pointer accessor; raises TypeError on an HTML node/Document. */
80
+ struct mkr_xml_node *mkr_xml_node_unwrap(VALUE rb_node);
81
+
40
82
  /* Document bridge (glue/ruby_doc.c). */
41
- lxb_dom_document_t *mkr_doc_unwrap(VALUE rb_doc);
83
+ lxb_dom_document_t *mkr_html_doc_unwrap(VALUE rb_doc);
42
84
  mkr_parsed_t *mkr_doc_parsed(VALUE rb_doc);
43
85
  VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownership */
44
86
 
@@ -46,7 +88,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
46
88
  * inner_html=/outer_html= so the UTF-8 sanitisation and import+template-fixup
47
89
  * are not duplicated.
48
90
  *
49
- * mkr_sanitize_html_input: decode rb_html for the fragment parser *out / *out_len
91
+ * mkr_sanitize_html_input: decode rb_html for the fragment parser - *out / *out_len
50
92
  * are the bytes to parse, *owned a malloc'd buffer to free afterwards (NULL when
51
93
  * the input is used in place). Returns 0, or -1 on OOM (nothing allocated), so
52
94
  * the caller can release its parser before raising. See mkr_utf8_sanitize.
@@ -54,7 +96,7 @@ VALUE mkr_wrap_document(mkr_parsed_t *parsed); /* GC takes ownersh
54
96
  * mkr_import_fragment_children: deep-import each child of `root` into `doc`, hand
55
97
  * it to `emit`, and fix up any <template> contents (which import_node omits).
56
98
  *
57
- * mkr_emit_append / mkr_emit_before: emit callbacks append as last child of
99
+ * mkr_emit_append / mkr_emit_before: emit callbacks - append as last child of
58
100
  * `u`, or insert before the reference node `u`. */
59
101
  int mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
60
102
  lxb_char_t **owned);
@@ -69,9 +111,11 @@ void mkr_emit_before(lxb_dom_node_t *imported, void *u);
69
111
  * mkr_init_node. */
70
112
  VALUE mkr_node_clone_node(int argc, VALUE *argv, VALUE self);
71
113
 
72
- /* NodeSet bridge (glue/ruby_node_set.c). */
114
+ /* NodeSet bridge (glue/ruby_node_set.c). mkr_raw_node_t (above): callers cast
115
+ * their typed node to it when pushing (forgetting the type is the safe, store
116
+ * direction); the single typed read-back lives in mkr_node_set_wrap. */
73
117
  VALUE mkr_node_set_new(VALUE document);
74
- void mkr_node_set_push(VALUE rb_set, lxb_dom_node_t *node);
118
+ void mkr_node_set_push(VALUE rb_set, mkr_raw_node_t *node);
75
119
 
76
120
  #ifdef __cplusplus
77
121
  }
@@ -1,6 +1,7 @@
1
1
  #include "glue.h"
2
2
  #include "../lexbor_compat/compat_internal.h" /* mkr_dom_preorder_next */
3
3
  #include "../core/mkr_core.h"
4
+ #include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
4
5
 
5
6
  #include <lexbor/html/parser.h>
6
7
  #include <ruby/thread.h>
@@ -32,23 +33,47 @@ mkr_doc_free(void *ptr)
32
33
  static size_t
33
34
  mkr_doc_memsize(const void *ptr)
34
35
  {
35
- /* The DOM arena size is not cheaply queryable; report the wrapper only. */
36
- (void)ptr;
37
- return sizeof(mkr_doc_data_t);
36
+ const mkr_doc_data_t *d = (const mkr_doc_data_t *)ptr;
37
+ size_t total = sizeof(mkr_doc_data_t);
38
+ /* The Lexbor (HTML) arena size is not cheaply queryable; report the wrapper
39
+ * only. The XML arena tracks its own byte total, so include it. */
40
+ if (d->parsed != NULL && mkr_parsed_kind(d->parsed) == MKR_DOC_XML) {
41
+ total += mkr_xml_doc_memsize(mkr_parsed_xml_doc(d->parsed));
42
+ }
43
+ return total;
38
44
  }
39
45
 
46
+ /* Like nodes, HTML and XML Documents share the mkr_doc_data_t layout and GC
47
+ * functions but are wrapped under DISTINCT TypedData types (both deriving from
48
+ * the shared base mkr_doc_type), so mkr_html_doc_unwrap - which reinterprets the
49
+ * parsed document as a Lexbor lxb_html_document_t - RAISES TypeError on an XML
50
+ * Document via Ruby's type machinery, instead of relying on the (NDEBUG-erased)
51
+ * assert in mkr_parsed_html_doc. mkr_doc_type (base) is kept for the kind-agnostic
52
+ * accessors (mkr_doc_parsed, #errors) that legitimately accept either. */
40
53
  const rb_data_type_t mkr_doc_type = {
41
54
  "Makiri::Document",
42
55
  { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
43
56
  0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
44
57
  };
58
+ static const rb_data_type_t mkr_html_doc_type = {
59
+ "Makiri::HTML::Document",
60
+ { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
61
+ &mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
62
+ };
63
+ static const rb_data_type_t mkr_xml_doc_type = {
64
+ "Makiri::XML::Document",
65
+ { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
66
+ &mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
67
+ };
45
68
 
46
69
  lxb_dom_document_t *
47
- mkr_doc_unwrap(VALUE rb_doc)
70
+ mkr_html_doc_unwrap(VALUE rb_doc)
48
71
  {
49
72
  mkr_doc_data_t *d;
50
- TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_doc_type, d);
51
- return (lxb_dom_document_t *)d->parsed->doc;
73
+ /* mkr_html_doc_type rejects an XML Document at the type boundary (its type
74
+ * chain does not include mkr_html_doc_type). */
75
+ TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_html_doc_type, d);
76
+ return (lxb_dom_document_t *)mkr_parsed_html_doc(d->parsed);
52
77
  }
53
78
 
54
79
  mkr_parsed_t *
@@ -59,13 +84,19 @@ mkr_doc_parsed(VALUE rb_doc)
59
84
  return d->parsed;
60
85
  }
61
86
 
62
- /* Wrap an owned mkr_parsed_t as a Makiri::Document. GC takes ownership of
63
- * +parsed+ (freed in dfree). Used to back a standalone DocumentFragment. */
87
+ /* Wrap an owned mkr_parsed_t as a Document. GC takes ownership of +parsed+
88
+ * (freed in dfree). The Ruby leaf class is chosen by kind: a Lexbor-backed
89
+ * handle becomes Makiri::Document (HTML), an arena-backed one
90
+ * Makiri::XML::Document (§2.3). Used to back a parsed document or a standalone
91
+ * DocumentFragment. */
64
92
  VALUE
65
93
  mkr_wrap_document(mkr_parsed_t *parsed)
66
94
  {
95
+ int is_xml = (mkr_parsed_kind(parsed) == MKR_DOC_XML);
96
+ VALUE klass = is_xml ? mkr_cXmlDocument : mkr_cHtmlDocument;
67
97
  mkr_doc_data_t *d;
68
- VALUE obj = TypedData_Make_Struct(mkr_cDocument, mkr_doc_data_t, &mkr_doc_type, d);
98
+ VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t,
99
+ is_xml ? &mkr_xml_doc_type : &mkr_html_doc_type, d);
69
100
  d->parsed = parsed;
70
101
  d->errors = rb_ary_new();
71
102
  return obj;
@@ -95,7 +126,7 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
95
126
  }
96
127
 
97
128
  if (rb_obj_is_kind_of(context, mkr_cNode)) {
98
- lxb_dom_node_t *cn = mkr_node_unwrap(context);
129
+ lxb_dom_node_t *cn = mkr_html_node_unwrap(context); /* reject an XML node before lxb use */
99
130
  if (cn->type != LXB_DOM_NODE_TYPE_ELEMENT) {
100
131
  rb_raise(rb_eArgError, "fragment context node must be an element");
101
132
  }
@@ -109,10 +140,10 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
109
140
  mkr_ruby_borrowed_text_t cv = mkr_ruby_verified_text(context, "fragment context element");
110
141
  const lxb_char_t *p = (const lxb_char_t *)cv.ptr;
111
142
  size_t n = cv.len;
112
- if (n == 3 && memcmp(p, "svg", 3) == 0) {
143
+ if (mkr_bytes_eq(p, n, "svg", 3)) {
113
144
  *out_tag = LXB_TAG_SVG; *out_ns = LXB_NS_SVG; return;
114
145
  }
115
- if (n == 4 && memcmp(p, "math", 4) == 0) {
146
+ if (mkr_bytes_eq(p, n, "math", 4)) {
116
147
  *out_tag = LXB_TAG_MATH; *out_ns = LXB_NS_MATH; return;
117
148
  }
118
149
  lxb_tag_id_t tid = lxb_tag_id_by_name(doc->tags, p, n);
@@ -207,17 +238,45 @@ mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
207
238
  /* Browser-compatible decoding: invalid UTF-8 -> U+FFFD; valid input is used
208
239
  * in place (no copy, *owned == NULL). Returns -1 on OOM (nothing allocated)
209
240
  * so the caller can release its parser before raising. */
210
- mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(html);
241
+ VALUE u8 = mkr_ruby_to_utf8(html); /* honour the input encoding (-> UTF-8) */
242
+ mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(u8);
243
+
244
+ if (u8 != html) {
245
+ /* Transcoded to UTF-8: a fresh String that nothing keeps alive past this
246
+ * return, so we must NOT borrow its bytes. It is already valid UTF-8, so
247
+ * copy it into an owned buffer (the caller frees *owned) - no sanitise. */
248
+ size_t n = (hv.len > 0) ? hv.len : 1;
249
+ char *buf = mkr_reallocarray(NULL, n, 1);
250
+ if (buf == NULL) {
251
+ return -1;
252
+ }
253
+ if (hv.len > 0) {
254
+ memcpy(buf, hv.ptr, hv.len);
255
+ }
256
+ *owned = (lxb_char_t *)buf;
257
+ *out = (const lxb_char_t *)buf;
258
+ *out_len = hv.len;
259
+ RB_GC_GUARD(hv.value);
260
+ return 0;
261
+ }
262
+
263
+ /* Not transcoded (UTF-8 / US-ASCII / binary): input Ruby already knows is
264
+ * valid UTF-8 is borrowed in place (the caller keeps `html` alive);
265
+ * otherwise sanitise as before. */
266
+ if (mkr_ruby_str_known_valid_utf8(html)) {
267
+ *owned = NULL;
268
+ *out = (const lxb_char_t *)hv.ptr;
269
+ *out_len = hv.len;
270
+ return 0;
271
+ }
211
272
  lxb_char_t *clean = NULL;
212
273
  size_t clean_len = 0;
213
274
  if (mkr_utf8_sanitize((const lxb_char_t *)hv.ptr, hv.len, &clean, &clean_len) != 0) {
214
- RB_GC_GUARD(hv.value);
215
275
  return -1;
216
276
  }
217
277
  *owned = clean;
218
278
  *out = (clean != NULL) ? clean : (const lxb_char_t *)hv.ptr;
219
279
  *out_len = (clean != NULL) ? clean_len : hv.len;
220
- RB_GC_GUARD(hv.value);
221
280
  return 0;
222
281
  }
223
282
 
@@ -249,7 +308,7 @@ mkr_import_fragment_children(lxb_dom_document_t *doc, lxb_dom_node_t *root,
249
308
  }
250
309
 
251
310
  /* Node#clone_node(deep = false): a shallow (or deep, with deep truthy) copy of
252
- * this node, owned by the same document and detached from any parent the DOM
311
+ * this node, owned by the same document and detached from any parent - the DOM
253
312
  * cloneNode, whose `deep` defaults to false (a missing/nil/false argument =>
254
313
  * shallow). Built on the same import_node + <template>-content fixup the
255
314
  * fragment parser uses, so a deep-cloned <template> carries its contents (which
@@ -262,7 +321,7 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
262
321
  rb_scan_args(argc, argv, "01", &deep_v);
263
322
  bool deep = RTEST(deep_v);
264
323
 
265
- lxb_dom_node_t *node = mkr_node_unwrap(self);
324
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
266
325
  lxb_dom_document_t *doc = node->owner_document;
267
326
 
268
327
  lxb_dom_node_t *clone = lxb_dom_document_import_node(doc, node, deep);
@@ -272,11 +331,11 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
272
331
  if (deep) {
273
332
  mkr_fixup_template_content(doc, node, clone);
274
333
  }
275
- return mkr_wrap_node(clone, mkr_node_document(self));
334
+ return mkr_wrap_html_node(clone, mkr_node_document(self));
276
335
  }
277
336
 
278
337
  /* Document#import_node(node, deep = false): a shallow (or deep, with deep
279
- * truthy) copy of +node+ owned by THIS document the DOM importNode, whose
338
+ * truthy) copy of +node+ owned by THIS document - the DOM importNode, whose
280
339
  * `deep` defaults to false (a missing/nil/false argument => shallow). Unlike
281
340
  * Node#clone_node, the copy is owned by the receiver rather than the node's own
282
341
  * document, so it is the way to bring a node across documents (Makiri never
@@ -290,8 +349,8 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
290
349
  rb_scan_args(argc, argv, "11", &node_v, &deep_v);
291
350
  bool deep = RTEST(deep_v);
292
351
 
293
- lxb_dom_node_t *src = mkr_node_unwrap(node_v);
294
- lxb_dom_document_t *doc = mkr_doc_unwrap(self);
352
+ lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
353
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
295
354
 
296
355
  lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
297
356
  if (imp == NULL) {
@@ -300,7 +359,7 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
300
359
  if (deep) {
301
360
  mkr_fixup_template_content(doc, src, imp);
302
361
  }
303
- return mkr_wrap_node(imp, self);
362
+ return mkr_wrap_html_node(imp, self);
304
363
  }
305
364
 
306
365
  /* Parse +rb_html+ as a fragment in the given (tag id, namespace) context and
@@ -315,7 +374,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
315
374
  lxb_tag_id_t ctx_tag, lxb_ns_id_t ctx_ns)
316
375
  {
317
376
  VALUE html = rb_String(rb_html);
318
- lxb_dom_document_t *doc = mkr_doc_unwrap(document);
377
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(document);
319
378
 
320
379
  lxb_dom_document_fragment_t *frag = lxb_dom_document_fragment_interface_create(doc);
321
380
  if (frag == NULL) {
@@ -349,7 +408,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
349
408
 
350
409
  lxb_html_parser_destroy(parser);
351
410
  RB_GC_GUARD(html);
352
- return mkr_wrap_node(frag_node, document);
411
+ return mkr_wrap_html_node(frag_node, document);
353
412
  }
354
413
 
355
414
  /* document.fragment(html, context: ...) -> DocumentFragment bound to this
@@ -363,7 +422,7 @@ mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
363
422
  : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
364
423
  lxb_tag_id_t tag;
365
424
  lxb_ns_id_t ns;
366
- mkr_resolve_fragment_context(mkr_doc_unwrap(self), context, &tag, &ns);
425
+ mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
367
426
  return mkr_build_fragment_ctx(self, html, tag, ns);
368
427
  }
369
428
 
@@ -379,14 +438,14 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
379
438
  : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
380
439
 
381
440
  static const lxb_char_t shell[] = "<html><body></body></html>";
382
- mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1);
441
+ mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1, true);
383
442
  if (parsed == NULL) {
384
443
  rb_raise(mkr_eError, "failed to create fragment document");
385
444
  }
386
445
  VALUE document = mkr_wrap_document(parsed); /* GC now owns parsed */
387
446
  lxb_tag_id_t tag;
388
447
  lxb_ns_id_t ns;
389
- mkr_resolve_fragment_context(mkr_doc_unwrap(document), context, &tag, &ns);
448
+ mkr_resolve_fragment_context(mkr_html_doc_unwrap(document), context, &tag, &ns);
390
449
  return mkr_build_fragment_ctx(document, html, tag, ns);
391
450
  }
392
451
 
@@ -396,7 +455,7 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
396
455
  static VALUE
397
456
  mkr_node_parse(VALUE self, VALUE rb_html)
398
457
  {
399
- lxb_dom_node_t *node = mkr_node_unwrap(self);
458
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
400
459
  if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
401
460
  rb_raise(rb_eArgError, "Node#parse requires an element context");
402
461
  }
@@ -415,6 +474,7 @@ mkr_node_parse(VALUE self, VALUE rb_html)
415
474
  typedef struct {
416
475
  const lxb_char_t *src;
417
476
  size_t len;
477
+ bool assume_valid;
418
478
  mkr_parsed_t *result;
419
479
  } mkr_parse_nogvl_t;
420
480
 
@@ -425,7 +485,7 @@ static void *
425
485
  mkr_parse_nogvl(void *p)
426
486
  {
427
487
  mkr_parse_nogvl_t *a = (mkr_parse_nogvl_t *)p;
428
- a->result = mkr_parse_html(a->src, a->len);
488
+ a->result = mkr_parse_html(a->src, a->len, a->assume_valid);
429
489
  return NULL;
430
490
  }
431
491
 
@@ -440,26 +500,36 @@ static VALUE
440
500
  mkr_doc_s_parse(VALUE klass, VALUE rb_source)
441
501
  {
442
502
  StringValue(rb_source);
443
-
444
- /* Allocate the wrapper first (with parsed == NULL) so that if parsing
445
- * fails the GC-managed object frees cleanly. */
446
- mkr_doc_data_t *d;
447
- VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_doc_type, d);
448
- d->parsed = NULL;
449
- d->errors = rb_ary_new();
450
-
451
- /* Copy the source into a C buffer so the parse can run with the GVL
452
- * released without racing GC/compaction on the Ruby String's backing
453
- * store. The source is not retained past the parse (Lexbor copies what it
454
- * needs into the arena and the line table is built up front), so the
455
- * buffer is freed immediately after. */
503
+ /* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
504
+ * degradation), anything else is transcoded to UTF-8 so its content is
505
+ * preserved rather than read as raw UTF-8 bytes. */
506
+ rb_source = mkr_ruby_to_utf8(rb_source);
507
+
508
+ /* Copy the source into a C buffer up front - BEFORE allocating the wrapper
509
+ * (a Ruby allocation, and thus a GC point) - so no GC can run between
510
+ * obtaining rb_source (possibly a fresh transcoded String) and copying its
511
+ * bytes, and the parse can then run with the GVL released without racing
512
+ * GC/compaction on the Ruby String's backing store. The source is not
513
+ * retained past the parse (Lexbor copies what it needs into the arena and
514
+ * the line table is built up front), so the buffer is freed immediately
515
+ * after. The coderange is read first (no scan): a source Ruby already knows
516
+ * is valid UTF-8 lets the parse skip its sanitisation scan. */
517
+ bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
456
518
  mkr_owned_bytes_t source = {0};
457
519
  if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
458
520
  rb_raise(mkr_eError, "out of memory copying source");
459
521
  }
460
- RB_GC_GUARD(rb_source);
461
522
 
462
- mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len, NULL };
523
+ /* Allocate the wrapper (with parsed == NULL) so that if parsing fails the
524
+ * GC-managed object frees cleanly. This is the HTML parse entry (defined on
525
+ * Makiri::HTML::Document), so the result is always HTML. */
526
+ mkr_doc_data_t *d;
527
+ VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
528
+ d->parsed = NULL;
529
+ d->errors = rb_ary_new();
530
+
531
+ mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
532
+ assume_valid, NULL };
463
533
  rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
464
534
  mkr_owned_bytes_clear(&source);
465
535
 
@@ -479,8 +549,8 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
479
549
  static VALUE
480
550
  mkr_doc_root(VALUE self)
481
551
  {
482
- lxb_dom_document_t *doc = mkr_doc_unwrap(self);
483
- return mkr_wrap_node(lxb_dom_document_root(doc), self);
552
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
553
+ return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
484
554
  }
485
555
 
486
556
  /* Get the document <title>, or "" if absent. */
@@ -489,7 +559,7 @@ mkr_doc_title(VALUE self)
489
559
  {
490
560
  size_t len = 0;
491
561
  const lxb_char_t *str =
492
- lxb_html_document_title((lxb_html_document_t *)mkr_doc_unwrap(self), &len);
562
+ lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
493
563
  return (str == NULL) ? rb_utf8_str_new("", 0)
494
564
  : rb_utf8_str_new((const char *)str, len);
495
565
  }
@@ -500,10 +570,10 @@ mkr_doc_title(VALUE self)
500
570
  static VALUE
501
571
  mkr_doc_internal_subset(VALUE self)
502
572
  {
503
- lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_doc_unwrap(self);
573
+ lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
504
574
  for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
505
575
  if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
506
- return mkr_wrap_node(c, self);
576
+ return mkr_wrap_html_node(c, self);
507
577
  }
508
578
  }
509
579
  return Qnil;
@@ -515,7 +585,7 @@ mkr_doc_internal_subset(VALUE self)
515
585
  static VALUE
516
586
  mkr_doc_quirks_mode(VALUE self)
517
587
  {
518
- return INT2NUM((int)mkr_doc_unwrap(self)->compat_mode);
588
+ return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
519
589
  }
520
590
 
521
591
  /* Parse warnings. Reserved; currently always empty. */
@@ -530,18 +600,18 @@ mkr_doc_errors(VALUE self)
530
600
  void
531
601
  mkr_init_document(void)
532
602
  {
533
- rb_define_singleton_method(mkr_cDocument, "_parse", mkr_doc_s_parse, 1);
534
- rb_define_method(mkr_cDocument, "root", mkr_doc_root, 0);
535
- rb_define_method(mkr_cDocument, "title", mkr_doc_title, 0);
536
- rb_define_method(mkr_cDocument, "errors", mkr_doc_errors, 0);
537
- rb_define_method(mkr_cDocument, "internal_subset", mkr_doc_internal_subset, 0);
538
- rb_define_method(mkr_cDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
539
- rb_define_method(mkr_cDocument, "fragment", mkr_doc_fragment, -1);
540
- rb_define_method(mkr_cDocument, "import_node", mkr_doc_import_node, -1);
603
+ rb_define_singleton_method(mkr_cHtmlDocument, "_parse", mkr_doc_s_parse, 1);
604
+ rb_define_method(mkr_cHtmlDocument, "root", mkr_doc_root, 0);
605
+ rb_define_method(mkr_cHtmlDocument, "title", mkr_doc_title, 0);
606
+ rb_define_method(mkr_cHtmlDocument, "errors", mkr_doc_errors, 0);
607
+ rb_define_method(mkr_cHtmlDocument, "internal_subset", mkr_doc_internal_subset, 0);
608
+ rb_define_method(mkr_cHtmlDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
609
+ rb_define_method(mkr_cHtmlDocument, "fragment", mkr_doc_fragment, -1);
610
+ rb_define_method(mkr_cHtmlDocument, "import_node", mkr_doc_import_node, -1);
541
611
 
542
612
  rb_define_singleton_method(mkr_cDocumentFragment, "parse", mkr_frag_s_parse, -1);
543
613
 
544
614
  /* Node#parse(html): fragment-parse in this element's context (Nokogiri
545
615
  * compatible). Defined here, next to the fragment machinery it reuses. */
546
- rb_define_method(mkr_cNode, "parse", mkr_node_parse, 1);
616
+ rb_define_method(mkr_mHtmlNodeMethods, "parse", mkr_node_parse, 1);
547
617
  }