makiri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +12 -7
  3. data/CHANGELOG.md +93 -14
  4. data/README.md +173 -7
  5. data/Rakefile +103 -7
  6. data/ext/makiri/bridge/bridge.h +28 -0
  7. data/ext/makiri/bridge/ruby_string.c +217 -0
  8. data/ext/makiri/core/mkr_alloc.h +1 -1
  9. data/ext/makiri/core/mkr_buf.c +35 -1
  10. data/ext/makiri/core/mkr_buf.h +37 -3
  11. data/ext/makiri/core/mkr_core.h +1 -1
  12. data/ext/makiri/core/mkr_hash.h +1 -1
  13. data/ext/makiri/core/mkr_text.h +8 -8
  14. data/ext/makiri/extconf.rb +20 -2
  15. data/ext/makiri/glue/glue.h +47 -11
  16. data/ext/makiri/glue/ruby_doc.c +117 -43
  17. data/ext/makiri/glue/ruby_html_css.c +246 -0
  18. data/ext/makiri/glue/{ruby_mutate.c → ruby_html_mutate.c} +242 -51
  19. data/ext/makiri/glue/ruby_html_node.c +888 -0
  20. data/ext/makiri/glue/ruby_html_serialize.c +154 -0
  21. data/ext/makiri/glue/ruby_node.c +54 -748
  22. data/ext/makiri/glue/ruby_node_set.c +167 -32
  23. data/ext/makiri/glue/ruby_xml.c +420 -0
  24. data/ext/makiri/glue/ruby_xml_node.c +1386 -0
  25. data/ext/makiri/glue/ruby_xpath.c +59 -26
  26. data/ext/makiri/glue/ruby_xpath.h +19 -0
  27. data/ext/makiri/lexbor_compat/compat.h +42 -9
  28. data/ext/makiri/lexbor_compat/compat_internal.h +1 -1
  29. data/ext/makiri/lexbor_compat/dom_index.c +2 -2
  30. data/ext/makiri/lexbor_compat/post_parse.c +100 -10
  31. data/ext/makiri/lexbor_compat/source_loc.c +13 -9
  32. data/ext/makiri/lexbor_compat/text_index.c +14 -8
  33. data/ext/makiri/lexbor_compat/utf8_input.c +85 -26
  34. data/ext/makiri/makiri.c +139 -6
  35. data/ext/makiri/makiri.h +43 -2
  36. data/ext/makiri/xml/mkr_xml.h +126 -0
  37. data/ext/makiri/xml/mkr_xml_chars.c +225 -0
  38. data/ext/makiri/xml/mkr_xml_mutate.c +875 -0
  39. data/ext/makiri/xml/mkr_xml_mutate.h +139 -0
  40. data/ext/makiri/xml/mkr_xml_node.c +267 -0
  41. data/ext/makiri/xml/mkr_xml_node.h +119 -0
  42. data/ext/makiri/xml/mkr_xml_tree.c +1479 -0
  43. data/ext/makiri/xpath/mkr_xpath.c +59 -32
  44. data/ext/makiri/xpath/mkr_xpath.h +96 -4
  45. data/ext/makiri/xpath/mkr_xpath_engine_html.c +17 -0
  46. data/ext/makiri/xpath/mkr_xpath_engine_xml.c +12 -0
  47. data/ext/makiri/xpath/{mkr_xpath_eval.c → mkr_xpath_eval_body.h} +202 -175
  48. data/ext/makiri/xpath/{mkr_xpath_funcs.c → mkr_xpath_funcs_body.h} +110 -86
  49. data/ext/makiri/xpath/mkr_xpath_internal.h +91 -200
  50. data/ext/makiri/xpath/mkr_xpath_lex.c +2 -2
  51. data/ext/makiri/xpath/mkr_xpath_node_access_html.h +138 -0
  52. data/ext/makiri/xpath/mkr_xpath_node_access_xml.h +142 -0
  53. data/ext/makiri/xpath/mkr_xpath_parse.c +5 -5
  54. data/ext/makiri/xpath/mkr_xpath_prelude_html.h +30 -0
  55. data/ext/makiri/xpath/mkr_xpath_prelude_xml.h +28 -0
  56. data/ext/makiri/xpath/mkr_xpath_shared.c +593 -0
  57. data/ext/makiri/xpath/{mkr_xpath_value.c → mkr_xpath_value_body.h} +145 -656
  58. data/ext/makiri/xpath/mkr_xpath_xml_selftest.c +76 -0
  59. data/lib/makiri/{attribute.rb → attr.rb} +7 -3
  60. data/lib/makiri/cdata_section.rb +21 -0
  61. data/lib/makiri/comment.rb +12 -0
  62. data/lib/makiri/compat_aliases.rb +30 -0
  63. data/lib/makiri/document.rb +4 -76
  64. data/lib/makiri/document_fragment.rb +14 -9
  65. data/lib/makiri/element.rb +5 -3
  66. data/lib/makiri/html/document.rb +106 -0
  67. data/lib/makiri/html/node_methods.rb +19 -0
  68. data/lib/makiri/html.rb +12 -0
  69. data/lib/makiri/node.rb +58 -15
  70. data/lib/makiri/node_set.rb +8 -0
  71. data/lib/makiri/processing_instruction.rb +12 -0
  72. data/lib/makiri/text.rb +2 -0
  73. data/lib/makiri/version.rb +1 -1
  74. data/lib/makiri/xml/document.rb +24 -0
  75. data/lib/makiri/xml/node_methods.rb +37 -0
  76. data/lib/makiri/xml.rb +10 -0
  77. data/lib/makiri/xpath_context.rb +1 -1
  78. data/lib/makiri.rb +23 -5
  79. data/script/build_native_gem.rb +2 -2
  80. data/script/check_c_safety.rb +32 -0
  81. data/script/check_c_safety_allowlist.yml +83 -0
  82. metadata +35 -9
  83. data/ext/makiri/glue/ruby_css.c +0 -185
  84. data/ext/makiri/glue/ruby_serialize.c +0 -92
  85. data/lib/makiri/cdata.rb +0 -6
@@ -1,6 +1,7 @@
1
1
  #include "glue.h"
2
2
  #include "../lexbor_compat/compat_internal.h" /* mkr_dom_preorder_next */
3
3
  #include "../core/mkr_core.h"
4
+ #include "../xml/mkr_xml.h" /* mkr_xml_doc_memsize for an XML-backed document */
4
5
 
5
6
  #include <lexbor/html/parser.h>
6
7
  #include <ruby/thread.h>
@@ -32,23 +33,47 @@ mkr_doc_free(void *ptr)
32
33
  static size_t
33
34
  mkr_doc_memsize(const void *ptr)
34
35
  {
35
- /* The DOM arena size is not cheaply queryable; report the wrapper only. */
36
- (void)ptr;
37
- return sizeof(mkr_doc_data_t);
36
+ const mkr_doc_data_t *d = (const mkr_doc_data_t *)ptr;
37
+ size_t total = sizeof(mkr_doc_data_t);
38
+ /* The Lexbor (HTML) arena size is not cheaply queryable; report the wrapper
39
+ * only. The XML arena tracks its own byte total, so include it. */
40
+ if (d->parsed != NULL && mkr_parsed_kind(d->parsed) == MKR_DOC_XML) {
41
+ total += mkr_xml_doc_memsize(mkr_parsed_xml_doc(d->parsed));
42
+ }
43
+ return total;
38
44
  }
39
45
 
46
+ /* Like nodes, HTML and XML Documents share the mkr_doc_data_t layout and GC
47
+ * functions but are wrapped under DISTINCT TypedData types (both deriving from
48
+ * the shared base mkr_doc_type), so mkr_html_doc_unwrap - which reinterprets the
49
+ * parsed document as a Lexbor lxb_html_document_t - RAISES TypeError on an XML
50
+ * Document via Ruby's type machinery, instead of relying on the (NDEBUG-erased)
51
+ * assert in mkr_parsed_html_doc. mkr_doc_type (base) is kept for the kind-agnostic
52
+ * accessors (mkr_doc_parsed, #errors) that legitimately accept either. */
40
53
  const rb_data_type_t mkr_doc_type = {
41
54
  "Makiri::Document",
42
55
  { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
43
56
  0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
44
57
  };
58
+ static const rb_data_type_t mkr_html_doc_type = {
59
+ "Makiri::HTML::Document",
60
+ { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
61
+ &mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
62
+ };
63
+ static const rb_data_type_t mkr_xml_doc_type = {
64
+ "Makiri::XML::Document",
65
+ { mkr_doc_mark, mkr_doc_free, mkr_doc_memsize, },
66
+ &mkr_doc_type, 0, RUBY_TYPED_FREE_IMMEDIATELY,
67
+ };
45
68
 
46
69
  lxb_dom_document_t *
47
- mkr_doc_unwrap(VALUE rb_doc)
70
+ mkr_html_doc_unwrap(VALUE rb_doc)
48
71
  {
49
72
  mkr_doc_data_t *d;
50
- TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_doc_type, d);
51
- return (lxb_dom_document_t *)d->parsed->doc;
73
+ /* mkr_html_doc_type rejects an XML Document at the type boundary (its type
74
+ * chain does not include mkr_html_doc_type). */
75
+ TypedData_Get_Struct(rb_doc, mkr_doc_data_t, &mkr_html_doc_type, d);
76
+ return (lxb_dom_document_t *)mkr_parsed_html_doc(d->parsed);
52
77
  }
53
78
 
54
79
  mkr_parsed_t *
@@ -59,13 +84,19 @@ mkr_doc_parsed(VALUE rb_doc)
59
84
  return d->parsed;
60
85
  }
61
86
 
62
- /* Wrap an owned mkr_parsed_t as a Makiri::Document. GC takes ownership of
63
- * +parsed+ (freed in dfree). Used to back a standalone DocumentFragment. */
87
+ /* Wrap an owned mkr_parsed_t as a Document. GC takes ownership of +parsed+
88
+ * (freed in dfree). The Ruby leaf class is chosen by kind: a Lexbor-backed
89
+ * handle becomes Makiri::Document (HTML), an arena-backed one
90
+ * Makiri::XML::Document (§2.3). Used to back a parsed document or a standalone
91
+ * DocumentFragment. */
64
92
  VALUE
65
93
  mkr_wrap_document(mkr_parsed_t *parsed)
66
94
  {
95
+ int is_xml = (mkr_parsed_kind(parsed) == MKR_DOC_XML);
96
+ VALUE klass = is_xml ? mkr_cXmlDocument : mkr_cHtmlDocument;
67
97
  mkr_doc_data_t *d;
68
- VALUE obj = TypedData_Make_Struct(mkr_cDocument, mkr_doc_data_t, &mkr_doc_type, d);
98
+ VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t,
99
+ is_xml ? &mkr_xml_doc_type : &mkr_html_doc_type, d);
69
100
  d->parsed = parsed;
70
101
  d->errors = rb_ary_new();
71
102
  return obj;
@@ -95,7 +126,7 @@ mkr_resolve_fragment_context(lxb_dom_document_t *doc, VALUE context,
95
126
  }
96
127
 
97
128
  if (rb_obj_is_kind_of(context, mkr_cNode)) {
98
- lxb_dom_node_t *cn = mkr_node_unwrap(context);
129
+ lxb_dom_node_t *cn = mkr_html_node_unwrap(context); /* reject an XML node before lxb use */
99
130
  if (cn->type != LXB_DOM_NODE_TYPE_ELEMENT) {
100
131
  rb_raise(rb_eArgError, "fragment context node must be an element");
101
132
  }
@@ -207,7 +238,39 @@ mkr_sanitize_html_input(VALUE html, const lxb_char_t **out, size_t *out_len,
207
238
  /* Browser-compatible decoding: invalid UTF-8 -> U+FFFD; valid input is used
208
239
  * in place (no copy, *owned == NULL). Returns -1 on OOM (nothing allocated)
209
240
  * so the caller can release its parser before raising. */
210
- mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(html);
241
+ VALUE u8 = mkr_ruby_to_utf8(html); /* honour the input encoding (-> UTF-8) */
242
+ mkr_ruby_borrowed_bytes_t hv = mkr_ruby_bytes_view(u8);
243
+
244
+ if (u8 != html) {
245
+ /* Transcoded to UTF-8: a fresh String that nothing keeps alive past this
246
+ * return, so we must NOT borrow its bytes. It is already valid UTF-8, so
247
+ * copy it into an owned buffer (the caller frees *owned) - no sanitise. */
248
+ size_t n = (hv.len > 0) ? hv.len : 1;
249
+ char *buf = mkr_reallocarray(NULL, n, 1);
250
+ if (buf == NULL) {
251
+ RB_GC_GUARD(hv.value);
252
+ return -1;
253
+ }
254
+ if (hv.len > 0) {
255
+ memcpy(buf, hv.ptr, hv.len);
256
+ }
257
+ *owned = (lxb_char_t *)buf;
258
+ *out = (const lxb_char_t *)buf;
259
+ *out_len = hv.len;
260
+ RB_GC_GUARD(hv.value);
261
+ return 0;
262
+ }
263
+
264
+ /* Not transcoded (UTF-8 / US-ASCII / binary): input Ruby already knows is
265
+ * valid UTF-8 is borrowed in place (the caller keeps `html` alive);
266
+ * otherwise sanitise as before. */
267
+ if (mkr_ruby_str_known_valid_utf8(html)) {
268
+ *owned = NULL;
269
+ *out = (const lxb_char_t *)hv.ptr;
270
+ *out_len = hv.len;
271
+ RB_GC_GUARD(hv.value);
272
+ return 0;
273
+ }
211
274
  lxb_char_t *clean = NULL;
212
275
  size_t clean_len = 0;
213
276
  if (mkr_utf8_sanitize((const lxb_char_t *)hv.ptr, hv.len, &clean, &clean_len) != 0) {
@@ -249,7 +312,7 @@ mkr_import_fragment_children(lxb_dom_document_t *doc, lxb_dom_node_t *root,
249
312
  }
250
313
 
251
314
  /* Node#clone_node(deep = false): a shallow (or deep, with deep truthy) copy of
252
- * this node, owned by the same document and detached from any parent the DOM
315
+ * this node, owned by the same document and detached from any parent - the DOM
253
316
  * cloneNode, whose `deep` defaults to false (a missing/nil/false argument =>
254
317
  * shallow). Built on the same import_node + <template>-content fixup the
255
318
  * fragment parser uses, so a deep-cloned <template> carries its contents (which
@@ -262,7 +325,7 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
262
325
  rb_scan_args(argc, argv, "01", &deep_v);
263
326
  bool deep = RTEST(deep_v);
264
327
 
265
- lxb_dom_node_t *node = mkr_node_unwrap(self);
328
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
266
329
  lxb_dom_document_t *doc = node->owner_document;
267
330
 
268
331
  lxb_dom_node_t *clone = lxb_dom_document_import_node(doc, node, deep);
@@ -272,11 +335,11 @@ mkr_node_clone_node(int argc, VALUE *argv, VALUE self)
272
335
  if (deep) {
273
336
  mkr_fixup_template_content(doc, node, clone);
274
337
  }
275
- return mkr_wrap_node(clone, mkr_node_document(self));
338
+ return mkr_wrap_html_node(clone, mkr_node_document(self));
276
339
  }
277
340
 
278
341
  /* Document#import_node(node, deep = false): a shallow (or deep, with deep
279
- * truthy) copy of +node+ owned by THIS document the DOM importNode, whose
342
+ * truthy) copy of +node+ owned by THIS document - the DOM importNode, whose
280
343
  * `deep` defaults to false (a missing/nil/false argument => shallow). Unlike
281
344
  * Node#clone_node, the copy is owned by the receiver rather than the node's own
282
345
  * document, so it is the way to bring a node across documents (Makiri never
@@ -290,8 +353,8 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
290
353
  rb_scan_args(argc, argv, "11", &node_v, &deep_v);
291
354
  bool deep = RTEST(deep_v);
292
355
 
293
- lxb_dom_node_t *src = mkr_node_unwrap(node_v);
294
- lxb_dom_document_t *doc = mkr_doc_unwrap(self);
356
+ lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* reject an XML node before lxb use */
357
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
295
358
 
296
359
  lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
297
360
  if (imp == NULL) {
@@ -300,7 +363,7 @@ mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
300
363
  if (deep) {
301
364
  mkr_fixup_template_content(doc, src, imp);
302
365
  }
303
- return mkr_wrap_node(imp, self);
366
+ return mkr_wrap_html_node(imp, self);
304
367
  }
305
368
 
306
369
  /* Parse +rb_html+ as a fragment in the given (tag id, namespace) context and
@@ -315,7 +378,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
315
378
  lxb_tag_id_t ctx_tag, lxb_ns_id_t ctx_ns)
316
379
  {
317
380
  VALUE html = rb_String(rb_html);
318
- lxb_dom_document_t *doc = mkr_doc_unwrap(document);
381
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(document);
319
382
 
320
383
  lxb_dom_document_fragment_t *frag = lxb_dom_document_fragment_interface_create(doc);
321
384
  if (frag == NULL) {
@@ -349,7 +412,7 @@ mkr_build_fragment_ctx(VALUE document, VALUE rb_html,
349
412
 
350
413
  lxb_html_parser_destroy(parser);
351
414
  RB_GC_GUARD(html);
352
- return mkr_wrap_node(frag_node, document);
415
+ return mkr_wrap_html_node(frag_node, document);
353
416
  }
354
417
 
355
418
  /* document.fragment(html, context: ...) -> DocumentFragment bound to this
@@ -363,7 +426,7 @@ mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
363
426
  : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
364
427
  lxb_tag_id_t tag;
365
428
  lxb_ns_id_t ns;
366
- mkr_resolve_fragment_context(mkr_doc_unwrap(self), context, &tag, &ns);
429
+ mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
367
430
  return mkr_build_fragment_ctx(self, html, tag, ns);
368
431
  }
369
432
 
@@ -379,14 +442,14 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
379
442
  : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
380
443
 
381
444
  static const lxb_char_t shell[] = "<html><body></body></html>";
382
- mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1);
445
+ mkr_parsed_t *parsed = mkr_parse_html(shell, sizeof(shell) - 1, true);
383
446
  if (parsed == NULL) {
384
447
  rb_raise(mkr_eError, "failed to create fragment document");
385
448
  }
386
449
  VALUE document = mkr_wrap_document(parsed); /* GC now owns parsed */
387
450
  lxb_tag_id_t tag;
388
451
  lxb_ns_id_t ns;
389
- mkr_resolve_fragment_context(mkr_doc_unwrap(document), context, &tag, &ns);
452
+ mkr_resolve_fragment_context(mkr_html_doc_unwrap(document), context, &tag, &ns);
390
453
  return mkr_build_fragment_ctx(document, html, tag, ns);
391
454
  }
392
455
 
@@ -396,7 +459,7 @@ mkr_frag_s_parse(int argc, VALUE *argv, VALUE klass)
396
459
  static VALUE
397
460
  mkr_node_parse(VALUE self, VALUE rb_html)
398
461
  {
399
- lxb_dom_node_t *node = mkr_node_unwrap(self);
462
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
400
463
  if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
401
464
  rb_raise(rb_eArgError, "Node#parse requires an element context");
402
465
  }
@@ -415,6 +478,7 @@ mkr_node_parse(VALUE self, VALUE rb_html)
415
478
  typedef struct {
416
479
  const lxb_char_t *src;
417
480
  size_t len;
481
+ bool assume_valid;
418
482
  mkr_parsed_t *result;
419
483
  } mkr_parse_nogvl_t;
420
484
 
@@ -425,7 +489,7 @@ static void *
425
489
  mkr_parse_nogvl(void *p)
426
490
  {
427
491
  mkr_parse_nogvl_t *a = (mkr_parse_nogvl_t *)p;
428
- a->result = mkr_parse_html(a->src, a->len);
492
+ a->result = mkr_parse_html(a->src, a->len, a->assume_valid);
429
493
  return NULL;
430
494
  }
431
495
 
@@ -440,11 +504,16 @@ static VALUE
440
504
  mkr_doc_s_parse(VALUE klass, VALUE rb_source)
441
505
  {
442
506
  StringValue(rb_source);
507
+ /* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
508
+ * degradation), anything else is transcoded to UTF-8 so its content is
509
+ * preserved rather than read as raw UTF-8 bytes. */
510
+ rb_source = mkr_ruby_to_utf8(rb_source);
443
511
 
444
512
  /* Allocate the wrapper first (with parsed == NULL) so that if parsing
445
- * fails the GC-managed object frees cleanly. */
513
+ * fails the GC-managed object frees cleanly. This is the HTML parse entry
514
+ * (defined on Makiri::HTML::Document), so the result is always HTML. */
446
515
  mkr_doc_data_t *d;
447
- VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_doc_type, d);
516
+ VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
448
517
  d->parsed = NULL;
449
518
  d->errors = rb_ary_new();
450
519
 
@@ -457,9 +526,14 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
457
526
  if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
458
527
  rb_raise(mkr_eError, "out of memory copying source");
459
528
  }
529
+ /* Read the coderange (no scan) before releasing the GVL; the copy is
530
+ * byte-identical, so a source Ruby already knows is valid UTF-8 lets the
531
+ * parse skip its sanitisation scan. */
532
+ bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
460
533
  RB_GC_GUARD(rb_source);
461
534
 
462
- mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len, NULL };
535
+ mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
536
+ assume_valid, NULL };
463
537
  rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
464
538
  mkr_owned_bytes_clear(&source);
465
539
 
@@ -479,8 +553,8 @@ mkr_doc_s_parse(VALUE klass, VALUE rb_source)
479
553
  static VALUE
480
554
  mkr_doc_root(VALUE self)
481
555
  {
482
- lxb_dom_document_t *doc = mkr_doc_unwrap(self);
483
- return mkr_wrap_node(lxb_dom_document_root(doc), self);
556
+ lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
557
+ return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
484
558
  }
485
559
 
486
560
  /* Get the document <title>, or "" if absent. */
@@ -489,7 +563,7 @@ mkr_doc_title(VALUE self)
489
563
  {
490
564
  size_t len = 0;
491
565
  const lxb_char_t *str =
492
- lxb_html_document_title((lxb_html_document_t *)mkr_doc_unwrap(self), &len);
566
+ lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
493
567
  return (str == NULL) ? rb_utf8_str_new("", 0)
494
568
  : rb_utf8_str_new((const char *)str, len);
495
569
  }
@@ -500,10 +574,10 @@ mkr_doc_title(VALUE self)
500
574
  static VALUE
501
575
  mkr_doc_internal_subset(VALUE self)
502
576
  {
503
- lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_doc_unwrap(self);
577
+ lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
504
578
  for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
505
579
  if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
506
- return mkr_wrap_node(c, self);
580
+ return mkr_wrap_html_node(c, self);
507
581
  }
508
582
  }
509
583
  return Qnil;
@@ -515,7 +589,7 @@ mkr_doc_internal_subset(VALUE self)
515
589
  static VALUE
516
590
  mkr_doc_quirks_mode(VALUE self)
517
591
  {
518
- return INT2NUM((int)mkr_doc_unwrap(self)->compat_mode);
592
+ return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
519
593
  }
520
594
 
521
595
  /* Parse warnings. Reserved; currently always empty. */
@@ -530,18 +604,18 @@ mkr_doc_errors(VALUE self)
530
604
  void
531
605
  mkr_init_document(void)
532
606
  {
533
- rb_define_singleton_method(mkr_cDocument, "_parse", mkr_doc_s_parse, 1);
534
- rb_define_method(mkr_cDocument, "root", mkr_doc_root, 0);
535
- rb_define_method(mkr_cDocument, "title", mkr_doc_title, 0);
536
- rb_define_method(mkr_cDocument, "errors", mkr_doc_errors, 0);
537
- rb_define_method(mkr_cDocument, "internal_subset", mkr_doc_internal_subset, 0);
538
- rb_define_method(mkr_cDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
539
- rb_define_method(mkr_cDocument, "fragment", mkr_doc_fragment, -1);
540
- rb_define_method(mkr_cDocument, "import_node", mkr_doc_import_node, -1);
607
+ rb_define_singleton_method(mkr_cHtmlDocument, "_parse", mkr_doc_s_parse, 1);
608
+ rb_define_method(mkr_cHtmlDocument, "root", mkr_doc_root, 0);
609
+ rb_define_method(mkr_cHtmlDocument, "title", mkr_doc_title, 0);
610
+ rb_define_method(mkr_cHtmlDocument, "errors", mkr_doc_errors, 0);
611
+ rb_define_method(mkr_cHtmlDocument, "internal_subset", mkr_doc_internal_subset, 0);
612
+ rb_define_method(mkr_cHtmlDocument, "quirks_mode", mkr_doc_quirks_mode, 0);
613
+ rb_define_method(mkr_cHtmlDocument, "fragment", mkr_doc_fragment, -1);
614
+ rb_define_method(mkr_cHtmlDocument, "import_node", mkr_doc_import_node, -1);
541
615
 
542
616
  rb_define_singleton_method(mkr_cDocumentFragment, "parse", mkr_frag_s_parse, -1);
543
617
 
544
618
  /* Node#parse(html): fragment-parse in this element's context (Nokogiri
545
619
  * compatible). Defined here, next to the fragment machinery it reuses. */
546
- rb_define_method(mkr_cNode, "parse", mkr_node_parse, 1);
620
+ rb_define_method(mkr_mHtmlNodeMethods, "parse", mkr_node_parse, 1);
547
621
  }
@@ -0,0 +1,246 @@
1
+ #include "glue.h"
2
+
3
+ #include <lexbor/css/css.h>
4
+ #include <lexbor/selectors/selectors.h>
5
+
6
+ /*
7
+ * CSS selector queries, delegated to Lexbor's lxb_selectors engine.
8
+ *
9
+ * Node#css(selector) -> NodeSet (descendants matching, document order)
10
+ * Node#at_css(selector) -> first matching descendant, or nil
11
+ *
12
+ * The Lexbor CSS engine (selector parser + its arena + the traversal engine) is
13
+ * built once and reused for every query. CSS evaluation always holds the GVL (it
14
+ * never releases it), so all queries are serialized and a single process-global
15
+ * engine is safe with no locking. Creating and tearing the engine down per call
16
+ * - four create/init/destroy triples - dominated a cheap query like
17
+ * at_css('#id') (the match is found almost immediately, so setup IS the cost);
18
+ * reusing it closes the gap to nokolexbor, which caches the same objects in
19
+ * thread-local storage. Between calls only the parsed selector list's arena is
20
+ * reset (lxb_css_memory_clean) and the parser is returned to its CLEAN stage
21
+ * (lxb_css_parser_clean) - both preserve the memory/selectors objects set once;
22
+ * the selectors parse-state is auto-cleaned by the parser and the traversal
23
+ * engine self-cleans after each find/match. A malformed selector raises
24
+ * Makiri::CSS::SyntaxError.
25
+ */
26
+
27
+ /* Process-global CSS engine, created lazily and kept for the process lifetime
28
+ * (one small allocation). parser->memory / parser->selectors are set once so the
29
+ * parser reuses the same selector arena + parse state across calls. */
30
+ static lxb_css_memory_t *g_css_mem;
31
+ static lxb_css_parser_t *g_css_parser;
32
+ static lxb_css_selectors_t *g_css_sel;
33
+ static lxb_selectors_t *g_selectors;
34
+ static int g_css_ready;
35
+
36
+ /* Build the shared engine on first use; raises Makiri::Error on init failure
37
+ * (leaving the globals unset, so a later call retries). */
38
+ static void
39
+ mkr_css_engine_init(void)
40
+ {
41
+ if (g_css_ready) {
42
+ return;
43
+ }
44
+
45
+ lxb_css_memory_t *mem = lxb_css_memory_create();
46
+ lxb_css_parser_t *parser = lxb_css_parser_create();
47
+ lxb_css_selectors_t *css_sel = lxb_css_selectors_create();
48
+ lxb_selectors_t *selectors = lxb_selectors_create();
49
+
50
+ int ok = (mem != NULL && parser != NULL && css_sel != NULL && selectors != NULL)
51
+ && (lxb_css_memory_init(mem, 128) == LXB_STATUS_OK)
52
+ && (lxb_css_parser_init(parser, NULL) == LXB_STATUS_OK)
53
+ && (lxb_css_selectors_init(css_sel) == LXB_STATUS_OK)
54
+ && (lxb_selectors_init(selectors) == LXB_STATUS_OK);
55
+
56
+ if (!ok) {
57
+ if (selectors != NULL) lxb_selectors_destroy(selectors, true);
58
+ if (parser != NULL) lxb_css_parser_destroy(parser, true);
59
+ if (mem != NULL) lxb_css_memory_destroy(mem, true);
60
+ if (css_sel != NULL) lxb_css_selectors_destroy(css_sel, true);
61
+ rb_raise(mkr_eError, "failed to initialise CSS selector engine");
62
+ }
63
+
64
+ lxb_css_parser_memory_set(parser, mem);
65
+ lxb_css_parser_selectors_set(parser, css_sel);
66
+
67
+ g_css_mem = mem;
68
+ g_css_parser = parser;
69
+ g_css_sel = css_sel;
70
+ g_selectors = selectors;
71
+ g_css_ready = 1;
72
+ }
73
+
74
+ typedef struct {
75
+ VALUE set;
76
+ lxb_dom_node_t *root; /* excluded from results: css is descendant-only */
77
+ size_t count;
78
+ int overflow;
79
+ } mkr_css_ctx_t;
80
+
81
+ static lxb_status_t
82
+ mkr_css_find_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
83
+ void *ctx_)
84
+ {
85
+ (void)spec;
86
+ mkr_css_ctx_t *c = (mkr_css_ctx_t *)ctx_;
87
+
88
+ if (node == c->root) {
89
+ return LXB_STATUS_OK; /* descendant-only, like Nokogiri's node.css */
90
+ }
91
+ if (c->count >= MKR_NODE_SET_MAX) {
92
+ c->overflow = 1;
93
+ return LXB_STATUS_STOP; /* fail closed without raising mid-traversal */
94
+ }
95
+
96
+ mkr_node_set_push(c->set, (mkr_raw_node_t *)node);
97
+ c->count++;
98
+ return LXB_STATUS_OK;
99
+ }
100
+
101
+ /* at_css: capture the first matching descendant and stop. Avoids materialising a
102
+ * NodeSet (and a Ruby #first dispatch) for the one node the caller wants. */
103
+ typedef struct {
104
+ lxb_dom_node_t *root; /* excluded: descendant-only */
105
+ lxb_dom_node_t *found;
106
+ } mkr_css_first_ctx_t;
107
+
108
+ static lxb_status_t
109
+ mkr_css_first_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
110
+ void *ctx_)
111
+ {
112
+ (void)spec;
113
+ mkr_css_first_ctx_t *c = (mkr_css_first_ctx_t *)ctx_;
114
+
115
+ if (node == c->root) {
116
+ return LXB_STATUS_OK; /* descendant-only */
117
+ }
118
+ c->found = node;
119
+ return LXB_STATUS_STOP;
120
+ }
121
+
122
+ /* Callback for matches?: signals that the node matched the selector. */
123
+ static lxb_status_t
124
+ mkr_css_match_cb(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
125
+ void *ctx_)
126
+ {
127
+ (void)node; (void)spec;
128
+ *(int *)ctx_ = 1;
129
+ return LXB_STATUS_STOP;
130
+ }
131
+
132
+ /* Parse +rb_selector+ with the shared engine, hand the parsed list to +run+
133
+ * (the actual find / match against +node+), then reset the engine for the next
134
+ * call. Raises Makiri::CSS::SyntaxError on a bad selector; any result-specific
135
+ * limits are checked by the caller after return. */
136
+ static void
137
+ mkr_with_compiled_selector(VALUE rb_selector, lxb_dom_node_t *node,
138
+ lxb_status_t (*run)(lxb_selectors_t *, lxb_dom_node_t *,
139
+ lxb_css_selector_list_t *, void *),
140
+ void *u)
141
+ {
142
+ mkr_ruby_borrowed_text_t sv = mkr_ruby_verified_text(rb_selector, "CSS selector");
143
+
144
+ mkr_css_engine_init(); /* raises on init failure */
145
+
146
+ lxb_css_selector_list_t *list =
147
+ lxb_css_selectors_parse(g_css_parser, (const lxb_char_t *)sv.ptr, sv.len);
148
+
149
+ int syntax_error = (list == NULL || g_css_parser->status != LXB_STATUS_OK);
150
+ if (!syntax_error) {
151
+ (void)run(g_selectors, node, list, u);
152
+ }
153
+
154
+ /* Reset the shared engine for the next query: drop the parsed list's arena
155
+ * allocations and return the parser to its CLEAN stage. Both preserve the
156
+ * memory/selectors objects we set once; the traversal engine self-cleans
157
+ * after find/match. */
158
+ lxb_css_memory_clean(g_css_mem);
159
+ lxb_css_parser_clean(g_css_parser);
160
+
161
+ if (syntax_error) {
162
+ rb_raise(mkr_eCSSSyntaxError, "invalid CSS selector: %" PRIsVALUE, sv.value);
163
+ }
164
+ RB_GC_GUARD(sv.value);
165
+ }
166
+
167
+ /* find: collect descendants matching the selector (MATCH_FIRST dedups a node
168
+ * that matches several selectors in a comma list). */
169
+ static lxb_status_t
170
+ mkr_run_find(lxb_selectors_t *selectors, lxb_dom_node_t *root,
171
+ lxb_css_selector_list_t *list, void *u)
172
+ {
173
+ lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
174
+ return lxb_selectors_find(selectors, root, list, mkr_css_find_cb, u);
175
+ }
176
+
177
+ /* find_first: stop at the first matching descendant (for at_css). */
178
+ static lxb_status_t
179
+ mkr_run_find_first(lxb_selectors_t *selectors, lxb_dom_node_t *root,
180
+ lxb_css_selector_list_t *list, void *u)
181
+ {
182
+ lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST);
183
+ return lxb_selectors_find(selectors, root, list, mkr_css_first_cb, u);
184
+ }
185
+
186
+ /* match_node: does THIS node match? */
187
+ static lxb_status_t
188
+ mkr_run_match(lxb_selectors_t *selectors, lxb_dom_node_t *node,
189
+ lxb_css_selector_list_t *list, void *u)
190
+ {
191
+ return lxb_selectors_match_node(selectors, node, list, mkr_css_match_cb, u);
192
+ }
193
+
194
+ /* Node#css: collect every matching descendant into a NodeSet (document order).
195
+ * Raises Makiri::CSS::SyntaxError on a bad selector, Makiri::Error on an
196
+ * over-large result. */
197
+ static VALUE
198
+ mkr_node_css(VALUE self, VALUE rb_selector)
199
+ {
200
+ lxb_dom_node_t *root = mkr_html_node_unwrap(self);
201
+ VALUE document = mkr_node_document(self);
202
+ VALUE set = mkr_node_set_new(document);
203
+
204
+ mkr_css_ctx_t ctx = { .set = set, .root = root, .count = 0, .overflow = 0 };
205
+ mkr_with_compiled_selector(rb_selector, root, mkr_run_find, &ctx);
206
+
207
+ if (ctx.overflow) {
208
+ rb_raise(mkr_eError, "CSS result set exceeded the node limit (%u)",
209
+ MKR_NODE_SET_MAX);
210
+ }
211
+ return set;
212
+ }
213
+
214
+ /* Node#at_css: the first matching descendant, or nil. */
215
+ static VALUE
216
+ mkr_node_at_css(VALUE self, VALUE rb_selector)
217
+ {
218
+ lxb_dom_node_t *root = mkr_html_node_unwrap(self);
219
+
220
+ mkr_css_first_ctx_t ctx = { .root = root, .found = NULL };
221
+ mkr_with_compiled_selector(rb_selector, root, mkr_run_find_first, &ctx);
222
+
223
+ return ctx.found != NULL
224
+ ? mkr_wrap_html_node(ctx.found, mkr_node_document(self))
225
+ : Qnil;
226
+ }
227
+
228
+ /* Node#matches?(selector): does THIS node match the CSS selector? (Like
229
+ * Nokogiri - tested against the node itself, not its descendants.) A malformed
230
+ * selector raises Makiri::CSS::SyntaxError. */
231
+ static VALUE
232
+ mkr_node_matches(VALUE self, VALUE rb_selector)
233
+ {
234
+ lxb_dom_node_t *node = mkr_html_node_unwrap(self);
235
+ int matched = 0;
236
+ mkr_with_compiled_selector(rb_selector, node, mkr_run_match, &matched);
237
+ return matched ? Qtrue : Qfalse;
238
+ }
239
+
240
+ void
241
+ mkr_init_css(void)
242
+ {
243
+ rb_define_method(mkr_mHtmlNodeMethods, "css", mkr_node_css, 1);
244
+ rb_define_method(mkr_mHtmlNodeMethods, "at_css", mkr_node_at_css, 1);
245
+ rb_define_method(mkr_mHtmlNodeMethods, "matches?", mkr_node_matches, 1);
246
+ }