nokogiri 1.16.8-arm64-darwin → 1.17.0-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
- data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
- data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
- data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
- data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
- data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
- data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
- data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
- data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
- data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
- data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
- data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
- data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
- data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
- data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
- data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
- data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
- data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
- data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
- data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
- data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
- data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
- data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
- data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +130 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.2/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.3/nokogiri.bundle +0 -0
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- metadata +8 -4
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
data/ext/nokogiri/gumbo.c
CHANGED
@@ -37,30 +37,6 @@ VALUE cNokogiriHtml5Document;
|
|
37
37
|
static ID internal_subset;
|
38
38
|
static ID parent;
|
39
39
|
|
40
|
-
/* Backwards compatibility to Ruby 2.1.0 */
|
41
|
-
#if RUBY_API_VERSION_CODE < 20200
|
42
|
-
#define ONIG_ESCAPE_UCHAR_COLLISION 1
|
43
|
-
#include <ruby/encoding.h>
|
44
|
-
|
45
|
-
static VALUE
|
46
|
-
rb_utf8_str_new(const char *str, long length)
|
47
|
-
{
|
48
|
-
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
49
|
-
}
|
50
|
-
|
51
|
-
static VALUE
|
52
|
-
rb_utf8_str_new_cstr(const char *str)
|
53
|
-
{
|
54
|
-
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
|
55
|
-
}
|
56
|
-
|
57
|
-
static VALUE
|
58
|
-
rb_utf8_str_new_static(const char *str, long length)
|
59
|
-
{
|
60
|
-
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
61
|
-
}
|
62
|
-
#endif
|
63
|
-
|
64
40
|
#include <nokogiri.h>
|
65
41
|
#include <libxml/tree.h>
|
66
42
|
#include <libxml/HTMLtree.h>
|
@@ -94,7 +70,7 @@ perform_parse(const GumboOptions *options, VALUE input)
|
|
94
70
|
GumboOutput *output = gumbo_parse_with_options(
|
95
71
|
options,
|
96
72
|
RSTRING_PTR(input),
|
97
|
-
RSTRING_LEN(input)
|
73
|
+
(size_t)RSTRING_LEN(input)
|
98
74
|
);
|
99
75
|
|
100
76
|
const char *status_string = gumbo_status_to_string(output->status);
|
@@ -260,7 +236,7 @@ static void
|
|
260
236
|
add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
261
237
|
{
|
262
238
|
const char *input_str = RSTRING_PTR(input);
|
263
|
-
size_t input_len = RSTRING_LEN(input);
|
239
|
+
size_t input_len = (size_t)RSTRING_LEN(input);
|
264
240
|
|
265
241
|
// Add parse errors to rdoc.
|
266
242
|
if (output->errors.length) {
|
@@ -272,11 +248,11 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
272
248
|
GumboSourcePosition position = gumbo_error_position(err);
|
273
249
|
char *msg;
|
274
250
|
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
|
275
|
-
VALUE err_str = rb_utf8_str_new(msg, size);
|
251
|
+
VALUE err_str = rb_utf8_str_new(msg, (int)size);
|
276
252
|
free(msg);
|
277
253
|
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
278
254
|
const char *error_code = gumbo_error_code(err);
|
279
|
-
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
|
255
|
+
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, (int)strlen(error_code)) : Qnil;
|
280
256
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
281
257
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
258
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
@@ -316,18 +292,58 @@ parse_cleanup(VALUE parse_args)
|
|
316
292
|
return Qnil;
|
317
293
|
}
|
318
294
|
|
295
|
+
// Scan the keyword arguments for options common to the document and fragment
|
296
|
+
// parse.
|
297
|
+
static GumboOptions
|
298
|
+
common_options(VALUE kwargs)
|
299
|
+
{
|
300
|
+
// The order of the keywords determines the order of the values below.
|
301
|
+
// If this order is changed, then setting the options below must change as
|
302
|
+
// well.
|
303
|
+
ID keywords[] = {
|
304
|
+
// Required keywords.
|
305
|
+
rb_intern_const("max_attributes"),
|
306
|
+
rb_intern_const("max_errors"),
|
307
|
+
rb_intern_const("max_tree_depth"),
|
308
|
+
|
309
|
+
// Optional keywords.
|
310
|
+
rb_intern_const("parse_noscript_content_as_text"),
|
311
|
+
};
|
312
|
+
VALUE values[sizeof keywords / sizeof keywords[0]];
|
313
|
+
|
314
|
+
// Extract the values coresponding to the required keywords. Raise an error
|
315
|
+
// if required arguments are missing.
|
316
|
+
rb_get_kwargs(kwargs, keywords, 3, 1, values);
|
317
|
+
|
318
|
+
GumboOptions options = kGumboDefaultOptions;
|
319
|
+
options.max_attributes = NUM2INT(values[0]);
|
320
|
+
options.max_errors = NUM2INT(values[1]);
|
321
|
+
|
322
|
+
// handle negative values
|
323
|
+
int depth = NUM2INT(values[2]);
|
324
|
+
options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
|
325
|
+
|
326
|
+
options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
|
327
|
+
|
328
|
+
return options;
|
329
|
+
}
|
330
|
+
|
319
331
|
static VALUE parse_continue(VALUE parse_args);
|
320
332
|
|
321
333
|
/*
|
322
334
|
* @!visibility protected
|
323
335
|
*/
|
324
336
|
static VALUE
|
325
|
-
|
337
|
+
noko_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
|
326
338
|
{
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
339
|
+
VALUE input, url, klass, kwargs;
|
340
|
+
|
341
|
+
rb_scan_args(argc, argv, "3:", &input, &url, &klass, &kwargs);
|
342
|
+
if (NIL_P(kwargs)) {
|
343
|
+
kwargs = rb_hash_new();
|
344
|
+
}
|
345
|
+
|
346
|
+
GumboOptions options = common_options(kwargs);
|
331
347
|
|
332
348
|
GumboOutput *output = perform_parse(&options, input);
|
333
349
|
ParseArgs args = {
|
@@ -383,7 +399,7 @@ lookup_namespace(VALUE node, bool require_known_ns)
|
|
383
399
|
Check_Type(ns, T_STRING);
|
384
400
|
|
385
401
|
const char *href_ptr = RSTRING_PTR(ns);
|
386
|
-
size_t href_len = RSTRING_LEN(ns);
|
402
|
+
size_t href_len = (size_t)RSTRING_LEN(ns);
|
387
403
|
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
|
388
404
|
if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
|
389
405
|
return GUMBO_NAMESPACE_HTML;
|
@@ -415,16 +431,12 @@ static VALUE fragment_continue(VALUE parse_args);
|
|
415
431
|
* @!visibility protected
|
416
432
|
*/
|
417
433
|
static VALUE
|
418
|
-
|
419
|
-
VALUE self,
|
420
|
-
VALUE doc_fragment,
|
421
|
-
VALUE tags,
|
422
|
-
VALUE ctx,
|
423
|
-
VALUE max_attributes,
|
424
|
-
VALUE max_errors,
|
425
|
-
VALUE max_depth
|
426
|
-
)
|
434
|
+
noko_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
|
427
435
|
{
|
436
|
+
VALUE doc_fragment;
|
437
|
+
VALUE tags;
|
438
|
+
VALUE ctx;
|
439
|
+
VALUE kwargs;
|
428
440
|
ID name = rb_intern_const("name");
|
429
441
|
const char *ctx_tag;
|
430
442
|
GumboNamespaceEnum ctx_ns;
|
@@ -432,13 +444,20 @@ fragment(
|
|
432
444
|
bool form = false;
|
433
445
|
const char *encoding = NULL;
|
434
446
|
|
447
|
+
rb_scan_args(argc, argv, "3:", &doc_fragment, &tags, &ctx, &kwargs);
|
448
|
+
if (NIL_P(kwargs)) {
|
449
|
+
kwargs = rb_hash_new();
|
450
|
+
}
|
451
|
+
|
452
|
+
GumboOptions options = common_options(kwargs);
|
453
|
+
|
435
454
|
if (NIL_P(ctx)) {
|
436
455
|
ctx_tag = "body";
|
437
456
|
ctx_ns = GUMBO_NAMESPACE_HTML;
|
438
457
|
} else if (TYPE(ctx) == T_STRING) {
|
439
458
|
ctx_tag = StringValueCStr(ctx);
|
440
459
|
ctx_ns = GUMBO_NAMESPACE_HTML;
|
441
|
-
size_t len = RSTRING_LEN(ctx);
|
460
|
+
size_t len = (size_t)RSTRING_LEN(ctx);
|
442
461
|
const char *colon = memchr(ctx_tag, ':', len);
|
443
462
|
if (colon) {
|
444
463
|
switch (colon - ctx_tag) {
|
@@ -519,7 +538,7 @@ error:
|
|
519
538
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
520
539
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
521
540
|
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
-
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
541
|
+
if (NIL_P(ctx) || (TYPE(ctx) == T_STRING) || NIL_P(doc_quirks_mode)) {
|
523
542
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
543
|
} else if (NIL_P(dtd)) {
|
525
544
|
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
@@ -535,18 +554,15 @@ error:
|
|
535
554
|
}
|
536
555
|
|
537
556
|
// Perform a fragment parse.
|
538
|
-
int depth = NUM2INT(max_depth);
|
539
|
-
GumboOptions options = kGumboDefaultOptions;
|
540
|
-
options.max_attributes = NUM2INT(max_attributes);
|
541
|
-
options.max_errors = NUM2INT(max_errors);
|
542
|
-
// Add one to account for the HTML element.
|
543
|
-
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
544
557
|
options.fragment_context = ctx_tag;
|
545
558
|
options.fragment_namespace = ctx_ns;
|
546
559
|
options.fragment_encoding = encoding;
|
547
560
|
options.quirks_mode = quirks_mode;
|
548
561
|
options.fragment_context_has_form_ancestor = form;
|
549
562
|
|
563
|
+
// Add one to the max tree depth to account for the HTML element.
|
564
|
+
if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }
|
565
|
+
|
550
566
|
GumboOutput *output = perform_parse(&options, tags);
|
551
567
|
ParseArgs args = {
|
552
568
|
.output = output,
|
@@ -587,8 +603,8 @@ noko_init_gumbo(void)
|
|
587
603
|
parent = rb_intern_const("parent");
|
588
604
|
|
589
605
|
// Define Nokogumbo module with parse and fragment methods.
|
590
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse",
|
591
|
-
rb_define_singleton_method(mNokogiriGumbo, "fragment",
|
606
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", noko_gumbo_s_parse, -1);
|
607
|
+
rb_define_singleton_method(mNokogiriGumbo, "fragment", noko_gumbo_s_fragment, -1);
|
592
608
|
}
|
593
609
|
|
594
610
|
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -7,9 +7,9 @@ static ID id_to_s;
|
|
7
7
|
|
8
8
|
/*
|
9
9
|
* call-seq:
|
10
|
-
* new
|
10
|
+
* new(uri=nil, external_id=nil) → HTML4::Document
|
11
11
|
*
|
12
|
-
* Create a new document
|
12
|
+
* Create a new empty document with base URI +uri+ and external ID +external_id+.
|
13
13
|
*/
|
14
14
|
static VALUE
|
15
15
|
rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
|
@@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
|
|
46
46
|
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
|
47
47
|
int options = NUM2INT(rb_options);
|
48
48
|
|
49
|
-
xmlSetStructuredErrorFunc((void *)rb_error_list,
|
49
|
+
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
|
50
50
|
|
51
51
|
c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
|
52
52
|
|
@@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
|
|
106
106
|
int html_len = (int)RSTRING_LEN(rb_html);
|
107
107
|
int options = NUM2INT(rb_options);
|
108
108
|
|
109
|
-
xmlSetStructuredErrorFunc((void *)rb_error_list,
|
109
|
+
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
|
110
110
|
|
111
111
|
c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
|
112
112
|
|
@@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
|
|
151
151
|
void
|
152
152
|
noko_init_html_document(void)
|
153
153
|
{
|
154
|
+
/* this is here so that rdoc doesn't ignore this file. */
|
155
|
+
/*
|
156
|
+
mNokogiri = rb_define_module("Nokogiri");
|
157
|
+
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
|
158
|
+
*/
|
159
|
+
|
154
160
|
assert(cNokogiriXmlDocument);
|
155
161
|
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
|
156
162
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#include <nokogiri.h>
|
2
2
|
|
3
|
-
static const rb_data_type_t
|
4
|
-
.wrap_struct_name = "
|
3
|
+
static const rb_data_type_t html_elem_desc_type = {
|
4
|
+
.wrap_struct_name = "htmlElemDesc",
|
5
5
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
|
6
6
|
};
|
7
7
|
|
@@ -20,7 +20,7 @@ required_attributes(VALUE self)
|
|
20
20
|
VALUE list;
|
21
21
|
int i;
|
22
22
|
|
23
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
23
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
24
24
|
|
25
25
|
list = rb_ary_new();
|
26
26
|
|
@@ -46,7 +46,7 @@ deprecated_attributes(VALUE self)
|
|
46
46
|
VALUE list;
|
47
47
|
int i;
|
48
48
|
|
49
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
49
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
50
50
|
|
51
51
|
list = rb_ary_new();
|
52
52
|
|
@@ -72,7 +72,7 @@ optional_attributes(VALUE self)
|
|
72
72
|
VALUE list;
|
73
73
|
int i;
|
74
74
|
|
75
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
75
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
76
76
|
|
77
77
|
list = rb_ary_new();
|
78
78
|
|
@@ -95,7 +95,7 @@ static VALUE
|
|
95
95
|
default_sub_element(VALUE self)
|
96
96
|
{
|
97
97
|
const htmlElemDesc *description;
|
98
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
98
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
99
99
|
|
100
100
|
if (description->defaultsubelt) {
|
101
101
|
return NOKOGIRI_STR_NEW2(description->defaultsubelt);
|
@@ -117,7 +117,7 @@ sub_elements(VALUE self)
|
|
117
117
|
VALUE list;
|
118
118
|
int i;
|
119
119
|
|
120
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
120
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
121
121
|
|
122
122
|
list = rb_ary_new();
|
123
123
|
|
@@ -140,7 +140,7 @@ static VALUE
|
|
140
140
|
description(VALUE self)
|
141
141
|
{
|
142
142
|
const htmlElemDesc *description;
|
143
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
143
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
144
144
|
|
145
145
|
return NOKOGIRI_STR_NEW2(description->desc);
|
146
146
|
}
|
@@ -155,7 +155,7 @@ static VALUE
|
|
155
155
|
inline_eh(VALUE self)
|
156
156
|
{
|
157
157
|
const htmlElemDesc *description;
|
158
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
158
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
159
159
|
|
160
160
|
if (description->isinline) { return Qtrue; }
|
161
161
|
return Qfalse;
|
@@ -171,7 +171,7 @@ static VALUE
|
|
171
171
|
deprecated_eh(VALUE self)
|
172
172
|
{
|
173
173
|
const htmlElemDesc *description;
|
174
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
174
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
175
175
|
|
176
176
|
if (description->depr) { return Qtrue; }
|
177
177
|
return Qfalse;
|
@@ -187,7 +187,7 @@ static VALUE
|
|
187
187
|
empty_eh(VALUE self)
|
188
188
|
{
|
189
189
|
const htmlElemDesc *description;
|
190
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
190
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
191
191
|
|
192
192
|
if (description->empty) { return Qtrue; }
|
193
193
|
return Qfalse;
|
@@ -203,7 +203,7 @@ static VALUE
|
|
203
203
|
save_end_tag_eh(VALUE self)
|
204
204
|
{
|
205
205
|
const htmlElemDesc *description;
|
206
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
206
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
207
207
|
|
208
208
|
if (description->saveEndTag) { return Qtrue; }
|
209
209
|
return Qfalse;
|
@@ -219,7 +219,7 @@ static VALUE
|
|
219
219
|
implied_end_tag_eh(VALUE self)
|
220
220
|
{
|
221
221
|
const htmlElemDesc *description;
|
222
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
222
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
223
223
|
|
224
224
|
if (description->endTag) { return Qtrue; }
|
225
225
|
return Qfalse;
|
@@ -235,7 +235,7 @@ static VALUE
|
|
235
235
|
implied_start_tag_eh(VALUE self)
|
236
236
|
{
|
237
237
|
const htmlElemDesc *description;
|
238
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
238
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
239
239
|
|
240
240
|
if (description->startTag) { return Qtrue; }
|
241
241
|
return Qfalse;
|
@@ -245,13 +245,13 @@ implied_start_tag_eh(VALUE self)
|
|
245
245
|
* call-seq:
|
246
246
|
* name
|
247
247
|
*
|
248
|
-
* Get the tag name for this
|
248
|
+
* Get the tag name for this ElementDescription
|
249
249
|
*/
|
250
250
|
static VALUE
|
251
251
|
name(VALUE self)
|
252
252
|
{
|
253
253
|
const htmlElemDesc *description;
|
254
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
254
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
255
255
|
|
256
256
|
if (NULL == description->name) { return Qnil; }
|
257
257
|
return NOKOGIRI_STR_NEW2(description->name);
|
@@ -261,7 +261,7 @@ name(VALUE self)
|
|
261
261
|
* call-seq:
|
262
262
|
* [](tag_name)
|
263
263
|
*
|
264
|
-
* Get
|
264
|
+
* Get ElementDescription for +tag_name+
|
265
265
|
*/
|
266
266
|
static VALUE
|
267
267
|
get_description(VALUE klass, VALUE tag_name)
|
@@ -271,7 +271,7 @@ get_description(VALUE klass, VALUE tag_name)
|
|
271
271
|
);
|
272
272
|
|
273
273
|
if (NULL == description) { return Qnil; }
|
274
|
-
return TypedData_Wrap_Struct(klass, &
|
274
|
+
return TypedData_Wrap_Struct(klass, &html_elem_desc_type, DISCARD_CONST_QUAL(void *, description));
|
275
275
|
}
|
276
276
|
|
277
277
|
void
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#include <nokogiri.h>
|
2
|
+
|
3
|
+
VALUE cNokogiriHtml4SaxParser;
|
4
|
+
|
5
|
+
static ID id_start_document;
|
6
|
+
|
7
|
+
static void
|
8
|
+
noko_html4_sax_parser_start_document(void *ctx)
|
9
|
+
{
|
10
|
+
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
|
11
|
+
VALUE self = (VALUE)ctxt->_private;
|
12
|
+
VALUE doc = rb_iv_get(self, "@document");
|
13
|
+
|
14
|
+
xmlSAX2StartDocument(ctx);
|
15
|
+
|
16
|
+
rb_funcall(doc, id_start_document, 0);
|
17
|
+
}
|
18
|
+
|
19
|
+
static VALUE
|
20
|
+
noko_html4_sax_parser_initialize(VALUE self)
|
21
|
+
{
|
22
|
+
xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);
|
23
|
+
|
24
|
+
rb_call_super(0, NULL);
|
25
|
+
|
26
|
+
handler->startDocument = noko_html4_sax_parser_start_document;
|
27
|
+
|
28
|
+
return self;
|
29
|
+
}
|
30
|
+
|
31
|
+
void
|
32
|
+
noko_init_html4_sax_parser(void)
|
33
|
+
{
|
34
|
+
cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser);
|
35
|
+
|
36
|
+
rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native",
|
37
|
+
noko_html4_sax_parser_initialize, 0);
|
38
|
+
|
39
|
+
id_start_document = rb_intern("start_document");
|
40
|
+
}
|
@@ -2,96 +2,83 @@
|
|
2
2
|
|
3
3
|
VALUE cNokogiriHtml4SaxParserContext ;
|
4
4
|
|
5
|
+
/* :nodoc: */
|
5
6
|
static VALUE
|
6
|
-
|
7
|
+
noko_html4_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALUE rb_encoding)
|
7
8
|
{
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
if (!(int)RSTRING_LEN(data)) {
|
13
|
-
rb_raise(rb_eRuntimeError, "data cannot be empty");
|
9
|
+
Check_Type(rb_input, T_STRING);
|
10
|
+
if (!(int)RSTRING_LEN(rb_input)) {
|
11
|
+
rb_raise(rb_eRuntimeError, "input string cannot be empty");
|
14
12
|
}
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
if (ctxt->sax) {
|
19
|
-
xmlFree(ctxt->sax);
|
20
|
-
ctxt->sax = NULL;
|
14
|
+
if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
|
15
|
+
rb_raise(rb_eTypeError, "argument must be an Encoding object");
|
21
16
|
}
|
22
17
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
28
|
-
rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
|
29
|
-
StringValueCStr(encoding));
|
30
|
-
}
|
31
|
-
}
|
18
|
+
htmlParserCtxtPtr c_context =
|
19
|
+
htmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input));
|
20
|
+
if (!c_context) {
|
21
|
+
rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
|
32
22
|
}
|
33
23
|
|
34
|
-
|
35
|
-
}
|
24
|
+
noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
|
36
25
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
|
41
|
-
StringValueCStr(filename),
|
42
|
-
StringValueCStr(encoding)
|
43
|
-
);
|
44
|
-
|
45
|
-
if (ctxt->sax) {
|
46
|
-
xmlFree(ctxt->sax);
|
47
|
-
ctxt->sax = NULL;
|
26
|
+
if (c_context->sax) {
|
27
|
+
xmlFree(c_context->sax);
|
28
|
+
c_context->sax = NULL;
|
48
29
|
}
|
49
30
|
|
50
|
-
return noko_xml_sax_parser_context_wrap(
|
31
|
+
return noko_xml_sax_parser_context_wrap(rb_class, c_context);
|
51
32
|
}
|
52
33
|
|
34
|
+
/* :nodoc: */
|
53
35
|
static VALUE
|
54
|
-
|
36
|
+
noko_html4_sax_parser_context_s_native_file(VALUE rb_class, VALUE rb_filename, VALUE rb_encoding)
|
55
37
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
}
|
38
|
+
if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
|
39
|
+
rb_raise(rb_eTypeError, "argument must be an Encoding object");
|
40
|
+
}
|
60
41
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
42
|
+
htmlParserCtxtPtr c_context = htmlCreateFileParserCtxt(StringValueCStr(rb_filename), NULL);
|
43
|
+
if (!c_context) {
|
44
|
+
rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
|
45
|
+
}
|
65
46
|
|
66
|
-
|
67
|
-
|
47
|
+
noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
|
48
|
+
|
49
|
+
if (c_context->sax) {
|
50
|
+
xmlFree(c_context->sax);
|
51
|
+
c_context->sax = NULL;
|
68
52
|
}
|
69
53
|
|
70
|
-
|
71
|
-
return Qnil;
|
54
|
+
return noko_xml_sax_parser_context_wrap(rb_class, c_context);
|
72
55
|
}
|
73
56
|
|
74
57
|
static VALUE
|
75
|
-
|
58
|
+
noko_html4_sax_parser_context__parse_with(VALUE rb_context, VALUE rb_sax_parser)
|
76
59
|
{
|
77
60
|
htmlParserCtxtPtr ctxt;
|
78
61
|
htmlSAXHandlerPtr sax;
|
79
62
|
|
80
|
-
if (!rb_obj_is_kind_of(
|
63
|
+
if (!rb_obj_is_kind_of(rb_sax_parser, cNokogiriXmlSaxParser)) {
|
81
64
|
rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
|
82
65
|
}
|
83
66
|
|
84
|
-
ctxt = noko_xml_sax_parser_context_unwrap(
|
85
|
-
sax =
|
67
|
+
ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
|
68
|
+
sax = noko_xml_sax_parser_unwrap(rb_sax_parser);
|
86
69
|
|
87
70
|
ctxt->sax = sax;
|
88
|
-
ctxt->userData =
|
71
|
+
ctxt->userData = ctxt; /* so we can use libxml2/SAX2.c handlers if we want to */
|
72
|
+
ctxt->_private = (void *)rb_sax_parser;
|
89
73
|
|
90
74
|
xmlSetStructuredErrorFunc(NULL, NULL);
|
91
75
|
|
92
|
-
|
76
|
+
/* although we're calling back into Ruby here, we don't need to worry about exceptions, because we
|
77
|
+
* don't have any cleanup to do. The only memory we need to free is handled by
|
78
|
+
* xml_sax_parser_context_type_free */
|
79
|
+
htmlParseDocument(ctxt);
|
93
80
|
|
94
|
-
return
|
81
|
+
return Qnil;
|
95
82
|
}
|
96
83
|
|
97
84
|
void
|
@@ -101,8 +88,11 @@ noko_init_html_sax_parser_context(void)
|
|
101
88
|
cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
|
102
89
|
cNokogiriXmlSaxParserContext);
|
103
90
|
|
104
|
-
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "
|
105
|
-
|
91
|
+
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_memory",
|
92
|
+
noko_html4_sax_parser_context_s_native_memory, 2);
|
93
|
+
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_file",
|
94
|
+
noko_html4_sax_parser_context_s_native_file, 2);
|
106
95
|
|
107
|
-
rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
|
96
|
+
rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
|
97
|
+
noko_html4_sax_parser_context__parse_with, 1);
|
108
98
|
}
|