nokogiri 1.16.7 → 1.18.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +14 -22
- data/LICENSE-DEPENDENCIES.md +6 -6
- data/README.md +8 -5
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +188 -142
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +141 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +219 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +103 -100
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/gumbo-parser/src/ascii.c +2 -2
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +63 -25
- data/gumbo-parser/src/tokenizer.c +6 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +38 -42
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml/xpath_context.rb +14 -3
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +13 -14
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/ext/nokogiri/gumbo.c
CHANGED
@@ -37,30 +37,6 @@ VALUE cNokogiriHtml5Document;
|
|
37
37
|
static ID internal_subset;
|
38
38
|
static ID parent;
|
39
39
|
|
40
|
-
/* Backwards compatibility to Ruby 2.1.0 */
|
41
|
-
#if RUBY_API_VERSION_CODE < 20200
|
42
|
-
#define ONIG_ESCAPE_UCHAR_COLLISION 1
|
43
|
-
#include <ruby/encoding.h>
|
44
|
-
|
45
|
-
static VALUE
|
46
|
-
rb_utf8_str_new(const char *str, long length)
|
47
|
-
{
|
48
|
-
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
49
|
-
}
|
50
|
-
|
51
|
-
static VALUE
|
52
|
-
rb_utf8_str_new_cstr(const char *str)
|
53
|
-
{
|
54
|
-
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
|
55
|
-
}
|
56
|
-
|
57
|
-
static VALUE
|
58
|
-
rb_utf8_str_new_static(const char *str, long length)
|
59
|
-
{
|
60
|
-
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
61
|
-
}
|
62
|
-
#endif
|
63
|
-
|
64
40
|
#include <nokogiri.h>
|
65
41
|
#include <libxml/tree.h>
|
66
42
|
#include <libxml/HTMLtree.h>
|
@@ -94,7 +70,7 @@ perform_parse(const GumboOptions *options, VALUE input)
|
|
94
70
|
GumboOutput *output = gumbo_parse_with_options(
|
95
71
|
options,
|
96
72
|
RSTRING_PTR(input),
|
97
|
-
RSTRING_LEN(input)
|
73
|
+
(size_t)RSTRING_LEN(input)
|
98
74
|
);
|
99
75
|
|
100
76
|
const char *status_string = gumbo_status_to_string(output->status);
|
@@ -260,7 +236,7 @@ static void
|
|
260
236
|
add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
261
237
|
{
|
262
238
|
const char *input_str = RSTRING_PTR(input);
|
263
|
-
size_t input_len = RSTRING_LEN(input);
|
239
|
+
size_t input_len = (size_t)RSTRING_LEN(input);
|
264
240
|
|
265
241
|
// Add parse errors to rdoc.
|
266
242
|
if (output->errors.length) {
|
@@ -272,11 +248,11 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
272
248
|
GumboSourcePosition position = gumbo_error_position(err);
|
273
249
|
char *msg;
|
274
250
|
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
|
275
|
-
VALUE err_str = rb_utf8_str_new(msg, size);
|
251
|
+
VALUE err_str = rb_utf8_str_new(msg, (int)size);
|
276
252
|
free(msg);
|
277
253
|
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
278
254
|
const char *error_code = gumbo_error_code(err);
|
279
|
-
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
|
255
|
+
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, (int)strlen(error_code)) : Qnil;
|
280
256
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
281
257
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
258
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
@@ -316,18 +292,58 @@ parse_cleanup(VALUE parse_args)
|
|
316
292
|
return Qnil;
|
317
293
|
}
|
318
294
|
|
295
|
+
// Scan the keyword arguments for options common to the document and fragment
|
296
|
+
// parse.
|
297
|
+
static GumboOptions
|
298
|
+
common_options(VALUE kwargs)
|
299
|
+
{
|
300
|
+
// The order of the keywords determines the order of the values below.
|
301
|
+
// If this order is changed, then setting the options below must change as
|
302
|
+
// well.
|
303
|
+
ID keywords[] = {
|
304
|
+
// Required keywords.
|
305
|
+
rb_intern_const("max_attributes"),
|
306
|
+
rb_intern_const("max_errors"),
|
307
|
+
rb_intern_const("max_tree_depth"),
|
308
|
+
|
309
|
+
// Optional keywords.
|
310
|
+
rb_intern_const("parse_noscript_content_as_text"),
|
311
|
+
};
|
312
|
+
VALUE values[sizeof keywords / sizeof keywords[0]];
|
313
|
+
|
314
|
+
// Extract the values coresponding to the required keywords. Raise an error
|
315
|
+
// if required arguments are missing.
|
316
|
+
rb_get_kwargs(kwargs, keywords, 3, 1, values);
|
317
|
+
|
318
|
+
GumboOptions options = kGumboDefaultOptions;
|
319
|
+
options.max_attributes = NUM2INT(values[0]);
|
320
|
+
options.max_errors = NUM2INT(values[1]);
|
321
|
+
|
322
|
+
// handle negative values
|
323
|
+
int depth = NUM2INT(values[2]);
|
324
|
+
options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
|
325
|
+
|
326
|
+
options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
|
327
|
+
|
328
|
+
return options;
|
329
|
+
}
|
330
|
+
|
319
331
|
static VALUE parse_continue(VALUE parse_args);
|
320
332
|
|
321
333
|
/*
|
322
334
|
* @!visibility protected
|
323
335
|
*/
|
324
336
|
static VALUE
|
325
|
-
|
337
|
+
noko_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
|
326
338
|
{
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
339
|
+
VALUE input, url, klass, kwargs;
|
340
|
+
|
341
|
+
rb_scan_args(argc, argv, "3:", &input, &url, &klass, &kwargs);
|
342
|
+
if (NIL_P(kwargs)) {
|
343
|
+
kwargs = rb_hash_new();
|
344
|
+
}
|
345
|
+
|
346
|
+
GumboOptions options = common_options(kwargs);
|
331
347
|
|
332
348
|
GumboOutput *output = perform_parse(&options, input);
|
333
349
|
ParseArgs args = {
|
@@ -383,7 +399,7 @@ lookup_namespace(VALUE node, bool require_known_ns)
|
|
383
399
|
Check_Type(ns, T_STRING);
|
384
400
|
|
385
401
|
const char *href_ptr = RSTRING_PTR(ns);
|
386
|
-
size_t href_len = RSTRING_LEN(ns);
|
402
|
+
size_t href_len = (size_t)RSTRING_LEN(ns);
|
387
403
|
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
|
388
404
|
if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
|
389
405
|
return GUMBO_NAMESPACE_HTML;
|
@@ -415,16 +431,12 @@ static VALUE fragment_continue(VALUE parse_args);
|
|
415
431
|
* @!visibility protected
|
416
432
|
*/
|
417
433
|
static VALUE
|
418
|
-
|
419
|
-
VALUE self,
|
420
|
-
VALUE doc_fragment,
|
421
|
-
VALUE tags,
|
422
|
-
VALUE ctx,
|
423
|
-
VALUE max_attributes,
|
424
|
-
VALUE max_errors,
|
425
|
-
VALUE max_depth
|
426
|
-
)
|
434
|
+
noko_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
|
427
435
|
{
|
436
|
+
VALUE doc_fragment;
|
437
|
+
VALUE tags;
|
438
|
+
VALUE ctx;
|
439
|
+
VALUE kwargs;
|
428
440
|
ID name = rb_intern_const("name");
|
429
441
|
const char *ctx_tag;
|
430
442
|
GumboNamespaceEnum ctx_ns;
|
@@ -432,13 +444,20 @@ fragment(
|
|
432
444
|
bool form = false;
|
433
445
|
const char *encoding = NULL;
|
434
446
|
|
447
|
+
rb_scan_args(argc, argv, "3:", &doc_fragment, &tags, &ctx, &kwargs);
|
448
|
+
if (NIL_P(kwargs)) {
|
449
|
+
kwargs = rb_hash_new();
|
450
|
+
}
|
451
|
+
|
452
|
+
GumboOptions options = common_options(kwargs);
|
453
|
+
|
435
454
|
if (NIL_P(ctx)) {
|
436
455
|
ctx_tag = "body";
|
437
456
|
ctx_ns = GUMBO_NAMESPACE_HTML;
|
438
457
|
} else if (TYPE(ctx) == T_STRING) {
|
439
458
|
ctx_tag = StringValueCStr(ctx);
|
440
459
|
ctx_ns = GUMBO_NAMESPACE_HTML;
|
441
|
-
size_t len = RSTRING_LEN(ctx);
|
460
|
+
size_t len = (size_t)RSTRING_LEN(ctx);
|
442
461
|
const char *colon = memchr(ctx_tag, ':', len);
|
443
462
|
if (colon) {
|
444
463
|
switch (colon - ctx_tag) {
|
@@ -519,7 +538,7 @@ error:
|
|
519
538
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
520
539
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
521
540
|
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
-
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
541
|
+
if (NIL_P(ctx) || (TYPE(ctx) == T_STRING) || NIL_P(doc_quirks_mode)) {
|
523
542
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
543
|
} else if (NIL_P(dtd)) {
|
525
544
|
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
@@ -535,18 +554,15 @@ error:
|
|
535
554
|
}
|
536
555
|
|
537
556
|
// Perform a fragment parse.
|
538
|
-
int depth = NUM2INT(max_depth);
|
539
|
-
GumboOptions options = kGumboDefaultOptions;
|
540
|
-
options.max_attributes = NUM2INT(max_attributes);
|
541
|
-
options.max_errors = NUM2INT(max_errors);
|
542
|
-
// Add one to account for the HTML element.
|
543
|
-
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
544
557
|
options.fragment_context = ctx_tag;
|
545
558
|
options.fragment_namespace = ctx_ns;
|
546
559
|
options.fragment_encoding = encoding;
|
547
560
|
options.quirks_mode = quirks_mode;
|
548
561
|
options.fragment_context_has_form_ancestor = form;
|
549
562
|
|
563
|
+
// Add one to the max tree depth to account for the HTML element.
|
564
|
+
if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }
|
565
|
+
|
550
566
|
GumboOutput *output = perform_parse(&options, tags);
|
551
567
|
ParseArgs args = {
|
552
568
|
.output = output,
|
@@ -587,8 +603,8 @@ noko_init_gumbo(void)
|
|
587
603
|
parent = rb_intern_const("parent");
|
588
604
|
|
589
605
|
// Define Nokogumbo module with parse and fragment methods.
|
590
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse",
|
591
|
-
rb_define_singleton_method(mNokogiriGumbo, "fragment",
|
606
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", noko_gumbo_s_parse, -1);
|
607
|
+
rb_define_singleton_method(mNokogiriGumbo, "fragment", noko_gumbo_s_fragment, -1);
|
592
608
|
}
|
593
609
|
|
594
610
|
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -7,9 +7,9 @@ static ID id_to_s;
|
|
7
7
|
|
8
8
|
/*
|
9
9
|
* call-seq:
|
10
|
-
* new
|
10
|
+
* new(uri=nil, external_id=nil) → HTML4::Document
|
11
11
|
*
|
12
|
-
* Create a new document
|
12
|
+
* Create a new empty document with base URI +uri+ and external ID +external_id+.
|
13
13
|
*/
|
14
14
|
static VALUE
|
15
15
|
rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
|
@@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
|
|
46
46
|
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
|
47
47
|
int options = NUM2INT(rb_options);
|
48
48
|
|
49
|
-
xmlSetStructuredErrorFunc((void *)rb_error_list,
|
49
|
+
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
|
50
50
|
|
51
51
|
c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
|
52
52
|
|
@@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
|
|
106
106
|
int html_len = (int)RSTRING_LEN(rb_html);
|
107
107
|
int options = NUM2INT(rb_options);
|
108
108
|
|
109
|
-
xmlSetStructuredErrorFunc((void *)rb_error_list,
|
109
|
+
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
|
110
110
|
|
111
111
|
c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
|
112
112
|
|
@@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
|
|
151
151
|
void
|
152
152
|
noko_init_html_document(void)
|
153
153
|
{
|
154
|
+
/* this is here so that rdoc doesn't ignore this file. */
|
155
|
+
/*
|
156
|
+
mNokogiri = rb_define_module("Nokogiri");
|
157
|
+
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
|
158
|
+
*/
|
159
|
+
|
154
160
|
assert(cNokogiriXmlDocument);
|
155
161
|
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
|
156
162
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#include <nokogiri.h>
|
2
2
|
|
3
|
-
static const rb_data_type_t
|
4
|
-
.wrap_struct_name = "
|
3
|
+
static const rb_data_type_t html_elem_desc_type = {
|
4
|
+
.wrap_struct_name = "htmlElemDesc",
|
5
5
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
|
6
6
|
};
|
7
7
|
|
@@ -20,7 +20,7 @@ required_attributes(VALUE self)
|
|
20
20
|
VALUE list;
|
21
21
|
int i;
|
22
22
|
|
23
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
23
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
24
24
|
|
25
25
|
list = rb_ary_new();
|
26
26
|
|
@@ -46,7 +46,7 @@ deprecated_attributes(VALUE self)
|
|
46
46
|
VALUE list;
|
47
47
|
int i;
|
48
48
|
|
49
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
49
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
50
50
|
|
51
51
|
list = rb_ary_new();
|
52
52
|
|
@@ -72,7 +72,7 @@ optional_attributes(VALUE self)
|
|
72
72
|
VALUE list;
|
73
73
|
int i;
|
74
74
|
|
75
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
75
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
76
76
|
|
77
77
|
list = rb_ary_new();
|
78
78
|
|
@@ -95,7 +95,7 @@ static VALUE
|
|
95
95
|
default_sub_element(VALUE self)
|
96
96
|
{
|
97
97
|
const htmlElemDesc *description;
|
98
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
98
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
99
99
|
|
100
100
|
if (description->defaultsubelt) {
|
101
101
|
return NOKOGIRI_STR_NEW2(description->defaultsubelt);
|
@@ -117,7 +117,7 @@ sub_elements(VALUE self)
|
|
117
117
|
VALUE list;
|
118
118
|
int i;
|
119
119
|
|
120
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
120
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
121
121
|
|
122
122
|
list = rb_ary_new();
|
123
123
|
|
@@ -140,7 +140,7 @@ static VALUE
|
|
140
140
|
description(VALUE self)
|
141
141
|
{
|
142
142
|
const htmlElemDesc *description;
|
143
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
143
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
144
144
|
|
145
145
|
return NOKOGIRI_STR_NEW2(description->desc);
|
146
146
|
}
|
@@ -155,7 +155,7 @@ static VALUE
|
|
155
155
|
inline_eh(VALUE self)
|
156
156
|
{
|
157
157
|
const htmlElemDesc *description;
|
158
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
158
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
159
159
|
|
160
160
|
if (description->isinline) { return Qtrue; }
|
161
161
|
return Qfalse;
|
@@ -171,7 +171,7 @@ static VALUE
|
|
171
171
|
deprecated_eh(VALUE self)
|
172
172
|
{
|
173
173
|
const htmlElemDesc *description;
|
174
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
174
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
175
175
|
|
176
176
|
if (description->depr) { return Qtrue; }
|
177
177
|
return Qfalse;
|
@@ -187,7 +187,7 @@ static VALUE
|
|
187
187
|
empty_eh(VALUE self)
|
188
188
|
{
|
189
189
|
const htmlElemDesc *description;
|
190
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
190
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
191
191
|
|
192
192
|
if (description->empty) { return Qtrue; }
|
193
193
|
return Qfalse;
|
@@ -203,7 +203,7 @@ static VALUE
|
|
203
203
|
save_end_tag_eh(VALUE self)
|
204
204
|
{
|
205
205
|
const htmlElemDesc *description;
|
206
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
206
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
207
207
|
|
208
208
|
if (description->saveEndTag) { return Qtrue; }
|
209
209
|
return Qfalse;
|
@@ -219,7 +219,7 @@ static VALUE
|
|
219
219
|
implied_end_tag_eh(VALUE self)
|
220
220
|
{
|
221
221
|
const htmlElemDesc *description;
|
222
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
222
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
223
223
|
|
224
224
|
if (description->endTag) { return Qtrue; }
|
225
225
|
return Qfalse;
|
@@ -235,7 +235,7 @@ static VALUE
|
|
235
235
|
implied_start_tag_eh(VALUE self)
|
236
236
|
{
|
237
237
|
const htmlElemDesc *description;
|
238
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
238
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
239
239
|
|
240
240
|
if (description->startTag) { return Qtrue; }
|
241
241
|
return Qfalse;
|
@@ -245,13 +245,13 @@ implied_start_tag_eh(VALUE self)
|
|
245
245
|
* call-seq:
|
246
246
|
* name
|
247
247
|
*
|
248
|
-
* Get the tag name for this
|
248
|
+
* Get the tag name for this ElementDescription
|
249
249
|
*/
|
250
250
|
static VALUE
|
251
251
|
name(VALUE self)
|
252
252
|
{
|
253
253
|
const htmlElemDesc *description;
|
254
|
-
TypedData_Get_Struct(self, htmlElemDesc, &
|
254
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
|
255
255
|
|
256
256
|
if (NULL == description->name) { return Qnil; }
|
257
257
|
return NOKOGIRI_STR_NEW2(description->name);
|
@@ -261,7 +261,7 @@ name(VALUE self)
|
|
261
261
|
* call-seq:
|
262
262
|
* [](tag_name)
|
263
263
|
*
|
264
|
-
* Get
|
264
|
+
* Get ElementDescription for +tag_name+
|
265
265
|
*/
|
266
266
|
static VALUE
|
267
267
|
get_description(VALUE klass, VALUE tag_name)
|
@@ -271,7 +271,7 @@ get_description(VALUE klass, VALUE tag_name)
|
|
271
271
|
);
|
272
272
|
|
273
273
|
if (NULL == description) { return Qnil; }
|
274
|
-
return TypedData_Wrap_Struct(klass, &
|
274
|
+
return TypedData_Wrap_Struct(klass, &html_elem_desc_type, DISCARD_CONST_QUAL(void *, description));
|
275
275
|
}
|
276
276
|
|
277
277
|
void
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#include <nokogiri.h>
|
2
|
+
|
3
|
+
VALUE cNokogiriHtml4SaxParser;
|
4
|
+
|
5
|
+
static ID id_start_document;
|
6
|
+
|
7
|
+
static void
|
8
|
+
noko_html4_sax_parser_start_document(void *ctx)
|
9
|
+
{
|
10
|
+
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
|
11
|
+
VALUE self = (VALUE)ctxt->_private;
|
12
|
+
VALUE doc = rb_iv_get(self, "@document");
|
13
|
+
|
14
|
+
xmlSAX2StartDocument(ctx);
|
15
|
+
|
16
|
+
rb_funcall(doc, id_start_document, 0);
|
17
|
+
}
|
18
|
+
|
19
|
+
static VALUE
|
20
|
+
noko_html4_sax_parser_initialize(VALUE self)
|
21
|
+
{
|
22
|
+
xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);
|
23
|
+
|
24
|
+
rb_call_super(0, NULL);
|
25
|
+
|
26
|
+
handler->startDocument = noko_html4_sax_parser_start_document;
|
27
|
+
|
28
|
+
return self;
|
29
|
+
}
|
30
|
+
|
31
|
+
void
|
32
|
+
noko_init_html4_sax_parser(void)
|
33
|
+
{
|
34
|
+
cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser);
|
35
|
+
|
36
|
+
rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native",
|
37
|
+
noko_html4_sax_parser_initialize, 0);
|
38
|
+
|
39
|
+
id_start_document = rb_intern("start_document");
|
40
|
+
}
|
@@ -2,96 +2,83 @@
|
|
2
2
|
|
3
3
|
VALUE cNokogiriHtml4SaxParserContext ;
|
4
4
|
|
5
|
+
/* :nodoc: */
|
5
6
|
static VALUE
|
6
|
-
|
7
|
+
noko_html4_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALUE rb_encoding)
|
7
8
|
{
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
if (!(int)RSTRING_LEN(data)) {
|
13
|
-
rb_raise(rb_eRuntimeError, "data cannot be empty");
|
9
|
+
Check_Type(rb_input, T_STRING);
|
10
|
+
if (!(int)RSTRING_LEN(rb_input)) {
|
11
|
+
rb_raise(rb_eRuntimeError, "input string cannot be empty");
|
14
12
|
}
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
if (ctxt->sax) {
|
19
|
-
xmlFree(ctxt->sax);
|
20
|
-
ctxt->sax = NULL;
|
14
|
+
if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
|
15
|
+
rb_raise(rb_eTypeError, "argument must be an Encoding object");
|
21
16
|
}
|
22
17
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
28
|
-
rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
|
29
|
-
StringValueCStr(encoding));
|
30
|
-
}
|
31
|
-
}
|
18
|
+
htmlParserCtxtPtr c_context =
|
19
|
+
htmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input));
|
20
|
+
if (!c_context) {
|
21
|
+
rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
|
32
22
|
}
|
33
23
|
|
34
|
-
|
35
|
-
}
|
24
|
+
noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
|
36
25
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
|
41
|
-
StringValueCStr(filename),
|
42
|
-
StringValueCStr(encoding)
|
43
|
-
);
|
44
|
-
|
45
|
-
if (ctxt->sax) {
|
46
|
-
xmlFree(ctxt->sax);
|
47
|
-
ctxt->sax = NULL;
|
26
|
+
if (c_context->sax) {
|
27
|
+
xmlFree(c_context->sax);
|
28
|
+
c_context->sax = NULL;
|
48
29
|
}
|
49
30
|
|
50
|
-
return noko_xml_sax_parser_context_wrap(
|
31
|
+
return noko_xml_sax_parser_context_wrap(rb_class, c_context);
|
51
32
|
}
|
52
33
|
|
34
|
+
/* :nodoc: */
|
53
35
|
static VALUE
|
54
|
-
|
36
|
+
noko_html4_sax_parser_context_s_native_file(VALUE rb_class, VALUE rb_filename, VALUE rb_encoding)
|
55
37
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
}
|
38
|
+
if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
|
39
|
+
rb_raise(rb_eTypeError, "argument must be an Encoding object");
|
40
|
+
}
|
60
41
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
42
|
+
htmlParserCtxtPtr c_context = htmlCreateFileParserCtxt(StringValueCStr(rb_filename), NULL);
|
43
|
+
if (!c_context) {
|
44
|
+
rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
|
45
|
+
}
|
65
46
|
|
66
|
-
|
67
|
-
|
47
|
+
noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
|
48
|
+
|
49
|
+
if (c_context->sax) {
|
50
|
+
xmlFree(c_context->sax);
|
51
|
+
c_context->sax = NULL;
|
68
52
|
}
|
69
53
|
|
70
|
-
|
71
|
-
return Qnil;
|
54
|
+
return noko_xml_sax_parser_context_wrap(rb_class, c_context);
|
72
55
|
}
|
73
56
|
|
74
57
|
static VALUE
|
75
|
-
|
58
|
+
noko_html4_sax_parser_context__parse_with(VALUE rb_context, VALUE rb_sax_parser)
|
76
59
|
{
|
77
60
|
htmlParserCtxtPtr ctxt;
|
78
61
|
htmlSAXHandlerPtr sax;
|
79
62
|
|
80
|
-
if (!rb_obj_is_kind_of(
|
63
|
+
if (!rb_obj_is_kind_of(rb_sax_parser, cNokogiriXmlSaxParser)) {
|
81
64
|
rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
|
82
65
|
}
|
83
66
|
|
84
|
-
ctxt = noko_xml_sax_parser_context_unwrap(
|
85
|
-
sax =
|
67
|
+
ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
|
68
|
+
sax = noko_xml_sax_parser_unwrap(rb_sax_parser);
|
86
69
|
|
87
70
|
ctxt->sax = sax;
|
88
|
-
ctxt->userData =
|
71
|
+
ctxt->userData = ctxt; /* so we can use libxml2/SAX2.c handlers if we want to */
|
72
|
+
ctxt->_private = (void *)rb_sax_parser;
|
89
73
|
|
90
74
|
xmlSetStructuredErrorFunc(NULL, NULL);
|
91
75
|
|
92
|
-
|
76
|
+
/* although we're calling back into Ruby here, we don't need to worry about exceptions, because we
|
77
|
+
* don't have any cleanup to do. The only memory we need to free is handled by
|
78
|
+
* xml_sax_parser_context_type_free */
|
79
|
+
htmlParseDocument(ctxt);
|
93
80
|
|
94
|
-
return
|
81
|
+
return Qnil;
|
95
82
|
}
|
96
83
|
|
97
84
|
void
|
@@ -101,8 +88,11 @@ noko_init_html_sax_parser_context(void)
|
|
101
88
|
cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
|
102
89
|
cNokogiriXmlSaxParserContext);
|
103
90
|
|
104
|
-
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "
|
105
|
-
|
91
|
+
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_memory",
|
92
|
+
noko_html4_sax_parser_context_s_native_memory, 2);
|
93
|
+
rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_file",
|
94
|
+
noko_html4_sax_parser_context_s_native_file, 2);
|
106
95
|
|
107
|
-
rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
|
96
|
+
rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
|
97
|
+
noko_html4_sax_parser_context__parse_with, 1);
|
108
98
|
}
|