nokogiri 1.13.8 → 1.15.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +40 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +18 -11
- data/dependencies.yml +33 -15
- data/ext/nokogiri/extconf.rb +164 -46
- data/ext/nokogiri/gumbo.c +20 -10
- data/ext/nokogiri/html4_document.c +3 -4
- data/ext/nokogiri/html4_element_description.c +20 -15
- data/ext/nokogiri/html4_entity_lookup.c +2 -2
- data/ext/nokogiri/html4_sax_parser_context.c +11 -22
- data/ext/nokogiri/html4_sax_push_parser.c +3 -3
- data/ext/nokogiri/nokogiri.c +84 -75
- data/ext/nokogiri/nokogiri.h +31 -16
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +2 -2
- data/ext/nokogiri/xml_attribute_decl.c +2 -2
- data/ext/nokogiri/xml_cdata.c +32 -18
- data/ext/nokogiri/xml_comment.c +2 -2
- data/ext/nokogiri/xml_document.c +127 -34
- data/ext/nokogiri/xml_document_fragment.c +2 -2
- data/ext/nokogiri/xml_dtd.c +2 -2
- data/ext/nokogiri/xml_element_content.c +34 -31
- data/ext/nokogiri/xml_element_decl.c +7 -7
- data/ext/nokogiri/xml_encoding_handler.c +15 -7
- data/ext/nokogiri/xml_entity_decl.c +1 -1
- data/ext/nokogiri/xml_entity_reference.c +2 -2
- data/ext/nokogiri/xml_namespace.c +79 -14
- data/ext/nokogiri/xml_node.c +300 -34
- data/ext/nokogiri/xml_node_set.c +125 -107
- data/ext/nokogiri/xml_processing_instruction.c +2 -2
- data/ext/nokogiri/xml_reader.c +81 -48
- data/ext/nokogiri/xml_relax_ng.c +66 -81
- data/ext/nokogiri/xml_sax_parser.c +45 -20
- data/ext/nokogiri/xml_sax_parser_context.c +46 -30
- data/ext/nokogiri/xml_sax_push_parser.c +30 -11
- data/ext/nokogiri/xml_schema.c +95 -117
- data/ext/nokogiri/xml_syntax_error.c +1 -1
- data/ext/nokogiri/xml_text.c +28 -14
- data/ext/nokogiri/xml_xpath_context.c +216 -136
- data/ext/nokogiri/xslt_stylesheet.c +118 -64
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +10 -6
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +15 -16
- data/gumbo-parser/src/foreign_attrs.gperf +1 -1
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +21 -5
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +7 -5
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/decorators/slop.rb +1 -1
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +4 -3
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/document_fragment.rb +1 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +1827 -365
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +10 -3
- data/lib/nokogiri/html5/node.rb +8 -5
- data/lib/nokogiri/html5.rb +130 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/attribute_decl.rb +4 -2
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -55
- data/lib/nokogiri/xml/document_fragment.rb +50 -7
- data/lib/nokogiri/xml/element_content.rb +10 -2
- data/lib/nokogiri/xml/element_decl.rb +4 -2
- data/lib/nokogiri/xml/entity_decl.rb +4 -2
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +14 -4
- data/lib/nokogiri/xml/node.rb +212 -48
- data/lib/nokogiri/xml/node_set.rb +88 -9
- data/lib/nokogiri/xml/parse_options.rb +129 -50
- data/lib/nokogiri/xml/pp/node.rb +28 -15
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/document.rb +1 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xml/searchable.rb +18 -10
- data/lib/nokogiri/xslt.rb +74 -4
- data/lib/nokogiri.rb +15 -15
- data/lib/xsd/xmlparser/nokogiri.rb +4 -2
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.11.7.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
- metadata +19 -242
- data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +0 -3037
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
data/ext/nokogiri/gumbo.c
CHANGED
@@ -23,13 +23,13 @@
|
|
23
23
|
//
|
24
24
|
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
|
25
25
|
// is then walked, a parallel libxml2 tree is constructed, and the final document is
|
26
|
-
// then wrapped using
|
26
|
+
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
|
27
27
|
// requirements as Ruby objects are only built when necessary.
|
28
28
|
//
|
29
29
|
|
30
30
|
#include <nokogiri.h>
|
31
31
|
|
32
|
-
#include "
|
32
|
+
#include "nokogiri_gumbo.h"
|
33
33
|
|
34
34
|
VALUE cNokogiriHtml5Document;
|
35
35
|
|
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
281
281
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
282
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
283
283
|
rb_iv_set(syntax_error, "@file", url);
|
284
|
-
rb_iv_set(syntax_error, "@line",
|
284
|
+
rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
|
285
285
|
rb_iv_set(syntax_error, "@str1", str1);
|
286
286
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
287
287
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
288
288
|
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
289
|
-
rb_iv_set(syntax_error, "@column",
|
289
|
+
rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
|
290
290
|
rb_ary_push(rerrors, syntax_error);
|
291
291
|
}
|
292
292
|
rb_iv_set(rdoc, "@errors", rerrors);
|
@@ -297,6 +297,7 @@ typedef struct {
|
|
297
297
|
GumboOutput *output;
|
298
298
|
VALUE input;
|
299
299
|
VALUE url_or_frag;
|
300
|
+
VALUE klass;
|
300
301
|
xmlDocPtr doc;
|
301
302
|
} ParseArgs;
|
302
303
|
|
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
|
|
321
322
|
* @!visibility protected
|
322
323
|
*/
|
323
324
|
static VALUE
|
324
|
-
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
|
325
|
+
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
|
325
326
|
{
|
326
327
|
GumboOptions options = kGumboDefaultOptions;
|
327
328
|
options.max_attributes = NUM2INT(max_attributes);
|
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
|
|
333
334
|
.output = output,
|
334
335
|
.input = input,
|
335
336
|
.url_or_frag = url,
|
337
|
+
.klass = klass,
|
336
338
|
.doc = NULL,
|
337
339
|
};
|
338
340
|
|
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
|
|
357
359
|
}
|
358
360
|
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
359
361
|
build_tree(doc, (xmlNodePtr)doc, output->document);
|
360
|
-
VALUE rdoc =
|
362
|
+
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
|
363
|
+
rb_iv_set(rdoc, "@url", args->url_or_frag);
|
364
|
+
rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
361
365
|
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
|
362
366
|
add_errors(output, rdoc, args->input, args->url_or_frag);
|
363
367
|
return rdoc;
|
@@ -498,9 +502,11 @@ error:
|
|
498
502
|
}
|
499
503
|
|
500
504
|
// Encoding.
|
501
|
-
if (
|
505
|
+
if (ctx_ns == GUMBO_NAMESPACE_MATHML
|
506
|
+
&& RSTRING_LEN(tag_name) == 14
|
502
507
|
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
503
508
|
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
509
|
+
1,
|
504
510
|
rb_utf8_str_new_static("encoding", 8));
|
505
511
|
if (RTEST(enc)) {
|
506
512
|
Check_Type(enc, T_STRING);
|
@@ -512,8 +518,11 @@ error:
|
|
512
518
|
// Quirks mode.
|
513
519
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
514
520
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
515
|
-
|
521
|
+
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
+
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
516
523
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
|
+
} else if (NIL_P(dtd)) {
|
525
|
+
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
517
526
|
} else {
|
518
527
|
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
519
528
|
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
@@ -560,13 +569,14 @@ fragment_continue(VALUE parse_args)
|
|
560
569
|
args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
|
561
570
|
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
562
571
|
build_tree(xml_doc, xml_frag, output->root);
|
572
|
+
rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
563
573
|
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
564
574
|
return Qnil;
|
565
575
|
}
|
566
576
|
|
567
577
|
// Initialize the Nokogumbo class and fetch constants we will use later.
|
568
578
|
void
|
569
|
-
noko_init_gumbo()
|
579
|
+
noko_init_gumbo(void)
|
570
580
|
{
|
571
581
|
// Class constants.
|
572
582
|
cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
|
@@ -577,7 +587,7 @@ noko_init_gumbo()
|
|
577
587
|
parent = rb_intern_const("parent");
|
578
588
|
|
579
589
|
// Define Nokogumbo module with parse and fragment methods.
|
580
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse", parse,
|
590
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
|
581
591
|
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
|
582
592
|
}
|
583
593
|
|
@@ -144,13 +144,12 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
|
|
144
144
|
static VALUE
|
145
145
|
rb_html_document_type(VALUE self)
|
146
146
|
{
|
147
|
-
htmlDocPtr doc;
|
148
|
-
|
149
|
-
return INT2NUM((long)doc->type);
|
147
|
+
htmlDocPtr doc = noko_xml_document_unwrap(self);
|
148
|
+
return INT2NUM(doc->type);
|
150
149
|
}
|
151
150
|
|
152
151
|
void
|
153
|
-
noko_init_html_document()
|
152
|
+
noko_init_html_document(void)
|
154
153
|
{
|
155
154
|
assert(cNokogiriXmlDocument);
|
156
155
|
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
|
@@ -1,5 +1,10 @@
|
|
1
1
|
#include <nokogiri.h>
|
2
2
|
|
3
|
+
static const rb_data_type_t html4_element_description_type = {
|
4
|
+
.wrap_struct_name = "Nokogiri::HTML4::ElementDescription",
|
5
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
|
6
|
+
};
|
7
|
+
|
3
8
|
VALUE cNokogiriHtml4ElementDescription ;
|
4
9
|
|
5
10
|
/*
|
@@ -15,7 +20,7 @@ required_attributes(VALUE self)
|
|
15
20
|
VALUE list;
|
16
21
|
int i;
|
17
22
|
|
18
|
-
|
23
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
19
24
|
|
20
25
|
list = rb_ary_new();
|
21
26
|
|
@@ -41,7 +46,7 @@ deprecated_attributes(VALUE self)
|
|
41
46
|
VALUE list;
|
42
47
|
int i;
|
43
48
|
|
44
|
-
|
49
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
45
50
|
|
46
51
|
list = rb_ary_new();
|
47
52
|
|
@@ -67,7 +72,7 @@ optional_attributes(VALUE self)
|
|
67
72
|
VALUE list;
|
68
73
|
int i;
|
69
74
|
|
70
|
-
|
75
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
71
76
|
|
72
77
|
list = rb_ary_new();
|
73
78
|
|
@@ -90,7 +95,7 @@ static VALUE
|
|
90
95
|
default_sub_element(VALUE self)
|
91
96
|
{
|
92
97
|
const htmlElemDesc *description;
|
93
|
-
|
98
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
94
99
|
|
95
100
|
if (description->defaultsubelt) {
|
96
101
|
return NOKOGIRI_STR_NEW2(description->defaultsubelt);
|
@@ -112,7 +117,7 @@ sub_elements(VALUE self)
|
|
112
117
|
VALUE list;
|
113
118
|
int i;
|
114
119
|
|
115
|
-
|
120
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
116
121
|
|
117
122
|
list = rb_ary_new();
|
118
123
|
|
@@ -135,7 +140,7 @@ static VALUE
|
|
135
140
|
description(VALUE self)
|
136
141
|
{
|
137
142
|
const htmlElemDesc *description;
|
138
|
-
|
143
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
139
144
|
|
140
145
|
return NOKOGIRI_STR_NEW2(description->desc);
|
141
146
|
}
|
@@ -150,7 +155,7 @@ static VALUE
|
|
150
155
|
inline_eh(VALUE self)
|
151
156
|
{
|
152
157
|
const htmlElemDesc *description;
|
153
|
-
|
158
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
154
159
|
|
155
160
|
if (description->isinline) { return Qtrue; }
|
156
161
|
return Qfalse;
|
@@ -166,7 +171,7 @@ static VALUE
|
|
166
171
|
deprecated_eh(VALUE self)
|
167
172
|
{
|
168
173
|
const htmlElemDesc *description;
|
169
|
-
|
174
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
170
175
|
|
171
176
|
if (description->depr) { return Qtrue; }
|
172
177
|
return Qfalse;
|
@@ -182,7 +187,7 @@ static VALUE
|
|
182
187
|
empty_eh(VALUE self)
|
183
188
|
{
|
184
189
|
const htmlElemDesc *description;
|
185
|
-
|
190
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
186
191
|
|
187
192
|
if (description->empty) { return Qtrue; }
|
188
193
|
return Qfalse;
|
@@ -198,7 +203,7 @@ static VALUE
|
|
198
203
|
save_end_tag_eh(VALUE self)
|
199
204
|
{
|
200
205
|
const htmlElemDesc *description;
|
201
|
-
|
206
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
202
207
|
|
203
208
|
if (description->saveEndTag) { return Qtrue; }
|
204
209
|
return Qfalse;
|
@@ -214,7 +219,7 @@ static VALUE
|
|
214
219
|
implied_end_tag_eh(VALUE self)
|
215
220
|
{
|
216
221
|
const htmlElemDesc *description;
|
217
|
-
|
222
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
218
223
|
|
219
224
|
if (description->endTag) { return Qtrue; }
|
220
225
|
return Qfalse;
|
@@ -230,7 +235,7 @@ static VALUE
|
|
230
235
|
implied_start_tag_eh(VALUE self)
|
231
236
|
{
|
232
237
|
const htmlElemDesc *description;
|
233
|
-
|
238
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
234
239
|
|
235
240
|
if (description->startTag) { return Qtrue; }
|
236
241
|
return Qfalse;
|
@@ -246,7 +251,7 @@ static VALUE
|
|
246
251
|
name(VALUE self)
|
247
252
|
{
|
248
253
|
const htmlElemDesc *description;
|
249
|
-
|
254
|
+
TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
|
250
255
|
|
251
256
|
if (NULL == description->name) { return Qnil; }
|
252
257
|
return NOKOGIRI_STR_NEW2(description->name);
|
@@ -266,11 +271,11 @@ get_description(VALUE klass, VALUE tag_name)
|
|
266
271
|
);
|
267
272
|
|
268
273
|
if (NULL == description) { return Qnil; }
|
269
|
-
return
|
274
|
+
return TypedData_Wrap_Struct(klass, &html4_element_description_type, DISCARD_CONST_QUAL(void *, description));
|
270
275
|
}
|
271
276
|
|
272
277
|
void
|
273
|
-
noko_init_html_element_description()
|
278
|
+
noko_init_html_element_description(void)
|
274
279
|
{
|
275
280
|
cNokogiriHtml4ElementDescription = rb_define_class_under(mNokogiriHtml4, "ElementDescription", rb_cObject);
|
276
281
|
|
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
20
20
|
return Qnil;
|
21
21
|
}
|
22
22
|
|
23
|
-
rb_constructor_args[0] =
|
23
|
+
rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
|
24
24
|
rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
|
25
25
|
rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
|
26
26
|
|
@@ -29,7 +29,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
29
29
|
}
|
30
30
|
|
31
31
|
void
|
32
|
-
noko_init_html_entity_lookup()
|
32
|
+
noko_init_html_entity_lookup(void)
|
33
33
|
{
|
34
34
|
cNokogiriHtml4EntityLookup = rb_define_class_under(mNokogiriHtml4, "EntityLookup", rb_cObject);
|
35
35
|
|
@@ -2,18 +2,6 @@
|
|
2
2
|
|
3
3
|
VALUE cNokogiriHtml4SaxParserContext ;
|
4
4
|
|
5
|
-
static void
|
6
|
-
deallocate(xmlParserCtxtPtr ctxt)
|
7
|
-
{
|
8
|
-
NOKOGIRI_DEBUG_START(ctxt);
|
9
|
-
|
10
|
-
ctxt->sax = NULL;
|
11
|
-
|
12
|
-
htmlFreeParserCtxt(ctxt);
|
13
|
-
|
14
|
-
NOKOGIRI_DEBUG_END(ctxt);
|
15
|
-
}
|
16
|
-
|
17
5
|
static VALUE
|
18
6
|
parse_memory(VALUE klass, VALUE data, VALUE encoding)
|
19
7
|
{
|
@@ -43,7 +31,7 @@ parse_memory(VALUE klass, VALUE data, VALUE encoding)
|
|
43
31
|
}
|
44
32
|
}
|
45
33
|
|
46
|
-
return
|
34
|
+
return noko_xml_sax_parser_context_wrap(klass, ctxt);
|
47
35
|
}
|
48
36
|
|
49
37
|
static VALUE
|
@@ -53,7 +41,13 @@ parse_file(VALUE klass, VALUE filename, VALUE encoding)
|
|
53
41
|
StringValueCStr(filename),
|
54
42
|
StringValueCStr(encoding)
|
55
43
|
);
|
56
|
-
|
44
|
+
|
45
|
+
if (ctxt->sax) {
|
46
|
+
xmlFree(ctxt->sax);
|
47
|
+
ctxt->sax = NULL;
|
48
|
+
}
|
49
|
+
|
50
|
+
return noko_xml_sax_parser_context_wrap(klass, ctxt);
|
57
51
|
}
|
58
52
|
|
59
53
|
static VALUE
|
@@ -87,13 +81,8 @@ parse_with(VALUE self, VALUE sax_handler)
|
|
87
81
|
rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
|
88
82
|
}
|
89
83
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
/* Free the sax handler since we'll assign our own */
|
94
|
-
if (ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler) {
|
95
|
-
xmlFree(ctxt->sax);
|
96
|
-
}
|
84
|
+
ctxt = noko_xml_sax_parser_context_unwrap(self);
|
85
|
+
sax = noko_sax_handler_unwrap(sax_handler);
|
97
86
|
|
98
87
|
ctxt->sax = sax;
|
99
88
|
ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
|
@@ -106,7 +95,7 @@ parse_with(VALUE self, VALUE sax_handler)
|
|
106
95
|
}
|
107
96
|
|
108
97
|
void
|
109
|
-
noko_init_html_sax_parser_context()
|
98
|
+
noko_init_html_sax_parser_context(void)
|
110
99
|
{
|
111
100
|
assert(cNokogiriXmlSaxParserContext);
|
112
101
|
cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
|
@@ -17,7 +17,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)
|
|
17
17
|
int status = 0;
|
18
18
|
libxmlStructuredErrorHandlerState handler_state;
|
19
19
|
|
20
|
-
|
20
|
+
ctx = noko_xml_sax_push_parser_unwrap(self);
|
21
21
|
|
22
22
|
if (Qnil != _chunk) {
|
23
23
|
chunk = StringValuePtr(_chunk);
|
@@ -54,7 +54,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
|
|
54
54
|
htmlParserCtxtPtr ctx;
|
55
55
|
xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;
|
56
56
|
|
57
|
-
|
57
|
+
sax = noko_sax_handler_unwrap(_xml_sax);
|
58
58
|
|
59
59
|
if (_filename != Qnil) { filename = StringValueCStr(_filename); }
|
60
60
|
|
@@ -85,7 +85,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
|
|
85
85
|
}
|
86
86
|
|
87
87
|
void
|
88
|
-
noko_init_html_sax_push_parser()
|
88
|
+
noko_init_html_sax_push_parser(void)
|
89
89
|
{
|
90
90
|
assert(cNokogiriXmlSaxPushParser);
|
91
91
|
cNokogiriHtml4SaxPushParser = rb_define_class_under(mNokogiriHtml4Sax, "PushParser", cNokogiriXmlSaxPushParser);
|
data/ext/nokogiri/nokogiri.c
CHANGED
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
|
|
49
49
|
void noko_init_gumbo(void);
|
50
50
|
void noko_init_test_global_handlers(void);
|
51
51
|
|
52
|
-
static ID id_read, id_write;
|
53
|
-
|
54
|
-
|
55
|
-
#ifndef HAVE_VASPRINTF
|
56
|
-
/*
|
57
|
-
* Thank you Geoffroy Couprie for this implementation of vasprintf!
|
58
|
-
*/
|
59
|
-
int
|
60
|
-
vasprintf(char **strp, const char *fmt, va_list ap)
|
61
|
-
{
|
62
|
-
/* Mingw32/64 have a broken vsnprintf implementation that fails when
|
63
|
-
* using a zero-byte limit in order to retrieve the required size for malloc.
|
64
|
-
* So we use a one byte buffer instead.
|
65
|
-
*/
|
66
|
-
char tmp[1];
|
67
|
-
int len = vsnprintf(tmp, 1, fmt, ap) + 1;
|
68
|
-
char *res = (char *)malloc((unsigned int)len);
|
69
|
-
if (res == NULL) {
|
70
|
-
return -1;
|
71
|
-
}
|
72
|
-
*strp = res;
|
73
|
-
return vsnprintf(res, (unsigned int)len, fmt, ap);
|
74
|
-
}
|
75
|
-
#endif
|
52
|
+
static ID id_read, id_write, id_external_encoding;
|
76
53
|
|
77
54
|
|
78
55
|
static VALUE
|
79
|
-
|
56
|
+
noko_io_read_check(VALUE val)
|
80
57
|
{
|
81
58
|
VALUE *args = (VALUE *)val;
|
82
59
|
return rb_funcall(args[0], id_read, 1, args[1]);
|
@@ -84,75 +61,126 @@ read_check(VALUE val)
|
|
84
61
|
|
85
62
|
|
86
63
|
static VALUE
|
87
|
-
|
64
|
+
noko_io_read_failed(VALUE arg, VALUE exc)
|
88
65
|
{
|
89
66
|
return Qundef;
|
90
67
|
}
|
91
68
|
|
92
69
|
|
93
70
|
int
|
94
|
-
noko_io_read(void *
|
71
|
+
noko_io_read(void *io, char *c_buffer, int c_buffer_len)
|
95
72
|
{
|
96
|
-
VALUE
|
97
|
-
|
73
|
+
VALUE rb_io = (VALUE)io;
|
74
|
+
VALUE rb_read_string, rb_args[2];
|
75
|
+
size_t n_bytes_read, safe_len;
|
98
76
|
|
99
|
-
|
100
|
-
|
77
|
+
rb_args[0] = rb_io;
|
78
|
+
rb_args[1] = INT2NUM(c_buffer_len);
|
101
79
|
|
102
|
-
|
80
|
+
rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
|
103
81
|
|
104
|
-
if (NIL_P(
|
105
|
-
if (
|
106
|
-
if (TYPE(
|
82
|
+
if (NIL_P(rb_read_string)) { return 0; }
|
83
|
+
if (rb_read_string == Qundef) { return -1; }
|
84
|
+
if (TYPE(rb_read_string) != T_STRING) { return -1; }
|
107
85
|
|
108
|
-
|
109
|
-
safe_len =
|
110
|
-
memcpy(
|
86
|
+
n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
|
87
|
+
safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
|
88
|
+
memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
|
111
89
|
|
112
90
|
return (int)safe_len;
|
113
91
|
}
|
114
92
|
|
115
93
|
|
116
94
|
static VALUE
|
117
|
-
|
95
|
+
noko_io_write_check(VALUE rb_args)
|
118
96
|
{
|
119
|
-
VALUE
|
120
|
-
|
97
|
+
VALUE rb_io = ((VALUE *)rb_args)[0];
|
98
|
+
VALUE rb_output = ((VALUE *)rb_args)[1];
|
99
|
+
return rb_funcall(rb_io, id_write, 1, rb_output);
|
121
100
|
}
|
122
101
|
|
123
102
|
|
124
103
|
static VALUE
|
125
|
-
|
104
|
+
noko_io_write_failed(VALUE arg, VALUE exc)
|
126
105
|
{
|
127
106
|
return Qundef;
|
128
107
|
}
|
129
108
|
|
130
109
|
|
131
110
|
int
|
132
|
-
noko_io_write(void *
|
111
|
+
noko_io_write(void *io, char *c_buffer, int c_buffer_len)
|
133
112
|
{
|
134
|
-
VALUE
|
113
|
+
VALUE rb_args[2], rb_n_bytes_written;
|
114
|
+
VALUE rb_io = (VALUE)io;
|
115
|
+
VALUE rb_enc = Qnil;
|
116
|
+
rb_encoding *io_encoding;
|
135
117
|
|
136
|
-
|
137
|
-
|
118
|
+
if (rb_respond_to(rb_io, id_external_encoding)) {
|
119
|
+
rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
|
120
|
+
}
|
121
|
+
io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
|
138
122
|
|
139
|
-
|
123
|
+
rb_args[0] = rb_io;
|
124
|
+
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
|
140
125
|
|
141
|
-
|
126
|
+
rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
|
127
|
+
if (rb_n_bytes_written == Qundef) { return -1; }
|
142
128
|
|
143
|
-
return NUM2INT(
|
129
|
+
return NUM2INT(rb_n_bytes_written);
|
144
130
|
}
|
145
131
|
|
146
132
|
|
147
133
|
int
|
148
|
-
noko_io_close(void *
|
134
|
+
noko_io_close(void *io)
|
149
135
|
{
|
150
136
|
return 0;
|
151
137
|
}
|
152
138
|
|
153
139
|
|
140
|
+
#if defined(_WIN32) && !defined(NOKOGIRI_PACKAGED_LIBRARIES)
|
141
|
+
# define NOKOGIRI_WINDOWS_DLLS 1
|
142
|
+
#else
|
143
|
+
# define NOKOGIRI_WINDOWS_DLLS 0
|
144
|
+
#endif
|
145
|
+
|
146
|
+
//
|
147
|
+
// | dlls || true | false |
|
148
|
+
// | nlmm || | |
|
149
|
+
// |-----------++---------+---------|
|
150
|
+
// | NULL || default | ruby |
|
151
|
+
// | "random" || default | ruby |
|
152
|
+
// | "ruby" || ruby | ruby |
|
153
|
+
// | "default" || default | default |
|
154
|
+
//
|
155
|
+
// We choose *not* to use Ruby's memory management functions with windows DLLs because of this
|
156
|
+
// issue: https://github.com/sparklemotion/nokogiri/issues/2241
|
157
|
+
//
|
158
|
+
static void
|
159
|
+
set_libxml_memory_management(void)
|
160
|
+
{
|
161
|
+
const char *nlmm = getenv("NOKOGIRI_LIBXML_MEMORY_MANAGEMENT");
|
162
|
+
if (nlmm) {
|
163
|
+
if (strcmp(nlmm, "default") == 0) {
|
164
|
+
goto libxml_uses_default_memory_management;
|
165
|
+
} else if (strcmp(nlmm, "ruby") == 0) {
|
166
|
+
goto libxml_uses_ruby_memory_management;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
if (NOKOGIRI_WINDOWS_DLLS) {
|
170
|
+
libxml_uses_default_memory_management:
|
171
|
+
rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("default"));
|
172
|
+
return;
|
173
|
+
} else {
|
174
|
+
libxml_uses_ruby_memory_management:
|
175
|
+
rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("ruby"));
|
176
|
+
xmlMemSetup((xmlFreeFunc)ruby_xfree, (xmlMallocFunc)ruby_xmalloc, (xmlReallocFunc)ruby_xrealloc, ruby_strdup);
|
177
|
+
return;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
|
154
182
|
void
|
155
|
-
Init_nokogiri()
|
183
|
+
Init_nokogiri(void)
|
156
184
|
{
|
157
185
|
mNokogiri = rb_define_module("Nokogiri");
|
158
186
|
mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo");
|
@@ -164,6 +192,10 @@ Init_nokogiri()
|
|
164
192
|
mNokogiriXmlXpath = rb_define_module_under(mNokogiriXml, "XPath");
|
165
193
|
mNokogiriXslt = rb_define_module_under(mNokogiri, "XSLT");
|
166
194
|
|
195
|
+
set_libxml_memory_management(); /* must be before any function calls that might invoke xmlInitParser() */
|
196
|
+
xmlInitParser();
|
197
|
+
exsltRegisterAll();
|
198
|
+
|
167
199
|
rb_const_set(mNokogiri, rb_intern("LIBXML_COMPILED_VERSION"), NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION));
|
168
200
|
rb_const_set(mNokogiri, rb_intern("LIBXML_LOADED_VERSION"), NOKOGIRI_STR_NEW2(xmlParserVersion));
|
169
201
|
|
@@ -196,30 +228,6 @@ Init_nokogiri()
|
|
196
228
|
rb_const_set(mNokogiri, rb_intern("OTHER_LIBRARY_VERSIONS"), NOKOGIRI_STR_NEW2(NOKOGIRI_OTHER_LIBRARY_VERSIONS));
|
197
229
|
#endif
|
198
230
|
|
199
|
-
#if defined(_WIN32) && !defined(NOKOGIRI_PACKAGED_LIBRARIES)
|
200
|
-
/*
|
201
|
-
* We choose *not* to do use Ruby's memory management functions with windows DLLs because of this
|
202
|
-
* issue in libxml 2.9.12:
|
203
|
-
*
|
204
|
-
* https://github.com/sparklemotion/nokogiri/issues/2241
|
205
|
-
*
|
206
|
-
* If the atexit() issue gets fixed in a future version of libxml2, then we may be able to skip
|
207
|
-
* this config only for the specific libxml2 versions 2.9.12.
|
208
|
-
*
|
209
|
-
* Alternatively, now that Ruby has a generational GC, it might be OK to let libxml2 use its
|
210
|
-
* default memory management functions (recall that this config was introduced to reduce memory
|
211
|
-
* bloat and allow Ruby to GC more often); but we should *really* test with production workloads
|
212
|
-
* before making that kind of a potentially-invasive change.
|
213
|
-
*/
|
214
|
-
rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("default"));
|
215
|
-
#else
|
216
|
-
rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("ruby"));
|
217
|
-
xmlMemSetup((xmlFreeFunc)ruby_xfree, (xmlMallocFunc)ruby_xmalloc, (xmlReallocFunc)ruby_xrealloc, ruby_strdup);
|
218
|
-
#endif
|
219
|
-
|
220
|
-
xmlInitParser();
|
221
|
-
exsltRegisterAll();
|
222
|
-
|
223
231
|
if (xsltExtModuleFunctionLookup((const xmlChar *)"date-time", EXSLT_DATE_NAMESPACE)) {
|
224
232
|
rb_const_set(mNokogiri, rb_intern("LIBXSLT_DATETIME_ENABLED"), Qtrue);
|
225
233
|
} else {
|
@@ -275,4 +283,5 @@ Init_nokogiri()
|
|
275
283
|
|
276
284
|
id_read = rb_intern("read");
|
277
285
|
id_write = rb_intern("write");
|
286
|
+
id_external_encoding = rb_intern("external_encoding");
|
278
287
|
}
|