nokogiri 1.3.3 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- data/CHANGELOG.ja.rdoc +48 -3
- data/CHANGELOG.rdoc +42 -0
- data/Manifest.txt +44 -29
- data/README.ja.rdoc +0 -2
- data/README.rdoc +4 -7
- data/Rakefile +42 -6
- data/bin/nokogiri +7 -5
- data/ext/nokogiri/extconf.rb +5 -21
- data/ext/nokogiri/html_document.c +14 -50
- data/ext/nokogiri/html_element_description.c +7 -7
- data/ext/nokogiri/html_entity_lookup.c +6 -4
- data/ext/nokogiri/html_sax_parser_context.c +92 -0
- data/ext/nokogiri/html_sax_parser_context.h +11 -0
- data/ext/nokogiri/nokogiri.c +9 -3
- data/ext/nokogiri/nokogiri.h +16 -20
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_attribute_decl.c +67 -0
- data/ext/nokogiri/xml_attribute_decl.h +9 -0
- data/ext/nokogiri/xml_cdata.c +6 -5
- data/ext/nokogiri/xml_comment.c +3 -2
- data/ext/nokogiri/xml_document.c +93 -23
- data/ext/nokogiri/xml_document_fragment.c +1 -3
- data/ext/nokogiri/xml_dtd.c +63 -6
- data/ext/nokogiri/xml_element_content.c +123 -0
- data/ext/nokogiri/xml_element_content.h +10 -0
- data/ext/nokogiri/xml_element_decl.c +69 -0
- data/ext/nokogiri/xml_element_decl.h +9 -0
- data/ext/nokogiri/xml_entity_decl.c +97 -0
- data/ext/nokogiri/xml_entity_decl.h +10 -0
- data/ext/nokogiri/xml_entity_reference.c +1 -1
- data/ext/nokogiri/xml_io.c +10 -3
- data/ext/nokogiri/xml_io.h +1 -0
- data/ext/nokogiri/xml_namespace.c +2 -2
- data/ext/nokogiri/xml_node.c +139 -34
- data/ext/nokogiri/xml_node.h +0 -1
- data/ext/nokogiri/xml_node_set.c +23 -16
- data/ext/nokogiri/xml_processing_instruction.c +1 -1
- data/ext/nokogiri/xml_reader.c +78 -50
- data/ext/nokogiri/xml_sax_parser.c +109 -168
- data/ext/nokogiri/xml_sax_parser.h +33 -0
- data/ext/nokogiri/xml_sax_parser_context.c +155 -0
- data/ext/nokogiri/xml_sax_parser_context.h +10 -0
- data/ext/nokogiri/xml_sax_push_parser.c +11 -6
- data/ext/nokogiri/xml_syntax_error.c +63 -12
- data/ext/nokogiri/xml_text.c +4 -3
- data/ext/nokogiri/xml_xpath.c +1 -1
- data/ext/nokogiri/xml_xpath_context.c +12 -25
- data/ext/nokogiri/xslt_stylesheet.c +3 -3
- data/lib/nokogiri.rb +4 -4
- data/lib/nokogiri/css/generated_tokenizer.rb +1 -0
- data/lib/nokogiri/css/node.rb +1 -9
- data/lib/nokogiri/css/xpath_visitor.rb +11 -21
- data/lib/nokogiri/ffi/html/document.rb +0 -9
- data/lib/nokogiri/ffi/html/sax/parser_context.rb +38 -0
- data/lib/nokogiri/ffi/io_callbacks.rb +4 -2
- data/lib/nokogiri/ffi/libxml.rb +44 -10
- data/lib/nokogiri/ffi/structs/common_node.rb +1 -1
- data/lib/nokogiri/ffi/structs/xml_attribute.rb +27 -0
- data/lib/nokogiri/ffi/structs/xml_dtd.rb +3 -1
- data/lib/nokogiri/ffi/structs/xml_element.rb +26 -0
- data/lib/nokogiri/ffi/structs/xml_element_content.rb +17 -0
- data/lib/nokogiri/ffi/structs/xml_entity.rb +32 -0
- data/lib/nokogiri/ffi/structs/xml_enumeration.rb +12 -0
- data/lib/nokogiri/ffi/structs/xml_parser_context.rb +19 -0
- data/lib/nokogiri/ffi/structs/xml_sax_push_parser_context.rb +4 -3
- data/lib/nokogiri/ffi/structs/xml_syntax_error.rb +1 -1
- data/lib/nokogiri/ffi/xml/attribute_decl.rb +27 -0
- data/lib/nokogiri/ffi/xml/comment.rb +2 -2
- data/lib/nokogiri/ffi/xml/document.rb +29 -12
- data/lib/nokogiri/ffi/xml/document_fragment.rb +0 -5
- data/lib/nokogiri/ffi/xml/dtd.rb +14 -3
- data/lib/nokogiri/ffi/xml/element_content.rb +43 -0
- data/lib/nokogiri/ffi/xml/element_decl.rb +19 -0
- data/lib/nokogiri/ffi/xml/entity_decl.rb +27 -0
- data/lib/nokogiri/ffi/xml/node.rb +45 -5
- data/lib/nokogiri/ffi/xml/node_set.rb +1 -1
- data/lib/nokogiri/ffi/xml/reader.rb +45 -24
- data/lib/nokogiri/ffi/xml/sax/parser.rb +27 -34
- data/lib/nokogiri/ffi/xml/sax/parser_context.rb +67 -0
- data/lib/nokogiri/ffi/xml/sax/push_parser.rb +5 -4
- data/lib/nokogiri/ffi/xml/syntax_error.rb +31 -16
- data/lib/nokogiri/ffi/xml/text.rb +2 -2
- data/lib/nokogiri/html.rb +1 -0
- data/lib/nokogiri/html/document.rb +39 -24
- data/lib/nokogiri/html/sax/parser.rb +2 -2
- data/lib/nokogiri/html/sax/parser_context.rb +16 -0
- data/lib/nokogiri/version.rb +1 -1
- data/lib/nokogiri/xml.rb +6 -1
- data/lib/nokogiri/xml/attr.rb +5 -0
- data/lib/nokogiri/xml/attribute_decl.rb +18 -0
- data/lib/nokogiri/xml/builder.rb +121 -13
- data/lib/nokogiri/xml/character_data.rb +7 -0
- data/lib/nokogiri/xml/document.rb +43 -29
- data/lib/nokogiri/xml/document_fragment.rb +26 -6
- data/lib/nokogiri/xml/dtd.rb +5 -5
- data/lib/nokogiri/xml/element_content.rb +36 -0
- data/lib/nokogiri/xml/element_decl.rb +13 -0
- data/lib/nokogiri/xml/entity_decl.rb +15 -0
- data/lib/nokogiri/xml/fragment_handler.rb +22 -11
- data/lib/nokogiri/xml/namespace.rb +6 -0
- data/lib/nokogiri/xml/node.rb +33 -15
- data/lib/nokogiri/xml/node_set.rb +66 -44
- data/lib/nokogiri/xml/pp.rb +2 -0
- data/lib/nokogiri/xml/pp/character_data.rb +18 -0
- data/lib/nokogiri/xml/pp/node.rb +56 -0
- data/lib/nokogiri/xml/reader.rb +8 -0
- data/lib/nokogiri/xml/sax.rb +1 -1
- data/lib/nokogiri/xml/sax/document.rb +18 -1
- data/lib/nokogiri/xml/sax/parser.rb +15 -8
- data/lib/nokogiri/xml/sax/parser_context.rb +16 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +0 -3
- data/lib/nokogiri/xml/syntax_error.rb +4 -0
- data/lib/nokogiri/xslt/stylesheet.rb +1 -1
- data/test/css/test_nthiness.rb +1 -1
- data/test/css/test_parser.rb +1 -1
- data/test/css/test_tokenizer.rb +1 -1
- data/test/css/test_xpath_visitor.rb +1 -1
- data/test/ffi/test_document.rb +1 -1
- data/test/files/shift_jis.html +10 -0
- data/test/files/staff.dtd +10 -0
- data/test/helper.rb +12 -3
- data/test/html/sax/test_parser.rb +1 -1
- data/test/html/sax/test_parser_context.rb +48 -0
- data/test/html/test_builder.rb +8 -2
- data/test/html/test_document.rb +23 -1
- data/test/html/test_document_encoding.rb +15 -1
- data/test/html/test_document_fragment.rb +10 -1
- data/test/html/test_element_description.rb +1 -2
- data/test/html/test_named_characters.rb +1 -1
- data/test/html/test_node.rb +61 -1
- data/test/html/test_node_encoding.rb +27 -0
- data/test/test_convert_xpath.rb +1 -3
- data/test/test_css_cache.rb +1 -1
- data/test/test_gc.rb +1 -1
- data/test/test_memory_leak.rb +1 -1
- data/test/test_nokogiri.rb +3 -3
- data/test/test_reader.rb +29 -1
- data/test/test_xslt_transforms.rb +1 -1
- data/test/xml/node/test_save_options.rb +1 -1
- data/test/xml/node/test_subclass.rb +1 -1
- data/test/xml/sax/test_parser.rb +64 -3
- data/test/xml/sax/test_parser_context.rb +56 -0
- data/test/xml/sax/test_push_parser.rb +11 -1
- data/test/xml/test_attr.rb +1 -1
- data/test/xml/test_attribute_decl.rb +82 -0
- data/test/xml/test_builder.rb +95 -1
- data/test/xml/test_cdata.rb +1 -1
- data/test/xml/test_comment.rb +7 -1
- data/test/xml/test_document.rb +147 -6
- data/test/xml/test_document_encoding.rb +1 -1
- data/test/xml/test_document_fragment.rb +55 -5
- data/test/xml/test_dtd.rb +40 -5
- data/test/xml/test_dtd_encoding.rb +3 -1
- data/test/xml/test_element_content.rb +56 -0
- data/test/xml/test_element_decl.rb +73 -0
- data/test/xml/test_entity_decl.rb +83 -0
- data/test/xml/test_entity_reference.rb +1 -1
- data/test/xml/test_namespace.rb +21 -1
- data/test/xml/test_node.rb +70 -4
- data/test/xml/test_node_attributes.rb +1 -1
- data/test/xml/test_node_encoding.rb +1 -1
- data/test/xml/test_node_set.rb +136 -2
- data/test/xml/test_parse_options.rb +1 -1
- data/test/xml/test_processing_instruction.rb +1 -1
- data/test/xml/test_reader_encoding.rb +1 -1
- data/test/xml/test_relax_ng.rb +1 -1
- data/test/xml/test_schema.rb +1 -1
- data/test/xml/test_syntax_error.rb +27 -0
- data/test/xml/test_text.rb +13 -1
- data/test/xml/test_unparented_node.rb +1 -1
- data/test/xml/test_xpath.rb +1 -1
- metadata +57 -40
- data/ext/nokogiri/html_sax_parser.c +0 -57
- data/ext/nokogiri/html_sax_parser.h +0 -11
- data/lib/action-nokogiri.rb +0 -38
- data/lib/nokogiri/decorators.rb +0 -2
- data/lib/nokogiri/decorators/hpricot.rb +0 -3
- data/lib/nokogiri/decorators/hpricot/node.rb +0 -56
- data/lib/nokogiri/decorators/hpricot/node_set.rb +0 -54
- data/lib/nokogiri/decorators/hpricot/xpath_visitor.rb +0 -30
- data/lib/nokogiri/ffi/html/sax/parser.rb +0 -21
- data/lib/nokogiri/hpricot.rb +0 -92
- data/lib/nokogiri/xml/entity_declaration.rb +0 -11
- data/lib/nokogiri/xml/sax/legacy_handlers.rb +0 -65
- data/test/hpricot/files/basic.xhtml +0 -17
- data/test/hpricot/files/boingboing.html +0 -2266
- data/test/hpricot/files/cy0.html +0 -3653
- data/test/hpricot/files/immob.html +0 -400
- data/test/hpricot/files/pace_application.html +0 -1320
- data/test/hpricot/files/tenderlove.html +0 -16
- data/test/hpricot/files/uswebgen.html +0 -220
- data/test/hpricot/files/utf8.html +0 -1054
- data/test/hpricot/files/week9.html +0 -1723
- data/test/hpricot/files/why.xml +0 -19
- data/test/hpricot/load_files.rb +0 -11
- data/test/hpricot/test_alter.rb +0 -68
- data/test/hpricot/test_builder.rb +0 -20
- data/test/hpricot/test_parser.rb +0 -350
- data/test/hpricot/test_paths.rb +0 -15
- data/test/hpricot/test_preserved.rb +0 -77
- data/test/hpricot/test_xml.rb +0 -30
data/bin/nokogiri
CHANGED
@@ -6,6 +6,8 @@ require 'uri'
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'nokogiri'
|
8
8
|
|
9
|
+
parse_class = Nokogiri
|
10
|
+
|
9
11
|
opts = OptionParser.new do |opts|
|
10
12
|
opts.banner = "Nokogiri: an HTML, XML, SAX, and Reader parser"
|
11
13
|
opts.define_head "Usage: nokogiri <uri|path> [options]"
|
@@ -16,6 +18,10 @@ opts = OptionParser.new do |opts|
|
|
16
18
|
opts.separator ""
|
17
19
|
opts.separator "Options:"
|
18
20
|
|
21
|
+
opts.on("--type [TYPE]", [:xml, :html]) do |v|
|
22
|
+
parse_class = {:xml => Nokogiri::XML, :html => Nokogiri::HTML}[v]
|
23
|
+
end
|
24
|
+
|
19
25
|
opts.on_tail("-?", "--help", "Show this message") do
|
20
26
|
puts opts
|
21
27
|
exit
|
@@ -36,11 +42,7 @@ if uri.to_s.strip.empty?
|
|
36
42
|
exit 1
|
37
43
|
end
|
38
44
|
|
39
|
-
|
40
|
-
@doc = Nokogiri(open(uri).read)
|
41
|
-
else
|
42
|
-
@doc = Nokogiri(File.read(uri))
|
43
|
-
end
|
45
|
+
@doc = parse_class.parse(open(uri).read)
|
44
46
|
|
45
47
|
puts "Your document is stored in @doc..."
|
46
48
|
IRB.start
|
data/ext/nokogiri/extconf.rb
CHANGED
@@ -129,27 +129,11 @@ unless find_library('exslt', 'exsltFuncRegister', *LIB_DIRS)
|
|
129
129
|
abort "libxslt is missing. try 'port install libxslt' or 'yum install libxslt-devel'"
|
130
130
|
end
|
131
131
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
alias :old_link_command :link_command
|
138
|
-
alias :link_command :nokogiri_link_command
|
139
|
-
yield
|
140
|
-
ensure
|
141
|
-
alias :link_command :old_link_command
|
142
|
-
end
|
143
|
-
|
144
|
-
with_custom_link do
|
145
|
-
with_cppflags $INCFLAGS do
|
146
|
-
have_func('xmlRelaxNGSetParserStructuredErrors')
|
147
|
-
have_func('xmlRelaxNGSetParserStructuredErrors')
|
148
|
-
have_func('xmlRelaxNGSetValidStructuredErrors')
|
149
|
-
have_func('xmlSchemaSetValidStructuredErrors')
|
150
|
-
have_func('xmlSchemaSetParserStructuredErrors')
|
151
|
-
end
|
152
|
-
end
|
132
|
+
have_func('xmlRelaxNGSetParserStructuredErrors')
|
133
|
+
have_func('xmlRelaxNGSetParserStructuredErrors')
|
134
|
+
have_func('xmlRelaxNGSetValidStructuredErrors')
|
135
|
+
have_func('xmlSchemaSetValidStructuredErrors')
|
136
|
+
have_func('xmlSchemaSetParserStructuredErrors')
|
153
137
|
|
154
138
|
if ENV['CPUPROFILE']
|
155
139
|
unless find_library('profiler', 'ProfilerEnable', *LIB_DIRS)
|
@@ -11,15 +11,15 @@ static VALUE new(int argc, VALUE *argv, VALUE klass)
|
|
11
11
|
VALUE uri, external_id, rest, rb_doc;
|
12
12
|
|
13
13
|
rb_scan_args(argc, argv, "0*", &rest);
|
14
|
-
uri = rb_ary_entry(rest, 0);
|
15
|
-
external_id = rb_ary_entry(rest, 1);
|
14
|
+
uri = rb_ary_entry(rest, (long)0);
|
15
|
+
external_id = rb_ary_entry(rest, (long)1);
|
16
16
|
|
17
17
|
htmlDocPtr doc = htmlNewDoc(
|
18
18
|
RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
|
19
19
|
RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
|
20
20
|
);
|
21
21
|
rb_doc = Nokogiri_wrap_xml_document(klass, doc);
|
22
|
-
|
22
|
+
rb_obj_call_init(rb_doc, argc, argv);
|
23
23
|
return rb_doc ;
|
24
24
|
}
|
25
25
|
|
@@ -36,8 +36,8 @@ static VALUE read_io( VALUE klass,
|
|
36
36
|
VALUE encoding,
|
37
37
|
VALUE options )
|
38
38
|
{
|
39
|
-
const char * c_url = (url
|
40
|
-
const char * c_enc = (encoding
|
39
|
+
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
|
40
|
+
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
|
41
41
|
VALUE error_list = rb_ary_new();
|
42
42
|
|
43
43
|
xmlResetLastError();
|
@@ -49,7 +49,7 @@ static VALUE read_io( VALUE klass,
|
|
49
49
|
(void *)io,
|
50
50
|
c_url,
|
51
51
|
c_enc,
|
52
|
-
NUM2INT(options)
|
52
|
+
(int)NUM2INT(options)
|
53
53
|
);
|
54
54
|
xmlSetStructuredErrorFunc(NULL, NULL);
|
55
55
|
|
@@ -58,9 +58,7 @@ static VALUE read_io( VALUE klass,
|
|
58
58
|
|
59
59
|
xmlErrorPtr error = xmlGetLastError();
|
60
60
|
if(error)
|
61
|
-
|
62
|
-
Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error)
|
63
|
-
);
|
61
|
+
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
|
64
62
|
else
|
65
63
|
rb_raise(rb_eRuntimeError, "Could not parse document");
|
66
64
|
|
@@ -68,7 +66,7 @@ static VALUE read_io( VALUE klass,
|
|
68
66
|
}
|
69
67
|
|
70
68
|
VALUE document = Nokogiri_wrap_xml_document(klass, doc);
|
71
|
-
|
69
|
+
rb_iv_set(document, "@errors", error_list);
|
72
70
|
return document;
|
73
71
|
}
|
74
72
|
|
@@ -86,15 +84,15 @@ static VALUE read_memory( VALUE klass,
|
|
86
84
|
VALUE options )
|
87
85
|
{
|
88
86
|
const char * c_buffer = StringValuePtr(string);
|
89
|
-
const char * c_url = (url
|
90
|
-
const char * c_enc = (encoding
|
87
|
+
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
|
88
|
+
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
|
91
89
|
int len = RSTRING_LEN(string);
|
92
90
|
VALUE error_list = rb_ary_new();
|
93
91
|
|
94
92
|
xmlResetLastError();
|
95
93
|
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
|
96
94
|
|
97
|
-
htmlDocPtr doc = htmlReadMemory(c_buffer, len, c_url, c_enc, NUM2INT(options));
|
95
|
+
htmlDocPtr doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
|
98
96
|
xmlSetStructuredErrorFunc(NULL, NULL);
|
99
97
|
|
100
98
|
if(doc == NULL) {
|
@@ -102,9 +100,7 @@ static VALUE read_memory( VALUE klass,
|
|
102
100
|
|
103
101
|
xmlErrorPtr error = xmlGetLastError();
|
104
102
|
if(error)
|
105
|
-
|
106
|
-
Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error)
|
107
|
-
);
|
103
|
+
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
|
108
104
|
else
|
109
105
|
rb_raise(rb_eRuntimeError, "Could not parse document");
|
110
106
|
|
@@ -112,7 +108,7 @@ static VALUE read_memory( VALUE klass,
|
|
112
108
|
}
|
113
109
|
|
114
110
|
VALUE document = Nokogiri_wrap_xml_document(klass, doc);
|
115
|
-
|
111
|
+
rb_iv_set(document, "@errors", error_list);
|
116
112
|
return document;
|
117
113
|
}
|
118
114
|
|
@@ -126,37 +122,7 @@ static VALUE type(VALUE self)
|
|
126
122
|
{
|
127
123
|
htmlDocPtr doc;
|
128
124
|
Data_Get_Struct(self, xmlDoc, doc);
|
129
|
-
return INT2NUM((
|
130
|
-
}
|
131
|
-
|
132
|
-
/*
|
133
|
-
* call-seq:
|
134
|
-
* meta_encoding=
|
135
|
-
*
|
136
|
-
* Set the meta tag encoding for this document.
|
137
|
-
*/
|
138
|
-
static VALUE set_meta_encoding(VALUE self, VALUE encoding)
|
139
|
-
{
|
140
|
-
htmlDocPtr doc;
|
141
|
-
Data_Get_Struct(self, xmlDoc, doc);
|
142
|
-
|
143
|
-
htmlSetMetaEncoding(doc, (const xmlChar *)StringValuePtr(encoding));
|
144
|
-
|
145
|
-
return encoding;
|
146
|
-
}
|
147
|
-
|
148
|
-
/*
|
149
|
-
* call-seq:
|
150
|
-
* meta_encoding
|
151
|
-
*
|
152
|
-
* Get the meta tag encoding for this document.
|
153
|
-
*/
|
154
|
-
static VALUE meta_encoding(VALUE self)
|
155
|
-
{
|
156
|
-
htmlDocPtr doc;
|
157
|
-
Data_Get_Struct(self, xmlDoc, doc);
|
158
|
-
|
159
|
-
return NOKOGIRI_STR_NEW2(htmlGetMetaEncoding(doc), doc->encoding);
|
125
|
+
return INT2NUM((long)doc->type);
|
160
126
|
}
|
161
127
|
|
162
128
|
VALUE cNokogiriHtmlDocument ;
|
@@ -176,6 +142,4 @@ void init_html_document()
|
|
176
142
|
rb_define_singleton_method(klass, "new", new, -1);
|
177
143
|
|
178
144
|
rb_define_method(klass, "type", type, 0);
|
179
|
-
rb_define_method(klass, "meta_encoding", meta_encoding, 0);
|
180
|
-
rb_define_method(klass, "meta_encoding=", set_meta_encoding, 1);
|
181
145
|
}
|
@@ -17,7 +17,7 @@ static VALUE required_attributes(VALUE self)
|
|
17
17
|
|
18
18
|
int i = 0;
|
19
19
|
while(description->attrs_req[i]) {
|
20
|
-
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_req[i]
|
20
|
+
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_req[i]));
|
21
21
|
i++;
|
22
22
|
}
|
23
23
|
|
@@ -41,7 +41,7 @@ static VALUE deprecated_attributes(VALUE self)
|
|
41
41
|
|
42
42
|
int i = 0;
|
43
43
|
while(description->attrs_depr[i]) {
|
44
|
-
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_depr[i]
|
44
|
+
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_depr[i]));
|
45
45
|
i++;
|
46
46
|
}
|
47
47
|
|
@@ -65,7 +65,7 @@ static VALUE optional_attributes(VALUE self)
|
|
65
65
|
|
66
66
|
int i = 0;
|
67
67
|
while(description->attrs_opt[i]) {
|
68
|
-
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_opt[i]
|
68
|
+
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_opt[i]));
|
69
69
|
i++;
|
70
70
|
}
|
71
71
|
|
@@ -83,7 +83,7 @@ static VALUE default_sub_element(VALUE self)
|
|
83
83
|
htmlElemDesc * description;
|
84
84
|
Data_Get_Struct(self, htmlElemDesc, description);
|
85
85
|
|
86
|
-
return NOKOGIRI_STR_NEW2(description->defaultsubelt
|
86
|
+
return NOKOGIRI_STR_NEW2(description->defaultsubelt);
|
87
87
|
}
|
88
88
|
|
89
89
|
/*
|
@@ -103,7 +103,7 @@ static VALUE sub_elements(VALUE self)
|
|
103
103
|
|
104
104
|
int i = 0;
|
105
105
|
while(description->subelts[i]) {
|
106
|
-
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->subelts[i]
|
106
|
+
rb_ary_push(list, NOKOGIRI_STR_NEW2(description->subelts[i]));
|
107
107
|
i++;
|
108
108
|
}
|
109
109
|
|
@@ -121,7 +121,7 @@ static VALUE description(VALUE self)
|
|
121
121
|
htmlElemDesc * description;
|
122
122
|
Data_Get_Struct(self, htmlElemDesc, description);
|
123
123
|
|
124
|
-
return NOKOGIRI_STR_NEW2(description->desc
|
124
|
+
return NOKOGIRI_STR_NEW2(description->desc);
|
125
125
|
}
|
126
126
|
|
127
127
|
/*
|
@@ -226,7 +226,7 @@ static VALUE name(VALUE self)
|
|
226
226
|
Data_Get_Struct(self, htmlElemDesc, description);
|
227
227
|
|
228
228
|
if(NULL == description->name) return Qnil;
|
229
|
-
return NOKOGIRI_STR_NEW2(description->name
|
229
|
+
return NOKOGIRI_STR_NEW2(description->name);
|
230
230
|
}
|
231
231
|
|
232
232
|
/*
|
@@ -14,10 +14,12 @@ static VALUE get(VALUE self, VALUE key)
|
|
14
14
|
if(NULL == desc) return Qnil;
|
15
15
|
VALUE klass = rb_const_get(mNokogiriHtml, rb_intern("EntityDescription"));
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
VALUE args[3];
|
18
|
+
args[0] = INT2NUM((long)desc->value);
|
19
|
+
args[1] = NOKOGIRI_STR_NEW2(desc->name);
|
20
|
+
args[2] = NOKOGIRI_STR_NEW2(desc->desc);
|
21
|
+
|
22
|
+
return rb_class_new_instance(3, args, klass);
|
21
23
|
}
|
22
24
|
|
23
25
|
void init_html_entity_lookup()
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#include <html_sax_parser_context.h>
|
2
|
+
|
3
|
+
VALUE cNokogiriHtmlSaxParserContext ;
|
4
|
+
|
5
|
+
static void deallocate(xmlParserCtxtPtr ctxt)
|
6
|
+
{
|
7
|
+
NOKOGIRI_DEBUG_START(handler);
|
8
|
+
|
9
|
+
ctxt->sax = NULL;
|
10
|
+
|
11
|
+
htmlFreeParserCtxt(ctxt);
|
12
|
+
|
13
|
+
NOKOGIRI_DEBUG_END(handler);
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE parse_memory(VALUE klass, VALUE data, VALUE encoding)
|
17
|
+
{
|
18
|
+
if(NIL_P(data)) rb_raise(rb_eArgError, "data cannot be nil");
|
19
|
+
if(!(int)RSTRING_LEN(data))
|
20
|
+
rb_raise(rb_eRuntimeError, "data cannot be empty");
|
21
|
+
|
22
|
+
htmlParserCtxtPtr ctxt = htmlCreateMemoryParserCtxt(
|
23
|
+
StringValuePtr(data),
|
24
|
+
(int)RSTRING_LEN(data)
|
25
|
+
);
|
26
|
+
|
27
|
+
if(RTEST(encoding)) {
|
28
|
+
xmlCharEncoding enc = xmlParseCharEncoding(StringValuePtr(encoding));
|
29
|
+
if(enc != XML_CHAR_ENCODING_ERROR) {
|
30
|
+
xmlSwitchEncoding(ctxt, enc);
|
31
|
+
if(ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
32
|
+
rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
|
33
|
+
StringValuePtr(encoding));
|
34
|
+
}
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
|
39
|
+
}
|
40
|
+
|
41
|
+
static VALUE parse_file(VALUE klass, VALUE filename, VALUE encoding)
|
42
|
+
{
|
43
|
+
htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
|
44
|
+
StringValuePtr(filename),
|
45
|
+
StringValuePtr(encoding)
|
46
|
+
);
|
47
|
+
return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE parse_with(VALUE self, VALUE sax_handler)
|
51
|
+
{
|
52
|
+
if(!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser))
|
53
|
+
rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
|
54
|
+
|
55
|
+
htmlParserCtxtPtr ctxt;
|
56
|
+
Data_Get_Struct(self, htmlParserCtxt, ctxt);
|
57
|
+
|
58
|
+
htmlSAXHandlerPtr sax;
|
59
|
+
Data_Get_Struct(sax_handler, htmlSAXHandler, sax);
|
60
|
+
|
61
|
+
// Free the sax handler since we'll assign our own
|
62
|
+
if(ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler)
|
63
|
+
xmlFree(ctxt->sax);
|
64
|
+
|
65
|
+
ctxt->sax = sax;
|
66
|
+
ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
|
67
|
+
|
68
|
+
htmlParseDocument(ctxt);
|
69
|
+
|
70
|
+
if(NULL != ctxt->myDoc) xmlFreeDoc(ctxt->myDoc);
|
71
|
+
|
72
|
+
NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
|
73
|
+
return self;
|
74
|
+
}
|
75
|
+
|
76
|
+
void init_html_sax_parser_context()
|
77
|
+
{
|
78
|
+
VALUE nokogiri = rb_define_module("Nokogiri");
|
79
|
+
VALUE xml = rb_define_module_under(nokogiri, "XML");
|
80
|
+
VALUE html = rb_define_module_under(nokogiri, "HTML");
|
81
|
+
VALUE sax = rb_define_module_under(xml, "SAX");
|
82
|
+
VALUE hsax = rb_define_module_under(html, "SAX");
|
83
|
+
VALUE pc = rb_define_class_under(sax, "ParserContext", rb_cObject);
|
84
|
+
VALUE klass = rb_define_class_under(hsax, "ParserContext", pc);
|
85
|
+
|
86
|
+
cNokogiriHtmlSaxParserContext = klass;
|
87
|
+
|
88
|
+
rb_define_singleton_method(klass, "memory", parse_memory, 2);
|
89
|
+
rb_define_singleton_method(klass, "file", parse_file, 2);
|
90
|
+
|
91
|
+
rb_define_method(klass, "parse_with", parse_with, 1);
|
92
|
+
}
|
data/ext/nokogiri/nokogiri.c
CHANGED
@@ -46,11 +46,11 @@ void Init_nokogiri()
|
|
46
46
|
|
47
47
|
rb_const_set( mNokogiri,
|
48
48
|
rb_intern("LIBXML_VERSION"),
|
49
|
-
NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION
|
49
|
+
NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION)
|
50
50
|
);
|
51
51
|
rb_const_set( mNokogiri,
|
52
52
|
rb_intern("LIBXML_PARSER_VERSION"),
|
53
|
-
NOKOGIRI_STR_NEW2(xmlParserVersion
|
53
|
+
NOKOGIRI_STR_NEW2(xmlParserVersion)
|
54
54
|
);
|
55
55
|
|
56
56
|
xmlInitParser();
|
@@ -68,16 +68,22 @@ void Init_nokogiri()
|
|
68
68
|
init_xml_node_set();
|
69
69
|
init_xml_xpath_context();
|
70
70
|
init_xml_xpath();
|
71
|
+
init_xml_sax_parser_context();
|
71
72
|
init_xml_sax_parser();
|
72
73
|
init_xml_sax_push_parser();
|
73
74
|
init_xml_reader();
|
74
75
|
init_xml_dtd();
|
76
|
+
init_xml_element_content();
|
77
|
+
init_xml_attribute_decl();
|
78
|
+
init_xml_element_decl();
|
79
|
+
init_xml_entity_decl();
|
75
80
|
init_xml_namespace();
|
76
|
-
|
81
|
+
init_html_sax_parser_context();
|
77
82
|
init_xslt_stylesheet();
|
78
83
|
init_xml_syntax_error();
|
79
84
|
init_html_entity_lookup();
|
80
85
|
init_html_element_description();
|
81
86
|
init_xml_schema();
|
82
87
|
init_xml_relax_ng();
|
88
|
+
init_nokogiri_io();
|
83
89
|
}
|
data/ext/nokogiri/nokogiri.h
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <assert.h>
|
6
|
-
#include <ruby.h>
|
7
6
|
#include <libxml/parser.h>
|
7
|
+
#include <libxml/parserInternals.h>
|
8
8
|
#include <libxml/xpath.h>
|
9
9
|
#include <libxml/xpathInternals.h>
|
10
10
|
#include <libxml/xmlreader.h>
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include <libxml/HTMLparser.h>
|
14
14
|
#include <libxml/HTMLtree.h>
|
15
15
|
#include <libxml/relaxng.h>
|
16
|
+
#include <ruby.h>
|
16
17
|
|
17
18
|
#ifdef USE_INCLUDED_VASPRINTF
|
18
19
|
int vasprintf (char **strp, const char *fmt, va_list ap);
|
@@ -42,38 +43,28 @@ int is_2_6_16(void) ;
|
|
42
43
|
|
43
44
|
#include <ruby/encoding.h>
|
44
45
|
|
45
|
-
#define NOKOGIRI_STR_NEW2(str
|
46
|
+
#define NOKOGIRI_STR_NEW2(str) \
|
46
47
|
({ \
|
47
48
|
VALUE _string = rb_str_new2((const char *)str); \
|
48
|
-
|
49
|
-
|
50
|
-
if(_enc == -1) \
|
51
|
-
rb_enc_associate_index(_string, rb_enc_find_index("ASCII")); \
|
52
|
-
else \
|
53
|
-
rb_enc_associate_index(_string, _enc); \
|
54
|
-
} \
|
49
|
+
int _enc = rb_enc_find_index("UTF-8"); \
|
50
|
+
rb_enc_associate_index(_string, _enc); \
|
55
51
|
_string; \
|
56
52
|
})
|
57
53
|
|
58
|
-
#define NOKOGIRI_STR_NEW(str, len
|
54
|
+
#define NOKOGIRI_STR_NEW(str, len) \
|
59
55
|
({ \
|
60
56
|
VALUE _string = rb_str_new((const char *)str, (long)len); \
|
61
|
-
|
62
|
-
|
63
|
-
if(_enc == -1) \
|
64
|
-
rb_enc_associate_index(_string, rb_enc_find_index("ASCII")); \
|
65
|
-
else \
|
66
|
-
rb_enc_associate_index(_string, _enc); \
|
67
|
-
} \
|
57
|
+
int _enc = rb_enc_find_index("UTF-8"); \
|
58
|
+
rb_enc_associate_index(_string, _enc); \
|
68
59
|
_string; \
|
69
60
|
})
|
70
61
|
|
71
62
|
#else
|
72
63
|
|
73
|
-
#define NOKOGIRI_STR_NEW2(str
|
64
|
+
#define NOKOGIRI_STR_NEW2(str) \
|
74
65
|
rb_str_new2((const char *)str)
|
75
66
|
|
76
|
-
#define NOKOGIRI_STR_NEW(str, len
|
67
|
+
#define NOKOGIRI_STR_NEW(str, len) \
|
77
68
|
rb_str_new((const char *)str, (long)len)
|
78
69
|
#endif
|
79
70
|
|
@@ -92,11 +83,16 @@ int is_2_6_16(void) ;
|
|
92
83
|
#include <xml_node_set.h>
|
93
84
|
#include <xml_xpath.h>
|
94
85
|
#include <xml_dtd.h>
|
86
|
+
#include <xml_attribute_decl.h>
|
87
|
+
#include <xml_element_decl.h>
|
88
|
+
#include <xml_entity_decl.h>
|
95
89
|
#include <xml_xpath_context.h>
|
90
|
+
#include <xml_element_content.h>
|
91
|
+
#include <xml_sax_parser_context.h>
|
96
92
|
#include <xml_sax_parser.h>
|
97
93
|
#include <xml_sax_push_parser.h>
|
98
94
|
#include <xml_reader.h>
|
99
|
-
#include <
|
95
|
+
#include <html_sax_parser_context.h>
|
100
96
|
#include <xslt_stylesheet.h>
|
101
97
|
#include <xml_syntax_error.h>
|
102
98
|
#include <xml_schema.h>
|