nokogiri 1.3.3 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (201) hide show
  1. data/CHANGELOG.ja.rdoc +48 -3
  2. data/CHANGELOG.rdoc +42 -0
  3. data/Manifest.txt +44 -29
  4. data/README.ja.rdoc +0 -2
  5. data/README.rdoc +4 -7
  6. data/Rakefile +42 -6
  7. data/bin/nokogiri +7 -5
  8. data/ext/nokogiri/extconf.rb +5 -21
  9. data/ext/nokogiri/html_document.c +14 -50
  10. data/ext/nokogiri/html_element_description.c +7 -7
  11. data/ext/nokogiri/html_entity_lookup.c +6 -4
  12. data/ext/nokogiri/html_sax_parser_context.c +92 -0
  13. data/ext/nokogiri/html_sax_parser_context.h +11 -0
  14. data/ext/nokogiri/nokogiri.c +9 -3
  15. data/ext/nokogiri/nokogiri.h +16 -20
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_attribute_decl.c +67 -0
  18. data/ext/nokogiri/xml_attribute_decl.h +9 -0
  19. data/ext/nokogiri/xml_cdata.c +6 -5
  20. data/ext/nokogiri/xml_comment.c +3 -2
  21. data/ext/nokogiri/xml_document.c +93 -23
  22. data/ext/nokogiri/xml_document_fragment.c +1 -3
  23. data/ext/nokogiri/xml_dtd.c +63 -6
  24. data/ext/nokogiri/xml_element_content.c +123 -0
  25. data/ext/nokogiri/xml_element_content.h +10 -0
  26. data/ext/nokogiri/xml_element_decl.c +69 -0
  27. data/ext/nokogiri/xml_element_decl.h +9 -0
  28. data/ext/nokogiri/xml_entity_decl.c +97 -0
  29. data/ext/nokogiri/xml_entity_decl.h +10 -0
  30. data/ext/nokogiri/xml_entity_reference.c +1 -1
  31. data/ext/nokogiri/xml_io.c +10 -3
  32. data/ext/nokogiri/xml_io.h +1 -0
  33. data/ext/nokogiri/xml_namespace.c +2 -2
  34. data/ext/nokogiri/xml_node.c +139 -34
  35. data/ext/nokogiri/xml_node.h +0 -1
  36. data/ext/nokogiri/xml_node_set.c +23 -16
  37. data/ext/nokogiri/xml_processing_instruction.c +1 -1
  38. data/ext/nokogiri/xml_reader.c +78 -50
  39. data/ext/nokogiri/xml_sax_parser.c +109 -168
  40. data/ext/nokogiri/xml_sax_parser.h +33 -0
  41. data/ext/nokogiri/xml_sax_parser_context.c +155 -0
  42. data/ext/nokogiri/xml_sax_parser_context.h +10 -0
  43. data/ext/nokogiri/xml_sax_push_parser.c +11 -6
  44. data/ext/nokogiri/xml_syntax_error.c +63 -12
  45. data/ext/nokogiri/xml_text.c +4 -3
  46. data/ext/nokogiri/xml_xpath.c +1 -1
  47. data/ext/nokogiri/xml_xpath_context.c +12 -25
  48. data/ext/nokogiri/xslt_stylesheet.c +3 -3
  49. data/lib/nokogiri.rb +4 -4
  50. data/lib/nokogiri/css/generated_tokenizer.rb +1 -0
  51. data/lib/nokogiri/css/node.rb +1 -9
  52. data/lib/nokogiri/css/xpath_visitor.rb +11 -21
  53. data/lib/nokogiri/ffi/html/document.rb +0 -9
  54. data/lib/nokogiri/ffi/html/sax/parser_context.rb +38 -0
  55. data/lib/nokogiri/ffi/io_callbacks.rb +4 -2
  56. data/lib/nokogiri/ffi/libxml.rb +44 -10
  57. data/lib/nokogiri/ffi/structs/common_node.rb +1 -1
  58. data/lib/nokogiri/ffi/structs/xml_attribute.rb +27 -0
  59. data/lib/nokogiri/ffi/structs/xml_dtd.rb +3 -1
  60. data/lib/nokogiri/ffi/structs/xml_element.rb +26 -0
  61. data/lib/nokogiri/ffi/structs/xml_element_content.rb +17 -0
  62. data/lib/nokogiri/ffi/structs/xml_entity.rb +32 -0
  63. data/lib/nokogiri/ffi/structs/xml_enumeration.rb +12 -0
  64. data/lib/nokogiri/ffi/structs/xml_parser_context.rb +19 -0
  65. data/lib/nokogiri/ffi/structs/xml_sax_push_parser_context.rb +4 -3
  66. data/lib/nokogiri/ffi/structs/xml_syntax_error.rb +1 -1
  67. data/lib/nokogiri/ffi/xml/attribute_decl.rb +27 -0
  68. data/lib/nokogiri/ffi/xml/comment.rb +2 -2
  69. data/lib/nokogiri/ffi/xml/document.rb +29 -12
  70. data/lib/nokogiri/ffi/xml/document_fragment.rb +0 -5
  71. data/lib/nokogiri/ffi/xml/dtd.rb +14 -3
  72. data/lib/nokogiri/ffi/xml/element_content.rb +43 -0
  73. data/lib/nokogiri/ffi/xml/element_decl.rb +19 -0
  74. data/lib/nokogiri/ffi/xml/entity_decl.rb +27 -0
  75. data/lib/nokogiri/ffi/xml/node.rb +45 -5
  76. data/lib/nokogiri/ffi/xml/node_set.rb +1 -1
  77. data/lib/nokogiri/ffi/xml/reader.rb +45 -24
  78. data/lib/nokogiri/ffi/xml/sax/parser.rb +27 -34
  79. data/lib/nokogiri/ffi/xml/sax/parser_context.rb +67 -0
  80. data/lib/nokogiri/ffi/xml/sax/push_parser.rb +5 -4
  81. data/lib/nokogiri/ffi/xml/syntax_error.rb +31 -16
  82. data/lib/nokogiri/ffi/xml/text.rb +2 -2
  83. data/lib/nokogiri/html.rb +1 -0
  84. data/lib/nokogiri/html/document.rb +39 -24
  85. data/lib/nokogiri/html/sax/parser.rb +2 -2
  86. data/lib/nokogiri/html/sax/parser_context.rb +16 -0
  87. data/lib/nokogiri/version.rb +1 -1
  88. data/lib/nokogiri/xml.rb +6 -1
  89. data/lib/nokogiri/xml/attr.rb +5 -0
  90. data/lib/nokogiri/xml/attribute_decl.rb +18 -0
  91. data/lib/nokogiri/xml/builder.rb +121 -13
  92. data/lib/nokogiri/xml/character_data.rb +7 -0
  93. data/lib/nokogiri/xml/document.rb +43 -29
  94. data/lib/nokogiri/xml/document_fragment.rb +26 -6
  95. data/lib/nokogiri/xml/dtd.rb +5 -5
  96. data/lib/nokogiri/xml/element_content.rb +36 -0
  97. data/lib/nokogiri/xml/element_decl.rb +13 -0
  98. data/lib/nokogiri/xml/entity_decl.rb +15 -0
  99. data/lib/nokogiri/xml/fragment_handler.rb +22 -11
  100. data/lib/nokogiri/xml/namespace.rb +6 -0
  101. data/lib/nokogiri/xml/node.rb +33 -15
  102. data/lib/nokogiri/xml/node_set.rb +66 -44
  103. data/lib/nokogiri/xml/pp.rb +2 -0
  104. data/lib/nokogiri/xml/pp/character_data.rb +18 -0
  105. data/lib/nokogiri/xml/pp/node.rb +56 -0
  106. data/lib/nokogiri/xml/reader.rb +8 -0
  107. data/lib/nokogiri/xml/sax.rb +1 -1
  108. data/lib/nokogiri/xml/sax/document.rb +18 -1
  109. data/lib/nokogiri/xml/sax/parser.rb +15 -8
  110. data/lib/nokogiri/xml/sax/parser_context.rb +16 -0
  111. data/lib/nokogiri/xml/sax/push_parser.rb +0 -3
  112. data/lib/nokogiri/xml/syntax_error.rb +4 -0
  113. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  114. data/test/css/test_nthiness.rb +1 -1
  115. data/test/css/test_parser.rb +1 -1
  116. data/test/css/test_tokenizer.rb +1 -1
  117. data/test/css/test_xpath_visitor.rb +1 -1
  118. data/test/ffi/test_document.rb +1 -1
  119. data/test/files/shift_jis.html +10 -0
  120. data/test/files/staff.dtd +10 -0
  121. data/test/helper.rb +12 -3
  122. data/test/html/sax/test_parser.rb +1 -1
  123. data/test/html/sax/test_parser_context.rb +48 -0
  124. data/test/html/test_builder.rb +8 -2
  125. data/test/html/test_document.rb +23 -1
  126. data/test/html/test_document_encoding.rb +15 -1
  127. data/test/html/test_document_fragment.rb +10 -1
  128. data/test/html/test_element_description.rb +1 -2
  129. data/test/html/test_named_characters.rb +1 -1
  130. data/test/html/test_node.rb +61 -1
  131. data/test/html/test_node_encoding.rb +27 -0
  132. data/test/test_convert_xpath.rb +1 -3
  133. data/test/test_css_cache.rb +1 -1
  134. data/test/test_gc.rb +1 -1
  135. data/test/test_memory_leak.rb +1 -1
  136. data/test/test_nokogiri.rb +3 -3
  137. data/test/test_reader.rb +29 -1
  138. data/test/test_xslt_transforms.rb +1 -1
  139. data/test/xml/node/test_save_options.rb +1 -1
  140. data/test/xml/node/test_subclass.rb +1 -1
  141. data/test/xml/sax/test_parser.rb +64 -3
  142. data/test/xml/sax/test_parser_context.rb +56 -0
  143. data/test/xml/sax/test_push_parser.rb +11 -1
  144. data/test/xml/test_attr.rb +1 -1
  145. data/test/xml/test_attribute_decl.rb +82 -0
  146. data/test/xml/test_builder.rb +95 -1
  147. data/test/xml/test_cdata.rb +1 -1
  148. data/test/xml/test_comment.rb +7 -1
  149. data/test/xml/test_document.rb +147 -6
  150. data/test/xml/test_document_encoding.rb +1 -1
  151. data/test/xml/test_document_fragment.rb +55 -5
  152. data/test/xml/test_dtd.rb +40 -5
  153. data/test/xml/test_dtd_encoding.rb +3 -1
  154. data/test/xml/test_element_content.rb +56 -0
  155. data/test/xml/test_element_decl.rb +73 -0
  156. data/test/xml/test_entity_decl.rb +83 -0
  157. data/test/xml/test_entity_reference.rb +1 -1
  158. data/test/xml/test_namespace.rb +21 -1
  159. data/test/xml/test_node.rb +70 -4
  160. data/test/xml/test_node_attributes.rb +1 -1
  161. data/test/xml/test_node_encoding.rb +1 -1
  162. data/test/xml/test_node_set.rb +136 -2
  163. data/test/xml/test_parse_options.rb +1 -1
  164. data/test/xml/test_processing_instruction.rb +1 -1
  165. data/test/xml/test_reader_encoding.rb +1 -1
  166. data/test/xml/test_relax_ng.rb +1 -1
  167. data/test/xml/test_schema.rb +1 -1
  168. data/test/xml/test_syntax_error.rb +27 -0
  169. data/test/xml/test_text.rb +13 -1
  170. data/test/xml/test_unparented_node.rb +1 -1
  171. data/test/xml/test_xpath.rb +1 -1
  172. metadata +57 -40
  173. data/ext/nokogiri/html_sax_parser.c +0 -57
  174. data/ext/nokogiri/html_sax_parser.h +0 -11
  175. data/lib/action-nokogiri.rb +0 -38
  176. data/lib/nokogiri/decorators.rb +0 -2
  177. data/lib/nokogiri/decorators/hpricot.rb +0 -3
  178. data/lib/nokogiri/decorators/hpricot/node.rb +0 -56
  179. data/lib/nokogiri/decorators/hpricot/node_set.rb +0 -54
  180. data/lib/nokogiri/decorators/hpricot/xpath_visitor.rb +0 -30
  181. data/lib/nokogiri/ffi/html/sax/parser.rb +0 -21
  182. data/lib/nokogiri/hpricot.rb +0 -92
  183. data/lib/nokogiri/xml/entity_declaration.rb +0 -11
  184. data/lib/nokogiri/xml/sax/legacy_handlers.rb +0 -65
  185. data/test/hpricot/files/basic.xhtml +0 -17
  186. data/test/hpricot/files/boingboing.html +0 -2266
  187. data/test/hpricot/files/cy0.html +0 -3653
  188. data/test/hpricot/files/immob.html +0 -400
  189. data/test/hpricot/files/pace_application.html +0 -1320
  190. data/test/hpricot/files/tenderlove.html +0 -16
  191. data/test/hpricot/files/uswebgen.html +0 -220
  192. data/test/hpricot/files/utf8.html +0 -1054
  193. data/test/hpricot/files/week9.html +0 -1723
  194. data/test/hpricot/files/why.xml +0 -19
  195. data/test/hpricot/load_files.rb +0 -11
  196. data/test/hpricot/test_alter.rb +0 -68
  197. data/test/hpricot/test_builder.rb +0 -20
  198. data/test/hpricot/test_parser.rb +0 -350
  199. data/test/hpricot/test_paths.rb +0 -15
  200. data/test/hpricot/test_preserved.rb +0 -77
  201. data/test/hpricot/test_xml.rb +0 -30
@@ -6,6 +6,8 @@ require 'uri'
6
6
  require 'rubygems'
7
7
  require 'nokogiri'
8
8
 
9
+ parse_class = Nokogiri
10
+
9
11
  opts = OptionParser.new do |opts|
10
12
  opts.banner = "Nokogiri: an HTML, XML, SAX, and Reader parser"
11
13
  opts.define_head "Usage: nokogiri <uri|path> [options]"
@@ -16,6 +18,10 @@ opts = OptionParser.new do |opts|
16
18
  opts.separator ""
17
19
  opts.separator "Options:"
18
20
 
21
+ opts.on("--type [TYPE]", [:xml, :html]) do |v|
22
+ parse_class = {:xml => Nokogiri::XML, :html => Nokogiri::HTML}[v]
23
+ end
24
+
19
25
  opts.on_tail("-?", "--help", "Show this message") do
20
26
  puts opts
21
27
  exit
@@ -36,11 +42,7 @@ if uri.to_s.strip.empty?
36
42
  exit 1
37
43
  end
38
44
 
39
- if URI.parse(uri).scheme
40
- @doc = Nokogiri(open(uri).read)
41
- else
42
- @doc = Nokogiri(File.read(uri))
43
- end
45
+ @doc = parse_class.parse(open(uri).read)
44
46
 
45
47
  puts "Your document is stored in @doc..."
46
48
  IRB.start
@@ -129,27 +129,11 @@ unless find_library('exslt', 'exsltFuncRegister', *LIB_DIRS)
129
129
  abort "libxslt is missing. try 'port install libxslt' or 'yum install libxslt-devel'"
130
130
  end
131
131
 
132
- def nokogiri_link_command ldflags, opt='', libpath=$LIBPATH
133
- old_link_command ldflags, opt, libpath
134
- end
135
-
136
- def with_custom_link
137
- alias :old_link_command :link_command
138
- alias :link_command :nokogiri_link_command
139
- yield
140
- ensure
141
- alias :link_command :old_link_command
142
- end
143
-
144
- with_custom_link do
145
- with_cppflags $INCFLAGS do
146
- have_func('xmlRelaxNGSetParserStructuredErrors')
147
- have_func('xmlRelaxNGSetParserStructuredErrors')
148
- have_func('xmlRelaxNGSetValidStructuredErrors')
149
- have_func('xmlSchemaSetValidStructuredErrors')
150
- have_func('xmlSchemaSetParserStructuredErrors')
151
- end
152
- end
132
+ have_func('xmlRelaxNGSetParserStructuredErrors')
133
+ have_func('xmlRelaxNGSetParserStructuredErrors')
134
+ have_func('xmlRelaxNGSetValidStructuredErrors')
135
+ have_func('xmlSchemaSetValidStructuredErrors')
136
+ have_func('xmlSchemaSetParserStructuredErrors')
153
137
 
154
138
  if ENV['CPUPROFILE']
155
139
  unless find_library('profiler', 'ProfilerEnable', *LIB_DIRS)
@@ -11,15 +11,15 @@ static VALUE new(int argc, VALUE *argv, VALUE klass)
11
11
  VALUE uri, external_id, rest, rb_doc;
12
12
 
13
13
  rb_scan_args(argc, argv, "0*", &rest);
14
- uri = rb_ary_entry(rest, 0);
15
- external_id = rb_ary_entry(rest, 1);
14
+ uri = rb_ary_entry(rest, (long)0);
15
+ external_id = rb_ary_entry(rest, (long)1);
16
16
 
17
17
  htmlDocPtr doc = htmlNewDoc(
18
18
  RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
19
19
  RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
20
20
  );
21
21
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
22
- rb_funcall2(rb_doc, rb_intern("initialize"), argc, argv);
22
+ rb_obj_call_init(rb_doc, argc, argv);
23
23
  return rb_doc ;
24
24
  }
25
25
 
@@ -36,8 +36,8 @@ static VALUE read_io( VALUE klass,
36
36
  VALUE encoding,
37
37
  VALUE options )
38
38
  {
39
- const char * c_url = (url == Qnil) ? NULL : StringValuePtr(url);
40
- const char * c_enc = (encoding == Qnil) ? NULL : StringValuePtr(encoding);
39
+ const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
40
+ const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
41
41
  VALUE error_list = rb_ary_new();
42
42
 
43
43
  xmlResetLastError();
@@ -49,7 +49,7 @@ static VALUE read_io( VALUE klass,
49
49
  (void *)io,
50
50
  c_url,
51
51
  c_enc,
52
- NUM2INT(options)
52
+ (int)NUM2INT(options)
53
53
  );
54
54
  xmlSetStructuredErrorFunc(NULL, NULL);
55
55
 
@@ -58,9 +58,7 @@ static VALUE read_io( VALUE klass,
58
58
 
59
59
  xmlErrorPtr error = xmlGetLastError();
60
60
  if(error)
61
- rb_funcall(rb_mKernel, rb_intern("raise"), 1,
62
- Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error)
63
- );
61
+ rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
64
62
  else
65
63
  rb_raise(rb_eRuntimeError, "Could not parse document");
66
64
 
@@ -68,7 +66,7 @@ static VALUE read_io( VALUE klass,
68
66
  }
69
67
 
70
68
  VALUE document = Nokogiri_wrap_xml_document(klass, doc);
71
- rb_funcall(document, rb_intern("errors="), 1, error_list);
69
+ rb_iv_set(document, "@errors", error_list);
72
70
  return document;
73
71
  }
74
72
 
@@ -86,15 +84,15 @@ static VALUE read_memory( VALUE klass,
86
84
  VALUE options )
87
85
  {
88
86
  const char * c_buffer = StringValuePtr(string);
89
- const char * c_url = (url == Qnil) ? NULL : StringValuePtr(url);
90
- const char * c_enc = (encoding == Qnil) ? NULL : StringValuePtr(encoding);
87
+ const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
88
+ const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
91
89
  int len = RSTRING_LEN(string);
92
90
  VALUE error_list = rb_ary_new();
93
91
 
94
92
  xmlResetLastError();
95
93
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
96
94
 
97
- htmlDocPtr doc = htmlReadMemory(c_buffer, len, c_url, c_enc, NUM2INT(options));
95
+ htmlDocPtr doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
98
96
  xmlSetStructuredErrorFunc(NULL, NULL);
99
97
 
100
98
  if(doc == NULL) {
@@ -102,9 +100,7 @@ static VALUE read_memory( VALUE klass,
102
100
 
103
101
  xmlErrorPtr error = xmlGetLastError();
104
102
  if(error)
105
- rb_funcall(rb_mKernel, rb_intern("raise"), 1,
106
- Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error)
107
- );
103
+ rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
108
104
  else
109
105
  rb_raise(rb_eRuntimeError, "Could not parse document");
110
106
 
@@ -112,7 +108,7 @@ static VALUE read_memory( VALUE klass,
112
108
  }
113
109
 
114
110
  VALUE document = Nokogiri_wrap_xml_document(klass, doc);
115
- rb_funcall(document, rb_intern("errors="), 1, error_list);
111
+ rb_iv_set(document, "@errors", error_list);
116
112
  return document;
117
113
  }
118
114
 
@@ -126,37 +122,7 @@ static VALUE type(VALUE self)
126
122
  {
127
123
  htmlDocPtr doc;
128
124
  Data_Get_Struct(self, xmlDoc, doc);
129
- return INT2NUM((int)doc->type);
130
- }
131
-
132
- /*
133
- * call-seq:
134
- * meta_encoding=
135
- *
136
- * Set the meta tag encoding for this document.
137
- */
138
- static VALUE set_meta_encoding(VALUE self, VALUE encoding)
139
- {
140
- htmlDocPtr doc;
141
- Data_Get_Struct(self, xmlDoc, doc);
142
-
143
- htmlSetMetaEncoding(doc, (const xmlChar *)StringValuePtr(encoding));
144
-
145
- return encoding;
146
- }
147
-
148
- /*
149
- * call-seq:
150
- * meta_encoding
151
- *
152
- * Get the meta tag encoding for this document.
153
- */
154
- static VALUE meta_encoding(VALUE self)
155
- {
156
- htmlDocPtr doc;
157
- Data_Get_Struct(self, xmlDoc, doc);
158
-
159
- return NOKOGIRI_STR_NEW2(htmlGetMetaEncoding(doc), doc->encoding);
125
+ return INT2NUM((long)doc->type);
160
126
  }
161
127
 
162
128
  VALUE cNokogiriHtmlDocument ;
@@ -176,6 +142,4 @@ void init_html_document()
176
142
  rb_define_singleton_method(klass, "new", new, -1);
177
143
 
178
144
  rb_define_method(klass, "type", type, 0);
179
- rb_define_method(klass, "meta_encoding", meta_encoding, 0);
180
- rb_define_method(klass, "meta_encoding=", set_meta_encoding, 1);
181
145
  }
@@ -17,7 +17,7 @@ static VALUE required_attributes(VALUE self)
17
17
 
18
18
  int i = 0;
19
19
  while(description->attrs_req[i]) {
20
- rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_req[i], "UTF-8"));
20
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_req[i]));
21
21
  i++;
22
22
  }
23
23
 
@@ -41,7 +41,7 @@ static VALUE deprecated_attributes(VALUE self)
41
41
 
42
42
  int i = 0;
43
43
  while(description->attrs_depr[i]) {
44
- rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_depr[i], "UTF-8"));
44
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_depr[i]));
45
45
  i++;
46
46
  }
47
47
 
@@ -65,7 +65,7 @@ static VALUE optional_attributes(VALUE self)
65
65
 
66
66
  int i = 0;
67
67
  while(description->attrs_opt[i]) {
68
- rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_opt[i], "UTF-8"));
68
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_opt[i]));
69
69
  i++;
70
70
  }
71
71
 
@@ -83,7 +83,7 @@ static VALUE default_sub_element(VALUE self)
83
83
  htmlElemDesc * description;
84
84
  Data_Get_Struct(self, htmlElemDesc, description);
85
85
 
86
- return NOKOGIRI_STR_NEW2(description->defaultsubelt, "UTF-8");
86
+ return NOKOGIRI_STR_NEW2(description->defaultsubelt);
87
87
  }
88
88
 
89
89
  /*
@@ -103,7 +103,7 @@ static VALUE sub_elements(VALUE self)
103
103
 
104
104
  int i = 0;
105
105
  while(description->subelts[i]) {
106
- rb_ary_push(list, NOKOGIRI_STR_NEW2(description->subelts[i], "UTF-8"));
106
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->subelts[i]));
107
107
  i++;
108
108
  }
109
109
 
@@ -121,7 +121,7 @@ static VALUE description(VALUE self)
121
121
  htmlElemDesc * description;
122
122
  Data_Get_Struct(self, htmlElemDesc, description);
123
123
 
124
- return NOKOGIRI_STR_NEW2(description->desc, "UTF-8");
124
+ return NOKOGIRI_STR_NEW2(description->desc);
125
125
  }
126
126
 
127
127
  /*
@@ -226,7 +226,7 @@ static VALUE name(VALUE self)
226
226
  Data_Get_Struct(self, htmlElemDesc, description);
227
227
 
228
228
  if(NULL == description->name) return Qnil;
229
- return NOKOGIRI_STR_NEW2(description->name, "UTF-8");
229
+ return NOKOGIRI_STR_NEW2(description->name);
230
230
  }
231
231
 
232
232
  /*
@@ -14,10 +14,12 @@ static VALUE get(VALUE self, VALUE key)
14
14
  if(NULL == desc) return Qnil;
15
15
  VALUE klass = rb_const_get(mNokogiriHtml, rb_intern("EntityDescription"));
16
16
 
17
- return rb_funcall(klass, rb_intern("new"), 3,
18
- INT2NUM((int)desc->value),
19
- NOKOGIRI_STR_NEW2(desc->name, "UTF-8"),
20
- NOKOGIRI_STR_NEW2(desc->desc, "UTF-8"));
17
+ VALUE args[3];
18
+ args[0] = INT2NUM((long)desc->value);
19
+ args[1] = NOKOGIRI_STR_NEW2(desc->name);
20
+ args[2] = NOKOGIRI_STR_NEW2(desc->desc);
21
+
22
+ return rb_class_new_instance(3, args, klass);
21
23
  }
22
24
 
23
25
  void init_html_entity_lookup()
@@ -0,0 +1,92 @@
1
+ #include <html_sax_parser_context.h>
2
+
3
+ VALUE cNokogiriHtmlSaxParserContext ;
4
+
5
+ static void deallocate(xmlParserCtxtPtr ctxt)
6
+ {
7
+ NOKOGIRI_DEBUG_START(handler);
8
+
9
+ ctxt->sax = NULL;
10
+
11
+ htmlFreeParserCtxt(ctxt);
12
+
13
+ NOKOGIRI_DEBUG_END(handler);
14
+ }
15
+
16
+ static VALUE parse_memory(VALUE klass, VALUE data, VALUE encoding)
17
+ {
18
+ if(NIL_P(data)) rb_raise(rb_eArgError, "data cannot be nil");
19
+ if(!(int)RSTRING_LEN(data))
20
+ rb_raise(rb_eRuntimeError, "data cannot be empty");
21
+
22
+ htmlParserCtxtPtr ctxt = htmlCreateMemoryParserCtxt(
23
+ StringValuePtr(data),
24
+ (int)RSTRING_LEN(data)
25
+ );
26
+
27
+ if(RTEST(encoding)) {
28
+ xmlCharEncoding enc = xmlParseCharEncoding(StringValuePtr(encoding));
29
+ if(enc != XML_CHAR_ENCODING_ERROR) {
30
+ xmlSwitchEncoding(ctxt, enc);
31
+ if(ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
32
+ rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
33
+ StringValuePtr(encoding));
34
+ }
35
+ }
36
+ }
37
+
38
+ return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
39
+ }
40
+
41
+ static VALUE parse_file(VALUE klass, VALUE filename, VALUE encoding)
42
+ {
43
+ htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
44
+ StringValuePtr(filename),
45
+ StringValuePtr(encoding)
46
+ );
47
+ return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
48
+ }
49
+
50
+ static VALUE parse_with(VALUE self, VALUE sax_handler)
51
+ {
52
+ if(!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser))
53
+ rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
54
+
55
+ htmlParserCtxtPtr ctxt;
56
+ Data_Get_Struct(self, htmlParserCtxt, ctxt);
57
+
58
+ htmlSAXHandlerPtr sax;
59
+ Data_Get_Struct(sax_handler, htmlSAXHandler, sax);
60
+
61
+ // Free the sax handler since we'll assign our own
62
+ if(ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler)
63
+ xmlFree(ctxt->sax);
64
+
65
+ ctxt->sax = sax;
66
+ ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
67
+
68
+ htmlParseDocument(ctxt);
69
+
70
+ if(NULL != ctxt->myDoc) xmlFreeDoc(ctxt->myDoc);
71
+
72
+ NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
73
+ return self;
74
+ }
75
+
76
+ void init_html_sax_parser_context()
77
+ {
78
+ VALUE nokogiri = rb_define_module("Nokogiri");
79
+ VALUE xml = rb_define_module_under(nokogiri, "XML");
80
+ VALUE html = rb_define_module_under(nokogiri, "HTML");
81
+ VALUE sax = rb_define_module_under(xml, "SAX");
82
+ VALUE hsax = rb_define_module_under(html, "SAX");
83
+ VALUE pc = rb_define_class_under(sax, "ParserContext", rb_cObject);
84
+ VALUE klass = rb_define_class_under(hsax, "ParserContext", pc);
85
+
86
+ cNokogiriHtmlSaxParserContext = klass;
87
+
88
+ rb_define_singleton_method(klass, "memory", parse_memory, 2);
89
+ rb_define_singleton_method(klass, "file", parse_file, 2);
90
+
91
+ rb_define_method(klass, "parse_with", parse_with, 1);
92
+ }
@@ -0,0 +1,11 @@
1
+ #ifndef NOKOGIRI_HTML_SAX_PARSER_CONTEXT
2
+ #define NOKOGIRI_HTML_SAX_PARSER_CONTEXT
3
+
4
+ #include <nokogiri.h>
5
+
6
+ extern VALUE cNokogiriHtmlSaxParserContext;
7
+
8
+ void init_html_sax_parser_context();
9
+
10
+ #endif
11
+
@@ -46,11 +46,11 @@ void Init_nokogiri()
46
46
 
47
47
  rb_const_set( mNokogiri,
48
48
  rb_intern("LIBXML_VERSION"),
49
- NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION, "UTF-8")
49
+ NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION)
50
50
  );
51
51
  rb_const_set( mNokogiri,
52
52
  rb_intern("LIBXML_PARSER_VERSION"),
53
- NOKOGIRI_STR_NEW2(xmlParserVersion, "UTF-8")
53
+ NOKOGIRI_STR_NEW2(xmlParserVersion)
54
54
  );
55
55
 
56
56
  xmlInitParser();
@@ -68,16 +68,22 @@ void Init_nokogiri()
68
68
  init_xml_node_set();
69
69
  init_xml_xpath_context();
70
70
  init_xml_xpath();
71
+ init_xml_sax_parser_context();
71
72
  init_xml_sax_parser();
72
73
  init_xml_sax_push_parser();
73
74
  init_xml_reader();
74
75
  init_xml_dtd();
76
+ init_xml_element_content();
77
+ init_xml_attribute_decl();
78
+ init_xml_element_decl();
79
+ init_xml_entity_decl();
75
80
  init_xml_namespace();
76
- init_html_sax_parser();
81
+ init_html_sax_parser_context();
77
82
  init_xslt_stylesheet();
78
83
  init_xml_syntax_error();
79
84
  init_html_entity_lookup();
80
85
  init_html_element_description();
81
86
  init_xml_schema();
82
87
  init_xml_relax_ng();
88
+ init_nokogiri_io();
83
89
  }
@@ -3,8 +3,8 @@
3
3
 
4
4
  #include <stdlib.h>
5
5
  #include <assert.h>
6
- #include <ruby.h>
7
6
  #include <libxml/parser.h>
7
+ #include <libxml/parserInternals.h>
8
8
  #include <libxml/xpath.h>
9
9
  #include <libxml/xpathInternals.h>
10
10
  #include <libxml/xmlreader.h>
@@ -13,6 +13,7 @@
13
13
  #include <libxml/HTMLparser.h>
14
14
  #include <libxml/HTMLtree.h>
15
15
  #include <libxml/relaxng.h>
16
+ #include <ruby.h>
16
17
 
17
18
  #ifdef USE_INCLUDED_VASPRINTF
18
19
  int vasprintf (char **strp, const char *fmt, va_list ap);
@@ -42,38 +43,28 @@ int is_2_6_16(void) ;
42
43
 
43
44
  #include <ruby/encoding.h>
44
45
 
45
- #define NOKOGIRI_STR_NEW2(str, encoding) \
46
+ #define NOKOGIRI_STR_NEW2(str) \
46
47
  ({ \
47
48
  VALUE _string = rb_str_new2((const char *)str); \
48
- if(NULL != encoding) { \
49
- int _enc = rb_enc_find_index("UTF-8"); \
50
- if(_enc == -1) \
51
- rb_enc_associate_index(_string, rb_enc_find_index("ASCII")); \
52
- else \
53
- rb_enc_associate_index(_string, _enc); \
54
- } \
49
+ int _enc = rb_enc_find_index("UTF-8"); \
50
+ rb_enc_associate_index(_string, _enc); \
55
51
  _string; \
56
52
  })
57
53
 
58
- #define NOKOGIRI_STR_NEW(str, len, encoding) \
54
+ #define NOKOGIRI_STR_NEW(str, len) \
59
55
  ({ \
60
56
  VALUE _string = rb_str_new((const char *)str, (long)len); \
61
- if(NULL != encoding) { \
62
- int _enc = rb_enc_find_index("UTF-8"); \
63
- if(_enc == -1) \
64
- rb_enc_associate_index(_string, rb_enc_find_index("ASCII")); \
65
- else \
66
- rb_enc_associate_index(_string, _enc); \
67
- } \
57
+ int _enc = rb_enc_find_index("UTF-8"); \
58
+ rb_enc_associate_index(_string, _enc); \
68
59
  _string; \
69
60
  })
70
61
 
71
62
  #else
72
63
 
73
- #define NOKOGIRI_STR_NEW2(str, doc) \
64
+ #define NOKOGIRI_STR_NEW2(str) \
74
65
  rb_str_new2((const char *)str)
75
66
 
76
- #define NOKOGIRI_STR_NEW(str, len, doc) \
67
+ #define NOKOGIRI_STR_NEW(str, len) \
77
68
  rb_str_new((const char *)str, (long)len)
78
69
  #endif
79
70
 
@@ -92,11 +83,16 @@ int is_2_6_16(void) ;
92
83
  #include <xml_node_set.h>
93
84
  #include <xml_xpath.h>
94
85
  #include <xml_dtd.h>
86
+ #include <xml_attribute_decl.h>
87
+ #include <xml_element_decl.h>
88
+ #include <xml_entity_decl.h>
95
89
  #include <xml_xpath_context.h>
90
+ #include <xml_element_content.h>
91
+ #include <xml_sax_parser_context.h>
96
92
  #include <xml_sax_parser.h>
97
93
  #include <xml_sax_push_parser.h>
98
94
  #include <xml_reader.h>
99
- #include <html_sax_parser.h>
95
+ #include <html_sax_parser_context.h>
100
96
  #include <xslt_stylesheet.h>
101
97
  #include <xml_syntax_error.h>
102
98
  #include <xml_schema.h>