nokogiri 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

@@ -1,3 +1,10 @@
1
+ === 1.2.1 / 2008年2月23日
2
+
3
+ * 修正
4
+
5
+ * CSS のセレクターのスペースを修正
6
+ * Ruby 1.9 のStringのエンコードを修正 (角谷さんに感謝!)
7
+
1
8
  === 1.2.0 / 2008年2月22日
2
9
 
3
10
  * 新しい機能
@@ -1,3 +1,10 @@
1
+ === 1.2.1 / 2008-02-23
2
+
3
+ * Bugfixes
4
+
5
+ * Fixed a CSS selector space bug
6
+ * Fixed Ruby 1.9 String Encoding (Thanks 角谷さん!)
7
+
1
8
  === 1.2.0 / 2008-02-22
2
9
 
3
10
  * New features
@@ -151,12 +151,16 @@ test/xml/test_builder.rb
151
151
  test/xml/test_cdata.rb
152
152
  test/xml/test_comment.rb
153
153
  test/xml/test_document.rb
154
+ test/xml/test_document_encoding.rb
154
155
  test/xml/test_document_fragment.rb
155
156
  test/xml/test_dtd.rb
157
+ test/xml/test_dtd_encoding.rb
156
158
  test/xml/test_entity_reference.rb
157
159
  test/xml/test_node.rb
160
+ test/xml/test_node_encoding.rb
158
161
  test/xml/test_node_set.rb
159
162
  test/xml/test_processing_instruction.rb
163
+ test/xml/test_reader_encoding.rb
160
164
  test/xml/test_text.rb
161
165
  test/xml/test_xpath.rb
162
166
  vendor/hoe.rb
@@ -25,7 +25,7 @@ void Init_native()
25
25
 
26
26
  rb_const_set( mNokogiri,
27
27
  rb_intern("LIBXML_VERSION"),
28
- rb_str_new2(LIBXML_DOTTED_VERSION)
28
+ NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION, "UTF-8")
29
29
  );
30
30
 
31
31
  init_xml_document();
@@ -12,6 +12,46 @@
12
12
  #include <libxml/HTMLparser.h>
13
13
  #include <libxml/HTMLtree.h>
14
14
 
15
+
16
+ #ifndef UNUSED
17
+ # if defined(__GNUC__)
18
+ # define MAYBE_UNUSED(name) name __attribute__((unused))
19
+ # define UNUSED(name) MAYBE_UNUSED(UNUSED_ ## name)
20
+ # else
21
+ # define MAYBE_UNUSED(name) name
22
+ # define UNUSED(name) name
23
+ # endif
24
+ #endif
25
+
26
+ #ifdef HAVE_RUBY_ENCODING_H
27
+
28
+ #include <ruby/encoding.h>
29
+
30
+ #define NOKOGIRI_STR_NEW2(str, encoding) \
31
+ ({ \
32
+ VALUE _string = rb_str_new2((const char *)str); \
33
+ if(NULL != encoding) \
34
+ rb_enc_associate_index(_string, rb_enc_find_index(encoding)); \
35
+ _string; \
36
+ })
37
+
38
+ #define NOKOGIRI_STR_NEW(str, len, encoding) \
39
+ ({ \
40
+ VALUE _string = rb_str_new((const char *)str, (long)len); \
41
+ if(NULL != encoding) \
42
+ rb_enc_associate_index(_string, rb_enc_find_index(encoding)); \
43
+ _string; \
44
+ })
45
+
46
+ #else
47
+
48
+ #define NOKOGIRI_STR_NEW2(str, doc) \
49
+ rb_str_new2((const char *)str)
50
+
51
+ #define NOKOGIRI_STR_NEW(str, len, doc) \
52
+ rb_str_new((const char *)str, (long)len)
53
+ #endif
54
+
15
55
  #include <xml_io.h>
16
56
  #include <xml_document.h>
17
57
  #include <html_document.h>
@@ -20,7 +20,7 @@ static VALUE url(VALUE self)
20
20
  Data_Get_Struct(self, xmlDoc, doc);
21
21
 
22
22
  if(doc->URL)
23
- return rb_str_new2((const char *)doc->URL);
23
+ return NOKOGIRI_STR_NEW2(doc->URL, doc->encoding);
24
24
 
25
25
  return Qnil;
26
26
  }
@@ -72,7 +72,7 @@ static VALUE encoding(VALUE self)
72
72
  Data_Get_Struct(self, xmlDoc, doc);
73
73
 
74
74
  if(!doc->encoding) return Qnil;
75
- return rb_str_new2((const char *)doc->encoding);
75
+ return NOKOGIRI_STR_NEW2(doc->encoding, doc->encoding);
76
76
  }
77
77
 
78
78
  /*
@@ -8,20 +8,21 @@ static void notation_copier(void *payload, void *data, xmlChar *name)
8
8
  xmlNotationPtr c_notation = (xmlNotationPtr)payload;
9
9
 
10
10
  VALUE notation = rb_funcall(klass, rb_intern("new"), 3,
11
- c_notation->name ? rb_str_new2((const char *)c_notation->name) : Qnil,
12
- c_notation->PublicID ? rb_str_new2((const char *)c_notation->PublicID) : Qnil,
13
- c_notation->SystemID ? rb_str_new2((const char *)c_notation->SystemID) : Qnil);
11
+ c_notation->name ? NOKOGIRI_STR_NEW2(c_notation->name, "UTF-8") : Qnil,
12
+ c_notation->PublicID ? NOKOGIRI_STR_NEW2(c_notation->PublicID, "UTF-8") : Qnil,
13
+ c_notation->SystemID ? NOKOGIRI_STR_NEW2(c_notation->SystemID, "UTF-8") : Qnil);
14
14
 
15
- rb_hash_aset(hash, rb_str_new2((const char *)name), notation);
15
+ rb_hash_aset(hash, NOKOGIRI_STR_NEW2(name, "UTF-8"),notation);
16
16
  }
17
17
 
18
- static void element_copier(void *payload, void *data, xmlChar *name)
18
+ static void element_copier(void *_payload, void *data, xmlChar *name)
19
19
  {
20
20
  VALUE hash = (VALUE)data;
21
+ xmlNodePtr payload = (xmlNodePtr)_payload;
21
22
 
22
- VALUE element = Nokogiri_wrap_xml_node((xmlNodePtr)payload);
23
+ VALUE element = Nokogiri_wrap_xml_node(payload);
23
24
 
24
- rb_hash_aset(hash, rb_str_new2((const char *)name), element);
25
+ rb_hash_aset(hash, NOKOGIRI_STR_NEW2(name, payload->doc->encoding), element);
25
26
  }
26
27
 
27
28
  /*
@@ -39,7 +39,7 @@ static VALUE encode_special_chars(VALUE self, VALUE string)
39
39
  (const xmlChar *)StringValuePtr(string)
40
40
  );
41
41
 
42
- VALUE encoded_str = rb_str_new2((const char *)encoded);
42
+ VALUE encoded_str = NOKOGIRI_STR_NEW2(encoded, node->doc->encoding);
43
43
  xmlFree(encoded);
44
44
 
45
45
  return encoded_str;
@@ -236,7 +236,8 @@ static VALUE get(VALUE self, VALUE attribute)
236
236
 
237
237
  if(NULL == propstr) return Qnil;
238
238
 
239
- rval = rb_str_new2((char *)propstr) ;
239
+ rval = NOKOGIRI_STR_NEW2(propstr, node->doc->encoding);
240
+
240
241
  xmlFree(propstr);
241
242
  return rval ;
242
243
  }
@@ -289,8 +290,9 @@ static VALUE namespace(VALUE self)
289
290
  {
290
291
  xmlNodePtr node ;
291
292
  Data_Get_Struct(self, xmlNode, node);
292
- if (node->ns && node->ns->prefix)
293
- return rb_str_new2((const char *)node->ns->prefix) ;
293
+ if (node->ns && node->ns->prefix) {
294
+ return NOKOGIRI_STR_NEW2(node->ns->prefix, node->doc->encoding);
295
+ }
294
296
  return Qnil ;
295
297
  }
296
298
 
@@ -354,7 +356,7 @@ static VALUE get_content(VALUE self)
354
356
 
355
357
  xmlChar * content = xmlNodeGetContent(node);
356
358
  if(content) {
357
- VALUE rval = rb_str_new2((char *)content);
359
+ VALUE rval = NOKOGIRI_STR_NEW2(content, node->doc->encoding);
358
360
  xmlFree(content);
359
361
  return rval;
360
362
  }
@@ -427,7 +429,8 @@ static VALUE get_name(VALUE self)
427
429
  {
428
430
  xmlNodePtr node;
429
431
  Data_Get_Struct(self, xmlNode, node);
430
- if(node->name) return rb_str_new2((const char *)node->name);
432
+ if(node->name)
433
+ return NOKOGIRI_STR_NEW2(node->name, node->doc->encoding);
431
434
  return Qnil;
432
435
  }
433
436
 
@@ -441,11 +444,10 @@ static VALUE path(VALUE self)
441
444
  {
442
445
  xmlNodePtr node;
443
446
  xmlChar *path ;
444
- VALUE rval ;
445
447
  Data_Get_Struct(self, xmlNode, node);
446
448
 
447
449
  path = xmlGetNodePath(node);
448
- rval = rb_str_new2((char *)path);
450
+ VALUE rval = NOKOGIRI_STR_NEW2(path, node->doc->encoding);
449
451
  xmlFree(path);
450
452
  return rval ;
451
453
  }
@@ -602,14 +604,12 @@ static VALUE dump_html(VALUE self)
602
604
  xmlNodePtr node ;
603
605
  Data_Get_Struct(self, xmlNode, node);
604
606
 
605
- VALUE html;
606
-
607
607
  if(node->doc->type == XML_DOCUMENT_NODE)
608
608
  return rb_funcall(self, rb_intern("to_xml"), 0);
609
609
 
610
610
  buf = xmlBufferCreate() ;
611
611
  htmlNodeDump(buf, node->doc, node);
612
- html = rb_str_new2((char*)buf->content);
612
+ VALUE html = NOKOGIRI_STR_NEW2(buf->content, node->doc->encoding);
613
613
  xmlBufferFree(buf);
614
614
  return html ;
615
615
  }
@@ -722,7 +722,10 @@ void Nokogiri_xml_node_namespaces(xmlNodePtr node, VALUE attr_hash)
722
722
  sprintf(key, "%s", XMLNS_PREFIX);
723
723
  }
724
724
 
725
- rb_hash_aset(attr_hash, rb_str_new2(key), rb_str_new2((const char*)ns->href)) ;
725
+ rb_hash_aset(attr_hash,
726
+ NOKOGIRI_STR_NEW2(key, node->doc->encoding),
727
+ NOKOGIRI_STR_NEW2(ns->href, node->doc->encoding)
728
+ );
726
729
  if (key != buffer) {
727
730
  free(key);
728
731
  }
@@ -131,6 +131,16 @@ static VALUE attribute_nodes(VALUE self)
131
131
  rb_iv_set(rb_doc, "@decorators", Qnil);
132
132
  ptr->doc->_private = (void *)rb_doc;
133
133
  }
134
+ VALUE enc = rb_iv_get(self, "@encoding");
135
+
136
+ if(enc != Qnil && NULL == ptr->doc->encoding) {
137
+ ptr->doc->encoding = calloc((size_t)RSTRING_LEN(enc), sizeof(char));
138
+ strncpy(
139
+ (char *)ptr->doc->encoding,
140
+ StringValuePtr(enc),
141
+ (size_t)RSTRING_LEN(enc)
142
+ );
143
+ }
134
144
 
135
145
  Nokogiri_xml_node_properties(ptr, attr);
136
146
 
@@ -157,7 +167,9 @@ static VALUE attribute_at(VALUE self, VALUE index)
157
167
  );
158
168
  if(value == NULL) return Qnil;
159
169
 
160
- VALUE rb_value = rb_str_new2((const char *)value);
170
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
171
+ VALUE rb_value = NOKOGIRI_STR_NEW2(value,
172
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
161
173
  xmlFree(value);
162
174
  return rb_value;
163
175
  }
@@ -193,7 +205,9 @@ static VALUE reader_attribute(VALUE self, VALUE name)
193
205
  }
194
206
  if(value == NULL) return Qnil;
195
207
 
196
- VALUE rb_value = rb_str_new2((const char *)value);
208
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
209
+ VALUE rb_value = NOKOGIRI_STR_NEW2(value,
210
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
197
211
  xmlFree(value);
198
212
  return rb_value;
199
213
  }
@@ -230,22 +244,6 @@ static VALUE depth(VALUE self)
230
244
  return INT2NUM(depth);
231
245
  }
232
246
 
233
- /*
234
- * call-seq:
235
- * encoding
236
- *
237
- * Get the encoding for the document
238
- */
239
- static VALUE encoding(VALUE self)
240
- {
241
- xmlTextReaderPtr reader;
242
- Data_Get_Struct(self, xmlTextReader, reader);
243
- const char * encoding = (const char *)xmlTextReaderConstEncoding(reader);
244
- if(encoding == NULL) return Qnil;
245
-
246
- return rb_str_new2(encoding);
247
- }
248
-
249
247
  /*
250
248
  * call-seq:
251
249
  * xml_version
@@ -259,7 +257,7 @@ static VALUE xml_version(VALUE self)
259
257
  const char * version = (const char *)xmlTextReaderConstXmlVersion(reader);
260
258
  if(version == NULL) return Qnil;
261
259
 
262
- return rb_str_new2(version);
260
+ return NOKOGIRI_STR_NEW2(version, "UTF-8");
263
261
  }
264
262
 
265
263
  /*
@@ -275,7 +273,9 @@ static VALUE lang(VALUE self)
275
273
  const char * lang = (const char *)xmlTextReaderConstXmlLang(reader);
276
274
  if(lang == NULL) return Qnil;
277
275
 
278
- return rb_str_new2(lang);
276
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
277
+ return NOKOGIRI_STR_NEW2(lang,
278
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
279
279
  }
280
280
 
281
281
  /*
@@ -291,7 +291,9 @@ static VALUE value(VALUE self)
291
291
  const char * value = (const char *)xmlTextReaderConstValue(reader);
292
292
  if(value == NULL) return Qnil;
293
293
 
294
- return rb_str_new2(value);
294
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
295
+ return NOKOGIRI_STR_NEW2(value,
296
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
295
297
  }
296
298
 
297
299
  /*
@@ -307,7 +309,9 @@ static VALUE prefix(VALUE self)
307
309
  const char * prefix = (const char *)xmlTextReaderConstPrefix(reader);
308
310
  if(prefix == NULL) return Qnil;
309
311
 
310
- return rb_str_new2(prefix);
312
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
313
+ return NOKOGIRI_STR_NEW2(prefix,
314
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
311
315
  }
312
316
 
313
317
  /*
@@ -323,7 +327,9 @@ static VALUE namespace_uri(VALUE self)
323
327
  const char * uri = (const char *)xmlTextReaderConstNamespaceUri(reader);
324
328
  if(uri == NULL) return Qnil;
325
329
 
326
- return rb_str_new2(uri);
330
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
331
+ return NOKOGIRI_STR_NEW2(uri,
332
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
327
333
  }
328
334
 
329
335
  /*
@@ -339,7 +345,9 @@ static VALUE local_name(VALUE self)
339
345
  const char * name = (const char *)xmlTextReaderConstLocalName(reader);
340
346
  if(name == NULL) return Qnil;
341
347
 
342
- return rb_str_new2(name);
348
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
349
+ return NOKOGIRI_STR_NEW2(name,
350
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
343
351
  }
344
352
 
345
353
  /*
@@ -355,7 +363,9 @@ static VALUE name(VALUE self)
355
363
  const char * name = (const char *)xmlTextReaderConstName(reader);
356
364
  if(name == NULL) return Qnil;
357
365
 
358
- return rb_str_new2(name);
366
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
367
+ return NOKOGIRI_STR_NEW2(name,
368
+ RTEST(enc) ? StringValuePtr(enc) : NULL);
359
369
  }
360
370
 
361
371
  /*
@@ -437,7 +447,7 @@ static VALUE from_memory(int argc, VALUE *argv, VALUE klass)
437
447
  }
438
448
 
439
449
  VALUE rb_reader = Data_Wrap_Struct(klass, NULL, dealloc, reader);
440
- rb_funcall(rb_reader, rb_intern("initialize"), 0);
450
+ rb_funcall(rb_reader, rb_intern("initialize"), 2, rb_url, encoding);
441
451
 
442
452
  return rb_reader;
443
453
  }
@@ -468,7 +478,6 @@ void init_xml_reader()
468
478
  rb_define_method(klass, "value", value, 0);
469
479
  rb_define_method(klass, "lang", lang, 0);
470
480
  rb_define_method(klass, "xml_version", xml_version, 0);
471
- rb_define_method(klass, "encoding", encoding, 0);
472
481
  rb_define_method(klass, "depth", depth, 0);
473
482
  rb_define_method(klass, "attribute_count", attribute_count, 0);
474
483
  rb_define_method(klass, "attribute", reader_attribute, 1);
@@ -81,11 +81,14 @@ static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
81
81
  VALUE self = (VALUE)ctx;
82
82
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
83
83
  VALUE attributes = rb_ary_new();
84
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
84
85
  const xmlChar * attr;
85
86
  int i = 0;
86
87
  if(atts) {
87
88
  while((attr = atts[i]) != NULL) {
88
- rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((const char *)attr));
89
+ rb_funcall(attributes, rb_intern("<<"), 1,
90
+ NOKOGIRI_STR_NEW2(attr, RTEST(enc) ? StringValuePtr(enc) : NULL)
91
+ );
89
92
  i++;
90
93
  }
91
94
  }
@@ -93,7 +96,7 @@ static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
93
96
  rb_funcall( doc,
94
97
  rb_intern("start_element"),
95
98
  2,
96
- rb_str_new2((const char *)name),
99
+ NOKOGIRI_STR_NEW2(name, RTEST(enc) ? StringValuePtr(enc) : NULL),
97
100
  attributes
98
101
  );
99
102
  }
@@ -101,23 +104,28 @@ static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
101
104
  static void end_element(void * ctx, const xmlChar *name)
102
105
  {
103
106
  VALUE self = (VALUE)ctx;
107
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
104
108
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
105
- rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((const char *)name));
109
+ rb_funcall(doc, rb_intern("end_element"), 1,
110
+ NOKOGIRI_STR_NEW2(name, RTEST(enc) ? StringValuePtr(enc) : NULL)
111
+ );
106
112
  }
107
113
 
108
114
  static void characters_func(void * ctx, const xmlChar * ch, int len)
109
115
  {
110
116
  VALUE self = (VALUE)ctx;
117
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
111
118
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
112
- VALUE str = rb_str_new((const char *)ch, (long)len);
119
+ VALUE str = NOKOGIRI_STR_NEW(ch, len, RTEST(enc) ? StringValuePtr(enc):NULL);
113
120
  rb_funcall(doc, rb_intern("characters"), 1, str);
114
121
  }
115
122
 
116
123
  static void comment_func(void * ctx, const xmlChar * value)
117
124
  {
118
125
  VALUE self = (VALUE)ctx;
126
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
119
127
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
120
- VALUE str = rb_str_new2((const char *)value);
128
+ VALUE str = NOKOGIRI_STR_NEW2(value, RTEST(enc) ? StringValuePtr(enc):NULL);
121
129
  rb_funcall(doc, rb_intern("comment"), 1, str);
122
130
  }
123
131
 
@@ -162,6 +170,7 @@ static void warning_func(void * ctx, const char *msg, ...)
162
170
  {
163
171
  VALUE self = (VALUE)ctx;
164
172
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
173
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
165
174
  char * message;
166
175
 
167
176
  va_list args;
@@ -169,13 +178,16 @@ static void warning_func(void * ctx, const char *msg, ...)
169
178
  vasprintf(&message, msg, args);
170
179
  va_end(args);
171
180
 
172
- rb_funcall(doc, rb_intern("warning"), 1, rb_str_new2(message));
181
+ rb_funcall(doc, rb_intern("warning"), 1,
182
+ NOKOGIRI_STR_NEW2(message, RTEST(enc) ? StringValuePtr(enc) : NULL)
183
+ );
173
184
  free(message);
174
185
  }
175
186
 
176
187
  static void error_func(void * ctx, const char *msg, ...)
177
188
  {
178
189
  VALUE self = (VALUE)ctx;
190
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
179
191
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
180
192
  char * message;
181
193
 
@@ -184,15 +196,19 @@ static void error_func(void * ctx, const char *msg, ...)
184
196
  vasprintf(&message, msg, args);
185
197
  va_end(args);
186
198
 
187
- rb_funcall(doc, rb_intern("error"), 1, rb_str_new2(message));
199
+ rb_funcall(doc, rb_intern("error"), 1,
200
+ NOKOGIRI_STR_NEW2(message, RTEST(enc) ? StringValuePtr(enc) : NULL)
201
+ );
188
202
  free(message);
189
203
  }
190
204
 
191
205
  static void cdata_block(void * ctx, const xmlChar * value, int len)
192
206
  {
193
207
  VALUE self = (VALUE)ctx;
208
+ VALUE MAYBE_UNUSED(enc) = rb_iv_get(self, "@encoding");
194
209
  VALUE doc = rb_funcall(self, rb_intern("document"), 0);
195
- VALUE string = rb_str_new((const char *)value, (long)len);
210
+ VALUE string =
211
+ NOKOGIRI_STR_NEW(value, len, RTEST(enc) ? StringValuePtr(enc) : NULL);
196
212
  rb_funcall(doc, rb_intern("cdata_block"), 1, string);
197
213
  }
198
214
 
@@ -45,7 +45,7 @@ static VALUE str3(VALUE self)
45
45
  xmlErrorPtr error;
46
46
  Data_Get_Struct(self, xmlError, error);
47
47
  if(error->str3)
48
- return rb_str_new2(error->str3);
48
+ return NOKOGIRI_STR_NEW2(error->str3, "UTF-8");
49
49
  return Qnil;
50
50
  }
51
51
 
@@ -60,7 +60,7 @@ static VALUE str2(VALUE self)
60
60
  xmlErrorPtr error;
61
61
  Data_Get_Struct(self, xmlError, error);
62
62
  if(error->str2)
63
- return rb_str_new2(error->str2);
63
+ return NOKOGIRI_STR_NEW2(error->str2, "UTF-8");
64
64
  return Qnil;
65
65
  }
66
66
 
@@ -75,7 +75,7 @@ static VALUE str1(VALUE self)
75
75
  xmlErrorPtr error;
76
76
  Data_Get_Struct(self, xmlError, error);
77
77
  if(error->str1)
78
- return rb_str_new2(error->str1);
78
+ return NOKOGIRI_STR_NEW2(error->str1, "UTF-8");
79
79
  return Qnil;
80
80
  }
81
81
 
@@ -103,7 +103,7 @@ static VALUE file(VALUE self)
103
103
  xmlErrorPtr error;
104
104
  Data_Get_Struct(self, xmlError, error);
105
105
  if(error->file)
106
- return rb_str_new2(error->file);
106
+ return NOKOGIRI_STR_NEW2(error->file, "UTF-8");
107
107
 
108
108
  return Qnil;
109
109
  }
@@ -157,7 +157,7 @@ static VALUE message(VALUE self)
157
157
  {
158
158
  xmlErrorPtr error;
159
159
  Data_Get_Struct(self, xmlError, error);
160
- return rb_str_new2(error->message);
160
+ return NOKOGIRI_STR_NEW2(error->message, "UTF-8");
161
161
  }
162
162
 
163
163
  void Nokogiri_error_array_pusher(void * ctx, xmlErrorPtr error)
@@ -46,7 +46,7 @@ static void ruby_funcall(xmlXPathParserContextPtr ctx, int nargs)
46
46
  obj = valuePop(ctx);
47
47
  switch(obj->type) {
48
48
  case XPATH_STRING:
49
- argv[i] = rb_str_new2((char *)obj->stringval);
49
+ argv[i] = NOKOGIRI_STR_NEW2(obj->stringval, ctx->context->doc->encoding);
50
50
  break;
51
51
  case XPATH_BOOLEAN:
52
52
  argv[i] = obj->boolval == 1 ? Qtrue : Qfalse;
@@ -58,7 +58,9 @@ static void ruby_funcall(xmlXPathParserContextPtr ctx, int nargs)
58
58
  argv[i] = Nokogiri_wrap_xml_node_set(obj->nodesetval);
59
59
  break;
60
60
  default:
61
- argv[i] = rb_str_new2((char *)xmlXPathCastToString(obj));
61
+ argv[i] = NOKOGIRI_STR_NEW2(
62
+ xmlXPathCastToString(obj), ctx->context->doc->encoding
63
+ );
62
64
  }
63
65
  xmlXPathFreeNodeSetList(obj);
64
66
  } while(i-- > 0);
@@ -46,7 +46,7 @@ static VALUE serialize(VALUE self, VALUE xmlobj)
46
46
  Data_Get_Struct(xmlobj, xmlDoc, xml);
47
47
  Data_Get_Struct(self, xsltStylesheet, ss);
48
48
  xsltSaveResultToString(&doc_ptr, &doc_len, xml, ss);
49
- rval = rb_str_new((char*)doc_ptr, doc_len);
49
+ rval = NOKOGIRI_STR_NEW(doc_ptr, doc_len, xml->encoding);
50
50
  xmlFree(doc_ptr);
51
51
  return rval ;
52
52
  }
@@ -83,13 +83,13 @@ class GeneratedTokenizer < GeneratedParser
83
83
  when (text = ss.scan(/[\s\r\n\f]*=[\s\r\n\f]*/))
84
84
  @rex_tokens.push action { [:EQUAL, text] }
85
85
 
86
- when (text = ss.scan(/[\s\r\n\f]*\)[\s\r\n\f]*/))
86
+ when (text = ss.scan(/[\s\r\n\f]*\)/))
87
87
  @rex_tokens.push action { [:RPAREN, text] }
88
88
 
89
89
  when (text = ss.scan(/[\s\r\n\f]*\[[\s\r\n\f]*/))
90
90
  @rex_tokens.push action { [:LSQUARE, text] }
91
91
 
92
- when (text = ss.scan(/[\s\r\n\f]*\][\s\r\n\f]*/))
92
+ when (text = ss.scan(/[\s\r\n\f]*\]/))
93
93
  @rex_tokens.push action { [:RSQUARE, text] }
94
94
 
95
95
  when (text = ss.scan(/[\s\r\n\f]*\+[\s\r\n\f]*/))
@@ -32,9 +32,9 @@ rule
32
32
  {w}\*={w} { [:SUBSTRINGMATCH, text] }
33
33
  {w}!={w} { [:NOT_EQUAL, text] }
34
34
  {w}={w} { [:EQUAL, text] }
35
- {w}\){w} { [:RPAREN, text] }
35
+ {w}\) { [:RPAREN, text] }
36
36
  {w}\[{w} { [:LSQUARE, text] }
37
- {w}\]{w} { [:RSQUARE, text] }
37
+ {w}\] { [:RSQUARE, text] }
38
38
  {w}\+{w} { [:PLUS, text] }
39
39
  {w}>{w} { [:GREATER, text] }
40
40
  {w},{w} { [:COMMA, text] }
@@ -1,4 +1,4 @@
1
1
  module Nokogiri
2
2
  # The version of Nokogiri you are using
3
- VERSION = '1.2.0'
3
+ VERSION = '1.2.1'
4
4
  end
@@ -55,6 +55,10 @@ module Nokogiri
55
55
  PARSE_NOXINCNODE = 1 << 15 # do not generate XINCLUDE START/END nodes
56
56
 
57
57
  class << self
58
+ def Reader string, url = nil, encoding = nil, options = 0
59
+ Reader.from_memory(string, url, encoding, options)
60
+ end
61
+
58
62
  ###
59
63
  # Parse an XML document. See Nokogiri.XML.
60
64
  def parse string_or_io, url = nil, encoding = nil, options = 2159
@@ -3,9 +3,11 @@ module Nokogiri
3
3
  class Reader
4
4
  include Enumerable
5
5
  attr_accessor :errors
6
+ attr_reader :encoding
6
7
 
7
- def initialize
8
+ def initialize url = nil, encoding = nil
8
9
  @errors = []
10
+ @encoding = encoding
9
11
  end
10
12
 
11
13
  def attributes
@@ -2,8 +2,35 @@ module Nokogiri
2
2
  module XML
3
3
  module SAX
4
4
  class Parser
5
+ ENCODINGS = {
6
+ 'NONE' => 0, # No char encoding detected
7
+ 'UTF-8' => 1, # UTF-8
8
+ 'UTF16LE' => 2, # UTF-16 little endian
9
+ 'UTF16BE' => 3, # UTF-16 big endian
10
+ 'UCS4LE' => 4, # UCS-4 little endian
11
+ 'UCS4BE' => 5, # UCS-4 big endian
12
+ 'EBCDIC' => 6, # EBCDIC uh!
13
+ 'UCS4-2143' => 7, # UCS-4 unusual ordering
14
+ 'UCS4-3412' => 8, # UCS-4 unusual ordering
15
+ 'UCS2' => 9, # UCS-2
16
+ 'ISO-8859-1' => 10, # ISO-8859-1 ISO Latin 1
17
+ 'ISO-8859-2' => 11, # ISO-8859-2 ISO Latin 2
18
+ 'ISO-8859-3' => 12, # ISO-8859-3
19
+ 'ISO-8859-4' => 13, # ISO-8859-4
20
+ 'ISO-8859-5' => 14, # ISO-8859-5
21
+ 'ISO-8859-6' => 15, # ISO-8859-6
22
+ 'ISO-8859-7' => 16, # ISO-8859-7
23
+ 'ISO-8859-8' => 17, # ISO-8859-8
24
+ 'ISO-8859-9' => 18, # ISO-8859-9
25
+ 'ISO-2022-JP' => 19, # ISO-2022-JP
26
+ 'SHIFT-JIS' => 20, # Shift_JIS
27
+ 'EUC-JP' => 21, # EUC-JP
28
+ 'ASCII' => 22, # pure ASCII
29
+ }
30
+
5
31
  attr_accessor :document
6
32
  def initialize(doc = XML::SAX::Document.new)
33
+ @encoding = 'ASCII'
7
34
  @document = doc
8
35
  end
9
36
 
@@ -20,8 +47,9 @@ module Nokogiri
20
47
 
21
48
  ###
22
49
  # Parse given +io+
23
- def parse_io io, encoding = 0
24
- native_parse_io io, encoding
50
+ def parse_io io, encoding = 'ASCII'
51
+ @encoding = encoding
52
+ native_parse_io io, ENCODINGS[@encoding] || ENCODINGS['ASCII']
25
53
  end
26
54
 
27
55
  ###
@@ -25,8 +25,9 @@ module Nokogiri
25
25
  class PushParser
26
26
  attr_accessor :document
27
27
 
28
- def initialize(doc = XML::SAX::Document.new, file_name = nil)
28
+ def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = 'ASCII')
29
29
  @document = doc
30
+ @encoding = encoding
30
31
  @sax_parser = XML::SAX::Parser.new(doc)
31
32
 
32
33
  ## Create our push parser context
@@ -71,6 +71,12 @@ module Nokogiri
71
71
  @parser.parse("a[@id='Boing']")
72
72
  end
73
73
 
74
+ def test_attributes_with_at_and_stuff
75
+ ## This is non standard CSS
76
+ assert_xpath "//a[@id = 'Boing']//div",
77
+ @parser.parse("a[@id='Boing'] div")
78
+ end
79
+
74
80
  def test_not_equal
75
81
  ## This is non standard CSS
76
82
  assert_xpath "//a[child::text() != 'Boing']",
@@ -105,6 +105,20 @@ module Nokogiri
105
105
  assert_equal 3, found.length
106
106
  end
107
107
 
108
+ def test_find_by_css_with_square_brackets
109
+ found = @html.css("div[@id='header'] > h1")
110
+ found = @html.css("div[@id='header'] h1") # this blows up on commit 6fa0f6d329d9dbf1cc21c0ac72f7e627bb4c05fc
111
+ assert_equal 1, found.length
112
+ end
113
+
114
+ def test_find_with_function
115
+ found = @html.css("div:awesome() h1", Class.new {
116
+ def awesome divs
117
+ [divs.first]
118
+ end
119
+ }.new)
120
+ end
121
+
108
122
  def test_dup_shallow
109
123
  found = @html.search('//div/a').first
110
124
  dup = found.dup(0)
@@ -136,8 +136,7 @@ class TestReader < Nokogiri::TestCase
136
136
  </awesome>
137
137
  eoxml
138
138
  reader = Nokogiri::XML::Reader.from_memory(string, nil, 'UTF-8')
139
- assert_nil reader.encoding
140
- assert_equal [nil], reader.map { |x| x.encoding }.uniq
139
+ assert_equal ['UTF-8'], reader.map { |x| x.encoding }.uniq
141
140
  end
142
141
 
143
142
  def test_xml_version
@@ -22,6 +22,12 @@ module Nokogiri
22
22
  assert @parser.document.errors
23
23
  assert @parser.document.errors.length > 0
24
24
 
25
+ if RUBY_VERSION =~ /^1\.9/
26
+ doc.errors.each do |error|
27
+ assert_equal 'UTF-8', error.message.encoding.name
28
+ end
29
+ end
30
+
25
31
  assert_equal doc.errors.length, @parser.document.errors.length
26
32
  end
27
33
 
@@ -35,9 +41,45 @@ module Nokogiri
35
41
 
36
42
  def test_parse_io
37
43
  File.open(XML_FILE, 'rb') { |f|
38
- @parser.parse_io(f)
44
+ @parser.parse_io(f, 'UTF-8')
39
45
  }
40
46
  assert(@parser.document.cdata_blocks.length > 0)
47
+ if RUBY_VERSION =~ /^1\.9/
48
+ called = false
49
+ @parser.document.start_elements.flatten.each do |thing|
50
+ assert_equal 'UTF-8', thing.encoding.name
51
+ called = true
52
+ end
53
+ assert called
54
+
55
+ called = false
56
+ @parser.document.end_elements.flatten.each do |thing|
57
+ assert_equal 'UTF-8', thing.encoding.name
58
+ called = true
59
+ end
60
+ assert called
61
+
62
+ called = false
63
+ @parser.document.data.each do |thing|
64
+ assert_equal 'UTF-8', thing.encoding.name
65
+ called = true
66
+ end
67
+ assert called
68
+
69
+ called = false
70
+ @parser.document.comments.flatten.each do |thing|
71
+ assert_equal 'UTF-8', thing.encoding.name
72
+ called = true
73
+ end
74
+ assert called
75
+
76
+ called = false
77
+ @parser.document.cdata_blocks.flatten.each do |thing|
78
+ assert_equal 'UTF-8', thing.encoding.name
79
+ called = true
80
+ end
81
+ assert called
82
+ end
41
83
  end
42
84
 
43
85
  def test_parse_file
@@ -0,0 +1,25 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', "helper"))
2
+
3
+ module Nokogiri
4
+ module XML
5
+ if RUBY_VERSION =~ /^1\.9/
6
+ class TestDocumentEncoding < Nokogiri::TestCase
7
+ def setup
8
+ @xml = Nokogiri::XML(File.read(XML_FILE), XML_FILE, 'UTF-8')
9
+ end
10
+
11
+ def test_url
12
+ assert_equal @xml.encoding, @xml.url.encoding.name
13
+ end
14
+
15
+ def test_encoding
16
+ assert_equal @xml.encoding, @xml.encoding.encoding.name
17
+ end
18
+
19
+ def test_dotted_version
20
+ assert_equal 'UTF-8', Nokogiri::LIBXML_VERSION.encoding.name
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,30 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', "helper"))
2
+
3
+ module Nokogiri
4
+ module XML
5
+ if RUBY_VERSION =~ /^1\.9/
6
+ class TestDTDEncoding < Nokogiri::TestCase
7
+ def setup
8
+ @xml = Nokogiri::XML(File.read(XML_FILE), XML_FILE, 'UTF-8')
9
+ assert @dtd = @xml.internal_subset
10
+ end
11
+
12
+ def test_entities
13
+ @dtd.entities.each do |k,v|
14
+ assert_equal @xml.encoding, k.encoding.name
15
+ end
16
+ end
17
+
18
+ def test_notations
19
+ @dtd.notations.each do |k,notation|
20
+ assert_equal 'UTF-8', k.encoding.name
21
+ %w{ name public_id system_id }.each do |attribute|
22
+ v = notation.send(:"#{attribute}") || next
23
+ assert_equal 'UTF-8', v.encoding.name
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,76 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', "helper"))
2
+
3
+ module Nokogiri
4
+ module XML
5
+ if RUBY_VERSION =~ /^1\.9/
6
+ class TestNodeEncoding < Nokogiri::TestCase
7
+ def setup
8
+ @html = Nokogiri::HTML(File.read(HTML_FILE), HTML_FILE)
9
+ end
10
+
11
+ def test_get_attribute
12
+ node = @html.css('a').first
13
+ assert_equal @html.encoding, node['href'].encoding.name
14
+ end
15
+
16
+ def test_encode_special_chars
17
+ foo = @html.css('a').first.encode_special_chars('foo')
18
+ assert_equal @html.encoding, foo.encoding.name
19
+ end
20
+
21
+ def test_content
22
+ node = @html.css('a').first
23
+ assert_equal @html.encoding, node.content.encoding.name
24
+ end
25
+
26
+ def test_name
27
+ node = @html.css('a').first
28
+ assert_equal @html.encoding, node.name.encoding.name
29
+ end
30
+
31
+ def test_path
32
+ node = @html.css('a').first
33
+ assert_equal @html.encoding, node.path.encoding.name
34
+ end
35
+
36
+ def test_namespace
37
+ xml = <<-eoxml
38
+ <root>
39
+ <car xmlns:part="http://general-motors.com/">
40
+ <part:tire>Michelin Model XGV</part:tire>
41
+ </car>
42
+ <bicycle xmlns:part="http://schwinn.com/">
43
+ <part:tire>I'm a bicycle tire!</part:tire>
44
+ </bicycle>
45
+ </root>
46
+ eoxml
47
+ doc = Nokogiri::XML(xml, nil, 'UTF-8')
48
+ assert_equal 'UTF-8', doc.encoding
49
+ n = doc.xpath('//part:tire', { 'part' => 'http://schwinn.com/' }).first
50
+ assert n
51
+ assert_equal doc.encoding, n.namespace.encoding.name
52
+ end
53
+
54
+ def test_namespace_as_hash
55
+ xml = <<-eoxml
56
+ <root>
57
+ <car xmlns:part="http://general-motors.com/">
58
+ <part:tire>Michelin Model XGV</part:tire>
59
+ </car>
60
+ <bicycle xmlns:part="http://schwinn.com/">
61
+ <part:tire>I'm a bicycle tire!</part:tire>
62
+ </bicycle>
63
+ </root>
64
+ eoxml
65
+ doc = Nokogiri::XML(xml, nil, 'UTF-8')
66
+ assert_equal 'UTF-8', doc.encoding
67
+ assert n = doc.xpath('//car').first
68
+ n.namespaces.each do |k,v|
69
+ assert_equal doc.encoding, v.encoding.name
70
+ assert_equal doc.encoding, k.encoding.name
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,125 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', "helper"))
3
+
4
+ module Nokogiri
5
+ module XML
6
+ if RUBY_VERSION =~ /^1\.9/
7
+ class TestReaderEncoding < Nokogiri::TestCase
8
+ def setup
9
+ @reader = Nokogiri::XML::Reader(
10
+ File.read(XML_FILE),
11
+ XML_FILE,
12
+ 'UTF-8'
13
+ )
14
+ end
15
+
16
+ def test_attribute_at
17
+ @reader.each do |node|
18
+ next unless attribute = node.attribute_at(0)
19
+ assert_equal @reader.encoding, attribute.encoding.name
20
+ end
21
+ end
22
+
23
+ def test_attributes
24
+ @reader.each do |node|
25
+ node.attributes.each do |k,v|
26
+ assert_equal @reader.encoding, k.encoding.name
27
+ assert_equal @reader.encoding, v.encoding.name
28
+ end
29
+ end
30
+ end
31
+
32
+ def test_attribute
33
+ xml = <<-eoxml
34
+ <x xmlns:tenderlove='http://tenderlovemaking.com/'>
35
+ <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
36
+ </x>
37
+ eoxml
38
+ reader = Nokogiri::XML::Reader(xml, nil, 'UTF-8')
39
+ reader.each do |node|
40
+ next unless attribute = node.attribute('awesome')
41
+ assert_equal reader.encoding, attribute.encoding.name
42
+ end
43
+ end
44
+
45
+ def test_xml_version
46
+ @reader.each do |node|
47
+ next unless version = node.xml_version
48
+ assert_equal @reader.encoding, version.encoding.name
49
+ end
50
+ end
51
+
52
+ def test_lang
53
+ xml = <<-eoxml
54
+ <awesome>
55
+ <p xml:lang="en">The quick brown fox jumps over the lazy dog.</p>
56
+ <p xml:lang="ja">日本語が上手です</p>
57
+ </awesome>
58
+ eoxml
59
+
60
+ reader = Nokogiri::XML::Reader(xml, nil, 'UTF-8')
61
+ reader.each do |node|
62
+ next unless lang = node.lang
63
+ assert_equal reader.encoding, lang.encoding.name
64
+ end
65
+ end
66
+
67
+ def test_value
68
+ called = false
69
+ @reader.each do |node|
70
+ next unless value = node.value
71
+ assert_equal @reader.encoding, value.encoding.name
72
+ called = true
73
+ end
74
+ assert called
75
+ end
76
+
77
+ def test_prefix
78
+ xml = <<-eoxml
79
+ <x xmlns:edi='http://ecommerce.example.org/schema'>
80
+ <edi:foo>hello</edi:foo>
81
+ </x>
82
+ eoxml
83
+ reader = Nokogiri::XML::Reader(xml, nil, 'UTF-8')
84
+ reader.each do |node|
85
+ next unless prefix = node.prefix
86
+ assert_equal reader.encoding, prefix.encoding.name
87
+ end
88
+ end
89
+
90
+ def test_ns_uri
91
+ xml = <<-eoxml
92
+ <x xmlns:edi='http://ecommerce.example.org/schema'>
93
+ <edi:foo>hello</edi:foo>
94
+ </x>
95
+ eoxml
96
+ reader = Nokogiri::XML::Reader(xml, nil, 'UTF-8')
97
+ reader.each do |node|
98
+ next unless uri = node.namespace_uri
99
+ assert_equal reader.encoding, uri.encoding.name
100
+ end
101
+ end
102
+
103
+ def test_local_name
104
+ xml = <<-eoxml
105
+ <x xmlns:edi='http://ecommerce.example.org/schema'>
106
+ <edi:foo>hello</edi:foo>
107
+ </x>
108
+ eoxml
109
+ reader = Nokogiri::XML::Reader(xml, nil, 'UTF-8')
110
+ reader.each do |node|
111
+ next unless lname = node.local_name
112
+ assert_equal reader.encoding, lname.encoding.name
113
+ end
114
+ end
115
+
116
+ def test_name
117
+ @reader.each do |node|
118
+ next unless name = node.name
119
+ assert_equal @reader.encoding, name.encoding.name
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogiri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Patterson
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-02-22 00:00:00 -08:00
13
+ date: 2009-02-23 00:00:00 -08:00
14
14
  default_executable:
15
15
  dependencies: []
16
16
 
@@ -182,12 +182,16 @@ files:
182
182
  - test/xml/test_cdata.rb
183
183
  - test/xml/test_comment.rb
184
184
  - test/xml/test_document.rb
185
+ - test/xml/test_document_encoding.rb
185
186
  - test/xml/test_document_fragment.rb
186
187
  - test/xml/test_dtd.rb
188
+ - test/xml/test_dtd_encoding.rb
187
189
  - test/xml/test_entity_reference.rb
188
190
  - test/xml/test_node.rb
191
+ - test/xml/test_node_encoding.rb
189
192
  - test/xml/test_node_set.rb
190
193
  - test/xml/test_processing_instruction.rb
194
+ - test/xml/test_reader_encoding.rb
191
195
  - test/xml/test_text.rb
192
196
  - test/xml/test_xpath.rb
193
197
  - vendor/hoe.rb
@@ -249,11 +253,15 @@ test_files:
249
253
  - test/xml/test_cdata.rb
250
254
  - test/xml/test_comment.rb
251
255
  - test/xml/test_document.rb
256
+ - test/xml/test_document_encoding.rb
252
257
  - test/xml/test_document_fragment.rb
253
258
  - test/xml/test_dtd.rb
259
+ - test/xml/test_dtd_encoding.rb
254
260
  - test/xml/test_entity_reference.rb
255
261
  - test/xml/test_node.rb
262
+ - test/xml/test_node_encoding.rb
256
263
  - test/xml/test_node_set.rb
257
264
  - test/xml/test_processing_instruction.rb
265
+ - test/xml/test_reader_encoding.rb
258
266
  - test/xml/test_text.rb
259
267
  - test/xml/test_xpath.rb