nokogiri 1.5.3.rc5-java → 1.5.3.rc6-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

@@ -20,11 +20,12 @@
20
20
  * Nokogiri::XML::Attribute が JRuby 上で nil namespace を返す #647
21
21
  * Nokogiri::XML::Node#namespace= メソッドが JRuby 上で prefix
22
22
   が無い namespace を設定できない #648
23
- * JRuby 1.9 モードで rake を実行するとデッドロックを引き起こす #571
23
+ * [JRuby] 1.9 モードで rake を実行するとデッドロックを引き起こす #571
24
24
  * HTML::Document#meta_encoding does not raise exception on docs with
25
25
  malformed content-type. #655
26
26
  * Fixing segfault related to unsupported encodings in in-context
27
27
  parsing on 1.8.7. #643
28
+ * [JRuby] Concurrency issue in XPath parsing. #682
28
29
 
29
30
 
30
31
  == 1.5.2 / 2012-03-09
@@ -20,11 +20,12 @@
20
20
  * Nokogiri::XML::Attribute on JRuby returns a nil namespace #647
21
21
  * Nokogiri::XML::Node#namespace= cannot set a namespace without a
22
22
  prefix on JRuby #648
23
- * JRuby 1.9 mode causes dead lock while running rake #571
23
+ * [JRuby] 1.9 mode causes dead lock while running rake #571
24
24
  * HTML::Document#meta_encoding does not raise exception on docs with
25
25
  malformed content-type. #655
26
26
  * Fixing segfault related to unsupported encodings in in-context
27
27
  parsing on 1.8.7. #643
28
+ * [JRuby] Concurrency issue in XPath parsing. #682
28
29
 
29
30
 
30
31
  == 1.5.2 / 2012-03-09
@@ -91,7 +91,7 @@ encoded like the source document.
91
91
  Some documents declare one particular encoding, but use a different
92
92
  one. So, which encoding should the parser choose?
93
93
 
94
- Remember that data is just a stream of bytes. Only us humans add
94
+ Remember that data is just a stream of bytes. Only we humans add
95
95
  meaning to that stream. Any particular set of bytes could be valid
96
96
  characters in multiple encodings, so detecting encoding with 100%
97
97
  accuracy is not possible. libxml2 does its best, but it can't be right
data/ROADMAP.md CHANGED
@@ -14,6 +14,15 @@
14
14
  * see fairy wing throwdown - SAX parsing is wicked slow.
15
15
 
16
16
 
17
+ ## Node should not be Enumerable; and should have a better attributes API
18
+
19
+ * https://github.com/tenderlove/nokogiri/issues/679
20
+ Mixing in Enumerable has some unintended consequences; plus we want to improve the attributes API
21
+
22
+ * (closed) https://github.com/tenderlove/nokogiri/issues/666
23
+ Some ideas for a better attributes API?
24
+
25
+
17
26
  ## improve CSS query parsing
18
27
 
19
28
  * https://github.com/tenderlove/nokogiri/issues/528
@@ -27,6 +36,7 @@
27
36
  * https://github.com/tenderlove/nokogiri/issues/342
28
37
  * https://github.com/tenderlove/nokogiri/issues/628
29
38
  * https://github.com/tenderlove/nokogiri/issues/652
39
+ * https://github.com/tenderlove/nokogiri/issues/688
30
40
 
31
41
  * https://github.com/tenderlove/nokogiri/issues/394
32
42
  nth-of-type is wrong, and possibly other selectors as well
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -55,6 +55,7 @@ import org.w3c.dom.NodeList;
55
55
  */
56
56
  @JRubyClass(name="Nokogiri::HTML::Document", parent="Nokogiri::XML::Document")
57
57
  public class HtmlDocument extends XmlDocument {
58
+ private String parsed_encoding = null;
58
59
 
59
60
  public HtmlDocument(Ruby ruby, RubyClass klazz) {
60
61
  super(ruby, klazz);
@@ -123,6 +124,14 @@ public class HtmlDocument extends XmlDocument {
123
124
  stabilzeAttrValue(children.item(i));
124
125
  }
125
126
  }
127
+
128
+ public void setParsedEncoding(String encoding) {
129
+ parsed_encoding = encoding;
130
+ }
131
+
132
+ public String getPraedEncoding() {
133
+ return parsed_encoding;
134
+ }
126
135
 
127
136
  /*
128
137
  * call-seq:
@@ -128,6 +128,10 @@ public class XmlDocument extends XmlNode {
128
128
  this.encoding = encoding;
129
129
  }
130
130
 
131
+ public IRubyObject getEncoding() {
132
+ return encoding;
133
+ }
134
+
131
135
  // not sure, but like attribute values, text value will be lost
132
136
  // unless it is referred once before this document is used.
133
137
  // this seems to happen only when the fragment is parsed from Node#in_context.
@@ -42,13 +42,8 @@ import static nokogiri.internals.NokogiriHelpers.stringOrNil;
42
42
 
43
43
  import java.io.ByteArrayInputStream;
44
44
  import java.io.InputStream;
45
- import java.io.UnsupportedEncodingException;
46
- import java.nio.ByteBuffer;
47
- import java.nio.CharBuffer;
48
45
  import java.nio.charset.CharacterCodingException;
49
46
  import java.nio.charset.Charset;
50
- import java.nio.charset.CharsetDecoder;
51
- import java.nio.charset.CharsetEncoder;
52
47
  import java.util.ArrayList;
53
48
  import java.util.List;
54
49
 
@@ -767,6 +762,7 @@ public class XmlNode extends RubyObject {
767
762
  } else {
768
763
  textContent = this.node.getTextContent();
769
764
  }
765
+ textContent = NokogiriHelpers.convertEncodingByNKFIfNecessary(context.getRuntime(), (XmlDocument)document(context), textContent);
770
766
  String decodedText = null;
771
767
  if (textContent != null) decodedText = NokogiriHelpers.decodeJavaString(textContent);
772
768
  return stringOrNil(context.getRuntime(), decodedText);
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -35,6 +35,7 @@ package nokogiri;
35
35
  import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
36
36
  import static nokogiri.internals.NokogiriHelpers.stringOrNil;
37
37
 
38
+ import org.jruby.CompatVersion;
38
39
  import org.jruby.Ruby;
39
40
  import org.jruby.RubyClass;
40
41
  import org.jruby.RubyException;
@@ -108,12 +109,23 @@ public class XmlSyntaxError extends RubyException {
108
109
  return new XmlSyntaxError(runtime, klazz, e);
109
110
  }
110
111
 
111
- @Override
112
- @JRubyMethod(name = "to_s")
112
+ //@Override
113
+ //"to_s" method was branched in 1.8 and 1.9 since JRuby 1.6.6
114
+ // to support older version of JRuby, the annotation is commented out
115
+ @JRubyMethod(name = "to_s", compat = CompatVersion.RUBY1_8)
113
116
  public IRubyObject to_s(ThreadContext context) {
114
117
  if (exception != null && exception.getMessage() != null)
115
118
  return context.getRuntime().newString(exception.getMessage());
116
119
  else
117
120
  return super.to_s(context);
118
121
  }
122
+
123
+ //@Override
124
+ //"to_s" method was branched in 1.8 and 1.9 since JRuby 1.6.6
125
+ // to support older version of JRuby, the annotation is commented out
126
+ @JRubyMethod(name = "to_s", compat = CompatVersion.RUBY1_9)
127
+ public IRubyObject to_s19(ThreadContext context) {
128
+ return this.to_s(context);
129
+ }
130
+
119
131
  }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -53,7 +53,6 @@ import org.jruby.RubyBoolean;
53
53
  import org.jruby.RubyClass;
54
54
  import org.jruby.RubyException;
55
55
  import org.jruby.RubyFloat;
56
- import org.jruby.RubyNumeric;
57
56
  import org.jruby.RubyObject;
58
57
  import org.jruby.RubyString;
59
58
  import org.jruby.anno.JRubyClass;
@@ -72,7 +71,7 @@ import org.w3c.dom.NodeList;
72
71
  @JRubyClass(name="Nokogiri::XML::XPathContext")
73
72
  public class XmlXpathContext extends RubyObject {
74
73
  private XmlNode context;
75
- private static final XPath xpath = XPathFactory.newInstance().newXPath();;
74
+ private XPath xpath;
76
75
 
77
76
  public XmlXpathContext(Ruby ruby, RubyClass rubyClass) {
78
77
  super(ruby, rubyClass);
@@ -98,6 +97,7 @@ public class XmlXpathContext extends RubyObject {
98
97
  public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject node) {
99
98
  XmlNode xmlNode = (XmlNode)node;
100
99
  XmlXpathContext xmlXpathContext = (XmlXpathContext) NokogiriService.XML_XPATHCONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
100
+ xmlXpathContext.xpath = XPathFactory.newInstance().newXPath();
101
101
  xmlXpathContext.setNode(xmlNode);
102
102
  return xmlXpathContext;
103
103
  }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -135,6 +135,7 @@ public class HtmlDomParserContext extends XmlDomParserContext {
135
135
  }
136
136
  }
137
137
  htmlDocument.setEncoding(ruby_encoding);
138
+ htmlDocument.setParsedEncoding(java_encoding);
138
139
  return htmlDocument;
139
140
  }
140
141
 
@@ -34,6 +34,8 @@ package nokogiri.internals;
34
34
 
35
35
  import java.io.File;
36
36
  import java.io.UnsupportedEncodingException;
37
+ import java.lang.reflect.InvocationTargetException;
38
+ import java.lang.reflect.Method;
37
39
  import java.nio.ByteBuffer;
38
40
  import java.nio.CharBuffer;
39
41
  import java.nio.charset.CharacterCodingException;
@@ -42,6 +44,8 @@ import java.nio.charset.CharsetDecoder;
42
44
  import java.nio.charset.CharsetEncoder;
43
45
  import java.util.ArrayList;
44
46
  import java.util.List;
47
+ import java.util.Set;
48
+ import java.util.SortedMap;
45
49
  import java.util.regex.Matcher;
46
50
  import java.util.regex.Pattern;
47
51
 
@@ -657,13 +661,29 @@ public class NokogiriHelpers {
657
661
  return n;
658
662
  }
659
663
 
660
- public static String guessEncoding(Ruby ruby) {
664
+ public static String getValidEncoding(Ruby runtime, IRubyObject encoding) {
665
+ if (encoding.isNil()) {
666
+ return guessEncoding();
667
+ } else {
668
+ return ignoreInvalidEncoding(runtime, encoding);
669
+ }
670
+ }
671
+
672
+ private static String guessEncoding() {
661
673
  String name = null;
662
674
  if (name == null) name = System.getProperty("file.encoding");
663
675
  if (name == null) name = "UTF-8";
664
676
  return name;
665
677
  }
666
678
 
679
+ private static Set<String> charsetNames = ((SortedMap<String, Charset>)Charset.availableCharsets()).keySet();
680
+
681
+ private static String ignoreInvalidEncoding(Ruby runtime, IRubyObject encoding) {
682
+ String givenEncoding = rubyStringToString(encoding);
683
+ if (charsetNames.contains(givenEncoding)) return givenEncoding;
684
+ else return guessEncoding();
685
+ }
686
+
667
687
  public static String adjustSystemIdIfNecessary(String currentDir, String scriptFileName, String baseURI, String systemId) {
668
688
  if (systemId == null) return systemId;
669
689
  File file = new File(systemId);
@@ -710,4 +730,71 @@ public class NokogiriHelpers {
710
730
  return bytes;
711
731
  }
712
732
 
733
+ public static String convertEncodingByNKFIfNecessary(Ruby runtime, XmlDocument doc, String thing) {
734
+ if (!(doc instanceof HtmlDocument)) return thing;
735
+ String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding();
736
+ if (parsed_encoding == null) return thing;
737
+ String ruby_encoding = rubyStringToString(doc.getEncoding());
738
+ if (ruby_encoding == null) return thing;
739
+ if (Charset.forName(parsed_encoding).compareTo(Charset.forName(ruby_encoding)) == 0) {
740
+ return thing;
741
+ } else {
742
+ return NokogiriHelpers.nkf(runtime, ruby_encoding, thing);
743
+ }
744
+
745
+ }
746
+
747
+ // This method is used from HTML documents. HTML meta tag with encoding specification
748
+ // might appear after non-ascii characters are used. For example, a title tag before
749
+ // a meta tag. In such a case, Xerces encodes characters in UTF-8 without seeing meta tag.
750
+ // Nokogiri uses NKF library to convert characters correct encoding. This means the method
751
+ // works only for JIS/Shift_JIS/EUC-JP.
752
+ public static String nkf(Ruby runtime, String ruby_encoding, String thing) {
753
+ StringBuffer sb = new StringBuffer("-");
754
+ Charset that = Charset.forName(ruby_encoding);
755
+ if (NokogiriHelpers.shift_jis.compareTo(that) == 0) {
756
+ sb.append("S");
757
+ } else if (NokogiriHelpers.jis.compareTo(that) == 0) {
758
+ sb.append("J");
759
+ } else if (NokogiriHelpers.euc_jp.compareTo(that) == 0) {
760
+ sb.append("E");
761
+ } else {
762
+ // should not come here. should be treated before this method.
763
+ sb.append("W");
764
+ }
765
+ sb.append("w");
766
+ Class nkfClass = null;
767
+ try {
768
+ // JRuby 1.7 and later
769
+ nkfClass = runtime.getClassLoader().loadClass("org.jruby.ext.nkf.RubyNKF");
770
+ } catch (ClassNotFoundException e1) {
771
+ try {
772
+ // Before JRuby 1.7
773
+ nkfClass = runtime.getClassLoader().loadClass("org.jruby.RubyNKF");
774
+ } catch (ClassNotFoundException e2) {
775
+ return thing;
776
+ }
777
+ }
778
+ Method nkf_method;
779
+ try {
780
+ nkf_method = nkfClass.getMethod("nkf", ThreadContext.class, IRubyObject.class, IRubyObject.class, IRubyObject.class);
781
+ RubyString r_str =
782
+ (RubyString)nkf_method.invoke(null, runtime.getCurrentContext(), null, runtime.newString(new String(sb)), runtime.newString(thing));
783
+ return NokogiriHelpers.rubyStringToString(r_str);
784
+ } catch (SecurityException e) {
785
+ return thing;
786
+ } catch (NoSuchMethodException e) {
787
+ return thing;
788
+ } catch (IllegalArgumentException e) {
789
+ return thing;
790
+ } catch (IllegalAccessException e) {
791
+ return thing;
792
+ } catch (InvocationTargetException e) {
793
+ return thing;
794
+ }
795
+ }
796
+
797
+ private static Charset shift_jis = Charset.forName("Shift_JIS");
798
+ private static Charset jis = Charset.forName("ISO-2022-JP");
799
+ private static Charset euc_jp = Charset.forName("EUC-JP");
713
800
  }
@@ -40,6 +40,9 @@ import java.io.ByteArrayInputStream;
40
40
  import java.io.File;
41
41
  import java.io.IOException;
42
42
  import java.io.InputStream;
43
+ import java.io.StringReader;
44
+ import java.nio.charset.Charset;
45
+ import java.nio.charset.UnsupportedCharsetException;
43
46
 
44
47
  import org.jruby.Ruby;
45
48
  import org.jruby.RubyClass;
@@ -159,9 +162,27 @@ public class ParserContext extends RubyObject {
159
162
  }
160
163
  }
161
164
  if (stringData != null) {
165
+ String encName = null;
166
+ if (stringData.encoding(context) != null) {
167
+ encName = stringData.encoding(context).toString();
168
+ }
169
+ Charset charset = null;
170
+ if (encName != null) {
171
+ try {
172
+ charset = Charset.forName(encName);
173
+ } catch (UnsupportedCharsetException e) {
174
+ // do nothing;
175
+ }
176
+ }
162
177
  ByteList bytes = stringData.getByteList();
163
- stringDataSize = bytes.length() - bytes.begin();
164
- source = new InputSource(new ByteArrayInputStream(bytes.unsafeBytes(), bytes.begin(), bytes.length()));
178
+ if (charset != null) {
179
+ StringReader reader = new StringReader(new String(bytes.unsafeBytes(), bytes.begin(), bytes.length(), charset));
180
+ source = new InputSource(reader);
181
+ source.setEncoding(charset.name());
182
+ } else {
183
+ stringDataSize = bytes.length() - bytes.begin();
184
+ source = new InputSource(new ByteArrayInputStream(bytes.unsafeBytes(), bytes.begin(), bytes.length()));
185
+ }
165
186
  }
166
187
  }
167
188
 
@@ -45,6 +45,8 @@ import java.util.Deque;
45
45
  import java.util.Iterator;
46
46
  import java.util.List;
47
47
  import java.util.Stack;
48
+ import java.util.regex.Matcher;
49
+ import java.util.regex.Pattern;
48
50
 
49
51
  import org.cyberneko.html.HTMLElements;
50
52
  import org.w3c.dom.Attr;
@@ -252,12 +254,27 @@ public class SaveContextVisitor {
252
254
  if (!asHtml || !isHtmlBooleanAttr(name)) {
253
255
  buffer.append("=");
254
256
  buffer.append("\"");
255
- buffer.append(serializeAttrTextContent(attr.getValue(), htmlDoc));
257
+ String value = replaceCharsetIfNecessary(attr);
258
+ buffer.append(serializeAttrTextContent(value, htmlDoc));
256
259
  buffer.append("\"");
257
260
  }
258
261
  return true;
259
262
  }
260
263
 
264
+ private static Pattern p =
265
+ Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE);
266
+
267
+ private String replaceCharsetIfNecessary(Attr attr) {
268
+ String value = attr.getValue();
269
+ if (encoding == null) return value; // unable to replace in any case
270
+ if (!"content".equals(attr.getName().toLowerCase())) return value; // must be content attr
271
+ if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value;
272
+ Matcher m = p.matcher(value);
273
+ if (!m.find()) return value;
274
+ if (value.contains(encoding)) return value; // no need to replace
275
+ return value.replace(m.group(), "charset=" + encoding);
276
+ }
277
+
261
278
  public static final String[] HTML_BOOLEAN_ATTRS = {
262
279
  "checked", "compact", "declare", "defer", "disabled", "ismap",
263
280
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -33,7 +33,6 @@
33
33
  package nokogiri.internals;
34
34
 
35
35
  import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
36
- import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
37
36
  import java.io.ByteArrayInputStream;
38
37
  import java.io.IOException;
39
38
  import java.util.ArrayList;
@@ -88,7 +87,7 @@ public class XmlDomParserContext extends ParserContext {
88
87
  public XmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) {
89
88
  super(runtime);
90
89
  this.options = new ParserContext.Options((Long)options.toJava(Long.class));
91
- this.java_encoding = encoding.isNil() ? NokogiriHelpers.guessEncoding(runtime) : rubyStringToString(encoding);
90
+ java_encoding = NokogiriHelpers.getValidEncoding(runtime, encoding);
92
91
  ruby_encoding = encoding;
93
92
  initErrorHandler();
94
93
  initParser(runtime);
Binary file
@@ -1,6 +1,6 @@
1
1
  module Nokogiri
2
2
  # The version of Nokogiri you are using
3
- VERSION = '1.5.3.rc5'
3
+ VERSION = '1.5.3.rc6'
4
4
 
5
5
  class VersionInfo # :nodoc:
6
6
  def jruby?
@@ -33,7 +33,7 @@ module Nokogiri
33
33
  # parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
34
34
  #
35
35
  # # Feed the parser some XML
36
- # parser.parse(File.read(ARGV[0], 'rb'))
36
+ # parser.parse(File.open(ARGV[0]))
37
37
  #
38
38
  # Now my document handler will be called when each node starts, and when
39
39
  # then document ends. To see what kinds of events are available, take
@@ -25,7 +25,7 @@ module Nokogiri
25
25
  # parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
26
26
  #
27
27
  # # Send some XML to the parser
28
- # parser.parse(File.read(ARGV[0]))
28
+ # parser.parse(File.open(ARGV[0]))
29
29
  #
30
30
  # For more information about SAX parsers, see Nokogiri::XML::SAX. Also
31
31
  # see Nokogiri::XML::SAX::Document for the available events.
@@ -18,7 +18,9 @@ module Nokogiri
18
18
  end
19
19
 
20
20
  def test_dotted_version
21
- assert_equal 'UTF-8', Nokogiri::LIBXML_VERSION.encoding.name
21
+ if Nokogiri.uses_libxml?
22
+ assert_equal 'UTF-8', Nokogiri::LIBXML_VERSION.encoding.name
23
+ end
22
24
  end
23
25
  end
24
26
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: nokogiri
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease: 6
5
- version: 1.5.3.rc5
5
+ version: 1.5.3.rc6
6
6
  platform: java
7
7
  authors:
8
8
  - Aaron Patterson
@@ -12,7 +12,7 @@ autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
14
 
15
- date: 2012-04-27 00:00:00 Z
15
+ date: 2012-05-30 00:00:00 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: hoe-bundler