nokogiri 1.11.0.rc3 → 1.11.0.rc4

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +1015 -947
  3. data/README.md +1 -1
  4. data/ext/nokogiri/depend +476 -357
  5. data/ext/nokogiri/extconf.rb +441 -321
  6. data/ext/nokogiri/html_document.c +79 -78
  7. data/ext/nokogiri/html_sax_parser_context.c +2 -2
  8. data/ext/nokogiri/nokogiri.c +34 -46
  9. data/ext/nokogiri/nokogiri.h +22 -26
  10. data/ext/nokogiri/xml_document.c +2 -2
  11. data/ext/nokogiri/xml_node.c +1 -1
  12. data/ext/nokogiri/xml_node_set.c +1 -1
  13. data/ext/nokogiri/xml_relax_ng.c +29 -11
  14. data/ext/nokogiri/xml_sax_parser.c +2 -7
  15. data/ext/nokogiri/xml_sax_parser_context.c +2 -2
  16. data/ext/nokogiri/xml_schema.c +55 -13
  17. data/ext/nokogiri/xml_xpath_context.c +80 -4
  18. data/ext/nokogiri/xslt_stylesheet.c +1 -4
  19. data/lib/nokogiri.rb +1 -1
  20. data/lib/nokogiri/css/parser.rb +3 -3
  21. data/lib/nokogiri/css/parser.y +2 -2
  22. data/lib/nokogiri/css/xpath_visitor.rb +70 -42
  23. data/lib/nokogiri/html/document.rb +12 -26
  24. data/lib/nokogiri/version.rb +2 -149
  25. data/lib/nokogiri/version/constant.rb +5 -0
  26. data/lib/nokogiri/version/info.rb +182 -0
  27. data/lib/nokogiri/xml/document.rb +17 -7
  28. data/lib/nokogiri/xml/document_fragment.rb +4 -6
  29. data/lib/nokogiri/xml/node.rb +50 -27
  30. data/lib/nokogiri/xml/parse_options.rb +6 -0
  31. data/lib/nokogiri/xml/relax_ng.rb +6 -2
  32. data/lib/nokogiri/xml/schema.rb +12 -4
  33. data/lib/nokogiri/xml/searchable.rb +3 -1
  34. data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +73 -0
  35. data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +103 -0
  36. data/patches/libxml2/0008-use-glibc-strlen.patch +53 -0
  37. metadata +34 -22
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
  #
3
3
  # DO NOT MODIFY!!!!
4
- # This file is automatically generated by Racc 1.4.16
4
+ # This file is automatically generated by Racc 1.5.1
5
5
  # from Racc grammar file "".
6
6
  #
7
7
 
@@ -476,7 +476,7 @@ def _reduce_26(val, _values, result)
476
476
  end
477
477
 
478
478
  def _reduce_27(val, _values, result)
479
- # Non standard, but hpricot supports it.
479
+ # non-standard, from hpricot
480
480
  result = Node.new(:PSEUDO_CLASS,
481
481
  [Node.new(:FUNCTION, ['nth-child(', val[1]])]
482
482
  )
@@ -558,7 +558,7 @@ def _reduce_40(val, _values, result)
558
558
  when 'n'
559
559
  result = Node.new(:NTH, ['1','n','+','0'])
560
560
  else
561
- # This is not CSS standard. It allows us to support this:
561
+ # non-standard to support custom functions:
562
562
  # assert_xpath("//a[foo(., @href)]", @parser.parse('a:foo(@href)'))
563
563
  # assert_xpath("//a[foo(., @a, b)]", @parser.parse('a:foo(@a, b)'))
564
564
  # assert_xpath("//a[foo(., a, 10)]", @parser.parse('a:foo(a, 10)'))
@@ -88,7 +88,7 @@ rule
88
88
  )
89
89
  }
90
90
  | LSQUARE NUMBER RSQUARE {
91
- # Non standard, but hpricot supports it.
91
+ # non-standard, from hpricot
92
92
  result = Node.new(:PSEUDO_CLASS,
93
93
  [Node.new(:FUNCTION, ['nth-child(', val[1]])]
94
94
  )
@@ -139,7 +139,7 @@ rule
139
139
  when 'n'
140
140
  result = Node.new(:NTH, ['1','n','+','0'])
141
141
  else
142
- # This is not CSS standard. It allows us to support this:
142
+ # non-standard to support custom functions:
143
143
  # assert_xpath("//a[foo(., @href)]", @parser.parse('a:foo(@href)'))
144
144
  # assert_xpath("//a[foo(., @a, b)]", @parser.parse('a:foo(@a, b)'))
145
145
  # assert_xpath("//a[foo(., a, 10)]", @parser.parse('a:foo(a, 10)'))
@@ -3,7 +3,6 @@ module Nokogiri
3
3
  module CSS
4
4
  class XPathVisitor # :nodoc:
5
5
  def visit_function node
6
-
7
6
  msg = :"visit_function_#{node.value.first.gsub(/[(]/, '')}"
8
7
  return self.send(msg, node) if self.respond_to?(msg)
9
8
 
@@ -13,50 +12,51 @@ module Nokogiri
13
12
  when /^self\(/
14
13
  "self::#{node.value[1]}"
15
14
  when /^eq\(/
16
- "position() = #{node.value[1]}"
15
+ "position()=#{node.value[1]}"
17
16
  when /^(nth|nth-of-type)\(/
18
17
  if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
19
18
  nth(node.value[1])
20
19
  else
21
- "position() = #{node.value[1]}"
20
+ "position()=#{node.value[1]}"
22
21
  end
23
22
  when /^nth-child\(/
24
23
  if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
25
24
  nth(node.value[1], :child => true)
26
25
  else
27
- "count(preceding-sibling::*) = #{node.value[1].to_i-1}"
26
+ "count(preceding-sibling::*)=#{node.value[1].to_i-1}"
28
27
  end
29
28
  when /^nth-last-of-type\(/
30
29
  if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
31
30
  nth(node.value[1], :last => true)
32
31
  else
33
32
  index = node.value[1].to_i - 1
34
- index == 0 ? "position() = last()" : "position() = last() - #{index}"
33
+ index == 0 ? "position()=last()" : "position()=last()-#{index}"
35
34
  end
36
35
  when /^nth-last-child\(/
37
36
  if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
38
37
  nth(node.value[1], :last => true, :child => true)
39
38
  else
40
- "count(following-sibling::*) = #{node.value[1].to_i-1}"
39
+ "count(following-sibling::*)=#{node.value[1].to_i-1}"
41
40
  end
42
41
  when /^(first|first-of-type)\(/
43
- "position() = 1"
42
+ "position()=1"
44
43
  when /^(last|last-of-type)\(/
45
- "position() = last()"
44
+ "position()=last()"
46
45
  when /^contains\(/
47
- "contains(., #{node.value[1]})"
46
+ "contains(.,#{node.value[1]})"
48
47
  when /^gt\(/
49
- "position() > #{node.value[1]}"
48
+ "position()>#{node.value[1]}"
50
49
  when /^only-child\(/
51
- "last() = 1"
50
+ "last()=1"
52
51
  when /^comment\(/
53
52
  "comment()"
54
53
  when /^has\(/
55
54
  is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
56
55
  ".#{"//" if !is_direct}#{node.value[1].accept(self)}"
57
56
  else
57
+ # non-standard. this looks like a function call.
58
58
  args = ['.'] + node.value[1..-1]
59
- "#{node.value.first}#{args.join(', ')})"
59
+ "#{node.value.first}#{args.join(',')})"
60
60
  end
61
61
  end
62
62
 
@@ -71,18 +71,18 @@ module Nokogiri
71
71
 
72
72
  def visit_id node
73
73
  node.value.first =~ /^#(.*)$/
74
- "@id = '#{$1}'"
74
+ "@id='#{$1}'"
75
75
  end
76
76
 
77
77
  def visit_attribute_condition node
78
- attribute = if (node.value.first.type == :FUNCTION) or (node.value.first.value.first =~ /::/)
79
- ''
80
- else
81
- '@'
82
- end
78
+ attribute = if (node.value.first.type == :FUNCTION) or (node.value.first.value.first =~ /::/)
79
+ ''
80
+ else
81
+ '@'
82
+ end
83
83
  attribute += node.value.first.accept(self)
84
84
 
85
- # Support non-standard css
85
+ # non-standard. attributes starting with '@'
86
86
  attribute.gsub!(/^@@/, '@')
87
87
 
88
88
  return attribute unless node.value.length == 3
@@ -90,29 +90,30 @@ module Nokogiri
90
90
  value = node.value.last
91
91
  value = "'#{value}'" if value !~ /^['"]/
92
92
 
93
+ # quoted values - see test_attribute_value_with_quotes in test/css/test_parser.rb
93
94
  if (value[0]==value[-1]) && %q{"'}.include?(value[0])
94
95
  str_value = value[1..-2]
95
96
  if str_value.include?(value[0])
96
- value = 'concat("' + str_value.split('"', -1).join(%q{", '"', "}) + '", "")'
97
+ value = 'concat("' + str_value.split('"', -1).join(%q{",'"',"}) + '","")'
97
98
  end
98
99
  end
99
100
 
100
101
  case node.value[1]
101
102
  when :equal
102
- attribute + " = " + "#{value}"
103
+ attribute + "=" + "#{value}"
103
104
  when :not_equal
104
- attribute + " != " + "#{value}"
105
+ attribute + "!=" + "#{value}"
105
106
  when :substring_match
106
- "contains(#{attribute}, #{value})"
107
+ "contains(#{attribute},#{value})"
107
108
  when :prefix_match
108
- "starts-with(#{attribute}, #{value})"
109
+ "starts-with(#{attribute},#{value})"
109
110
  when :dash_match
110
- "#{attribute} = #{value} or starts-with(#{attribute}, concat(#{value}, '-'))"
111
+ "#{attribute}=#{value} or starts-with(#{attribute},concat(#{value},'-'))"
111
112
  when :includes
112
- "contains(concat(\" \", #{attribute}, \" \"),concat(\" \", #{value}, \" \"))"
113
+ value = value[1..-2] # strip quotes
114
+ css_class(attribute, value)
113
115
  when :suffix_match
114
- "substring(#{attribute}, string-length(#{attribute}) - " +
115
- "string-length(#{value}) + 1, string-length(#{value})) = #{value}"
116
+ "substring(#{attribute},string-length(#{attribute})-string-length(#{value})+1,string-length(#{value}))=#{value}"
116
117
  else
117
118
  attribute + " #{node.value[1]} " + "#{value}"
118
119
  end
@@ -126,14 +127,14 @@ module Nokogiri
126
127
  return self.send(msg, node) if self.respond_to?(msg)
127
128
 
128
129
  case node.value.first
129
- when "first" then "position() = 1"
130
- when "first-child" then "count(preceding-sibling::*) = 0"
131
- when "last" then "position() = last()"
132
- when "last-child" then "count(following-sibling::*) = 0"
133
- when "first-of-type" then "position() = 1"
134
- when "last-of-type" then "position() = last()"
135
- when "only-child" then "count(preceding-sibling::*) = 0 and count(following-sibling::*) = 0"
136
- when "only-of-type" then "last() = 1"
130
+ when "first" then "position()=1"
131
+ when "first-child" then "count(preceding-sibling::*)=0"
132
+ when "last" then "position()=last()"
133
+ when "last-child" then "count(following-sibling::*)=0"
134
+ when "first-of-type" then "position()=1"
135
+ when "last-of-type" then "position()=last()"
136
+ when "only-child" then "count(preceding-sibling::*)=0 and count(following-sibling::*)=0"
137
+ when "only-of-type" then "last()=1"
137
138
  when "empty" then "not(node())"
138
139
  when "parent" then "node()"
139
140
  when "root" then "not(parent::*)"
@@ -144,7 +145,7 @@ module Nokogiri
144
145
  end
145
146
 
146
147
  def visit_class_condition node
147
- "contains(concat(' ', normalize-space(@class), ' '), ' #{node.value.first} ')"
148
+ css_class("@class", node.value.first)
148
149
  end
149
150
 
150
151
  def visit_combinator node
@@ -181,25 +182,26 @@ module Nokogiri
181
182
  node.accept(self)
182
183
  end
183
184
 
184
- private
185
+ private
186
+
185
187
  def nth node, options={}
186
188
  raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
187
189
 
188
190
  a, b = read_a_and_positive_b node.value
189
191
  position = if options[:child]
190
- options[:last] ? "(count(following-sibling::*) + 1)" : "(count(preceding-sibling::*) + 1)"
192
+ options[:last] ? "(count(following-sibling::*)+1)" : "(count(preceding-sibling::*)+1)"
191
193
  else
192
194
  options[:last] ? "(last()-position()+1)" : "position()"
193
195
  end
194
196
 
195
197
  if b.zero?
196
- "(#{position} mod #{a}) = 0"
198
+ "(#{position} mod #{a})=0"
197
199
  else
198
200
  compare = a < 0 ? "<=" : ">="
199
201
  if a.abs == 1
200
- "#{position} #{compare} #{b}"
202
+ "#{position}#{compare}#{b}"
201
203
  else
202
- "(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
204
+ "(#{position}#{compare}#{b}) and (((#{position}-#{b}) mod #{a.abs})=0)"
203
205
  end
204
206
  end
205
207
  end
@@ -227,6 +229,32 @@ module Nokogiri
227
229
  end =~ /(nth|first|last|only)-of-type(\()?/
228
230
  end
229
231
  end
232
+
233
+ # use only ordinary xpath functions
234
+ def css_class_standard(hay, needle)
235
+ "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')"
236
+ end
237
+
238
+ # use the builtin implementation
239
+ def css_class_builtin(hay, needle)
240
+ "nokogiri-builtin:css-class(#{hay},'#{needle}')"
241
+ end
242
+
243
+ alias_method :css_class, :css_class_standard
244
+ end
245
+
246
+ class XPathVisitorAlwaysUseBuiltins < XPathVisitor # :nodoc:
247
+ private
248
+ alias_method :css_class, :css_class_builtin
249
+ end
250
+
251
+ class XPathVisitorOptimallyUseBuiltins < XPathVisitor # :nodoc:
252
+ private
253
+ if Nokogiri.uses_libxml?
254
+ alias_method :css_class, :css_class_builtin
255
+ else
256
+ alias_method :css_class, :css_class_standard
257
+ end
230
258
  end
231
259
  end
232
260
  end
@@ -1,4 +1,7 @@
1
1
  # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
2
5
  module Nokogiri
3
6
  module HTML
4
7
  class Document < Nokogiri::XML::Document
@@ -161,11 +164,12 @@ module Nokogiri
161
164
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
162
165
  # Nokogiri::XML::ParseOptions.
163
166
  def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
164
-
165
167
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
166
- # Give the options to the user
168
+
167
169
  yield options if block_given?
168
170
 
171
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
172
+
169
173
  if string_or_io.respond_to?(:encoding)
170
174
  unless string_or_io.encoding.name == "ASCII-8BIT"
171
175
  encoding ||= string_or_io.encoding.name
@@ -173,7 +177,12 @@ module Nokogiri
173
177
  end
174
178
 
175
179
  if string_or_io.respond_to?(:read)
176
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
180
+ if string_or_io.is_a?(Pathname)
181
+ # resolve the Pathname to the file and open it as an IO object, see #2110
182
+ string_or_io = string_or_io.expand_path.open
183
+ url ||= string_or_io.path
184
+ end
185
+
177
186
  unless encoding
178
187
  # Libxml2's parser has poor support for encoding
179
188
  # detection. First, it does not recognize the HTML5
@@ -252,9 +261,6 @@ module Nokogiri
252
261
  end
253
262
 
254
263
  def self.detect_encoding(chunk)
255
- if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
256
- return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
257
- end
258
264
  m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
259
265
  return Nokogiri.XML(m[1]).encoding
260
266
 
@@ -273,26 +279,6 @@ module Nokogiri
273
279
  end
274
280
  end
275
281
 
276
- def self.is_jruby_without_fix?
277
- JRUBY_VERSION.split('.').join.to_i < 165
278
- end
279
-
280
- def self.detect_encoding_for_jruby_without_fix(chunk)
281
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
282
- return Nokogiri.XML(m[1]).encoding
283
-
284
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
285
- return m[4]
286
-
287
- catch(:encoding_found) {
288
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
289
- nil
290
- }
291
- rescue Nokogiri::SyntaxError, RuntimeError
292
- # Ignore parser errors that nokogiri may raise
293
- nil
294
- end
295
-
296
282
  def initialize(io)
297
283
  @io = io
298
284
  @firstchunk = nil
@@ -1,150 +1,3 @@
1
1
  # frozen_string_literal: true
2
- module Nokogiri
3
- # The version of Nokogiri you are using
4
- VERSION = "1.11.0.rc3"
5
-
6
- class VersionInfo # :nodoc:
7
- def jruby?
8
- ::JRUBY_VERSION if RUBY_PLATFORM == "java"
9
- end
10
-
11
- def engine
12
- defined?(RUBY_ENGINE) ? RUBY_ENGINE : "mri"
13
- end
14
-
15
- def loaded_libxml_version
16
- Gem::Version.new(LIBXML_LOADED_VERSION.
17
- scan(/^(\d+)(\d\d)(\d\d)(?!\d)/).first.
18
- collect(&:to_i).
19
- join("."))
20
- end
21
-
22
- def compiled_libxml_version
23
- Gem::Version.new LIBXML_COMPILED_VERSION
24
- end
25
-
26
- def loaded_libxslt_version
27
- Gem::Version.new(LIBXSLT_LOADED_VERSION.
28
- scan(/^(\d+)(\d\d)(\d\d)(?!\d)/).first.
29
- collect(&:to_i).
30
- join("."))
31
- end
32
-
33
- def compiled_libxslt_version
34
- Gem::Version.new LIBXSLT_COMPILED_VERSION
35
- end
36
-
37
- def libxml2?
38
- defined?(LIBXML_COMPILED_VERSION)
39
- end
40
-
41
- def libxml2_using_system?
42
- !libxml2_using_packaged?
43
- end
44
-
45
- def libxml2_using_packaged?
46
- NOKOGIRI_USE_PACKAGED_LIBRARIES
47
- end
48
-
49
- def warnings
50
- warnings = []
51
-
52
- if libxml2?
53
- if compiled_libxml_version != loaded_libxml_version
54
- warnings << "Nokogiri was built against libxml version #{compiled_libxml_version}, but has dynamically loaded #{loaded_libxml_version}"
55
- end
56
-
57
- if compiled_libxslt_version != loaded_libxslt_version
58
- warnings << "Nokogiri was built against libxslt version #{compiled_libxslt_version}, but has dynamically loaded #{loaded_libxslt_version}"
59
- end
60
- end
61
-
62
- warnings
63
- end
64
-
65
- def to_hash
66
- {}.tap do |vi|
67
- vi["warnings"] = []
68
- vi["nokogiri"] = Nokogiri::VERSION
69
- vi["ruby"] = {}.tap do |ruby|
70
- ruby["version"] = ::RUBY_VERSION
71
- ruby["platform"] = ::RUBY_PLATFORM
72
- ruby["gem_platform"] = ::Gem::Platform.local.to_s
73
- ruby["description"] = ::RUBY_DESCRIPTION
74
- ruby["engine"] = engine
75
- ruby["jruby"] = jruby? if jruby?
76
- end
77
-
78
- if libxml2?
79
- vi["libxml"] = {}.tap do |libxml|
80
- if libxml2_using_packaged?
81
- libxml["source"] = "packaged"
82
- libxml["patches"] = NOKOGIRI_LIBXML2_PATCHES
83
- else
84
- libxml["source"] = "system"
85
- end
86
- libxml["compiled"] = compiled_libxml_version.to_s
87
- libxml["loaded"] = loaded_libxml_version.to_s
88
- end
89
-
90
- vi["libxslt"] = {}.tap do |libxslt|
91
- if libxml2_using_packaged?
92
- libxslt["source"] = "packaged"
93
- libxslt["patches"] = NOKOGIRI_LIBXSLT_PATCHES
94
- else
95
- libxslt["source"] = "system"
96
- end
97
- libxslt["compiled"] = compiled_libxslt_version.to_s
98
- libxslt["loaded"] = loaded_libxslt_version.to_s
99
- end
100
-
101
- vi["warnings"] = warnings
102
- elsif jruby?
103
- vi["xerces"] = Nokogiri::XERCES_VERSION
104
- vi["nekohtml"] = Nokogiri::NEKO_VERSION
105
- end
106
- end
107
- end
108
-
109
- def to_markdown
110
- begin
111
- require "psych"
112
- rescue LoadError
113
- end
114
- require "yaml"
115
- "# Nokogiri (#{Nokogiri::VERSION})\n" +
116
- YAML.dump(to_hash).each_line.map { |line| " #{line}" }.join
117
- end
118
-
119
- # FIXME: maybe switch to singleton?
120
- @@instance = new
121
- @@instance.warnings.each do |warning|
122
- warn "WARNING: #{warning}"
123
- end
124
- def self.instance; @@instance; end
125
- end
126
-
127
- def self.uses_libxml?(requirement = nil) # :nodoc:
128
- return false unless VersionInfo.instance.libxml2?
129
- return true unless requirement
130
- return Gem::Requirement.new(requirement).satisfied_by?(VersionInfo.instance.loaded_libxml_version)
131
- end
132
-
133
- def self.jruby? # :nodoc:
134
- VersionInfo.instance.jruby?
135
- end
136
-
137
- # Ensure constants used in this file are loaded - see #1896
138
- if Nokogiri.jruby?
139
- require "nokogiri/jruby/dependencies"
140
- end
141
- begin
142
- RUBY_VERSION =~ /(\d+\.\d+)/
143
- require "nokogiri/#{$1}/nokogiri"
144
- rescue LoadError
145
- require "nokogiri/nokogiri"
146
- end
147
-
148
- # More complete version information about libxml
149
- VERSION_INFO = VersionInfo.instance.to_hash
150
- end
2
+ require_relative "version/constant"
3
+ require_relative "version/info"