nokogiri 1.12.2 → 1.13.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (97) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +9 -7
  4. data/bin/nokogiri +63 -50
  5. data/dependencies.yml +5 -6
  6. data/ext/nokogiri/extconf.rb +51 -35
  7. data/ext/nokogiri/gumbo.c +11 -11
  8. data/ext/nokogiri/html4_element_description.c +1 -1
  9. data/ext/nokogiri/html4_sax_parser_context.c +2 -1
  10. data/ext/nokogiri/nokogiri.c +1 -1
  11. data/ext/nokogiri/nokogiri.h +3 -0
  12. data/ext/nokogiri/xml_document.c +36 -36
  13. data/ext/nokogiri/xml_document_fragment.c +0 -2
  14. data/ext/nokogiri/xml_dtd.c +2 -2
  15. data/ext/nokogiri/xml_encoding_handler.c +25 -11
  16. data/ext/nokogiri/xml_namespace.c +2 -2
  17. data/ext/nokogiri/xml_node.c +647 -335
  18. data/ext/nokogiri/xml_reader.c +37 -11
  19. data/ext/nokogiri/xml_xpath_context.c +72 -49
  20. data/gumbo-parser/src/parser.c +0 -11
  21. data/lib/nokogiri/class_resolver.rb +67 -0
  22. data/lib/nokogiri/css/node.rb +9 -8
  23. data/lib/nokogiri/css/parser.rb +11 -3
  24. data/lib/nokogiri/css/parser.y +10 -2
  25. data/lib/nokogiri/css/parser_extras.rb +20 -20
  26. data/lib/nokogiri/css/syntax_error.rb +1 -0
  27. data/lib/nokogiri/css/tokenizer.rb +2 -1
  28. data/lib/nokogiri/css/tokenizer.rex +2 -1
  29. data/lib/nokogiri/css/xpath_visitor.rb +174 -75
  30. data/lib/nokogiri/css.rb +38 -6
  31. data/lib/nokogiri/decorators/slop.rb +8 -7
  32. data/lib/nokogiri/extension.rb +1 -1
  33. data/lib/nokogiri/gumbo.rb +1 -0
  34. data/lib/nokogiri/html.rb +16 -10
  35. data/lib/nokogiri/html4/builder.rb +1 -0
  36. data/lib/nokogiri/html4/document.rb +84 -75
  37. data/lib/nokogiri/html4/document_fragment.rb +11 -7
  38. data/lib/nokogiri/html4/element_description.rb +1 -0
  39. data/lib/nokogiri/html4/element_description_defaults.rb +426 -520
  40. data/lib/nokogiri/html4/entity_lookup.rb +2 -1
  41. data/lib/nokogiri/html4/sax/parser.rb +2 -1
  42. data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
  43. data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
  44. data/lib/nokogiri/html4.rb +11 -5
  45. data/lib/nokogiri/html5/document.rb +24 -10
  46. data/lib/nokogiri/html5/document_fragment.rb +5 -2
  47. data/lib/nokogiri/html5/node.rb +6 -3
  48. data/lib/nokogiri/html5.rb +68 -64
  49. data/lib/nokogiri/jruby/dependencies.rb +10 -9
  50. data/lib/nokogiri/syntax_error.rb +1 -0
  51. data/lib/nokogiri/version/constant.rb +2 -1
  52. data/lib/nokogiri/version/info.rb +19 -13
  53. data/lib/nokogiri/version.rb +1 -0
  54. data/lib/nokogiri/xml/attr.rb +5 -3
  55. data/lib/nokogiri/xml/attribute_decl.rb +2 -1
  56. data/lib/nokogiri/xml/builder.rb +69 -31
  57. data/lib/nokogiri/xml/cdata.rb +2 -1
  58. data/lib/nokogiri/xml/character_data.rb +1 -0
  59. data/lib/nokogiri/xml/document.rb +178 -96
  60. data/lib/nokogiri/xml/document_fragment.rb +41 -38
  61. data/lib/nokogiri/xml/dtd.rb +3 -2
  62. data/lib/nokogiri/xml/element_content.rb +1 -0
  63. data/lib/nokogiri/xml/element_decl.rb +2 -1
  64. data/lib/nokogiri/xml/entity_decl.rb +3 -2
  65. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  66. data/lib/nokogiri/xml/namespace.rb +2 -0
  67. data/lib/nokogiri/xml/node/save_options.rb +7 -4
  68. data/lib/nokogiri/xml/node.rb +512 -348
  69. data/lib/nokogiri/xml/node_set.rb +46 -54
  70. data/lib/nokogiri/xml/notation.rb +12 -0
  71. data/lib/nokogiri/xml/parse_options.rb +11 -7
  72. data/lib/nokogiri/xml/pp/character_data.rb +8 -6
  73. data/lib/nokogiri/xml/pp/node.rb +24 -26
  74. data/lib/nokogiri/xml/pp.rb +1 -0
  75. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  76. data/lib/nokogiri/xml/reader.rb +17 -19
  77. data/lib/nokogiri/xml/relax_ng.rb +1 -0
  78. data/lib/nokogiri/xml/sax/document.rb +20 -19
  79. data/lib/nokogiri/xml/sax/parser.rb +36 -34
  80. data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
  81. data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
  82. data/lib/nokogiri/xml/sax.rb +1 -0
  83. data/lib/nokogiri/xml/schema.rb +7 -6
  84. data/lib/nokogiri/xml/searchable.rb +42 -22
  85. data/lib/nokogiri/xml/syntax_error.rb +4 -4
  86. data/lib/nokogiri/xml/text.rb +1 -0
  87. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  88. data/lib/nokogiri/xml/xpath.rb +12 -0
  89. data/lib/nokogiri/xml/xpath_context.rb +2 -3
  90. data/lib/nokogiri/xml.rb +3 -3
  91. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  92. data/lib/nokogiri/xslt.rb +3 -2
  93. data/lib/nokogiri.rb +19 -16
  94. data/lib/xsd/xmlparser/nokogiri.rb +25 -24
  95. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  96. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  97. metadata +101 -27
@@ -1,6 +1,7 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
2
3
 
3
- require 'pathname'
4
+ require "pathname"
4
5
 
5
6
  module Nokogiri
6
7
  module HTML4
@@ -9,11 +10,10 @@ module Nokogiri
9
10
  # Get the meta tag encoding for this document. If there is no meta tag,
10
11
  # then nil is returned.
11
12
  def meta_encoding
12
- case
13
- when meta = at('//meta[@charset]')
13
+ if (meta = at_xpath("//meta[@charset]"))
14
14
  meta[:charset]
15
- when meta = meta_content_type
16
- meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
15
+ elsif (meta = meta_content_type)
16
+ meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
17
17
  end
18
18
  end
19
19
 
@@ -33,24 +33,22 @@ module Nokogiri
33
33
  #
34
34
  # Beware in CRuby, that libxml2 automatically inserts a meta tag
35
35
  # into a head element.
36
- def meta_encoding= encoding
37
- case
38
- when meta = meta_content_type
39
- meta['content'] = 'text/html; charset=%s' % encoding
36
+ def meta_encoding=(encoding)
37
+ if (meta = meta_content_type)
38
+ meta["content"] = format("text/html; charset=%s", encoding)
40
39
  encoding
41
- when meta = at('//meta[@charset]')
42
- meta['charset'] = encoding
40
+ elsif (meta = at_xpath("//meta[@charset]"))
41
+ meta["charset"] = encoding
43
42
  else
44
- meta = XML::Node.new('meta', self)
45
- if dtd = internal_subset and dtd.html5_dtd?
46
- meta['charset'] = encoding
43
+ meta = XML::Node.new("meta", self)
44
+ if (dtd = internal_subset) && dtd.html5_dtd?
45
+ meta["charset"] = encoding
47
46
  else
48
- meta['http-equiv'] = 'Content-Type'
49
- meta['content'] = 'text/html; charset=%s' % encoding
47
+ meta["http-equiv"] = "Content-Type"
48
+ meta["content"] = format("text/html; charset=%s", encoding)
50
49
  end
51
50
 
52
- case
53
- when head = at('//head')
51
+ if (head = at_xpath("//head"))
54
52
  head.prepend_child(meta)
55
53
  else
56
54
  set_metadata_element(meta)
@@ -60,9 +58,9 @@ module Nokogiri
60
58
  end
61
59
 
62
60
  def meta_content_type
63
- xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
64
- node['http-equiv'] =~ /\AContent-Type\z/i
65
- }
61
+ xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
62
+ node["http-equiv"] =~ /\AContent-Type\z/i
63
+ end
66
64
  end
67
65
  private :meta_content_type
68
66
 
@@ -70,7 +68,7 @@ module Nokogiri
70
68
  # Get the title string of this document. Return nil if there is
71
69
  # no title tag.
72
70
  def title
73
- title = at('//title') and title.inner_text
71
+ (title = at_xpath("//title")) && title.inner_text
74
72
  end
75
73
 
76
74
  ###
@@ -86,44 +84,41 @@ module Nokogiri
86
84
  # content element (typically <body>) if any.
87
85
  def title=(text)
88
86
  tnode = XML::Text.new(text, self)
89
- if title = at('//title')
87
+ if (title = at_xpath("//title"))
90
88
  title.children = tnode
91
89
  return text
92
90
  end
93
91
 
94
- title = XML::Node.new('title', self) << tnode
95
- case
96
- when head = at('//head')
92
+ title = XML::Node.new("title", self) << tnode
93
+ if (head = at_xpath("//head"))
97
94
  head << title
98
- when meta = at('//meta[@charset]') || meta_content_type
95
+ elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
99
96
  # better put after charset declaration
100
97
  meta.add_next_sibling(title)
101
98
  else
102
99
  set_metadata_element(title)
103
100
  end
104
- text
105
101
  end
106
102
 
107
- def set_metadata_element(element)
108
- case
109
- when head = at('//head')
103
+ def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
104
+ if (head = at_xpath("//head"))
110
105
  head << element
111
- when html = at('//html')
112
- head = html.prepend_child(XML::Node.new('head', self))
106
+ elsif (html = at_xpath("//html"))
107
+ head = html.prepend_child(XML::Node.new("head", self))
113
108
  head.prepend_child(element)
114
- when first = children.find { |node|
115
- case node
116
- when XML::Element, XML::Text
117
- true
118
- end
119
- }
109
+ elsif (first = children.find do |node|
110
+ case node
111
+ when XML::Element, XML::Text
112
+ true
113
+ end
114
+ end)
120
115
  # We reach here only if the underlying document model
121
116
  # allows <html>/<head> elements to be omitted and does not
122
117
  # automatically supply them.
123
118
  first.add_previous_sibling(element)
124
119
  else
125
- html = add_child(XML::Node.new('html', self))
126
- head = html.add_child(XML::Node.new('head', self))
120
+ html = add_child(XML::Node.new("html", self))
121
+ head = html.add_child(XML::Node.new("head", self))
127
122
  head.prepend_child(element)
128
123
  end
129
124
  end
@@ -143,15 +138,25 @@ module Nokogiri
143
138
  # config.format.as_xml
144
139
  # end
145
140
  #
146
- def serialize options = {}
141
+ def serialize(options = {})
147
142
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
148
143
  super
149
144
  end
150
145
 
151
146
  ####
152
147
  # Create a Nokogiri::XML::DocumentFragment from +tags+
153
- def fragment tags = nil
154
- DocumentFragment.new(self, tags, self.root)
148
+ def fragment(tags = nil)
149
+ DocumentFragment.new(self, tags, root)
150
+ end
151
+
152
+ # :call-seq:
153
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
154
+ #
155
+ # [Returns] The document type which determines CSS-to-XPath translation.
156
+ #
157
+ # See XPathVisitor for more information.
158
+ def xpath_doctype
159
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
155
160
  end
156
161
 
157
162
  class << self
@@ -163,9 +168,8 @@ module Nokogiri
163
168
  # is a number that sets options in the parser, such as
164
169
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
165
170
  # Nokogiri::XML::ParseOptions.
166
- def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
171
+ def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
167
172
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
168
-
169
173
  yield options if block_given?
170
174
 
171
175
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
@@ -206,7 +210,7 @@ module Nokogiri
206
210
  end
207
211
 
208
212
  # read_memory pukes on empty docs
209
- if string_or_io.nil? or string_or_io.empty?
213
+ if string_or_io.nil? || string_or_io.empty?
210
214
  return encoding ? new.tap { |i| i.encoding = encoding } : new
211
215
  end
212
216
 
@@ -216,37 +220,38 @@ module Nokogiri
216
220
  end
217
221
  end
218
222
 
219
- class EncodingFound < StandardError # :nodoc:
223
+ class EncodingFound < StandardError # :nodoc: all
220
224
  attr_reader :found_encoding
221
225
 
222
226
  def initialize(encoding)
223
227
  @found_encoding = encoding
224
- super("encoding found: %s" % encoding)
228
+ super(format("encoding found: %s", encoding))
225
229
  end
226
230
  end
227
231
 
228
- class EncodingReader # :nodoc:
229
- class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
232
+ # :nodoc: all
233
+ class EncodingReader
234
+ class SAXHandler < Nokogiri::XML::SAX::Document
230
235
  attr_reader :encoding
231
-
236
+
232
237
  def initialize
233
238
  @encoding = nil
234
239
  super()
235
240
  end
236
-
241
+
237
242
  def start_element(name, attrs = [])
238
- return unless name == 'meta'
243
+ return unless name == "meta"
239
244
  attr = Hash[attrs]
240
- charset = attr['charset'] and
241
- @encoding = charset
242
- http_equiv = attr['http-equiv'] and
243
- http_equiv.match(/\AContent-Type\z/i) and
244
- content = attr['content'] and
245
- m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
246
- @encoding = m[1]
245
+ (charset = attr["charset"]) &&
246
+ (@encoding = charset)
247
+ (http_equiv = attr["http-equiv"]) &&
248
+ http_equiv.match(/\AContent-Type\z/i) &&
249
+ (content = attr["content"]) &&
250
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
251
+ (@encoding = m[1])
247
252
  end
248
253
  end
249
-
254
+
250
255
  class JumpSAXHandler < SAXHandler
251
256
  def initialize(jumptag)
252
257
  @jumptag = jumptag
@@ -255,26 +260,30 @@ module Nokogiri
255
260
 
256
261
  def start_element(name, attrs = [])
257
262
  super
258
- throw @jumptag, @encoding if @encoding
259
- throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
263
+ throw(@jumptag, @encoding) if @encoding
264
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
260
265
  end
261
266
  end
262
267
 
263
268
  def self.detect_encoding(chunk)
264
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
265
- return Nokogiri.XML(m[1]).encoding
269
+ (m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)) &&
270
+ (return Nokogiri.XML(m[1]).encoding)
266
271
 
267
272
  if Nokogiri.jruby?
268
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
269
- return m[4]
270
- catch(:encoding_found) {
273
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
274
+ (return m[4])
275
+ catch(:encoding_found) do
271
276
  Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
272
277
  nil
273
- }
278
+ end
274
279
  else
275
280
  handler = SAXHandler.new
276
281
  parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
277
- parser << chunk rescue Nokogiri::SyntaxError
282
+ begin
283
+ parser << chunk
284
+ rescue
285
+ Nokogiri::SyntaxError
286
+ end
278
287
  handler.encoding
279
288
  end
280
289
  end
@@ -293,13 +302,13 @@ module Nokogiri
293
302
  def read(len)
294
303
  # no support for a call without len
295
304
 
296
- if !@firstchunk
297
- @firstchunk = @io.read(len) or return nil
305
+ unless @firstchunk
306
+ (@firstchunk = @io.read(len)) || (return nil)
298
307
 
299
308
  # This implementation expects that the first call from
300
309
  # htmlReadIO() is made with a length long enough (~1KB) to
301
310
  # achieve advanced encoding detection.
302
- if encoding = EncodingReader.detect_encoding(@firstchunk)
311
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
303
312
  # The first chunk is stored for the next read in retry.
304
313
  raise @encoding_found = EncodingFound.new(encoding)
305
314
  end
@@ -308,7 +317,7 @@ module Nokogiri
308
317
 
309
318
  ret = @firstchunk.slice!(0, len)
310
319
  if (len -= ret.length) > 0
311
- rest = @io.read(len) and ret << rest
320
+ (rest = @io.read(len)) && ret << (rest)
312
321
  end
313
322
  if ret.empty?
314
323
  nil
@@ -1,34 +1,38 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
5
6
  ####
6
7
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
7
- def self.parse(tags, encoding = nil)
8
+ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
8
9
  doc = HTML4::Document.new
9
10
 
10
11
  encoding ||= if tags.respond_to?(:encoding)
11
12
  encoding = tags.encoding
12
13
  if encoding == ::Encoding::ASCII_8BIT
13
- 'UTF-8'
14
+ "UTF-8"
14
15
  else
15
16
  encoding.name
16
17
  end
17
18
  else
18
- 'UTF-8'
19
+ "UTF-8"
19
20
  end
20
21
 
21
22
  doc.encoding = encoding
22
23
 
23
- new(doc, tags)
24
+ new(doc, tags, nil, options, &block)
24
25
  end
25
26
 
26
- def initialize(document, tags = nil, ctx = nil)
27
+ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
27
28
  return self unless tags
28
29
 
30
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
31
+ yield options if block_given?
32
+
29
33
  if ctx
30
34
  preexisting_errors = document.errors.dup
31
- node_set = ctx.parse("<div>#{tags}</div>")
35
+ node_set = ctx.parse("<div>#{tags}</div>", options)
32
36
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
33
37
  self.errors = document.errors - preexisting_errors
34
38
  else
@@ -39,7 +43,7 @@ module Nokogiri
39
43
  "/html/body/node()"
40
44
  end
41
45
 
42
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
46
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
43
47
  temp_doc.xpath(path).each { |child| child.parent = self }
44
48
  self.errors = temp_doc.errors
45
49
  end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  class ElementDescription