nokogiri 1.12.5 → 1.13.8

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +9 -7
  4. data/bin/nokogiri +63 -50
  5. data/dependencies.yml +13 -64
  6. data/ext/nokogiri/extconf.rb +66 -44
  7. data/ext/nokogiri/gumbo.c +1 -1
  8. data/ext/nokogiri/html4_sax_parser_context.c +2 -3
  9. data/ext/nokogiri/nokogiri.h +8 -0
  10. data/ext/nokogiri/xml_attr.c +2 -2
  11. data/ext/nokogiri/xml_attribute_decl.c +3 -3
  12. data/ext/nokogiri/xml_cdata.c +1 -1
  13. data/ext/nokogiri/xml_document.c +36 -36
  14. data/ext/nokogiri/xml_document_fragment.c +0 -2
  15. data/ext/nokogiri/xml_dtd.c +10 -10
  16. data/ext/nokogiri/xml_element_decl.c +3 -3
  17. data/ext/nokogiri/xml_encoding_handler.c +25 -11
  18. data/ext/nokogiri/xml_entity_decl.c +5 -5
  19. data/ext/nokogiri/xml_node.c +707 -381
  20. data/ext/nokogiri/xml_node_set.c +4 -4
  21. data/ext/nokogiri/xml_reader.c +88 -11
  22. data/ext/nokogiri/xml_sax_parser_context.c +10 -3
  23. data/ext/nokogiri/xml_schema.c +3 -3
  24. data/ext/nokogiri/xml_text.c +1 -1
  25. data/ext/nokogiri/xml_xpath_context.c +73 -50
  26. data/ext/nokogiri/xslt_stylesheet.c +107 -9
  27. data/gumbo-parser/src/parser.c +0 -11
  28. data/lib/nokogiri/class_resolver.rb +67 -0
  29. data/lib/nokogiri/css/node.rb +9 -8
  30. data/lib/nokogiri/css/parser.rb +360 -341
  31. data/lib/nokogiri/css/parser.y +249 -244
  32. data/lib/nokogiri/css/parser_extras.rb +22 -20
  33. data/lib/nokogiri/css/syntax_error.rb +1 -0
  34. data/lib/nokogiri/css/tokenizer.rb +4 -3
  35. data/lib/nokogiri/css/tokenizer.rex +3 -2
  36. data/lib/nokogiri/css/xpath_visitor.rb +179 -82
  37. data/lib/nokogiri/css.rb +38 -6
  38. data/lib/nokogiri/decorators/slop.rb +8 -7
  39. data/lib/nokogiri/extension.rb +1 -1
  40. data/lib/nokogiri/gumbo.rb +1 -0
  41. data/lib/nokogiri/html.rb +16 -10
  42. data/lib/nokogiri/html4/builder.rb +1 -0
  43. data/lib/nokogiri/html4/document.rb +88 -77
  44. data/lib/nokogiri/html4/document_fragment.rb +11 -7
  45. data/lib/nokogiri/html4/element_description.rb +1 -0
  46. data/lib/nokogiri/html4/element_description_defaults.rb +426 -520
  47. data/lib/nokogiri/html4/entity_lookup.rb +2 -1
  48. data/lib/nokogiri/html4/sax/parser.rb +5 -2
  49. data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
  50. data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
  51. data/lib/nokogiri/html4.rb +11 -5
  52. data/lib/nokogiri/html5/document.rb +27 -10
  53. data/lib/nokogiri/html5/document_fragment.rb +5 -2
  54. data/lib/nokogiri/html5/node.rb +10 -3
  55. data/lib/nokogiri/html5.rb +69 -64
  56. data/lib/nokogiri/jruby/dependencies.rb +10 -9
  57. data/lib/nokogiri/syntax_error.rb +1 -0
  58. data/lib/nokogiri/version/constant.rb +2 -1
  59. data/lib/nokogiri/version/info.rb +20 -13
  60. data/lib/nokogiri/version.rb +1 -0
  61. data/lib/nokogiri/xml/attr.rb +5 -3
  62. data/lib/nokogiri/xml/attribute_decl.rb +2 -1
  63. data/lib/nokogiri/xml/builder.rb +34 -32
  64. data/lib/nokogiri/xml/cdata.rb +2 -1
  65. data/lib/nokogiri/xml/character_data.rb +1 -0
  66. data/lib/nokogiri/xml/document.rb +144 -103
  67. data/lib/nokogiri/xml/document_fragment.rb +41 -38
  68. data/lib/nokogiri/xml/dtd.rb +3 -2
  69. data/lib/nokogiri/xml/element_content.rb +1 -0
  70. data/lib/nokogiri/xml/element_decl.rb +2 -1
  71. data/lib/nokogiri/xml/entity_decl.rb +3 -2
  72. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  73. data/lib/nokogiri/xml/namespace.rb +2 -0
  74. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  75. data/lib/nokogiri/xml/node.rb +521 -351
  76. data/lib/nokogiri/xml/node_set.rb +50 -54
  77. data/lib/nokogiri/xml/notation.rb +12 -0
  78. data/lib/nokogiri/xml/parse_options.rb +12 -7
  79. data/lib/nokogiri/xml/pp/character_data.rb +8 -6
  80. data/lib/nokogiri/xml/pp/node.rb +24 -26
  81. data/lib/nokogiri/xml/pp.rb +1 -0
  82. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  83. data/lib/nokogiri/xml/reader.rb +20 -24
  84. data/lib/nokogiri/xml/relax_ng.rb +1 -0
  85. data/lib/nokogiri/xml/sax/document.rb +20 -19
  86. data/lib/nokogiri/xml/sax/parser.rb +37 -34
  87. data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
  88. data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
  89. data/lib/nokogiri/xml/sax.rb +1 -0
  90. data/lib/nokogiri/xml/schema.rb +7 -6
  91. data/lib/nokogiri/xml/searchable.rb +93 -62
  92. data/lib/nokogiri/xml/syntax_error.rb +5 -4
  93. data/lib/nokogiri/xml/text.rb +1 -0
  94. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  95. data/lib/nokogiri/xml/xpath.rb +12 -0
  96. data/lib/nokogiri/xml/xpath_context.rb +2 -3
  97. data/lib/nokogiri/xml.rb +4 -3
  98. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  99. data/lib/nokogiri/xslt.rb +21 -13
  100. data/lib/nokogiri.rb +19 -16
  101. data/lib/xsd/xmlparser/nokogiri.rb +25 -24
  102. data/patches/libxml2/0004-use-glibc-strlen.patch +3 -3
  103. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2443 -1914
  104. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  105. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  106. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
  107. data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
  108. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
  109. metadata +104 -32
  110. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
  111. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
  112. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  113. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -1,6 +1,7 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
2
3
 
3
- require 'pathname'
4
+ require "pathname"
4
5
 
5
6
  module Nokogiri
6
7
  module HTML4
@@ -9,11 +10,10 @@ module Nokogiri
9
10
  # Get the meta tag encoding for this document. If there is no meta tag,
10
11
  # then nil is returned.
11
12
  def meta_encoding
12
- case
13
- when meta = at('//meta[@charset]')
13
+ if (meta = at_xpath("//meta[@charset]"))
14
14
  meta[:charset]
15
- when meta = meta_content_type
16
- meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
15
+ elsif (meta = meta_content_type)
16
+ meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
17
17
  end
18
18
  end
19
19
 
@@ -33,24 +33,22 @@ module Nokogiri
33
33
  #
34
34
  # Beware in CRuby, that libxml2 automatically inserts a meta tag
35
35
  # into a head element.
36
- def meta_encoding= encoding
37
- case
38
- when meta = meta_content_type
39
- meta['content'] = 'text/html; charset=%s' % encoding
36
+ def meta_encoding=(encoding)
37
+ if (meta = meta_content_type)
38
+ meta["content"] = format("text/html; charset=%s", encoding)
40
39
  encoding
41
- when meta = at('//meta[@charset]')
42
- meta['charset'] = encoding
40
+ elsif (meta = at_xpath("//meta[@charset]"))
41
+ meta["charset"] = encoding
43
42
  else
44
- meta = XML::Node.new('meta', self)
45
- if dtd = internal_subset and dtd.html5_dtd?
46
- meta['charset'] = encoding
43
+ meta = XML::Node.new("meta", self)
44
+ if (dtd = internal_subset) && dtd.html5_dtd?
45
+ meta["charset"] = encoding
47
46
  else
48
- meta['http-equiv'] = 'Content-Type'
49
- meta['content'] = 'text/html; charset=%s' % encoding
47
+ meta["http-equiv"] = "Content-Type"
48
+ meta["content"] = format("text/html; charset=%s", encoding)
50
49
  end
51
50
 
52
- case
53
- when head = at('//head')
51
+ if (head = at_xpath("//head"))
54
52
  head.prepend_child(meta)
55
53
  else
56
54
  set_metadata_element(meta)
@@ -60,9 +58,9 @@ module Nokogiri
60
58
  end
61
59
 
62
60
  def meta_content_type
63
- xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
64
- node['http-equiv'] =~ /\AContent-Type\z/i
65
- }
61
+ xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
62
+ node["http-equiv"] =~ /\AContent-Type\z/i
63
+ end
66
64
  end
67
65
  private :meta_content_type
68
66
 
@@ -70,7 +68,7 @@ module Nokogiri
70
68
  # Get the title string of this document. Return nil if there is
71
69
  # no title tag.
72
70
  def title
73
- title = at('//title') and title.inner_text
71
+ (title = at_xpath("//title")) && title.inner_text
74
72
  end
75
73
 
76
74
  ###
@@ -86,52 +84,50 @@ module Nokogiri
86
84
  # content element (typically <body>) if any.
87
85
  def title=(text)
88
86
  tnode = XML::Text.new(text, self)
89
- if title = at('//title')
87
+ if (title = at_xpath("//title"))
90
88
  title.children = tnode
91
89
  return text
92
90
  end
93
91
 
94
- title = XML::Node.new('title', self) << tnode
95
- case
96
- when head = at('//head')
92
+ title = XML::Node.new("title", self) << tnode
93
+ if (head = at_xpath("//head"))
97
94
  head << title
98
- when meta = at('//meta[@charset]') || meta_content_type
95
+ elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
99
96
  # better put after charset declaration
100
97
  meta.add_next_sibling(title)
101
98
  else
102
99
  set_metadata_element(title)
103
100
  end
104
- text
105
101
  end
106
102
 
107
- def set_metadata_element(element)
108
- case
109
- when head = at('//head')
103
+ def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
104
+ if (head = at_xpath("//head"))
110
105
  head << element
111
- when html = at('//html')
112
- head = html.prepend_child(XML::Node.new('head', self))
106
+ elsif (html = at_xpath("//html"))
107
+ head = html.prepend_child(XML::Node.new("head", self))
113
108
  head.prepend_child(element)
114
- when first = children.find { |node|
115
- case node
116
- when XML::Element, XML::Text
117
- true
118
- end
119
- }
109
+ elsif (first = children.find do |node|
110
+ case node
111
+ when XML::Element, XML::Text
112
+ true
113
+ end
114
+ end)
120
115
  # We reach here only if the underlying document model
121
116
  # allows <html>/<head> elements to be omitted and does not
122
117
  # automatically supply them.
123
118
  first.add_previous_sibling(element)
124
119
  else
125
- html = add_child(XML::Node.new('html', self))
126
- head = html.add_child(XML::Node.new('head', self))
120
+ html = add_child(XML::Node.new("html", self))
121
+ head = html.add_child(XML::Node.new("head", self))
127
122
  head.prepend_child(element)
128
123
  end
129
124
  end
130
125
  private :set_metadata_element
131
126
 
132
127
  ####
133
- # Serialize Node using +options+. Save options can also be set using a
134
- # block. See SaveOptions.
128
+ # Serialize Node using +options+. Save options can also be set using a block.
129
+ #
130
+ # See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
135
131
  #
136
132
  # These two statements are equivalent:
137
133
  #
@@ -143,15 +139,25 @@ module Nokogiri
143
139
  # config.format.as_xml
144
140
  # end
145
141
  #
146
- def serialize options = {}
142
+ def serialize(options = {})
147
143
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
148
144
  super
149
145
  end
150
146
 
151
147
  ####
152
148
  # Create a Nokogiri::XML::DocumentFragment from +tags+
153
- def fragment tags = nil
154
- DocumentFragment.new(self, tags, self.root)
149
+ def fragment(tags = nil)
150
+ DocumentFragment.new(self, tags, root)
151
+ end
152
+
153
+ # :call-seq:
154
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
155
+ #
156
+ # [Returns] The document type which determines CSS-to-XPath translation.
157
+ #
158
+ # See XPathVisitor for more information.
159
+ def xpath_doctype
160
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
155
161
  end
156
162
 
157
163
  class << self
@@ -163,9 +169,8 @@ module Nokogiri
163
169
  # is a number that sets options in the parser, such as
164
170
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
165
171
  # Nokogiri::XML::ParseOptions.
166
- def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
172
+ def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
167
173
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
168
-
169
174
  yield options if block_given?
170
175
 
171
176
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
@@ -206,7 +211,7 @@ module Nokogiri
206
211
  end
207
212
 
208
213
  # read_memory pukes on empty docs
209
- if string_or_io.nil? or string_or_io.empty?
214
+ if string_or_io.nil? || string_or_io.empty?
210
215
  return encoding ? new.tap { |i| i.encoding = encoding } : new
211
216
  end
212
217
 
@@ -216,37 +221,39 @@ module Nokogiri
216
221
  end
217
222
  end
218
223
 
219
- class EncodingFound < StandardError # :nodoc:
224
+ class EncodingFound < StandardError # :nodoc: all
220
225
  attr_reader :found_encoding
221
226
 
222
227
  def initialize(encoding)
223
228
  @found_encoding = encoding
224
- super("encoding found: %s" % encoding)
229
+ super(format("encoding found: %s", encoding))
225
230
  end
226
231
  end
227
232
 
228
- class EncodingReader # :nodoc:
229
- class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
233
+ # :nodoc: all
234
+ class EncodingReader
235
+ class SAXHandler < Nokogiri::XML::SAX::Document
230
236
  attr_reader :encoding
231
-
237
+
232
238
  def initialize
233
239
  @encoding = nil
234
240
  super()
235
241
  end
236
-
242
+
237
243
  def start_element(name, attrs = [])
238
- return unless name == 'meta'
244
+ return unless name == "meta"
245
+
239
246
  attr = Hash[attrs]
240
- charset = attr['charset'] and
241
- @encoding = charset
242
- http_equiv = attr['http-equiv'] and
243
- http_equiv.match(/\AContent-Type\z/i) and
244
- content = attr['content'] and
245
- m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
246
- @encoding = m[1]
247
+ (charset = attr["charset"]) &&
248
+ (@encoding = charset)
249
+ (http_equiv = attr["http-equiv"]) &&
250
+ http_equiv.match(/\AContent-Type\z/i) &&
251
+ (content = attr["content"]) &&
252
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
253
+ (@encoding = m[1])
247
254
  end
248
255
  end
249
-
256
+
250
257
  class JumpSAXHandler < SAXHandler
251
258
  def initialize(jumptag)
252
259
  @jumptag = jumptag
@@ -255,26 +262,30 @@ module Nokogiri
255
262
 
256
263
  def start_element(name, attrs = [])
257
264
  super
258
- throw @jumptag, @encoding if @encoding
259
- throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
265
+ throw(@jumptag, @encoding) if @encoding
266
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
260
267
  end
261
268
  end
262
269
 
263
270
  def self.detect_encoding(chunk)
264
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
265
- return Nokogiri.XML(m[1]).encoding
271
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
272
+ (return Nokogiri.XML(m[1]).encoding)
266
273
 
267
274
  if Nokogiri.jruby?
268
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
269
- return m[4]
270
- catch(:encoding_found) {
275
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
276
+ (return m[4])
277
+ catch(:encoding_found) do
271
278
  Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
272
279
  nil
273
- }
280
+ end
274
281
  else
275
282
  handler = SAXHandler.new
276
283
  parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
277
- parser << chunk rescue Nokogiri::SyntaxError
284
+ begin
285
+ parser << chunk
286
+ rescue
287
+ Nokogiri::SyntaxError
288
+ end
278
289
  handler.encoding
279
290
  end
280
291
  end
@@ -293,13 +304,13 @@ module Nokogiri
293
304
  def read(len)
294
305
  # no support for a call without len
295
306
 
296
- if !@firstchunk
297
- @firstchunk = @io.read(len) or return nil
307
+ unless @firstchunk
308
+ (@firstchunk = @io.read(len)) || (return nil)
298
309
 
299
310
  # This implementation expects that the first call from
300
311
  # htmlReadIO() is made with a length long enough (~1KB) to
301
312
  # achieve advanced encoding detection.
302
- if encoding = EncodingReader.detect_encoding(@firstchunk)
313
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
303
314
  # The first chunk is stored for the next read in retry.
304
315
  raise @encoding_found = EncodingFound.new(encoding)
305
316
  end
@@ -308,7 +319,7 @@ module Nokogiri
308
319
 
309
320
  ret = @firstchunk.slice!(0, len)
310
321
  if (len -= ret.length) > 0
311
- rest = @io.read(len) and ret << rest
322
+ (rest = @io.read(len)) && ret << (rest)
312
323
  end
313
324
  if ret.empty?
314
325
  nil
@@ -1,34 +1,38 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  class DocumentFragment < Nokogiri::XML::DocumentFragment
5
6
  ####
6
7
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
7
- def self.parse(tags, encoding = nil)
8
+ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
8
9
  doc = HTML4::Document.new
9
10
 
10
11
  encoding ||= if tags.respond_to?(:encoding)
11
12
  encoding = tags.encoding
12
13
  if encoding == ::Encoding::ASCII_8BIT
13
- 'UTF-8'
14
+ "UTF-8"
14
15
  else
15
16
  encoding.name
16
17
  end
17
18
  else
18
- 'UTF-8'
19
+ "UTF-8"
19
20
  end
20
21
 
21
22
  doc.encoding = encoding
22
23
 
23
- new(doc, tags)
24
+ new(doc, tags, nil, options, &block)
24
25
  end
25
26
 
26
- def initialize(document, tags = nil, ctx = nil)
27
+ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
27
28
  return self unless tags
28
29
 
30
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
31
+ yield options if block_given?
32
+
29
33
  if ctx
30
34
  preexisting_errors = document.errors.dup
31
- node_set = ctx.parse("<div>#{tags}</div>")
35
+ node_set = ctx.parse("<div>#{tags}</div>", options)
32
36
  node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
33
37
  self.errors = document.errors - preexisting_errors
34
38
  else
@@ -39,7 +43,7 @@ module Nokogiri
39
43
  "/html/body/node()"
40
44
  end
41
45
 
42
- temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
46
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
43
47
  temp_doc.xpath(path).each { |child| child.parent = self }
44
48
  self.errors = temp_doc.errors
45
49
  end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  module HTML4
4
5
  class ElementDescription