nokogiri 1.12.5 → 1.14.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (156) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +41 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +23 -14
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +33 -66
  8. data/ext/nokogiri/extconf.rb +159 -63
  9. data/ext/nokogiri/gumbo.c +21 -11
  10. data/ext/nokogiri/html4_document.c +2 -2
  11. data/ext/nokogiri/html4_element_description.c +1 -1
  12. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  13. data/ext/nokogiri/html4_sax_parser_context.c +3 -9
  14. data/ext/nokogiri/html4_sax_push_parser.c +1 -1
  15. data/ext/nokogiri/nokogiri.c +38 -51
  16. data/ext/nokogiri/nokogiri.h +26 -14
  17. data/ext/nokogiri/test_global_handlers.c +1 -1
  18. data/ext/nokogiri/xml_attr.c +3 -3
  19. data/ext/nokogiri/xml_attribute_decl.c +5 -5
  20. data/ext/nokogiri/xml_cdata.c +3 -3
  21. data/ext/nokogiri/xml_comment.c +1 -1
  22. data/ext/nokogiri/xml_document.c +53 -44
  23. data/ext/nokogiri/xml_document_fragment.c +1 -3
  24. data/ext/nokogiri/xml_dtd.c +11 -11
  25. data/ext/nokogiri/xml_element_content.c +3 -3
  26. data/ext/nokogiri/xml_element_decl.c +5 -5
  27. data/ext/nokogiri/xml_encoding_handler.c +28 -14
  28. data/ext/nokogiri/xml_entity_decl.c +6 -6
  29. data/ext/nokogiri/xml_entity_reference.c +1 -1
  30. data/ext/nokogiri/xml_namespace.c +80 -14
  31. data/ext/nokogiri/xml_node.c +982 -396
  32. data/ext/nokogiri/xml_node_set.c +4 -6
  33. data/ext/nokogiri/xml_processing_instruction.c +1 -1
  34. data/ext/nokogiri/xml_reader.c +133 -32
  35. data/ext/nokogiri/xml_relax_ng.c +1 -3
  36. data/ext/nokogiri/xml_sax_parser.c +23 -17
  37. data/ext/nokogiri/xml_sax_parser_context.c +11 -9
  38. data/ext/nokogiri/xml_sax_push_parser.c +1 -3
  39. data/ext/nokogiri/xml_schema.c +4 -6
  40. data/ext/nokogiri/xml_syntax_error.c +1 -1
  41. data/ext/nokogiri/xml_text.c +2 -2
  42. data/ext/nokogiri/xml_xpath_context.c +144 -114
  43. data/ext/nokogiri/xslt_stylesheet.c +122 -23
  44. data/gumbo-parser/Makefile +10 -0
  45. data/gumbo-parser/src/attribute.h +1 -1
  46. data/gumbo-parser/src/error.c +2 -2
  47. data/gumbo-parser/src/error.h +1 -1
  48. data/gumbo-parser/src/foreign_attrs.c +2 -2
  49. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  50. data/gumbo-parser/src/parser.c +8 -16
  51. data/gumbo-parser/src/replacement.h +1 -1
  52. data/gumbo-parser/src/string_buffer.h +1 -1
  53. data/gumbo-parser/src/string_piece.c +1 -1
  54. data/gumbo-parser/src/svg_attrs.c +2 -2
  55. data/gumbo-parser/src/svg_tags.c +2 -2
  56. data/gumbo-parser/src/tag.c +2 -1
  57. data/gumbo-parser/src/tag_lookup.c +7 -7
  58. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  59. data/gumbo-parser/src/tag_lookup.h +1 -1
  60. data/gumbo-parser/src/token_buffer.h +1 -1
  61. data/gumbo-parser/src/tokenizer.c +1 -1
  62. data/gumbo-parser/src/tokenizer.h +1 -1
  63. data/gumbo-parser/src/utf8.c +1 -1
  64. data/gumbo-parser/src/utf8.h +1 -1
  65. data/gumbo-parser/src/util.c +1 -3
  66. data/gumbo-parser/src/util.h +4 -0
  67. data/gumbo-parser/src/vector.h +1 -1
  68. data/lib/nokogiri/class_resolver.rb +67 -0
  69. data/lib/nokogiri/css/node.rb +9 -8
  70. data/lib/nokogiri/css/parser.rb +360 -341
  71. data/lib/nokogiri/css/parser.y +249 -244
  72. data/lib/nokogiri/css/parser_extras.rb +22 -20
  73. data/lib/nokogiri/css/syntax_error.rb +1 -0
  74. data/lib/nokogiri/css/tokenizer.rb +4 -3
  75. data/lib/nokogiri/css/tokenizer.rex +3 -2
  76. data/lib/nokogiri/css/xpath_visitor.rb +184 -85
  77. data/lib/nokogiri/css.rb +44 -6
  78. data/lib/nokogiri/decorators/slop.rb +8 -7
  79. data/lib/nokogiri/encoding_handler.rb +57 -0
  80. data/lib/nokogiri/extension.rb +4 -3
  81. data/lib/nokogiri/gumbo.rb +1 -0
  82. data/lib/nokogiri/html.rb +16 -10
  83. data/lib/nokogiri/html4/builder.rb +1 -0
  84. data/lib/nokogiri/html4/document.rb +56 -164
  85. data/lib/nokogiri/html4/document_fragment.rb +11 -7
  86. data/lib/nokogiri/html4/element_description.rb +1 -0
  87. data/lib/nokogiri/html4/element_description_defaults.rb +432 -532
  88. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  89. data/lib/nokogiri/html4/entity_lookup.rb +2 -1
  90. data/lib/nokogiri/html4/sax/parser.rb +5 -2
  91. data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
  92. data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
  93. data/lib/nokogiri/html4.rb +12 -5
  94. data/lib/nokogiri/html5/document.rb +126 -32
  95. data/lib/nokogiri/html5/document_fragment.rb +14 -4
  96. data/lib/nokogiri/html5/node.rb +12 -7
  97. data/lib/nokogiri/html5.rb +138 -222
  98. data/lib/nokogiri/jruby/dependencies.rb +2 -19
  99. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  100. data/lib/nokogiri/syntax_error.rb +1 -0
  101. data/lib/nokogiri/version/constant.rb +2 -1
  102. data/lib/nokogiri/version/info.rb +32 -24
  103. data/lib/nokogiri/version.rb +1 -0
  104. data/lib/nokogiri/xml/attr.rb +54 -3
  105. data/lib/nokogiri/xml/attribute_decl.rb +2 -1
  106. data/lib/nokogiri/xml/builder.rb +35 -33
  107. data/lib/nokogiri/xml/cdata.rb +2 -1
  108. data/lib/nokogiri/xml/character_data.rb +1 -0
  109. data/lib/nokogiri/xml/document.rb +232 -143
  110. data/lib/nokogiri/xml/document_fragment.rb +88 -42
  111. data/lib/nokogiri/xml/dtd.rb +3 -2
  112. data/lib/nokogiri/xml/element_content.rb +1 -0
  113. data/lib/nokogiri/xml/element_decl.rb +2 -1
  114. data/lib/nokogiri/xml/entity_decl.rb +3 -2
  115. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  116. data/lib/nokogiri/xml/namespace.rb +44 -0
  117. data/lib/nokogiri/xml/node/save_options.rb +14 -8
  118. data/lib/nokogiri/xml/node.rb +708 -383
  119. data/lib/nokogiri/xml/node_set.rb +134 -59
  120. data/lib/nokogiri/xml/notation.rb +12 -0
  121. data/lib/nokogiri/xml/parse_options.rb +140 -56
  122. data/lib/nokogiri/xml/pp/character_data.rb +8 -6
  123. data/lib/nokogiri/xml/pp/node.rb +26 -26
  124. data/lib/nokogiri/xml/pp.rb +1 -0
  125. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  126. data/lib/nokogiri/xml/reader.rb +20 -24
  127. data/lib/nokogiri/xml/relax_ng.rb +1 -0
  128. data/lib/nokogiri/xml/sax/document.rb +20 -19
  129. data/lib/nokogiri/xml/sax/parser.rb +38 -36
  130. data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
  131. data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
  132. data/lib/nokogiri/xml/sax.rb +1 -0
  133. data/lib/nokogiri/xml/schema.rb +7 -6
  134. data/lib/nokogiri/xml/searchable.rb +93 -62
  135. data/lib/nokogiri/xml/syntax_error.rb +5 -4
  136. data/lib/nokogiri/xml/text.rb +1 -0
  137. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  138. data/lib/nokogiri/xml/xpath.rb +12 -0
  139. data/lib/nokogiri/xml/xpath_context.rb +2 -3
  140. data/lib/nokogiri/xml.rb +4 -3
  141. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  142. data/lib/nokogiri/xslt.rb +21 -13
  143. data/lib/nokogiri.rb +22 -27
  144. data/lib/xsd/xmlparser/nokogiri.rb +28 -25
  145. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  146. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
  147. data/ports/archives/libxml2-2.10.4.tar.xz +0 -0
  148. data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
  149. metadata +20 -171
  150. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  151. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  152. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -2511
  153. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
  154. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
  155. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  156. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -1,5 +1,6 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
+
3
4
  #
4
5
  # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
5
6
  #
@@ -16,13 +17,15 @@
16
17
  # limitations under the License.
17
18
  #
18
19
 
19
- require_relative 'html5/document'
20
- require_relative 'html5/document_fragment'
21
- require_relative 'html5/node'
20
+ require_relative "html5/document"
21
+ require_relative "html5/document_fragment"
22
+ require_relative "html5/node"
22
23
 
23
24
  module Nokogiri
24
- # @since v1.12.0
25
- # @note HTML5 functionality is not available when running JRuby.
25
+ # Since v1.12.0
26
+ #
27
+ # ⚠ HTML5 functionality is not available when running JRuby.
28
+ #
26
29
  # Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
27
30
  def self.HTML5(input, url = nil, encoding = nil, **options, &block)
28
31
  Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
@@ -30,6 +33,8 @@ module Nokogiri
30
33
 
31
34
  # == Usage
32
35
  #
36
+ # ⚠ HTML5 functionality is not available when running JRuby.
37
+ #
33
38
  # Parse an HTML5 document:
34
39
  #
35
40
  # doc = Nokogiri.HTML5(string)
@@ -220,254 +225,165 @@ module Nokogiri
220
225
  # * Instead of returning +unknown+ as the element name for unknown tags, the
221
226
  # original tag name is returned verbatim.
222
227
  #
223
- # @since v1.12.0
224
- # @note HTML5 functionality is not available when running JRuby.
228
+ # Since v1.12.0
225
229
  module HTML5
226
- # HTML uses the XHTML namespace.
227
- HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
228
- MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
229
- SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
230
- XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
231
- XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
232
- XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
230
+ class << self
231
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
232
+ def parse(string, url = nil, encoding = nil, **options, &block)
233
+ Document.parse(string, url, encoding, **options, &block)
234
+ end
233
235
 
234
- # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
235
- def self.parse(string, url = nil, encoding = nil, **options, &block)
236
- Document.parse(string, url, encoding, **options, &block)
237
- end
236
+ # Parse a fragment from +string+. Convenience method for
237
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
238
+ def fragment(string, encoding = nil, **options)
239
+ DocumentFragment.parse(string, encoding, options)
240
+ end
238
241
 
239
- # Parse a fragment from +string+. Convenience method for
240
- # {Nokogiri::HTML5::DocumentFragment.parse}.
241
- def self.fragment(string, encoding = nil, **options)
242
- DocumentFragment.parse(string, encoding, options)
243
- end
242
+ # Fetch and parse a HTML document from the web, following redirects,
243
+ # handling https, and determining the character encoding using HTML5
244
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
245
+ # http headers and special options. Everything which is not a
246
+ # special option is considered a header. Special options include:
247
+ # * :follow_limit => number of redirects which are followed
248
+ # * :basic_auth => [username, password]
249
+ def get(uri, options = {})
250
+ # TODO: deprecate
251
+ warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
252
+ uplevel: 1, category: :deprecated)
253
+ get_impl(uri, options)
254
+ end
244
255
 
245
- # Fetch and parse a HTML document from the web, following redirects,
246
- # handling https, and determining the character encoding using HTML5
247
- # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
248
- # http headers and special options. Everything which is not a
249
- # special option is considered a header. Special options include:
250
- # * :follow_limit => number of redirects which are followed
251
- # * :basic_auth => [username, password]
252
- def self.get(uri, options={})
253
- warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
254
- uplevel: 1, category: :deprecated)
255
- get_impl(uri, options)
256
- end
256
+ # :nodoc:
257
+ def read_and_encode(string, encoding)
258
+ # Read the string with the given encoding.
259
+ if string.respond_to?(:read)
260
+ string = if encoding.nil?
261
+ string.read
262
+ else
263
+ string.read(encoding: encoding)
264
+ end
265
+ else
266
+ # Otherwise the string has the given encoding.
267
+ string = string.to_s
268
+ if encoding
269
+ string = string.dup
270
+ string.force_encoding(encoding)
271
+ end
272
+ end
257
273
 
258
- private
274
+ # convert to UTF-8
275
+ if string.encoding != Encoding::UTF_8
276
+ string = reencode(string)
277
+ end
278
+ string
279
+ end
259
280
 
260
- def self.get_impl(uri, options={})
261
- headers = options.clone
262
- headers = {:follow_limit => headers} if Numeric === headers # deprecated
263
- limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
281
+ private
264
282
 
265
- require 'net/http'
266
- uri = URI(uri) unless URI === uri
283
+ def get_impl(uri, options = {})
284
+ headers = options.clone
285
+ headers = { follow_limit: headers } if Numeric === headers # deprecated
286
+ limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
267
287
 
268
- http = Net::HTTP.new(uri.host, uri.port)
288
+ require "net/http"
289
+ uri = URI(uri) unless URI === uri
269
290
 
270
- # TLS / SSL support
271
- http.use_ssl = true if uri.scheme == 'https'
291
+ http = Net::HTTP.new(uri.host, uri.port)
272
292
 
273
- # Pass through Net::HTTP override values, which currently include:
274
- # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
275
- # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
276
- # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
277
- # :verify_callback, :verify_depth, :verify_mode
278
- options.each do |key, value|
279
- http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
280
- end
293
+ # TLS / SSL support
294
+ http.use_ssl = true if uri.scheme == "https"
281
295
 
282
- request = Net::HTTP::Get.new(uri.request_uri)
296
+ # Pass through Net::HTTP override values, which currently include:
297
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
298
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
299
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
300
+ # :verify_callback, :verify_depth, :verify_mode
301
+ options.each do |key, _value|
302
+ http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
303
+ end
283
304
 
284
- # basic authentication
285
- auth = headers.delete(:basic_auth)
286
- auth ||= [uri.user, uri.password] if uri.user && uri.password
287
- request.basic_auth auth.first, auth.last if auth
305
+ request = Net::HTTP::Get.new(uri.request_uri)
288
306
 
289
- # remaining options are treated as headers
290
- headers.each {|key, value| request[key.to_s] = value.to_s}
307
+ # basic authentication
308
+ auth = headers.delete(:basic_auth)
309
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
310
+ request.basic_auth(auth.first, auth.last) if auth
291
311
 
292
- response = http.request(request)
312
+ # remaining options are treated as headers
313
+ headers.each { |key, value| request[key.to_s] = value.to_s }
293
314
 
294
- case response
295
- when Net::HTTPSuccess
296
- doc = parse(reencode(response.body, response['content-type']), options)
297
- doc.instance_variable_set('@response', response)
298
- doc.class.send(:attr_reader, :response)
299
- doc
300
- when Net::HTTPRedirection
301
- response.value if limit <= 1
302
- location = URI.join(uri, response['location'])
303
- get_impl(location, options.merge(:follow_limit => limit-1))
304
- else
305
- response.value
306
- end
307
- end
315
+ response = http.request(request)
308
316
 
309
- def self.read_and_encode(string, encoding)
310
- # Read the string with the given encoding.
311
- if string.respond_to?(:read)
312
- if encoding.nil?
313
- string = string.read
317
+ case response
318
+ when Net::HTTPSuccess
319
+ doc = parse(reencode(response.body, response["content-type"]), options)
320
+ doc.instance_variable_set(:@response, response)
321
+ doc.class.send(:attr_reader, :response)
322
+ doc
323
+ when Net::HTTPRedirection
324
+ response.value if limit <= 1
325
+ location = URI.join(uri, response["location"])
326
+ get_impl(location, options.merge(follow_limit: limit - 1))
314
327
  else
315
- string = string.read(encoding: encoding)
316
- end
317
- else
318
- # Otherwise the string has the given encoding.
319
- string = string.to_s
320
- if encoding
321
- string = string.dup
322
- string.force_encoding(encoding)
328
+ response.value
323
329
  end
324
330
  end
325
331
 
326
- # convert to UTF-8
327
- if string.encoding != Encoding::UTF_8
328
- string = reencode(string)
329
- end
330
- string
331
- end
332
-
333
- # Charset sniffing is a complex and controversial topic that understandably isn't done _by
334
- # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
335
- # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
336
- # the Gumbo parser *only* supports utf-8.
337
- #
338
- # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
339
- # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
340
- # the HTML5 standard.
341
- #
342
- # http://bugs.ruby-lang.org/issues/2567
343
- # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
344
- #
345
- def self.reencode(body, content_type=nil)
346
- if body.encoding == Encoding::ASCII_8BIT
347
- encoding = nil
332
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
333
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
334
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
335
+ # the Gumbo parser *only* supports utf-8.
336
+ #
337
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
338
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
339
+ # the HTML5 standard.
340
+ #
341
+ # http://bugs.ruby-lang.org/issues/2567
342
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
343
+ #
344
+ def reencode(body, content_type = nil)
345
+ if body.encoding == Encoding::ASCII_8BIT
346
+ encoding = nil
348
347
 
349
- # look for a Byte Order Mark (BOM)
350
- initial_bytes = body[0..2].bytes
351
- if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
352
- encoding = Encoding::UTF_8
353
- elsif initial_bytes[0..1] == [0xFE, 0xFF]
354
- encoding = Encoding::UTF_16BE
355
- elsif initial_bytes[0..1] == [0xFF, 0xFE]
356
- encoding = Encoding::UTF_16LE
357
- end
358
-
359
- # look for a charset in a content-encoding header
360
- if content_type
361
- encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
362
- end
363
-
364
- # look for a charset in a meta tag in the first 1024 bytes
365
- if not encoding
366
- data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
367
- data.scan(/<meta.*?>/m).each do |meta|
368
- encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
348
+ # look for a Byte Order Mark (BOM)
349
+ initial_bytes = body[0..2].bytes
350
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
351
+ encoding = Encoding::UTF_8
352
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
353
+ encoding = Encoding::UTF_16BE
354
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
355
+ encoding = Encoding::UTF_16LE
369
356
  end
370
- end
371
-
372
- # if all else fails, default to the official default encoding for HTML
373
- encoding ||= Encoding::ISO_8859_1
374
357
 
375
- # change the encoding to match the detected or inferred encoding
376
- body = body.dup
377
- begin
378
- body.force_encoding(encoding)
379
- rescue ArgumentError
380
- body.force_encoding(Encoding::ISO_8859_1)
381
- end
382
- end
383
-
384
- body.encode(Encoding::UTF_8)
385
- end
358
+ # look for a charset in a content-encoding header
359
+ if content_type
360
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
361
+ end
386
362
 
387
- def self.serialize_node_internal(current_node, io, encoding, options)
388
- case current_node.type
389
- when XML::Node::ELEMENT_NODE
390
- ns = current_node.namespace
391
- ns_uri = ns.nil? ? nil : ns.href
392
- # XXX(sfc): attach namespaces to all nodes, even html?
393
- if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
394
- tagname = current_node.name
395
- else
396
- tagname = "#{ns.prefix}:#{current_node.name}"
397
- end
398
- io << '<' << tagname
399
- current_node.attribute_nodes.each do |attr|
400
- attr_ns = attr.namespace
401
- if attr_ns.nil?
402
- attr_name = attr.name
403
- else
404
- ns_uri = attr_ns.href
405
- if ns_uri == XML_NAMESPACE
406
- attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
407
- elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
408
- attr_name = 'xmlns'
409
- elsif ns_uri == XMLNS_NAMESPACE
410
- attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
411
- elsif ns_uri == XLINK_NAMESPACE
412
- attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
413
- else
414
- attr_name = "#{attr_ns.prefix}:#{attr.name}"
363
+ # look for a charset in a meta tag in the first 1024 bytes
364
+ unless encoding
365
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
366
+ data.scan(/<meta.*?>/im).each do |meta|
367
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
415
368
  end
416
369
  end
417
- io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
418
- end
419
- io << '>'
420
- if !%w[area base basefont bgsound br col embed frame hr img input keygen
421
- link meta param source track wbr].include?(current_node.name)
422
- io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
423
- current_node.children.each do |child|
424
- # XXX(sfc): Templates handled specially?
425
- serialize_node_internal(child, io, encoding, options)
370
+
371
+ # if all else fails, default to the official default encoding for HTML
372
+ encoding ||= Encoding::ISO_8859_1
373
+
374
+ # change the encoding to match the detected or inferred encoding
375
+ body = body.dup
376
+ begin
377
+ body.force_encoding(encoding)
378
+ rescue ArgumentError
379
+ body.force_encoding(Encoding::ISO_8859_1)
426
380
  end
427
- io << '</' << tagname << '>'
428
- end
429
- when XML::Node::TEXT_NODE
430
- parent = current_node.parent
431
- if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
432
- io << current_node.content
433
- else
434
- io << escape_text(current_node.content, encoding, false)
435
- end
436
- when XML::Node::CDATA_SECTION_NODE
437
- io << '<![CDATA[' << current_node.content << ']]>'
438
- when XML::Node::COMMENT_NODE
439
- io << '<!--' << current_node.content << '-->'
440
- when XML::Node::PI_NODE
441
- io << '<?' << current_node.content << '>'
442
- when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
443
- io << '<!DOCTYPE ' << current_node.name << '>'
444
- when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
445
- current_node.children.each do |child|
446
- serialize_node_internal(child, io, encoding, options)
447
381
  end
448
- else
449
- raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
450
- end
451
- end
452
382
 
453
- def self.escape_text(text, encoding, attribute_mode)
454
- if attribute_mode
455
- text = text.gsub(/[&\u00a0"]/,
456
- '&' => '&amp;', "\u00a0" => '&nbsp;', '"' => '&quot;')
457
- else
458
- text = text.gsub(/[&\u00a0<>]/,
459
- '&' => '&amp;', "\u00a0" => '&nbsp;', '<' => '&lt;', '>' => '&gt;')
383
+ body.encode(Encoding::UTF_8)
460
384
  end
461
- # Not part of the standard
462
- text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
463
- end
464
-
465
- def self.prepend_newline?(node)
466
- return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
467
- first_child = node.children[0]
468
- first_child.text? && first_child.content.start_with?("\n")
469
385
  end
470
386
  end
471
387
  end
472
388
 
473
- require_relative 'gumbo'
389
+ require_relative "gumbo"
@@ -1,20 +1,3 @@
1
1
  # frozen_string_literal: true
2
- # The line below caused a problem on non-GAE rack environment.
3
- # unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
4
- #
5
- # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
6
- # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
7
- # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
8
- # should skip loading xml jars. This is because those are in WEB-INF/lib and
9
- # already set in the classpath.
10
- unless $LOAD_PATH.to_s.include?("appengine-rack")
11
- require 'stringio'
12
- require 'isorelax.jar'
13
- require 'jing.jar'
14
- require 'nekohtml.jar'
15
- require 'nekodtd.jar'
16
- require 'xercesImpl.jar'
17
- require 'serializer.jar'
18
- require 'xalan.jar'
19
- require 'xml-apis.jar'
20
- end
2
+
3
+ require_relative "nokogiri_jars"
@@ -0,0 +1,43 @@
1
+ # this is a generated file, to avoid over-writing it just delete this comment
2
+ begin
3
+ require 'jar_dependencies'
4
+ rescue LoadError
5
+ require 'xalan/xalan/2.7.2/xalan-2.7.2.jar'
6
+ require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
7
+ require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
8
+ require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
9
+ require 'org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar'
10
+ require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
11
+ require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
12
+ require 'xalan/serializer/2.7.2/serializer-2.7.2.jar'
13
+ require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
14
+ end
15
+
16
+ if defined? Jars
17
+ require_jar 'xalan', 'xalan', '2.7.2'
18
+ require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
19
+ require_jar 'nu.validator', 'jing', '20200702VNU'
20
+ require_jar 'xerces', 'xercesImpl', '2.12.2'
21
+ require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko1'
22
+ require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
23
+ require_jar 'xml-apis', 'xml-apis', '1.4.01'
24
+ require_jar 'xalan', 'serializer', '2.7.2'
25
+ require_jar 'isorelax', 'isorelax', '20030108'
26
+ end
27
+
28
+ module Nokogiri
29
+ # generated by the :vendor_jars rake task
30
+ JAR_DEPENDENCIES = {
31
+ "isorelax:isorelax" => "20030108",
32
+ "net.sf.saxon:Saxon-HE" => "9.6.0-4",
33
+ "net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
34
+ "nu.validator:jing" => "20200702VNU",
35
+ "org.nokogiri:nekodtd" => "0.1.11.noko1",
36
+ "xalan:serializer" => "2.7.2",
37
+ "xalan:xalan" => "2.7.2",
38
+ "xerces:xercesImpl" => "2.12.2",
39
+ "xml-apis:xml-apis" => "1.4.01",
40
+ }.freeze
41
+ XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
42
+ NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
43
+ end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  class SyntaxError < ::StandardError
4
5
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Nokogiri
3
4
  # The version of Nokogiri you are using
4
- VERSION = "1.12.5"
5
+ VERSION = "1.14.3"
5
6
  end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require "singleton"
3
4
  require "shellwords"
4
5
 
@@ -102,18 +103,18 @@ module Nokogiri
102
103
  ldflags = []
103
104
 
104
105
  if libxml2_using_packaged?
105
- cppflags << "-I#{File.join(header_directory, 'include').shellescape}"
106
- cppflags << "-I#{File.join(header_directory, 'include/libxml2').shellescape}"
107
-
108
- if windows?
109
- # on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
110
- lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
111
- unless File.exist?(lib_directory)
112
- lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
113
- end
114
- ldflags << "-L#{lib_directory.shellescape}"
115
- ldflags << "-l:nokogiri.so"
106
+ cppflags << "-I#{File.join(header_directory, "include").shellescape}"
107
+ cppflags << "-I#{File.join(header_directory, "include/libxml2").shellescape}"
108
+ end
109
+
110
+ if windows?
111
+ # on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
112
+ lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
113
+ unless File.exist?(lib_directory)
114
+ lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
116
115
  end
116
+ ldflags << "-L#{lib_directory.shellescape}"
117
+ ldflags << "-l:nokogiri.so"
117
118
  end
118
119
 
119
120
  nokogiri["cppflags"] = cppflags
@@ -168,21 +169,18 @@ module Nokogiri
168
169
  vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
169
170
  elsif jruby?
170
171
  vi["other_libraries"] = {}.tap do |ol|
171
- ol["xerces"] = Nokogiri::XERCES_VERSION
172
- ol["nekohtml"] = Nokogiri::NEKO_VERSION
172
+ Nokogiri::JAR_DEPENDENCIES.each do |k, v|
173
+ ol[k] = v
174
+ end
173
175
  end
174
176
  end
175
177
  end
176
178
  end
177
179
 
178
180
  def to_markdown
179
- begin
180
- require "psych"
181
- rescue LoadError
182
- end
183
181
  require "yaml"
184
182
  "# Nokogiri (#{Nokogiri::VERSION})\n" +
185
- YAML.dump(to_hash).each_line.map { |line| " #{line}" }.join
183
+ YAML.dump(to_hash).each_line.map { |line| " #{line}" }.join
186
184
  end
187
185
 
188
186
  instance.warnings.each do |warning|
@@ -190,26 +188,36 @@ module Nokogiri
190
188
  end
191
189
  end
192
190
 
193
- def self.uses_libxml?(requirement = nil) # :nodoc:
191
+ # :nodoc:
192
+ def self.uses_libxml?(requirement = nil)
194
193
  return false unless VersionInfo.instance.libxml2?
195
194
  return true unless requirement
195
+
196
196
  Gem::Requirement.new(requirement).satisfied_by?(VersionInfo.instance.loaded_libxml_version)
197
197
  end
198
198
 
199
+ # :nodoc:
199
200
  def self.uses_gumbo?
200
201
  uses_libxml? # TODO: replace with Gumbo functionality
201
202
  end
202
203
 
203
- def self.jruby? # :nodoc:
204
+ # :nodoc:
205
+ def self.jruby?
204
206
  VersionInfo.instance.jruby?
205
207
  end
206
208
 
207
- # Ensure constants used in this file are loaded - see #1896
208
- if Nokogiri.jruby?
209
- require_relative "../jruby/dependencies"
209
+ # :nodoc:
210
+ def self.libxml2_patches
211
+ if VersionInfo.instance.libxml2_using_packaged?
212
+ Nokogiri::VERSION_INFO["libxml"]["patches"]
213
+ else
214
+ []
215
+ end
210
216
  end
217
+
218
+ require_relative "../jruby/dependencies" if Nokogiri.jruby?
211
219
  require_relative "../extension"
212
220
 
213
- # More complete version information about libxml
221
+ # Detailed version info about Nokogiri and the installed extension dependencies.
214
222
  VERSION_INFO = VersionInfo.instance.to_hash
215
223
  end
@@ -1,3 +1,4 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require_relative "version/constant"
3
4
  require_relative "version/info"