nokogiri 1.13.8 → 1.15.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +40 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +33 -15
  7. data/ext/nokogiri/extconf.rb +164 -46
  8. data/ext/nokogiri/gumbo.c +20 -10
  9. data/ext/nokogiri/html4_document.c +3 -4
  10. data/ext/nokogiri/html4_element_description.c +20 -15
  11. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  12. data/ext/nokogiri/html4_sax_parser_context.c +11 -22
  13. data/ext/nokogiri/html4_sax_push_parser.c +3 -3
  14. data/ext/nokogiri/nokogiri.c +84 -75
  15. data/ext/nokogiri/nokogiri.h +31 -16
  16. data/ext/nokogiri/test_global_handlers.c +1 -1
  17. data/ext/nokogiri/xml_attr.c +2 -2
  18. data/ext/nokogiri/xml_attribute_decl.c +2 -2
  19. data/ext/nokogiri/xml_cdata.c +32 -18
  20. data/ext/nokogiri/xml_comment.c +2 -2
  21. data/ext/nokogiri/xml_document.c +127 -34
  22. data/ext/nokogiri/xml_document_fragment.c +2 -2
  23. data/ext/nokogiri/xml_dtd.c +2 -2
  24. data/ext/nokogiri/xml_element_content.c +34 -31
  25. data/ext/nokogiri/xml_element_decl.c +7 -7
  26. data/ext/nokogiri/xml_encoding_handler.c +15 -7
  27. data/ext/nokogiri/xml_entity_decl.c +1 -1
  28. data/ext/nokogiri/xml_entity_reference.c +2 -2
  29. data/ext/nokogiri/xml_namespace.c +79 -14
  30. data/ext/nokogiri/xml_node.c +300 -34
  31. data/ext/nokogiri/xml_node_set.c +125 -107
  32. data/ext/nokogiri/xml_processing_instruction.c +2 -2
  33. data/ext/nokogiri/xml_reader.c +81 -48
  34. data/ext/nokogiri/xml_relax_ng.c +66 -81
  35. data/ext/nokogiri/xml_sax_parser.c +45 -20
  36. data/ext/nokogiri/xml_sax_parser_context.c +46 -30
  37. data/ext/nokogiri/xml_sax_push_parser.c +30 -11
  38. data/ext/nokogiri/xml_schema.c +95 -117
  39. data/ext/nokogiri/xml_syntax_error.c +1 -1
  40. data/ext/nokogiri/xml_text.c +28 -14
  41. data/ext/nokogiri/xml_xpath_context.c +216 -136
  42. data/ext/nokogiri/xslt_stylesheet.c +118 -64
  43. data/gumbo-parser/Makefile +10 -0
  44. data/gumbo-parser/src/attribute.h +1 -1
  45. data/gumbo-parser/src/error.c +10 -6
  46. data/gumbo-parser/src/error.h +1 -1
  47. data/gumbo-parser/src/foreign_attrs.c +15 -16
  48. data/gumbo-parser/src/foreign_attrs.gperf +1 -1
  49. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  50. data/gumbo-parser/src/parser.c +21 -5
  51. data/gumbo-parser/src/replacement.h +1 -1
  52. data/gumbo-parser/src/string_buffer.h +1 -1
  53. data/gumbo-parser/src/string_piece.c +1 -1
  54. data/gumbo-parser/src/svg_attrs.c +2 -2
  55. data/gumbo-parser/src/svg_tags.c +2 -2
  56. data/gumbo-parser/src/tag.c +2 -1
  57. data/gumbo-parser/src/tag_lookup.c +7 -7
  58. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  59. data/gumbo-parser/src/tag_lookup.h +1 -1
  60. data/gumbo-parser/src/token_buffer.h +1 -1
  61. data/gumbo-parser/src/tokenizer.c +1 -1
  62. data/gumbo-parser/src/tokenizer.h +1 -1
  63. data/gumbo-parser/src/utf8.c +1 -1
  64. data/gumbo-parser/src/utf8.h +1 -1
  65. data/gumbo-parser/src/util.c +1 -3
  66. data/gumbo-parser/src/util.h +4 -0
  67. data/gumbo-parser/src/vector.h +1 -1
  68. data/lib/nokogiri/css/node.rb +2 -2
  69. data/lib/nokogiri/css/xpath_visitor.rb +7 -5
  70. data/lib/nokogiri/css.rb +6 -0
  71. data/lib/nokogiri/decorators/slop.rb +1 -1
  72. data/lib/nokogiri/encoding_handler.rb +57 -0
  73. data/lib/nokogiri/extension.rb +4 -3
  74. data/lib/nokogiri/html4/document.rb +2 -121
  75. data/lib/nokogiri/html4/document_fragment.rb +1 -1
  76. data/lib/nokogiri/html4/element_description_defaults.rb +1827 -365
  77. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  78. data/lib/nokogiri/html4.rb +1 -0
  79. data/lib/nokogiri/html5/document.rb +113 -36
  80. data/lib/nokogiri/html5/document_fragment.rb +10 -3
  81. data/lib/nokogiri/html5/node.rb +8 -5
  82. data/lib/nokogiri/html5.rb +130 -216
  83. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  84. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  85. data/lib/nokogiri/version/constant.rb +1 -1
  86. data/lib/nokogiri/version/info.rb +11 -10
  87. data/lib/nokogiri/xml/attr.rb +49 -0
  88. data/lib/nokogiri/xml/attribute_decl.rb +4 -2
  89. data/lib/nokogiri/xml/builder.rb +1 -1
  90. data/lib/nokogiri/xml/document.rb +102 -55
  91. data/lib/nokogiri/xml/document_fragment.rb +50 -7
  92. data/lib/nokogiri/xml/element_content.rb +10 -2
  93. data/lib/nokogiri/xml/element_decl.rb +4 -2
  94. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  95. data/lib/nokogiri/xml/namespace.rb +42 -0
  96. data/lib/nokogiri/xml/node/save_options.rb +14 -4
  97. data/lib/nokogiri/xml/node.rb +212 -48
  98. data/lib/nokogiri/xml/node_set.rb +88 -9
  99. data/lib/nokogiri/xml/parse_options.rb +129 -50
  100. data/lib/nokogiri/xml/pp/node.rb +28 -15
  101. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  102. data/lib/nokogiri/xml/sax/document.rb +1 -1
  103. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  104. data/lib/nokogiri/xml/searchable.rb +18 -10
  105. data/lib/nokogiri/xslt.rb +74 -4
  106. data/lib/nokogiri.rb +15 -15
  107. data/lib/xsd/xmlparser/nokogiri.rb +4 -2
  108. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  109. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  110. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  111. data/ports/archives/libxml2-2.11.7.tar.xz +0 -0
  112. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
  113. metadata +19 -242
  114. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  115. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  116. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
  117. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
  118. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +0 -3037
  119. data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
  120. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
@@ -227,250 +227,164 @@ module Nokogiri
227
227
  #
228
228
  # Since v1.12.0
229
229
  module HTML5
230
- # HTML uses the XHTML namespace.
231
- HTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
232
- MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
233
- SVG_NAMESPACE = "http://www.w3.org/2000/svg"
234
- XLINK_NAMESPACE = "http://www.w3.org/1999/xlink"
235
- XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
236
- XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
230
+ class << self
231
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
232
+ def parse(string, url = nil, encoding = nil, **options, &block)
233
+ Document.parse(string, url, encoding, **options, &block)
234
+ end
237
235
 
238
- # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
239
- def self.parse(string, url = nil, encoding = nil, **options, &block)
240
- Document.parse(string, url, encoding, **options, &block)
241
- end
236
+ # Parse a fragment from +string+. Convenience method for
237
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
238
+ def fragment(string, encoding = nil, **options)
239
+ DocumentFragment.parse(string, encoding, options)
240
+ end
242
241
 
243
- # Parse a fragment from +string+. Convenience method for
244
- # {Nokogiri::HTML5::DocumentFragment.parse}.
245
- def self.fragment(string, encoding = nil, **options)
246
- DocumentFragment.parse(string, encoding, options)
247
- end
242
+ # Fetch and parse a HTML document from the web, following redirects,
243
+ # handling https, and determining the character encoding using HTML5
244
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
245
+ # http headers and special options. Everything which is not a
246
+ # special option is considered a header. Special options include:
247
+ # * :follow_limit => number of redirects which are followed
248
+ # * :basic_auth => [username, password]
249
+ def get(uri, options = {})
250
+ # TODO: deprecate
251
+ warn(
252
+ "Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
253
+ uplevel: 1,
254
+ category: :deprecated,
255
+ )
256
+ get_impl(uri, options)
257
+ end
248
258
 
249
- # Fetch and parse a HTML document from the web, following redirects,
250
- # handling https, and determining the character encoding using HTML5
251
- # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
252
- # http headers and special options. Everything which is not a
253
- # special option is considered a header. Special options include:
254
- # * :follow_limit => number of redirects which are followed
255
- # * :basic_auth => [username, password]
256
- def self.get(uri, options = {})
257
- # TODO: deprecate
258
- warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
259
- uplevel: 1, category: :deprecated)
260
- get_impl(uri, options)
261
- end
259
+ # :nodoc:
260
+ def read_and_encode(string, encoding)
261
+ # Read the string with the given encoding.
262
+ if string.respond_to?(:read)
263
+ string = if encoding.nil?
264
+ string.read
265
+ else
266
+ string.read(encoding: encoding)
267
+ end
268
+ else
269
+ # Otherwise the string has the given encoding.
270
+ string = string.to_s
271
+ if encoding
272
+ string = string.dup
273
+ string.force_encoding(encoding)
274
+ end
275
+ end
262
276
 
263
- private
277
+ # convert to UTF-8
278
+ if string.encoding != Encoding::UTF_8
279
+ string = reencode(string)
280
+ end
281
+ string
282
+ end
264
283
 
265
- def self.get_impl(uri, options = {})
266
- headers = options.clone
267
- headers = { follow_limit: headers } if Numeric === headers # deprecated
268
- limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
284
+ private
269
285
 
270
- require "net/http"
271
- uri = URI(uri) unless URI === uri
286
+ def get_impl(uri, options = {})
287
+ headers = options.clone
288
+ headers = { follow_limit: headers } if Numeric === headers # deprecated
289
+ limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
272
290
 
273
- http = Net::HTTP.new(uri.host, uri.port)
291
+ require "net/http"
292
+ uri = URI(uri) unless URI === uri
274
293
 
275
- # TLS / SSL support
276
- http.use_ssl = true if uri.scheme == "https"
294
+ http = Net::HTTP.new(uri.host, uri.port)
277
295
 
278
- # Pass through Net::HTTP override values, which currently include:
279
- # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
280
- # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
281
- # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
282
- # :verify_callback, :verify_depth, :verify_mode
283
- options.each do |key, _value|
284
- http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
285
- end
296
+ # TLS / SSL support
297
+ http.use_ssl = true if uri.scheme == "https"
286
298
 
287
- request = Net::HTTP::Get.new(uri.request_uri)
299
+ # Pass through Net::HTTP override values, which currently include:
300
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
301
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
302
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
303
+ # :verify_callback, :verify_depth, :verify_mode
304
+ options.each do |key, _value|
305
+ http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
306
+ end
288
307
 
289
- # basic authentication
290
- auth = headers.delete(:basic_auth)
291
- auth ||= [uri.user, uri.password] if uri.user && uri.password
292
- request.basic_auth(auth.first, auth.last) if auth
308
+ request = Net::HTTP::Get.new(uri.request_uri)
293
309
 
294
- # remaining options are treated as headers
295
- headers.each { |key, value| request[key.to_s] = value.to_s }
310
+ # basic authentication
311
+ auth = headers.delete(:basic_auth)
312
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
313
+ request.basic_auth(auth.first, auth.last) if auth
296
314
 
297
- response = http.request(request)
315
+ # remaining options are treated as headers
316
+ headers.each { |key, value| request[key.to_s] = value.to_s }
298
317
 
299
- case response
300
- when Net::HTTPSuccess
301
- doc = parse(reencode(response.body, response["content-type"]), options)
302
- doc.instance_variable_set("@response", response)
303
- doc.class.send(:attr_reader, :response)
304
- doc
305
- when Net::HTTPRedirection
306
- response.value if limit <= 1
307
- location = URI.join(uri, response["location"])
308
- get_impl(location, options.merge(follow_limit: limit - 1))
309
- else
310
- response.value
311
- end
312
- end
318
+ response = http.request(request)
313
319
 
314
- def self.read_and_encode(string, encoding)
315
- # Read the string with the given encoding.
316
- if string.respond_to?(:read)
317
- string = if encoding.nil?
318
- string.read
320
+ case response
321
+ when Net::HTTPSuccess
322
+ doc = parse(reencode(response.body, response["content-type"]), options)
323
+ doc.instance_variable_set(:@response, response)
324
+ doc.class.send(:attr_reader, :response)
325
+ doc
326
+ when Net::HTTPRedirection
327
+ response.value if limit <= 1
328
+ location = URI.join(uri, response["location"])
329
+ get_impl(location, options.merge(follow_limit: limit - 1))
319
330
  else
320
- string.read(encoding: encoding)
321
- end
322
- else
323
- # Otherwise the string has the given encoding.
324
- string = string.to_s
325
- if encoding
326
- string = string.dup
327
- string.force_encoding(encoding)
331
+ response.value
328
332
  end
329
333
  end
330
334
 
331
- # convert to UTF-8
332
- if string.encoding != Encoding::UTF_8
333
- string = reencode(string)
334
- end
335
- string
336
- end
337
-
338
- # Charset sniffing is a complex and controversial topic that understandably isn't done _by
339
- # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
340
- # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
341
- # the Gumbo parser *only* supports utf-8.
342
- #
343
- # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
344
- # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
345
- # the HTML5 standard.
346
- #
347
- # http://bugs.ruby-lang.org/issues/2567
348
- # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
349
- #
350
- def self.reencode(body, content_type = nil)
351
- if body.encoding == Encoding::ASCII_8BIT
352
- encoding = nil
353
-
354
- # look for a Byte Order Mark (BOM)
355
- initial_bytes = body[0..2].bytes
356
- if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
357
- encoding = Encoding::UTF_8
358
- elsif initial_bytes[0..1] == [0xFE, 0xFF]
359
- encoding = Encoding::UTF_16BE
360
- elsif initial_bytes[0..1] == [0xFF, 0xFE]
361
- encoding = Encoding::UTF_16LE
362
- end
363
-
364
- # look for a charset in a content-encoding header
365
- if content_type
366
- encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
367
- end
368
-
369
- # look for a charset in a meta tag in the first 1024 bytes
370
- unless encoding
371
- data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
372
- data.scan(/<meta.*?>/m).each do |meta|
373
- encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
335
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
336
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
337
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
338
+ # the Gumbo parser *only* supports utf-8.
339
+ #
340
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
341
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
342
+ # the HTML5 standard.
343
+ #
344
+ # http://bugs.ruby-lang.org/issues/2567
345
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
346
+ #
347
+ def reencode(body, content_type = nil)
348
+ if body.encoding == Encoding::ASCII_8BIT
349
+ encoding = nil
350
+
351
+ # look for a Byte Order Mark (BOM)
352
+ initial_bytes = body[0..2].bytes
353
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
354
+ encoding = Encoding::UTF_8
355
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
356
+ encoding = Encoding::UTF_16BE
357
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
358
+ encoding = Encoding::UTF_16LE
374
359
  end
375
- end
376
-
377
- # if all else fails, default to the official default encoding for HTML
378
- encoding ||= Encoding::ISO_8859_1
379
-
380
- # change the encoding to match the detected or inferred encoding
381
- body = body.dup
382
- begin
383
- body.force_encoding(encoding)
384
- rescue ArgumentError
385
- body.force_encoding(Encoding::ISO_8859_1)
386
- end
387
- end
388
360
 
389
- body.encode(Encoding::UTF_8)
390
- end
361
+ # look for a charset in a content-encoding header
362
+ if content_type
363
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
364
+ end
391
365
 
392
- def self.serialize_node_internal(current_node, io, encoding, options)
393
- case current_node.type
394
- when XML::Node::ELEMENT_NODE
395
- ns = current_node.namespace
396
- ns_uri = ns.nil? ? nil : ns.href
397
- # XXX(sfc): attach namespaces to all nodes, even html?
398
- tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
399
- current_node.name
400
- else
401
- "#{ns.prefix}:#{current_node.name}"
402
- end
403
- io << "<" << tagname
404
- current_node.attribute_nodes.each do |attr|
405
- attr_ns = attr.namespace
406
- if attr_ns.nil?
407
- attr_name = attr.name
408
- else
409
- ns_uri = attr_ns.href
410
- attr_name = if ns_uri == XML_NAMESPACE
411
- "xml:" + attr.name.sub(/^[^:]*:/, "")
412
- elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
413
- "xmlns"
414
- elsif ns_uri == XMLNS_NAMESPACE
415
- "xmlns:" + attr.name.sub(/^[^:]*:/, "")
416
- elsif ns_uri == XLINK_NAMESPACE
417
- "xlink:" + attr.name.sub(/^[^:]*:/, "")
418
- else
419
- "#{attr_ns.prefix}:#{attr.name}"
366
+ # look for a charset in a meta tag in the first 1024 bytes
367
+ unless encoding
368
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
369
+ data.scan(/<meta.*?>/im).each do |meta|
370
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
420
371
  end
421
372
  end
422
- io << " " << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
423
- end
424
- io << ">"
425
- unless ["area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"].include?(current_node.name)
426
- io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
427
- current_node.children.each do |child|
428
- # XXX(sfc): Templates handled specially?
429
- serialize_node_internal(child, io, encoding, options)
373
+
374
+ # if all else fails, default to the official default encoding for HTML
375
+ encoding ||= Encoding::ISO_8859_1
376
+
377
+ # change the encoding to match the detected or inferred encoding
378
+ body = body.dup
379
+ begin
380
+ body.force_encoding(encoding)
381
+ rescue ArgumentError
382
+ body.force_encoding(Encoding::ISO_8859_1)
430
383
  end
431
- io << "</" << tagname << ">"
432
- end
433
- when XML::Node::TEXT_NODE
434
- parent = current_node.parent
435
- io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
436
- current_node.content
437
- else
438
- escape_text(current_node.content, encoding, false)
439
- end
440
- when XML::Node::CDATA_SECTION_NODE
441
- io << "<![CDATA[" << current_node.content << "]]>"
442
- when XML::Node::COMMENT_NODE
443
- io << "<!--" << current_node.content << "-->"
444
- when XML::Node::PI_NODE
445
- io << "<?" << current_node.content << ">"
446
- when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
447
- io << "<!DOCTYPE " << current_node.name << ">"
448
- when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
449
- current_node.children.each do |child|
450
- serialize_node_internal(child, io, encoding, options)
451
384
  end
452
- else
453
- raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
454
- end
455
- end
456
385
 
457
- def self.escape_text(text, encoding, attribute_mode)
458
- text = if attribute_mode
459
- text.gsub(/[&\u00a0"]/,
460
- "&" => "&amp;", "\u00a0" => "&nbsp;", '"' => "&quot;")
461
- else
462
- text.gsub(/[&\u00a0<>]/,
463
- "&" => "&amp;", "\u00a0" => "&nbsp;", "<" => "&lt;", ">" => "&gt;")
386
+ body.encode(Encoding::UTF_8)
464
387
  end
465
- # Not part of the standard
466
- text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
467
- end
468
-
469
- def self.prepend_newline?(node)
470
- return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
471
-
472
- first_child = node.children[0]
473
- first_child.text? && first_child.content.start_with?("\n")
474
388
  end
475
389
  end
476
390
  end
@@ -1,21 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # The line below caused a problem on non-GAE rack environment.
4
- # unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
5
- #
6
- # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
7
- # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
8
- # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
9
- # should skip loading xml jars. This is because those are in WEB-INF/lib and
10
- # already set in the classpath.
11
- unless $LOAD_PATH.to_s.include?("appengine-rack")
12
- require "stringio"
13
- require "isorelax.jar"
14
- require "jing.jar"
15
- require "nekohtml.jar"
16
- require "nekodtd.jar"
17
- require "xercesImpl.jar"
18
- require "serializer.jar"
19
- require "xalan.jar"
20
- require "xml-apis.jar"
21
- end
3
+ require_relative "nokogiri_jars"
@@ -0,0 +1,43 @@
1
+ # this is a generated file, to avoid over-writing it just delete this comment
2
+ begin
3
+ require 'jar_dependencies'
4
+ rescue LoadError
5
+ require 'xalan/serializer/2.7.3/serializer-2.7.3.jar'
6
+ require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
7
+ require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
8
+ require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
9
+ require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
10
+ require 'xalan/xalan/2.7.3/xalan-2.7.3.jar'
11
+ require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
12
+ require 'org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar'
13
+ require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
14
+ end
15
+
16
+ if defined? Jars
17
+ require_jar 'xalan', 'serializer', '2.7.3'
18
+ require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
19
+ require_jar 'nu.validator', 'jing', '20200702VNU'
20
+ require_jar 'xerces', 'xercesImpl', '2.12.2'
21
+ require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
22
+ require_jar 'xalan', 'xalan', '2.7.3'
23
+ require_jar 'xml-apis', 'xml-apis', '1.4.01'
24
+ require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko2'
25
+ require_jar 'isorelax', 'isorelax', '20030108'
26
+ end
27
+
28
+ module Nokogiri
29
+ # generated by the :vendor_jars rake task
30
+ JAR_DEPENDENCIES = {
31
+ "isorelax:isorelax" => "20030108",
32
+ "net.sf.saxon:Saxon-HE" => "9.6.0-4",
33
+ "net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
34
+ "nu.validator:jing" => "20200702VNU",
35
+ "org.nokogiri:nekodtd" => "0.1.11.noko2",
36
+ "xalan:serializer" => "2.7.3",
37
+ "xalan:xalan" => "2.7.3",
38
+ "xerces:xercesImpl" => "2.12.2",
39
+ "xml-apis:xml-apis" => "1.4.01",
40
+ }.freeze
41
+ XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
42
+ NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
43
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Nokogiri
4
4
  # The version of Nokogiri you are using
5
- VERSION = "1.13.8"
5
+ VERSION = "1.15.7"
6
6
  end
@@ -105,16 +105,16 @@ module Nokogiri
105
105
  if libxml2_using_packaged?
106
106
  cppflags << "-I#{File.join(header_directory, "include").shellescape}"
107
107
  cppflags << "-I#{File.join(header_directory, "include/libxml2").shellescape}"
108
+ end
108
109
 
109
- if windows?
110
- # on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
111
- lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
112
- unless File.exist?(lib_directory)
113
- lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
114
- end
115
- ldflags << "-L#{lib_directory.shellescape}"
116
- ldflags << "-l:nokogiri.so"
110
+ if windows?
111
+ # on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
112
+ lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
113
+ unless File.exist?(lib_directory)
114
+ lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
117
115
  end
116
+ ldflags << "-L#{lib_directory.shellescape}"
117
+ ldflags << "-l:nokogiri.so"
118
118
  end
119
119
 
120
120
  nokogiri["cppflags"] = cppflags
@@ -169,8 +169,9 @@ module Nokogiri
169
169
  vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
170
170
  elsif jruby?
171
171
  vi["other_libraries"] = {}.tap do |ol|
172
- ol["xerces"] = Nokogiri::XERCES_VERSION
173
- ol["nekohtml"] = Nokogiri::NEKO_VERSION
172
+ Nokogiri::JAR_DEPENDENCIES.each do |k, v|
173
+ ol[k] = v
174
+ end
174
175
  end
175
176
  end
176
177
  end
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  # frozen_string_literal: true
2
3
 
3
4
  module Nokogiri
@@ -7,6 +8,54 @@ module Nokogiri
7
8
  alias_method :to_s, :content
8
9
  alias_method :content=, :value=
9
10
 
11
+ #
12
+ # :call-seq: deconstruct_keys(array_of_names) → Hash
13
+ #
14
+ # Returns a hash describing the Attr, to use in pattern matching.
15
+ #
16
+ # Valid keys and their values:
17
+ # - +name+ → (String) The name of the attribute.
18
+ # - +value+ → (String) The value of the attribute.
19
+ # - +namespace+ → (Namespace, nil) The Namespace of the attribute, or +nil+ if there is no namespace.
20
+ #
21
+ # ⚡ This is an experimental feature, available since v1.14.0
22
+ #
23
+ # *Example*
24
+ #
25
+ # doc = Nokogiri::XML.parse(<<~XML)
26
+ # <?xml version="1.0"?>
27
+ # <root xmlns="http://nokogiri.org/ns/default" xmlns:noko="http://nokogiri.org/ns/noko">
28
+ # <child1 foo="abc" noko:bar="def"/>
29
+ # </root>
30
+ # XML
31
+ #
32
+ # attributes = doc.root.elements.first.attribute_nodes
33
+ # # => [#(Attr:0x35c { name = "foo", value = "abc" }),
34
+ # # #(Attr:0x370 {
35
+ # # name = "bar",
36
+ # # namespace = #(Namespace:0x384 {
37
+ # # prefix = "noko",
38
+ # # href = "http://nokogiri.org/ns/noko"
39
+ # # }),
40
+ # # value = "def"
41
+ # # })]
42
+ #
43
+ # attributes.first.deconstruct_keys([:name, :value, :namespace])
44
+ # # => {:name=>"foo", :value=>"abc", :namespace=>nil}
45
+ #
46
+ # attributes.last.deconstruct_keys([:name, :value, :namespace])
47
+ # # => {:name=>"bar",
48
+ # # :value=>"def",
49
+ # # :namespace=>
50
+ # # #(Namespace:0x384 {
51
+ # # prefix = "noko",
52
+ # # href = "http://nokogiri.org/ns/noko"
53
+ # # })}
54
+ #
55
+ def deconstruct_keys(keys)
56
+ { name: name, value: value, namespace: namespace }
57
+ end
58
+
10
59
  private
11
60
 
12
61
  def inspect_attributes
@@ -12,8 +12,10 @@ module Nokogiri
12
12
  undef_method :namespace_definitions
13
13
  undef_method :line if method_defined?(:line)
14
14
 
15
- def inspect
16
- "#<#{self.class.name}:#{format("0x%x", object_id)} #{to_s.inspect}>"
15
+ private
16
+
17
+ def inspect_attributes
18
+ [:to_s]
17
19
  end
18
20
  end
19
21
  end
@@ -234,7 +234,7 @@ module Nokogiri
234
234
  #
235
235
  # == Document Types
236
236
  #
237
- # To create a document type (DTD), access use the Builder#doc method to get
237
+ # To create a document type (DTD), use the Builder#doc method to get
238
238
  # the current context document. Then call Node#create_internal_subset to
239
239
  # create the DTD node.
240
240
  #