nokogiri 1.13.6 → 1.14.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +39 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +33 -15
  7. data/ext/nokogiri/extconf.rb +100 -24
  8. data/ext/nokogiri/gumbo.c +21 -11
  9. data/ext/nokogiri/html4_document.c +2 -2
  10. data/ext/nokogiri/html4_element_description.c +1 -1
  11. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  12. data/ext/nokogiri/html4_sax_parser_context.c +1 -6
  13. data/ext/nokogiri/html4_sax_push_parser.c +1 -1
  14. data/ext/nokogiri/nokogiri.c +38 -51
  15. data/ext/nokogiri/nokogiri.h +26 -14
  16. data/ext/nokogiri/test_global_handlers.c +1 -1
  17. data/ext/nokogiri/xml_attr.c +3 -3
  18. data/ext/nokogiri/xml_attribute_decl.c +5 -5
  19. data/ext/nokogiri/xml_cdata.c +3 -3
  20. data/ext/nokogiri/xml_comment.c +1 -1
  21. data/ext/nokogiri/xml_document.c +23 -14
  22. data/ext/nokogiri/xml_document_fragment.c +1 -1
  23. data/ext/nokogiri/xml_dtd.c +9 -9
  24. data/ext/nokogiri/xml_element_content.c +3 -3
  25. data/ext/nokogiri/xml_element_decl.c +5 -5
  26. data/ext/nokogiri/xml_encoding_handler.c +3 -3
  27. data/ext/nokogiri/xml_entity_decl.c +6 -6
  28. data/ext/nokogiri/xml_entity_reference.c +1 -1
  29. data/ext/nokogiri/xml_namespace.c +80 -14
  30. data/ext/nokogiri/xml_node.c +363 -82
  31. data/ext/nokogiri/xml_node_set.c +4 -6
  32. data/ext/nokogiri/xml_processing_instruction.c +1 -1
  33. data/ext/nokogiri/xml_reader.c +97 -22
  34. data/ext/nokogiri/xml_relax_ng.c +1 -3
  35. data/ext/nokogiri/xml_sax_parser.c +23 -17
  36. data/ext/nokogiri/xml_sax_parser_context.c +1 -6
  37. data/ext/nokogiri/xml_sax_push_parser.c +1 -3
  38. data/ext/nokogiri/xml_schema.c +4 -6
  39. data/ext/nokogiri/xml_syntax_error.c +1 -1
  40. data/ext/nokogiri/xml_text.c +2 -2
  41. data/ext/nokogiri/xml_xpath_context.c +91 -84
  42. data/ext/nokogiri/xslt_stylesheet.c +15 -14
  43. data/gumbo-parser/Makefile +10 -0
  44. data/gumbo-parser/src/attribute.h +1 -1
  45. data/gumbo-parser/src/error.c +2 -2
  46. data/gumbo-parser/src/error.h +1 -1
  47. data/gumbo-parser/src/foreign_attrs.c +2 -2
  48. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  49. data/gumbo-parser/src/parser.c +8 -5
  50. data/gumbo-parser/src/replacement.h +1 -1
  51. data/gumbo-parser/src/string_buffer.h +1 -1
  52. data/gumbo-parser/src/string_piece.c +1 -1
  53. data/gumbo-parser/src/svg_attrs.c +2 -2
  54. data/gumbo-parser/src/svg_tags.c +2 -2
  55. data/gumbo-parser/src/tag.c +2 -1
  56. data/gumbo-parser/src/tag_lookup.c +7 -7
  57. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  58. data/gumbo-parser/src/tag_lookup.h +1 -1
  59. data/gumbo-parser/src/token_buffer.h +1 -1
  60. data/gumbo-parser/src/tokenizer.c +1 -1
  61. data/gumbo-parser/src/tokenizer.h +1 -1
  62. data/gumbo-parser/src/utf8.c +1 -1
  63. data/gumbo-parser/src/utf8.h +1 -1
  64. data/gumbo-parser/src/util.c +1 -3
  65. data/gumbo-parser/src/util.h +4 -0
  66. data/gumbo-parser/src/vector.h +1 -1
  67. data/lib/nokogiri/css/node.rb +2 -2
  68. data/lib/nokogiri/css/xpath_visitor.rb +5 -3
  69. data/lib/nokogiri/css.rb +6 -0
  70. data/lib/nokogiri/decorators/slop.rb +1 -1
  71. data/lib/nokogiri/encoding_handler.rb +57 -0
  72. data/lib/nokogiri/extension.rb +3 -2
  73. data/lib/nokogiri/html4/document.rb +2 -121
  74. data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
  75. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  76. data/lib/nokogiri/html4.rb +1 -0
  77. data/lib/nokogiri/html5/document.rb +113 -36
  78. data/lib/nokogiri/html5/document_fragment.rb +9 -2
  79. data/lib/nokogiri/html5/node.rb +3 -5
  80. data/lib/nokogiri/html5.rb +127 -216
  81. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  82. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  83. data/lib/nokogiri/version/constant.rb +1 -1
  84. data/lib/nokogiri/version/info.rb +11 -10
  85. data/lib/nokogiri/xml/attr.rb +49 -0
  86. data/lib/nokogiri/xml/builder.rb +1 -1
  87. data/lib/nokogiri/xml/document.rb +103 -55
  88. data/lib/nokogiri/xml/document_fragment.rb +49 -6
  89. data/lib/nokogiri/xml/namespace.rb +42 -0
  90. data/lib/nokogiri/xml/node/save_options.rb +6 -4
  91. data/lib/nokogiri/xml/node.rb +190 -35
  92. data/lib/nokogiri/xml/node_set.rb +88 -9
  93. data/lib/nokogiri/xml/parse_options.rb +129 -50
  94. data/lib/nokogiri/xml/pp/node.rb +6 -4
  95. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  96. data/lib/nokogiri/xml/reader.rb +6 -8
  97. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  98. data/lib/nokogiri/xslt.rb +1 -1
  99. data/lib/nokogiri.rb +3 -11
  100. data/lib/xsd/xmlparser/nokogiri.rb +3 -1
  101. data/ports/archives/libxml2-2.10.3.tar.xz +0 -0
  102. data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
  103. metadata +11 -242
  104. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  105. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  106. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
  107. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
  108. data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
  109. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
@@ -21,48 +21,137 @@ require_relative "../html4/document"
21
21
 
22
22
  module Nokogiri
23
23
  module HTML5
24
+ # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
25
+ #
26
+ # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
27
+ # mode.
28
+ #
29
+ # Since v1.14.0
30
+ module QuirksMode
31
+ NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
32
+ QUIRKS = 1 # The document was parsed in "quirks" mode
33
+ LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
34
+ end
35
+
24
36
  # Since v1.12.0
25
37
  #
26
38
  # 💡 HTML5 functionality is not available when running JRuby.
27
39
  class Document < Nokogiri::HTML4::Document
28
- def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
29
- yield options if block
30
- string_or_io = "" unless string_or_io
40
+ # Get the url name for this document, as passed into Document.parse, Document.read_io, or
41
+ # Document.read_memory
42
+ attr_reader :url
31
43
 
32
- if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != "ASCII-8BIT"
33
- encoding ||= string_or_io.encoding.name
34
- end
44
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
+ #
46
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
47
+ #
48
+ # Since v1.14.0
49
+ attr_reader :quirks_mode
35
50
 
36
- if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
37
- url ||= string_or_io.path
51
+ class << self
52
+ # :call-seq:
53
+ # parse(input)
54
+ # parse(input, url=nil, encoding=nil, **options)
55
+ # parse(input, url=nil, encoding=nil) { |options| ... }
56
+ #
57
+ # Parse HTML5 input.
58
+ #
59
+ # [Parameters]
60
+ # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
61
+ # IO, or StringIO.
62
+ #
63
+ # - +url+ (optional) is a String indicating the canonical URI where this document is located.
64
+ #
65
+ # - +encoding+ (optional) is the encoding that should be used when processing
66
+ # the document.
67
+ #
68
+ # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
69
+ # during parsing. The three currently supported options are +:max_errors+,
70
+ # +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
71
+ #
72
+ # ⚠ Note that these options are different than those made available by
73
+ # Nokogiri::XML::Document and Nokogiri::HTML4::Document.
74
+ #
75
+ # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
76
+ # Nokogiri::HTML5 for more information and usage.
77
+ #
78
+ # [Returns] Nokogiri::HTML5::Document
79
+ #
80
+ def parse(string_or_io, url = nil, encoding = nil, **options, &block)
81
+ yield options if block
82
+ string_or_io = "" unless string_or_io
83
+
84
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
85
+ encoding ||= string_or_io.encoding.name
86
+ end
87
+
88
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
89
+ url ||= string_or_io.path
90
+ end
91
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
92
+ raise ArgumentError, "not a string or IO object"
93
+ end
94
+
95
+ do_parse(string_or_io, url, encoding, options)
38
96
  end
39
- unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
40
- raise ArgumentError, "not a string or IO object"
97
+
98
+ # Create a new document from an IO object.
99
+ #
100
+ # 💡 Most users should prefer Document.parse to this method.
101
+ def read_io(io, url = nil, encoding = nil, **options)
102
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
103
+
104
+ do_parse(io, url, encoding, options)
41
105
  end
42
106
 
43
- do_parse(string_or_io, url, encoding, options)
44
- end
107
+ # Create a new document from a String.
108
+ #
109
+ # 💡 Most users should prefer Document.parse to this method.
110
+ def read_memory(string, url = nil, encoding = nil, **options)
111
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
45
112
 
46
- def self.read_io(io, url = nil, encoding = nil, **options)
47
- raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
113
+ do_parse(string, url, encoding, options)
114
+ end
48
115
 
49
- do_parse(io, url, encoding, options)
50
- end
116
+ private
51
117
 
52
- def self.read_memory(string, url = nil, encoding = nil, **options)
53
- raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
118
+ def do_parse(string_or_io, url, encoding, options)
119
+ string = HTML5.read_and_encode(string_or_io, encoding)
120
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
121
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
122
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
123
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
124
+ doc.encoding = "UTF-8"
125
+ doc
126
+ end
127
+ end
54
128
 
55
- do_parse(string, url, encoding, options)
129
+ def initialize(*args) # :nodoc:
130
+ super
131
+ @url = nil
132
+ @quirks_mode = nil
56
133
  end
57
134
 
58
- def fragment(tags = nil)
59
- DocumentFragment.new(self, tags, root)
135
+ # :call-seq:
136
+ # fragment() Nokogiri::HTML5::DocumentFragment
137
+ # fragment(markup) → Nokogiri::HTML5::DocumentFragment
138
+ #
139
+ # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
140
+ #
141
+ # [Properties]
142
+ # - +markup+ (String) The HTML5 markup fragment to be parsed
143
+ #
144
+ # [Returns]
145
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
146
+ #
147
+ def fragment(markup = nil)
148
+ DocumentFragment.new(self, markup)
60
149
  end
61
150
 
62
- def to_xml(options = {}, &block)
151
+ def to_xml(options = {}, &block) # :nodoc:
63
152
  # Bypass XML::Document#to_xml which doesn't add
64
153
  # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
65
- XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
154
+ XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
66
155
  end
67
156
 
68
157
  # :call-seq:
@@ -70,22 +159,10 @@ module Nokogiri
70
159
  #
71
160
  # [Returns] The document type which determines CSS-to-XPath translation.
72
161
  #
73
- # See XPathVisitor for more information.
162
+ # See CSS::XPathVisitor for more information.
74
163
  def xpath_doctype
75
164
  Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
76
165
  end
77
-
78
- private
79
-
80
- def self.do_parse(string_or_io, url, encoding, options)
81
- string = HTML5.read_and_encode(string_or_io, encoding)
82
- max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
83
- max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
84
- max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
85
- doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
86
- doc.encoding = "UTF-8"
87
- doc
88
- end
89
166
  end
90
167
  end
91
168
  end
@@ -28,6 +28,13 @@ module Nokogiri
28
28
  attr_accessor :document
29
29
  attr_accessor :errors
30
30
 
31
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
32
+ #
33
+ # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
34
+ #
35
+ # Since v1.14.0
36
+ attr_reader :quirks_mode
37
+
31
38
  # Create a document fragment.
32
39
  def initialize(doc, tags = nil, ctx = nil, options = {})
33
40
  self.document = doc
@@ -41,10 +48,10 @@ module Nokogiri
41
48
  Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
42
49
  end
43
50
 
44
- def serialize(options = {}, &block)
51
+ def serialize(options = {}, &block) # :nodoc:
45
52
  # Bypass XML::Document.serialize which doesn't support options even
46
53
  # though XML::Node.serialize does!
47
- XML::Node.instance_method(:serialize).bind(self).call(options, &block)
54
+ XML::Node.instance_method(:serialize).bind_call(self, options, &block)
48
55
  end
49
56
 
50
57
  # Parse a document fragment from +tags+, returning a Nodeset.
@@ -28,7 +28,7 @@ module Nokogiri
28
28
  def inner_html(options = {})
29
29
  return super(options) unless document.is_a?(HTML5::Document)
30
30
 
31
- result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? +"\n" : +""
31
+ result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
32
32
  result << children.map { |child| child.to_html(options) }.join
33
33
  result
34
34
  end
@@ -56,11 +56,9 @@ module Nokogiri
56
56
  native_write_to(io, encoding, indent_string, config_options)
57
57
  else
58
58
  # Serialize including the current node.
59
+ html = html_standard_serialize(options[:preserve_newline] || false)
59
60
  encoding ||= document.encoding || Encoding::UTF_8
60
- internal_ops = {
61
- preserve_newline: options[:preserve_newline] || false,
62
- }
63
- HTML5.serialize_node_internal(self, io, encoding, internal_ops)
61
+ io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
64
62
  end
65
63
  end
66
64
 
@@ -227,250 +227,161 @@ module Nokogiri
227
227
  #
228
228
  # Since v1.12.0
229
229
  module HTML5
230
- # HTML uses the XHTML namespace.
231
- HTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
232
- MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
233
- SVG_NAMESPACE = "http://www.w3.org/2000/svg"
234
- XLINK_NAMESPACE = "http://www.w3.org/1999/xlink"
235
- XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
236
- XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
230
+ class << self
231
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
232
+ def parse(string, url = nil, encoding = nil, **options, &block)
233
+ Document.parse(string, url, encoding, **options, &block)
234
+ end
237
235
 
238
- # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
239
- def self.parse(string, url = nil, encoding = nil, **options, &block)
240
- Document.parse(string, url, encoding, **options, &block)
241
- end
236
+ # Parse a fragment from +string+. Convenience method for
237
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
238
+ def fragment(string, encoding = nil, **options)
239
+ DocumentFragment.parse(string, encoding, options)
240
+ end
242
241
 
243
- # Parse a fragment from +string+. Convenience method for
244
- # {Nokogiri::HTML5::DocumentFragment.parse}.
245
- def self.fragment(string, encoding = nil, **options)
246
- DocumentFragment.parse(string, encoding, options)
247
- end
242
+ # Fetch and parse a HTML document from the web, following redirects,
243
+ # handling https, and determining the character encoding using HTML5
244
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
245
+ # http headers and special options. Everything which is not a
246
+ # special option is considered a header. Special options include:
247
+ # * :follow_limit => number of redirects which are followed
248
+ # * :basic_auth => [username, password]
249
+ def get(uri, options = {})
250
+ # TODO: deprecate
251
+ warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
252
+ uplevel: 1, category: :deprecated)
253
+ get_impl(uri, options)
254
+ end
248
255
 
249
- # Fetch and parse a HTML document from the web, following redirects,
250
- # handling https, and determining the character encoding using HTML5
251
- # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
252
- # http headers and special options. Everything which is not a
253
- # special option is considered a header. Special options include:
254
- # * :follow_limit => number of redirects which are followed
255
- # * :basic_auth => [username, password]
256
- def self.get(uri, options = {})
257
- # TODO: deprecate
258
- warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
259
- uplevel: 1, category: :deprecated)
260
- get_impl(uri, options)
261
- end
256
+ # :nodoc:
257
+ def read_and_encode(string, encoding)
258
+ # Read the string with the given encoding.
259
+ if string.respond_to?(:read)
260
+ string = if encoding.nil?
261
+ string.read
262
+ else
263
+ string.read(encoding: encoding)
264
+ end
265
+ else
266
+ # Otherwise the string has the given encoding.
267
+ string = string.to_s
268
+ if encoding
269
+ string = string.dup
270
+ string.force_encoding(encoding)
271
+ end
272
+ end
262
273
 
263
- private
274
+ # convert to UTF-8
275
+ if string.encoding != Encoding::UTF_8
276
+ string = reencode(string)
277
+ end
278
+ string
279
+ end
264
280
 
265
- def self.get_impl(uri, options = {})
266
- headers = options.clone
267
- headers = { follow_limit: headers } if Numeric === headers # deprecated
268
- limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
281
+ private
269
282
 
270
- require "net/http"
271
- uri = URI(uri) unless URI === uri
283
+ def get_impl(uri, options = {})
284
+ headers = options.clone
285
+ headers = { follow_limit: headers } if Numeric === headers # deprecated
286
+ limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
272
287
 
273
- http = Net::HTTP.new(uri.host, uri.port)
288
+ require "net/http"
289
+ uri = URI(uri) unless URI === uri
274
290
 
275
- # TLS / SSL support
276
- http.use_ssl = true if uri.scheme == "https"
291
+ http = Net::HTTP.new(uri.host, uri.port)
277
292
 
278
- # Pass through Net::HTTP override values, which currently include:
279
- # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
280
- # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
281
- # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
282
- # :verify_callback, :verify_depth, :verify_mode
283
- options.each do |key, _value|
284
- http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
285
- end
293
+ # TLS / SSL support
294
+ http.use_ssl = true if uri.scheme == "https"
286
295
 
287
- request = Net::HTTP::Get.new(uri.request_uri)
296
+ # Pass through Net::HTTP override values, which currently include:
297
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
298
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
299
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
300
+ # :verify_callback, :verify_depth, :verify_mode
301
+ options.each do |key, _value|
302
+ http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
303
+ end
288
304
 
289
- # basic authentication
290
- auth = headers.delete(:basic_auth)
291
- auth ||= [uri.user, uri.password] if uri.user && uri.password
292
- request.basic_auth(auth.first, auth.last) if auth
305
+ request = Net::HTTP::Get.new(uri.request_uri)
293
306
 
294
- # remaining options are treated as headers
295
- headers.each { |key, value| request[key.to_s] = value.to_s }
307
+ # basic authentication
308
+ auth = headers.delete(:basic_auth)
309
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
310
+ request.basic_auth(auth.first, auth.last) if auth
296
311
 
297
- response = http.request(request)
312
+ # remaining options are treated as headers
313
+ headers.each { |key, value| request[key.to_s] = value.to_s }
298
314
 
299
- case response
300
- when Net::HTTPSuccess
301
- doc = parse(reencode(response.body, response["content-type"]), options)
302
- doc.instance_variable_set("@response", response)
303
- doc.class.send(:attr_reader, :response)
304
- doc
305
- when Net::HTTPRedirection
306
- response.value if limit <= 1
307
- location = URI.join(uri, response["location"])
308
- get_impl(location, options.merge(follow_limit: limit - 1))
309
- else
310
- response.value
311
- end
312
- end
315
+ response = http.request(request)
313
316
 
314
- def self.read_and_encode(string, encoding)
315
- # Read the string with the given encoding.
316
- if string.respond_to?(:read)
317
- string = if encoding.nil?
318
- string.read
317
+ case response
318
+ when Net::HTTPSuccess
319
+ doc = parse(reencode(response.body, response["content-type"]), options)
320
+ doc.instance_variable_set(:@response, response)
321
+ doc.class.send(:attr_reader, :response)
322
+ doc
323
+ when Net::HTTPRedirection
324
+ response.value if limit <= 1
325
+ location = URI.join(uri, response["location"])
326
+ get_impl(location, options.merge(follow_limit: limit - 1))
319
327
  else
320
- string.read(encoding: encoding)
321
- end
322
- else
323
- # Otherwise the string has the given encoding.
324
- string = string.to_s
325
- if encoding
326
- string = string.dup
327
- string.force_encoding(encoding)
328
+ response.value
328
329
  end
329
330
  end
330
331
 
331
- # convert to UTF-8
332
- if string.encoding != Encoding::UTF_8
333
- string = reencode(string)
334
- end
335
- string
336
- end
337
-
338
- # Charset sniffing is a complex and controversial topic that understandably isn't done _by
339
- # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
340
- # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
341
- # the Gumbo parser *only* supports utf-8.
342
- #
343
- # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
344
- # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
345
- # the HTML5 standard.
346
- #
347
- # http://bugs.ruby-lang.org/issues/2567
348
- # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
349
- #
350
- def self.reencode(body, content_type = nil)
351
- if body.encoding == Encoding::ASCII_8BIT
352
- encoding = nil
353
-
354
- # look for a Byte Order Mark (BOM)
355
- initial_bytes = body[0..2].bytes
356
- if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
357
- encoding = Encoding::UTF_8
358
- elsif initial_bytes[0..1] == [0xFE, 0xFF]
359
- encoding = Encoding::UTF_16BE
360
- elsif initial_bytes[0..1] == [0xFF, 0xFE]
361
- encoding = Encoding::UTF_16LE
362
- end
363
-
364
- # look for a charset in a content-encoding header
365
- if content_type
366
- encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
367
- end
368
-
369
- # look for a charset in a meta tag in the first 1024 bytes
370
- unless encoding
371
- data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
372
- data.scan(/<meta.*?>/m).each do |meta|
373
- encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
332
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
333
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
334
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
335
+ # the Gumbo parser *only* supports utf-8.
336
+ #
337
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
338
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
339
+ # the HTML5 standard.
340
+ #
341
+ # http://bugs.ruby-lang.org/issues/2567
342
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
343
+ #
344
+ def reencode(body, content_type = nil)
345
+ if body.encoding == Encoding::ASCII_8BIT
346
+ encoding = nil
347
+
348
+ # look for a Byte Order Mark (BOM)
349
+ initial_bytes = body[0..2].bytes
350
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
351
+ encoding = Encoding::UTF_8
352
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
353
+ encoding = Encoding::UTF_16BE
354
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
355
+ encoding = Encoding::UTF_16LE
374
356
  end
375
- end
376
-
377
- # if all else fails, default to the official default encoding for HTML
378
- encoding ||= Encoding::ISO_8859_1
379
-
380
- # change the encoding to match the detected or inferred encoding
381
- body = body.dup
382
- begin
383
- body.force_encoding(encoding)
384
- rescue ArgumentError
385
- body.force_encoding(Encoding::ISO_8859_1)
386
- end
387
- end
388
357
 
389
- body.encode(Encoding::UTF_8)
390
- end
358
+ # look for a charset in a content-encoding header
359
+ if content_type
360
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
361
+ end
391
362
 
392
- def self.serialize_node_internal(current_node, io, encoding, options)
393
- case current_node.type
394
- when XML::Node::ELEMENT_NODE
395
- ns = current_node.namespace
396
- ns_uri = ns.nil? ? nil : ns.href
397
- # XXX(sfc): attach namespaces to all nodes, even html?
398
- tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
399
- current_node.name
400
- else
401
- "#{ns.prefix}:#{current_node.name}"
402
- end
403
- io << "<" << tagname
404
- current_node.attribute_nodes.each do |attr|
405
- attr_ns = attr.namespace
406
- if attr_ns.nil?
407
- attr_name = attr.name
408
- else
409
- ns_uri = attr_ns.href
410
- attr_name = if ns_uri == XML_NAMESPACE
411
- "xml:" + attr.name.sub(/^[^:]*:/, "")
412
- elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
413
- "xmlns"
414
- elsif ns_uri == XMLNS_NAMESPACE
415
- "xmlns:" + attr.name.sub(/^[^:]*:/, "")
416
- elsif ns_uri == XLINK_NAMESPACE
417
- "xlink:" + attr.name.sub(/^[^:]*:/, "")
418
- else
419
- "#{attr_ns.prefix}:#{attr.name}"
363
+ # look for a charset in a meta tag in the first 1024 bytes
364
+ unless encoding
365
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
366
+ data.scan(/<meta.*?>/im).each do |meta|
367
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
420
368
  end
421
369
  end
422
- io << " " << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
423
- end
424
- io << ">"
425
- unless ["area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"].include?(current_node.name)
426
- io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
427
- current_node.children.each do |child|
428
- # XXX(sfc): Templates handled specially?
429
- serialize_node_internal(child, io, encoding, options)
370
+
371
+ # if all else fails, default to the official default encoding for HTML
372
+ encoding ||= Encoding::ISO_8859_1
373
+
374
+ # change the encoding to match the detected or inferred encoding
375
+ body = body.dup
376
+ begin
377
+ body.force_encoding(encoding)
378
+ rescue ArgumentError
379
+ body.force_encoding(Encoding::ISO_8859_1)
430
380
  end
431
- io << "</" << tagname << ">"
432
- end
433
- when XML::Node::TEXT_NODE
434
- parent = current_node.parent
435
- io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
436
- current_node.content
437
- else
438
- escape_text(current_node.content, encoding, false)
439
- end
440
- when XML::Node::CDATA_SECTION_NODE
441
- io << "<![CDATA[" << current_node.content << "]]>"
442
- when XML::Node::COMMENT_NODE
443
- io << "<!--" << current_node.content << "-->"
444
- when XML::Node::PI_NODE
445
- io << "<?" << current_node.content << ">"
446
- when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
447
- io << "<!DOCTYPE " << current_node.name << ">"
448
- when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
449
- current_node.children.each do |child|
450
- serialize_node_internal(child, io, encoding, options)
451
381
  end
452
- else
453
- raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
454
- end
455
- end
456
382
 
457
- def self.escape_text(text, encoding, attribute_mode)
458
- text = if attribute_mode
459
- text.gsub(/[&\u00a0"]/,
460
- "&" => "&amp;", "\u00a0" => "&nbsp;", '"' => "&quot;")
461
- else
462
- text.gsub(/[&\u00a0<>]/,
463
- "&" => "&amp;", "\u00a0" => "&nbsp;", "<" => "&lt;", ">" => "&gt;")
383
+ body.encode(Encoding::UTF_8)
464
384
  end
465
- # Not part of the standard
466
- text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
467
- end
468
-
469
- def self.prepend_newline?(node)
470
- return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
471
-
472
- first_child = node.children[0]
473
- first_child.text? && first_child.content.start_with?("\n")
474
385
  end
475
386
  end
476
387
  end
@@ -1,21 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # The line below caused a problem on non-GAE rack environment.
4
- # unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
5
- #
6
- # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
7
- # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
8
- # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
9
- # should skip loading xml jars. This is because those are in WEB-INF/lib and
10
- # already set in the classpath.
11
- unless $LOAD_PATH.to_s.include?("appengine-rack")
12
- require "stringio"
13
- require "isorelax.jar"
14
- require "jing.jar"
15
- require "nekohtml.jar"
16
- require "nekodtd.jar"
17
- require "xercesImpl.jar"
18
- require "serializer.jar"
19
- require "xalan.jar"
20
- require "xml-apis.jar"
21
- end
3
+ require_relative "nokogiri_jars"