nokogiri 1.13.6 → 1.14.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +39 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +18 -11
- data/dependencies.yml +33 -15
- data/ext/nokogiri/extconf.rb +100 -24
- data/ext/nokogiri/gumbo.c +21 -11
- data/ext/nokogiri/html4_document.c +2 -2
- data/ext/nokogiri/html4_element_description.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +2 -2
- data/ext/nokogiri/html4_sax_parser_context.c +1 -6
- data/ext/nokogiri/html4_sax_push_parser.c +1 -1
- data/ext/nokogiri/nokogiri.c +38 -51
- data/ext/nokogiri/nokogiri.h +26 -14
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +3 -3
- data/ext/nokogiri/xml_attribute_decl.c +5 -5
- data/ext/nokogiri/xml_cdata.c +3 -3
- data/ext/nokogiri/xml_comment.c +1 -1
- data/ext/nokogiri/xml_document.c +23 -14
- data/ext/nokogiri/xml_document_fragment.c +1 -1
- data/ext/nokogiri/xml_dtd.c +9 -9
- data/ext/nokogiri/xml_element_content.c +3 -3
- data/ext/nokogiri/xml_element_decl.c +5 -5
- data/ext/nokogiri/xml_encoding_handler.c +3 -3
- data/ext/nokogiri/xml_entity_decl.c +6 -6
- data/ext/nokogiri/xml_entity_reference.c +1 -1
- data/ext/nokogiri/xml_namespace.c +80 -14
- data/ext/nokogiri/xml_node.c +363 -82
- data/ext/nokogiri/xml_node_set.c +4 -6
- data/ext/nokogiri/xml_processing_instruction.c +1 -1
- data/ext/nokogiri/xml_reader.c +97 -22
- data/ext/nokogiri/xml_relax_ng.c +1 -3
- data/ext/nokogiri/xml_sax_parser.c +23 -17
- data/ext/nokogiri/xml_sax_parser_context.c +1 -6
- data/ext/nokogiri/xml_sax_push_parser.c +1 -3
- data/ext/nokogiri/xml_schema.c +4 -6
- data/ext/nokogiri/xml_syntax_error.c +1 -1
- data/ext/nokogiri/xml_text.c +2 -2
- data/ext/nokogiri/xml_xpath_context.c +91 -84
- data/ext/nokogiri/xslt_stylesheet.c +15 -14
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +2 -2
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +2 -2
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +8 -5
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +5 -3
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/decorators/slop.rb +1 -1
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +103 -55
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +6 -4
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +88 -9
- data/lib/nokogiri/xml/parse_options.rb +129 -50
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/reader.rb +6 -8
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- data/lib/xsd/xmlparser/nokogiri.rb +3 -1
- data/ports/archives/libxml2-2.10.3.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
- metadata +11 -242
- data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
@@ -21,48 +21,137 @@ require_relative "../html4/document"
|
|
21
21
|
|
22
22
|
module Nokogiri
|
23
23
|
module HTML5
|
24
|
+
# Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
|
25
|
+
#
|
26
|
+
# See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
|
27
|
+
# mode.
|
28
|
+
#
|
29
|
+
# Since v1.14.0
|
30
|
+
module QuirksMode
|
31
|
+
NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
|
32
|
+
QUIRKS = 1 # The document was parsed in "quirks" mode
|
33
|
+
LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
|
34
|
+
end
|
35
|
+
|
24
36
|
# Since v1.12.0
|
25
37
|
#
|
26
38
|
# 💡 HTML5 functionality is not available when running JRuby.
|
27
39
|
class Document < Nokogiri::HTML4::Document
|
28
|
-
|
29
|
-
|
30
|
-
|
40
|
+
# Get the url name for this document, as passed into Document.parse, Document.read_io, or
|
41
|
+
# Document.read_memory
|
42
|
+
attr_reader :url
|
31
43
|
|
32
|
-
|
33
|
-
|
34
|
-
|
44
|
+
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
45
|
+
#
|
46
|
+
# This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
|
47
|
+
#
|
48
|
+
# Since v1.14.0
|
49
|
+
attr_reader :quirks_mode
|
35
50
|
|
36
|
-
|
37
|
-
|
51
|
+
class << self
|
52
|
+
# :call-seq:
|
53
|
+
# parse(input)
|
54
|
+
# parse(input, url=nil, encoding=nil, **options)
|
55
|
+
# parse(input, url=nil, encoding=nil) { |options| ... }
|
56
|
+
#
|
57
|
+
# Parse HTML5 input.
|
58
|
+
#
|
59
|
+
# [Parameters]
|
60
|
+
# - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
|
61
|
+
# IO, or StringIO.
|
62
|
+
#
|
63
|
+
# - +url+ (optional) is a String indicating the canonical URI where this document is located.
|
64
|
+
#
|
65
|
+
# - +encoding+ (optional) is the encoding that should be used when processing
|
66
|
+
# the document.
|
67
|
+
#
|
68
|
+
# - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
|
69
|
+
# during parsing. The three currently supported options are +:max_errors+,
|
70
|
+
# +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
|
71
|
+
#
|
72
|
+
# ⚠ Note that these options are different than those made available by
|
73
|
+
# Nokogiri::XML::Document and Nokogiri::HTML4::Document.
|
74
|
+
#
|
75
|
+
# - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
|
76
|
+
# Nokogiri::HTML5 for more information and usage.
|
77
|
+
#
|
78
|
+
# [Returns] Nokogiri::HTML5::Document
|
79
|
+
#
|
80
|
+
def parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
81
|
+
yield options if block
|
82
|
+
string_or_io = "" unless string_or_io
|
83
|
+
|
84
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
|
85
|
+
encoding ||= string_or_io.encoding.name
|
86
|
+
end
|
87
|
+
|
88
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
89
|
+
url ||= string_or_io.path
|
90
|
+
end
|
91
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
92
|
+
raise ArgumentError, "not a string or IO object"
|
93
|
+
end
|
94
|
+
|
95
|
+
do_parse(string_or_io, url, encoding, options)
|
38
96
|
end
|
39
|
-
|
40
|
-
|
97
|
+
|
98
|
+
# Create a new document from an IO object.
|
99
|
+
#
|
100
|
+
# 💡 Most users should prefer Document.parse to this method.
|
101
|
+
def read_io(io, url = nil, encoding = nil, **options)
|
102
|
+
raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
|
103
|
+
|
104
|
+
do_parse(io, url, encoding, options)
|
41
105
|
end
|
42
106
|
|
43
|
-
|
44
|
-
|
107
|
+
# Create a new document from a String.
|
108
|
+
#
|
109
|
+
# 💡 Most users should prefer Document.parse to this method.
|
110
|
+
def read_memory(string, url = nil, encoding = nil, **options)
|
111
|
+
raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
|
45
112
|
|
46
|
-
|
47
|
-
|
113
|
+
do_parse(string, url, encoding, options)
|
114
|
+
end
|
48
115
|
|
49
|
-
|
50
|
-
end
|
116
|
+
private
|
51
117
|
|
52
|
-
|
53
|
-
|
118
|
+
def do_parse(string_or_io, url, encoding, options)
|
119
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
120
|
+
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
121
|
+
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
122
|
+
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
123
|
+
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
|
124
|
+
doc.encoding = "UTF-8"
|
125
|
+
doc
|
126
|
+
end
|
127
|
+
end
|
54
128
|
|
55
|
-
|
129
|
+
def initialize(*args) # :nodoc:
|
130
|
+
super
|
131
|
+
@url = nil
|
132
|
+
@quirks_mode = nil
|
56
133
|
end
|
57
134
|
|
58
|
-
|
59
|
-
|
135
|
+
# :call-seq:
|
136
|
+
# fragment() → Nokogiri::HTML5::DocumentFragment
|
137
|
+
# fragment(markup) → Nokogiri::HTML5::DocumentFragment
|
138
|
+
#
|
139
|
+
# Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
|
140
|
+
#
|
141
|
+
# [Properties]
|
142
|
+
# - +markup+ (String) The HTML5 markup fragment to be parsed
|
143
|
+
#
|
144
|
+
# [Returns]
|
145
|
+
# Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
|
146
|
+
#
|
147
|
+
def fragment(markup = nil)
|
148
|
+
DocumentFragment.new(self, markup)
|
60
149
|
end
|
61
150
|
|
62
|
-
def to_xml(options = {}, &block)
|
151
|
+
def to_xml(options = {}, &block) # :nodoc:
|
63
152
|
# Bypass XML::Document#to_xml which doesn't add
|
64
153
|
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
65
|
-
XML::Node.instance_method(:to_xml).
|
154
|
+
XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
|
66
155
|
end
|
67
156
|
|
68
157
|
# :call-seq:
|
@@ -70,22 +159,10 @@ module Nokogiri
|
|
70
159
|
#
|
71
160
|
# [Returns] The document type which determines CSS-to-XPath translation.
|
72
161
|
#
|
73
|
-
# See XPathVisitor for more information.
|
162
|
+
# See CSS::XPathVisitor for more information.
|
74
163
|
def xpath_doctype
|
75
164
|
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
|
76
165
|
end
|
77
|
-
|
78
|
-
private
|
79
|
-
|
80
|
-
def self.do_parse(string_or_io, url, encoding, options)
|
81
|
-
string = HTML5.read_and_encode(string_or_io, encoding)
|
82
|
-
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
83
|
-
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
84
|
-
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
85
|
-
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
86
|
-
doc.encoding = "UTF-8"
|
87
|
-
doc
|
88
|
-
end
|
89
166
|
end
|
90
167
|
end
|
91
168
|
end
|
@@ -28,6 +28,13 @@ module Nokogiri
|
|
28
28
|
attr_accessor :document
|
29
29
|
attr_accessor :errors
|
30
30
|
|
31
|
+
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
32
|
+
#
|
33
|
+
# This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
|
34
|
+
#
|
35
|
+
# Since v1.14.0
|
36
|
+
attr_reader :quirks_mode
|
37
|
+
|
31
38
|
# Create a document fragment.
|
32
39
|
def initialize(doc, tags = nil, ctx = nil, options = {})
|
33
40
|
self.document = doc
|
@@ -41,10 +48,10 @@ module Nokogiri
|
|
41
48
|
Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
42
49
|
end
|
43
50
|
|
44
|
-
def serialize(options = {}, &block)
|
51
|
+
def serialize(options = {}, &block) # :nodoc:
|
45
52
|
# Bypass XML::Document.serialize which doesn't support options even
|
46
53
|
# though XML::Node.serialize does!
|
47
|
-
XML::Node.instance_method(:serialize).
|
54
|
+
XML::Node.instance_method(:serialize).bind_call(self, options, &block)
|
48
55
|
end
|
49
56
|
|
50
57
|
# Parse a document fragment from +tags+, returning a Nodeset.
|
data/lib/nokogiri/html5/node.rb
CHANGED
@@ -28,7 +28,7 @@ module Nokogiri
|
|
28
28
|
def inner_html(options = {})
|
29
29
|
return super(options) unless document.is_a?(HTML5::Document)
|
30
30
|
|
31
|
-
result = options[:preserve_newline] &&
|
31
|
+
result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
|
32
32
|
result << children.map { |child| child.to_html(options) }.join
|
33
33
|
result
|
34
34
|
end
|
@@ -56,11 +56,9 @@ module Nokogiri
|
|
56
56
|
native_write_to(io, encoding, indent_string, config_options)
|
57
57
|
else
|
58
58
|
# Serialize including the current node.
|
59
|
+
html = html_standard_serialize(options[:preserve_newline] || false)
|
59
60
|
encoding ||= document.encoding || Encoding::UTF_8
|
60
|
-
|
61
|
-
preserve_newline: options[:preserve_newline] || false,
|
62
|
-
}
|
63
|
-
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
61
|
+
io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
|
64
62
|
end
|
65
63
|
end
|
66
64
|
|
data/lib/nokogiri/html5.rb
CHANGED
@@ -227,250 +227,161 @@ module Nokogiri
|
|
227
227
|
#
|
228
228
|
# Since v1.12.0
|
229
229
|
module HTML5
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
236
|
-
XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
|
230
|
+
class << self
|
231
|
+
# Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
232
|
+
def parse(string, url = nil, encoding = nil, **options, &block)
|
233
|
+
Document.parse(string, url, encoding, **options, &block)
|
234
|
+
end
|
237
235
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
236
|
+
# Parse a fragment from +string+. Convenience method for
|
237
|
+
# {Nokogiri::HTML5::DocumentFragment.parse}.
|
238
|
+
def fragment(string, encoding = nil, **options)
|
239
|
+
DocumentFragment.parse(string, encoding, options)
|
240
|
+
end
|
242
241
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
242
|
+
# Fetch and parse a HTML document from the web, following redirects,
|
243
|
+
# handling https, and determining the character encoding using HTML5
|
244
|
+
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
245
|
+
# http headers and special options. Everything which is not a
|
246
|
+
# special option is considered a header. Special options include:
|
247
|
+
# * :follow_limit => number of redirects which are followed
|
248
|
+
# * :basic_auth => [username, password]
|
249
|
+
def get(uri, options = {})
|
250
|
+
# TODO: deprecate
|
251
|
+
warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
|
252
|
+
uplevel: 1, category: :deprecated)
|
253
|
+
get_impl(uri, options)
|
254
|
+
end
|
248
255
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
256
|
+
# :nodoc:
|
257
|
+
def read_and_encode(string, encoding)
|
258
|
+
# Read the string with the given encoding.
|
259
|
+
if string.respond_to?(:read)
|
260
|
+
string = if encoding.nil?
|
261
|
+
string.read
|
262
|
+
else
|
263
|
+
string.read(encoding: encoding)
|
264
|
+
end
|
265
|
+
else
|
266
|
+
# Otherwise the string has the given encoding.
|
267
|
+
string = string.to_s
|
268
|
+
if encoding
|
269
|
+
string = string.dup
|
270
|
+
string.force_encoding(encoding)
|
271
|
+
end
|
272
|
+
end
|
262
273
|
|
263
|
-
|
274
|
+
# convert to UTF-8
|
275
|
+
if string.encoding != Encoding::UTF_8
|
276
|
+
string = reencode(string)
|
277
|
+
end
|
278
|
+
string
|
279
|
+
end
|
264
280
|
|
265
|
-
|
266
|
-
headers = options.clone
|
267
|
-
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
268
|
-
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
281
|
+
private
|
269
282
|
|
270
|
-
|
271
|
-
|
283
|
+
def get_impl(uri, options = {})
|
284
|
+
headers = options.clone
|
285
|
+
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
286
|
+
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
272
287
|
|
273
|
-
|
288
|
+
require "net/http"
|
289
|
+
uri = URI(uri) unless URI === uri
|
274
290
|
|
275
|
-
|
276
|
-
http.use_ssl = true if uri.scheme == "https"
|
291
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
277
292
|
|
278
|
-
|
279
|
-
|
280
|
-
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
281
|
-
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
282
|
-
# :verify_callback, :verify_depth, :verify_mode
|
283
|
-
options.each do |key, _value|
|
284
|
-
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
285
|
-
end
|
293
|
+
# TLS / SSL support
|
294
|
+
http.use_ssl = true if uri.scheme == "https"
|
286
295
|
|
287
|
-
|
296
|
+
# Pass through Net::HTTP override values, which currently include:
|
297
|
+
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
298
|
+
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
299
|
+
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
300
|
+
# :verify_callback, :verify_depth, :verify_mode
|
301
|
+
options.each do |key, _value|
|
302
|
+
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
303
|
+
end
|
288
304
|
|
289
|
-
|
290
|
-
auth = headers.delete(:basic_auth)
|
291
|
-
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
292
|
-
request.basic_auth(auth.first, auth.last) if auth
|
305
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
293
306
|
|
294
|
-
|
295
|
-
|
307
|
+
# basic authentication
|
308
|
+
auth = headers.delete(:basic_auth)
|
309
|
+
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
310
|
+
request.basic_auth(auth.first, auth.last) if auth
|
296
311
|
|
297
|
-
|
312
|
+
# remaining options are treated as headers
|
313
|
+
headers.each { |key, value| request[key.to_s] = value.to_s }
|
298
314
|
|
299
|
-
|
300
|
-
when Net::HTTPSuccess
|
301
|
-
doc = parse(reencode(response.body, response["content-type"]), options)
|
302
|
-
doc.instance_variable_set("@response", response)
|
303
|
-
doc.class.send(:attr_reader, :response)
|
304
|
-
doc
|
305
|
-
when Net::HTTPRedirection
|
306
|
-
response.value if limit <= 1
|
307
|
-
location = URI.join(uri, response["location"])
|
308
|
-
get_impl(location, options.merge(follow_limit: limit - 1))
|
309
|
-
else
|
310
|
-
response.value
|
311
|
-
end
|
312
|
-
end
|
315
|
+
response = http.request(request)
|
313
316
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
317
|
+
case response
|
318
|
+
when Net::HTTPSuccess
|
319
|
+
doc = parse(reencode(response.body, response["content-type"]), options)
|
320
|
+
doc.instance_variable_set(:@response, response)
|
321
|
+
doc.class.send(:attr_reader, :response)
|
322
|
+
doc
|
323
|
+
when Net::HTTPRedirection
|
324
|
+
response.value if limit <= 1
|
325
|
+
location = URI.join(uri, response["location"])
|
326
|
+
get_impl(location, options.merge(follow_limit: limit - 1))
|
319
327
|
else
|
320
|
-
|
321
|
-
end
|
322
|
-
else
|
323
|
-
# Otherwise the string has the given encoding.
|
324
|
-
string = string.to_s
|
325
|
-
if encoding
|
326
|
-
string = string.dup
|
327
|
-
string.force_encoding(encoding)
|
328
|
+
response.value
|
328
329
|
end
|
329
330
|
end
|
330
331
|
|
331
|
-
#
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
initial_bytes = body[0..2].bytes
|
356
|
-
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
357
|
-
encoding = Encoding::UTF_8
|
358
|
-
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
359
|
-
encoding = Encoding::UTF_16BE
|
360
|
-
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
361
|
-
encoding = Encoding::UTF_16LE
|
362
|
-
end
|
363
|
-
|
364
|
-
# look for a charset in a content-encoding header
|
365
|
-
if content_type
|
366
|
-
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
367
|
-
end
|
368
|
-
|
369
|
-
# look for a charset in a meta tag in the first 1024 bytes
|
370
|
-
unless encoding
|
371
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
372
|
-
data.scan(/<meta.*?>/m).each do |meta|
|
373
|
-
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
332
|
+
# Charset sniffing is a complex and controversial topic that understandably isn't done _by
|
333
|
+
# default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
|
334
|
+
# consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
|
335
|
+
# the Gumbo parser *only* supports utf-8.
|
336
|
+
#
|
337
|
+
# Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
|
338
|
+
# this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
|
339
|
+
# the HTML5 standard.
|
340
|
+
#
|
341
|
+
# http://bugs.ruby-lang.org/issues/2567
|
342
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
343
|
+
#
|
344
|
+
def reencode(body, content_type = nil)
|
345
|
+
if body.encoding == Encoding::ASCII_8BIT
|
346
|
+
encoding = nil
|
347
|
+
|
348
|
+
# look for a Byte Order Mark (BOM)
|
349
|
+
initial_bytes = body[0..2].bytes
|
350
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
351
|
+
encoding = Encoding::UTF_8
|
352
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
353
|
+
encoding = Encoding::UTF_16BE
|
354
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
355
|
+
encoding = Encoding::UTF_16LE
|
374
356
|
end
|
375
|
-
end
|
376
|
-
|
377
|
-
# if all else fails, default to the official default encoding for HTML
|
378
|
-
encoding ||= Encoding::ISO_8859_1
|
379
|
-
|
380
|
-
# change the encoding to match the detected or inferred encoding
|
381
|
-
body = body.dup
|
382
|
-
begin
|
383
|
-
body.force_encoding(encoding)
|
384
|
-
rescue ArgumentError
|
385
|
-
body.force_encoding(Encoding::ISO_8859_1)
|
386
|
-
end
|
387
|
-
end
|
388
357
|
|
389
|
-
|
390
|
-
|
358
|
+
# look for a charset in a content-encoding header
|
359
|
+
if content_type
|
360
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
361
|
+
end
|
391
362
|
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
# XXX(sfc): attach namespaces to all nodes, even html?
|
398
|
-
tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
399
|
-
current_node.name
|
400
|
-
else
|
401
|
-
"#{ns.prefix}:#{current_node.name}"
|
402
|
-
end
|
403
|
-
io << "<" << tagname
|
404
|
-
current_node.attribute_nodes.each do |attr|
|
405
|
-
attr_ns = attr.namespace
|
406
|
-
if attr_ns.nil?
|
407
|
-
attr_name = attr.name
|
408
|
-
else
|
409
|
-
ns_uri = attr_ns.href
|
410
|
-
attr_name = if ns_uri == XML_NAMESPACE
|
411
|
-
"xml:" + attr.name.sub(/^[^:]*:/, "")
|
412
|
-
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
|
413
|
-
"xmlns"
|
414
|
-
elsif ns_uri == XMLNS_NAMESPACE
|
415
|
-
"xmlns:" + attr.name.sub(/^[^:]*:/, "")
|
416
|
-
elsif ns_uri == XLINK_NAMESPACE
|
417
|
-
"xlink:" + attr.name.sub(/^[^:]*:/, "")
|
418
|
-
else
|
419
|
-
"#{attr_ns.prefix}:#{attr.name}"
|
363
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
364
|
+
unless encoding
|
365
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
366
|
+
data.scan(/<meta.*?>/im).each do |meta|
|
367
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
420
368
|
end
|
421
369
|
end
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
370
|
+
|
371
|
+
# if all else fails, default to the official default encoding for HTML
|
372
|
+
encoding ||= Encoding::ISO_8859_1
|
373
|
+
|
374
|
+
# change the encoding to match the detected or inferred encoding
|
375
|
+
body = body.dup
|
376
|
+
begin
|
377
|
+
body.force_encoding(encoding)
|
378
|
+
rescue ArgumentError
|
379
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
430
380
|
end
|
431
|
-
io << "</" << tagname << ">"
|
432
|
-
end
|
433
|
-
when XML::Node::TEXT_NODE
|
434
|
-
parent = current_node.parent
|
435
|
-
io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
|
436
|
-
current_node.content
|
437
|
-
else
|
438
|
-
escape_text(current_node.content, encoding, false)
|
439
|
-
end
|
440
|
-
when XML::Node::CDATA_SECTION_NODE
|
441
|
-
io << "<![CDATA[" << current_node.content << "]]>"
|
442
|
-
when XML::Node::COMMENT_NODE
|
443
|
-
io << "<!--" << current_node.content << "-->"
|
444
|
-
when XML::Node::PI_NODE
|
445
|
-
io << "<?" << current_node.content << ">"
|
446
|
-
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
447
|
-
io << "<!DOCTYPE " << current_node.name << ">"
|
448
|
-
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
449
|
-
current_node.children.each do |child|
|
450
|
-
serialize_node_internal(child, io, encoding, options)
|
451
381
|
end
|
452
|
-
else
|
453
|
-
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
|
454
|
-
end
|
455
|
-
end
|
456
382
|
|
457
|
-
|
458
|
-
text = if attribute_mode
|
459
|
-
text.gsub(/[&\u00a0"]/,
|
460
|
-
"&" => "&", "\u00a0" => " ", '"' => """)
|
461
|
-
else
|
462
|
-
text.gsub(/[&\u00a0<>]/,
|
463
|
-
"&" => "&", "\u00a0" => " ", "<" => "<", ">" => ">")
|
383
|
+
body.encode(Encoding::UTF_8)
|
464
384
|
end
|
465
|
-
# Not part of the standard
|
466
|
-
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
467
|
-
end
|
468
|
-
|
469
|
-
def self.prepend_newline?(node)
|
470
|
-
return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
|
471
|
-
|
472
|
-
first_child = node.children[0]
|
473
|
-
first_child.text? && first_child.content.start_with?("\n")
|
474
385
|
end
|
475
386
|
end
|
476
387
|
end
|
@@ -1,21 +1,3 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
# unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
|
5
|
-
#
|
6
|
-
# However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
|
7
|
-
# an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
|
8
|
-
# of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
|
9
|
-
# should skip loading xml jars. This is because those are in WEB-INF/lib and
|
10
|
-
# already set in the classpath.
|
11
|
-
unless $LOAD_PATH.to_s.include?("appengine-rack")
|
12
|
-
require "stringio"
|
13
|
-
require "isorelax.jar"
|
14
|
-
require "jing.jar"
|
15
|
-
require "nekohtml.jar"
|
16
|
-
require "nekodtd.jar"
|
17
|
-
require "xercesImpl.jar"
|
18
|
-
require "serializer.jar"
|
19
|
-
require "xalan.jar"
|
20
|
-
require "xml-apis.jar"
|
21
|
-
end
|
3
|
+
require_relative "nokogiri_jars"
|