nokogiri 1.12.5 → 1.13.8
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +9 -7
- data/bin/nokogiri +63 -50
- data/dependencies.yml +13 -64
- data/ext/nokogiri/extconf.rb +66 -44
- data/ext/nokogiri/gumbo.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +2 -3
- data/ext/nokogiri/nokogiri.h +8 -0
- data/ext/nokogiri/xml_attr.c +2 -2
- data/ext/nokogiri/xml_attribute_decl.c +3 -3
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +36 -36
- data/ext/nokogiri/xml_document_fragment.c +0 -2
- data/ext/nokogiri/xml_dtd.c +10 -10
- data/ext/nokogiri/xml_element_decl.c +3 -3
- data/ext/nokogiri/xml_encoding_handler.c +25 -11
- data/ext/nokogiri/xml_entity_decl.c +5 -5
- data/ext/nokogiri/xml_node.c +707 -381
- data/ext/nokogiri/xml_node_set.c +4 -4
- data/ext/nokogiri/xml_reader.c +88 -11
- data/ext/nokogiri/xml_sax_parser_context.c +10 -3
- data/ext/nokogiri/xml_schema.c +3 -3
- data/ext/nokogiri/xml_text.c +1 -1
- data/ext/nokogiri/xml_xpath_context.c +73 -50
- data/ext/nokogiri/xslt_stylesheet.c +107 -9
- data/gumbo-parser/src/parser.c +0 -11
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +9 -8
- data/lib/nokogiri/css/parser.rb +360 -341
- data/lib/nokogiri/css/parser.y +249 -244
- data/lib/nokogiri/css/parser_extras.rb +22 -20
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +179 -82
- data/lib/nokogiri/css.rb +38 -6
- data/lib/nokogiri/decorators/slop.rb +8 -7
- data/lib/nokogiri/extension.rb +1 -1
- data/lib/nokogiri/gumbo.rb +1 -0
- data/lib/nokogiri/html.rb +16 -10
- data/lib/nokogiri/html4/builder.rb +1 -0
- data/lib/nokogiri/html4/document.rb +88 -77
- data/lib/nokogiri/html4/document_fragment.rb +11 -7
- data/lib/nokogiri/html4/element_description.rb +1 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +426 -520
- data/lib/nokogiri/html4/entity_lookup.rb +2 -1
- data/lib/nokogiri/html4/sax/parser.rb +5 -2
- data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
- data/lib/nokogiri/html4.rb +11 -5
- data/lib/nokogiri/html5/document.rb +27 -10
- data/lib/nokogiri/html5/document_fragment.rb +5 -2
- data/lib/nokogiri/html5/node.rb +10 -3
- data/lib/nokogiri/html5.rb +69 -64
- data/lib/nokogiri/jruby/dependencies.rb +10 -9
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version/constant.rb +2 -1
- data/lib/nokogiri/version/info.rb +20 -13
- data/lib/nokogiri/version.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +5 -3
- data/lib/nokogiri/xml/attribute_decl.rb +2 -1
- data/lib/nokogiri/xml/builder.rb +34 -32
- data/lib/nokogiri/xml/cdata.rb +2 -1
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +144 -103
- data/lib/nokogiri/xml/document_fragment.rb +41 -38
- data/lib/nokogiri/xml/dtd.rb +3 -2
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +2 -1
- data/lib/nokogiri/xml/entity_decl.rb +3 -2
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +2 -0
- data/lib/nokogiri/xml/node/save_options.rb +8 -4
- data/lib/nokogiri/xml/node.rb +521 -351
- data/lib/nokogiri/xml/node_set.rb +50 -54
- data/lib/nokogiri/xml/notation.rb +12 -0
- data/lib/nokogiri/xml/parse_options.rb +12 -7
- data/lib/nokogiri/xml/pp/character_data.rb +8 -6
- data/lib/nokogiri/xml/pp/node.rb +24 -26
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/reader.rb +20 -24
- data/lib/nokogiri/xml/relax_ng.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +20 -19
- data/lib/nokogiri/xml/sax/parser.rb +37 -34
- data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +7 -6
- data/lib/nokogiri/xml/searchable.rb +93 -62
- data/lib/nokogiri/xml/syntax_error.rb +5 -4
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
- data/lib/nokogiri/xml/xpath.rb +12 -0
- data/lib/nokogiri/xml/xpath_context.rb +2 -3
- data/lib/nokogiri/xml.rb +4 -3
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/nokogiri/xslt.rb +21 -13
- data/lib/nokogiri.rb +19 -16
- data/lib/xsd/xmlparser/nokogiri.rb +25 -24
- data/patches/libxml2/0004-use-glibc-strlen.patch +3 -3
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2443 -1914
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
- metadata +104 -32
- data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
- data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
- data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Nokogiri
|
3
4
|
module HTML4
|
4
5
|
class EntityDescription < Struct.new(:value, :name, :description); end
|
@@ -6,7 +7,7 @@ module Nokogiri
|
|
6
7
|
class EntityLookup
|
7
8
|
###
|
8
9
|
# Look up entity with +name+
|
9
|
-
def []
|
10
|
+
def [](name)
|
10
11
|
(val = get(name)) && val.value
|
11
12
|
end
|
12
13
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Nokogiri
|
3
4
|
module HTML4
|
4
5
|
###
|
@@ -27,8 +28,9 @@ module Nokogiri
|
|
27
28
|
###
|
28
29
|
# Parse html stored in +data+ using +encoding+
|
29
30
|
def parse_memory(data, encoding = "UTF-8")
|
30
|
-
raise
|
31
|
-
return
|
31
|
+
raise TypeError unless String === data
|
32
|
+
return if data.empty?
|
33
|
+
|
32
34
|
ctx = ParserContext.memory(data, encoding)
|
33
35
|
yield ctx if block_given?
|
34
36
|
ctx.parse_with(self)
|
@@ -50,6 +52,7 @@ module Nokogiri
|
|
50
52
|
raise ArgumentError unless filename
|
51
53
|
raise Errno::ENOENT unless File.exist?(filename)
|
52
54
|
raise Errno::EISDIR if File.directory?(filename)
|
55
|
+
|
53
56
|
ctx = ParserContext.file(filename, encoding)
|
54
57
|
yield ctx if block_given?
|
55
58
|
ctx.parse_with(self)
|
@@ -1,14 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Nokogiri
|
3
4
|
module HTML4
|
4
5
|
module SAX
|
5
6
|
class PushParser
|
6
|
-
|
7
7
|
# The Nokogiri::HTML4::SAX::Document on which the PushParser will be
|
8
8
|
# operating
|
9
9
|
attr_accessor :document
|
10
|
-
|
11
|
-
def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding =
|
10
|
+
|
11
|
+
def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
|
12
12
|
@document = doc
|
13
13
|
@encoding = encoding
|
14
14
|
@sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
|
@@ -16,20 +16,20 @@ module Nokogiri
|
|
16
16
|
## Create our push parser context
|
17
17
|
initialize_native(@sax_parser, file_name, encoding)
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
###
|
21
21
|
# Write a +chunk+ of HTML to the PushParser. Any callback methods
|
22
22
|
# that can be called will be called immediately.
|
23
|
-
def write
|
23
|
+
def write(chunk, last_chunk = false)
|
24
24
|
native_write(chunk, last_chunk)
|
25
25
|
end
|
26
|
-
|
26
|
+
alias_method :<<, :write
|
27
27
|
|
28
28
|
###
|
29
29
|
# Finish the parsing. This method is only necessary for
|
30
30
|
# Nokogiri::HTML4::SAX::Document#end_document to be called.
|
31
31
|
def finish
|
32
|
-
write
|
32
|
+
write("", true)
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
data/lib/nokogiri/html4.rb
CHANGED
@@ -1,15 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
3
|
+
|
2
4
|
module Nokogiri
|
3
5
|
class << self
|
4
|
-
|
6
|
+
# :call-seq:
|
7
|
+
# HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
|
8
|
+
#
|
5
9
|
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
6
10
|
def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
7
11
|
Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
|
8
12
|
end
|
9
13
|
end
|
10
14
|
|
11
|
-
#
|
12
|
-
#
|
15
|
+
# Since v1.12.0
|
16
|
+
#
|
17
|
+
# 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
|
18
|
+
# for parsing HTML.
|
13
19
|
module HTML4
|
14
20
|
class << self
|
15
21
|
###
|
@@ -20,8 +26,8 @@ module Nokogiri
|
|
20
26
|
|
21
27
|
####
|
22
28
|
# Parse a fragment from +string+ in to a NodeSet.
|
23
|
-
def fragment(string, encoding = nil)
|
24
|
-
HTML4::DocumentFragment.parse(string, encoding)
|
29
|
+
def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
30
|
+
HTML4::DocumentFragment.parse(string, encoding, options, &block)
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
6
|
#
|
@@ -19,14 +21,15 @@ require_relative "../html4/document"
|
|
19
21
|
|
20
22
|
module Nokogiri
|
21
23
|
module HTML5
|
22
|
-
#
|
23
|
-
#
|
24
|
+
# Since v1.12.0
|
25
|
+
#
|
26
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
24
27
|
class Document < Nokogiri::HTML4::Document
|
25
28
|
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
26
|
-
yield options if
|
27
|
-
|
29
|
+
yield options if block
|
30
|
+
string_or_io = "" unless string_or_io
|
28
31
|
|
29
|
-
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name !=
|
32
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != "ASCII-8BIT"
|
30
33
|
encoding ||= string_or_io.encoding.name
|
31
34
|
end
|
32
35
|
|
@@ -34,23 +37,26 @@ module Nokogiri
|
|
34
37
|
url ||= string_or_io.path
|
35
38
|
end
|
36
39
|
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
37
|
-
raise ArgumentError
|
40
|
+
raise ArgumentError, "not a string or IO object"
|
38
41
|
end
|
42
|
+
|
39
43
|
do_parse(string_or_io, url, encoding, options)
|
40
44
|
end
|
41
45
|
|
42
46
|
def self.read_io(io, url = nil, encoding = nil, **options)
|
43
|
-
raise ArgumentError
|
47
|
+
raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
|
48
|
+
|
44
49
|
do_parse(io, url, encoding, options)
|
45
50
|
end
|
46
51
|
|
47
52
|
def self.read_memory(string, url = nil, encoding = nil, **options)
|
48
|
-
raise ArgumentError
|
53
|
+
raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
|
54
|
+
|
49
55
|
do_parse(string, url, encoding, options)
|
50
56
|
end
|
51
57
|
|
52
58
|
def fragment(tags = nil)
|
53
|
-
DocumentFragment.new(self, tags,
|
59
|
+
DocumentFragment.new(self, tags, root)
|
54
60
|
end
|
55
61
|
|
56
62
|
def to_xml(options = {}, &block)
|
@@ -59,14 +65,25 @@ module Nokogiri
|
|
59
65
|
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
60
66
|
end
|
61
67
|
|
68
|
+
# :call-seq:
|
69
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
70
|
+
#
|
71
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
72
|
+
#
|
73
|
+
# See XPathVisitor for more information.
|
74
|
+
def xpath_doctype
|
75
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
|
76
|
+
end
|
77
|
+
|
62
78
|
private
|
79
|
+
|
63
80
|
def self.do_parse(string_or_io, url, encoding, options)
|
64
81
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
65
82
|
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
66
83
|
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
67
84
|
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
68
85
|
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
69
|
-
doc.encoding =
|
86
|
+
doc.encoding = "UTF-8"
|
70
87
|
doc
|
71
88
|
end
|
72
89
|
end
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
6
|
#
|
@@ -19,8 +21,9 @@ require_relative "../html4/document_fragment"
|
|
19
21
|
|
20
22
|
module Nokogiri
|
21
23
|
module HTML5
|
22
|
-
#
|
23
|
-
#
|
24
|
+
# Since v1.12.0
|
25
|
+
#
|
26
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
24
27
|
class DocumentFragment < Nokogiri::HTML4::DocumentFragment
|
25
28
|
attr_accessor :document
|
26
29
|
attr_accessor :errors
|
data/lib/nokogiri/html5/node.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
6
|
#
|
@@ -19,18 +21,21 @@ require_relative "../xml/node"
|
|
19
21
|
|
20
22
|
module Nokogiri
|
21
23
|
module HTML5
|
22
|
-
#
|
23
|
-
#
|
24
|
+
# Since v1.12.0
|
25
|
+
#
|
26
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
24
27
|
module Node
|
25
28
|
def inner_html(options = {})
|
26
29
|
return super(options) unless document.is_a?(HTML5::Document)
|
27
|
-
|
30
|
+
|
31
|
+
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? +"\n" : +""
|
28
32
|
result << children.map { |child| child.to_html(options) }.join
|
29
33
|
result
|
30
34
|
end
|
31
35
|
|
32
36
|
def write_to(io, *options)
|
33
37
|
return super(io, *options) unless document.is_a?(HTML5::Document)
|
38
|
+
|
34
39
|
options = options.first.is_a?(Hash) ? options.shift : {}
|
35
40
|
encoding = options[:encoding] || options[0]
|
36
41
|
if Nokogiri.jruby?
|
@@ -61,6 +66,7 @@ module Nokogiri
|
|
61
66
|
|
62
67
|
def fragment(tags)
|
63
68
|
return super(tags) unless document.is_a?(HTML5::Document)
|
69
|
+
|
64
70
|
DocumentFragment.new(document, tags, self)
|
65
71
|
end
|
66
72
|
|
@@ -73,6 +79,7 @@ module Nokogiri
|
|
73
79
|
# actually create the xml namespace if it doesn't exist already.
|
74
80
|
def add_child_node_and_reparent_attrs(node)
|
75
81
|
return super(node) unless document.is_a?(HTML5::Document)
|
82
|
+
|
76
83
|
# I'm not sure what this method is supposed to do. Reparenting
|
77
84
|
# namespaces is handled by libxml2, including child namespaces which
|
78
85
|
# this method wouldn't handle.
|
data/lib/nokogiri/html5.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
# frozen_string_literal: true
|
3
|
+
|
3
4
|
#
|
4
5
|
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
5
6
|
#
|
@@ -16,13 +17,15 @@
|
|
16
17
|
# limitations under the License.
|
17
18
|
#
|
18
19
|
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
20
|
+
require_relative "html5/document"
|
21
|
+
require_relative "html5/document_fragment"
|
22
|
+
require_relative "html5/node"
|
22
23
|
|
23
24
|
module Nokogiri
|
24
|
-
#
|
25
|
-
#
|
25
|
+
# Since v1.12.0
|
26
|
+
#
|
27
|
+
# ⚠ HTML5 functionality is not available when running JRuby.
|
28
|
+
#
|
26
29
|
# Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
27
30
|
def self.HTML5(input, url = nil, encoding = nil, **options, &block)
|
28
31
|
Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
|
@@ -30,6 +33,8 @@ module Nokogiri
|
|
30
33
|
|
31
34
|
# == Usage
|
32
35
|
#
|
36
|
+
# ⚠ HTML5 functionality is not available when running JRuby.
|
37
|
+
#
|
33
38
|
# Parse an HTML5 document:
|
34
39
|
#
|
35
40
|
# doc = Nokogiri.HTML5(string)
|
@@ -220,16 +225,15 @@ module Nokogiri
|
|
220
225
|
# * Instead of returning +unknown+ as the element name for unknown tags, the
|
221
226
|
# original tag name is returned verbatim.
|
222
227
|
#
|
223
|
-
#
|
224
|
-
# @note HTML5 functionality is not available when running JRuby.
|
228
|
+
# Since v1.12.0
|
225
229
|
module HTML5
|
226
230
|
# HTML uses the XHTML namespace.
|
227
|
-
HTML_NAMESPACE =
|
228
|
-
MATHML_NAMESPACE =
|
229
|
-
SVG_NAMESPACE =
|
230
|
-
XLINK_NAMESPACE =
|
231
|
-
XML_NAMESPACE =
|
232
|
-
XMLNS_NAMESPACE =
|
231
|
+
HTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
|
232
|
+
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
|
233
|
+
SVG_NAMESPACE = "http://www.w3.org/2000/svg"
|
234
|
+
XLINK_NAMESPACE = "http://www.w3.org/1999/xlink"
|
235
|
+
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
236
|
+
XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
|
233
237
|
|
234
238
|
# Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
235
239
|
def self.parse(string, url = nil, encoding = nil, **options, &block)
|
@@ -249,34 +253,35 @@ module Nokogiri
|
|
249
253
|
# special option is considered a header. Special options include:
|
250
254
|
# * :follow_limit => number of redirects which are followed
|
251
255
|
# * :basic_auth => [username, password]
|
252
|
-
def self.get(uri, options={})
|
256
|
+
def self.get(uri, options = {})
|
257
|
+
# TODO: deprecate
|
253
258
|
warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
|
254
|
-
|
259
|
+
uplevel: 1, category: :deprecated)
|
255
260
|
get_impl(uri, options)
|
256
261
|
end
|
257
262
|
|
258
263
|
private
|
259
264
|
|
260
|
-
def self.get_impl(uri, options={})
|
265
|
+
def self.get_impl(uri, options = {})
|
261
266
|
headers = options.clone
|
262
|
-
headers = {:
|
263
|
-
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
267
|
+
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
268
|
+
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
264
269
|
|
265
|
-
require
|
270
|
+
require "net/http"
|
266
271
|
uri = URI(uri) unless URI === uri
|
267
272
|
|
268
273
|
http = Net::HTTP.new(uri.host, uri.port)
|
269
274
|
|
270
275
|
# TLS / SSL support
|
271
|
-
http.use_ssl = true if uri.scheme ==
|
276
|
+
http.use_ssl = true if uri.scheme == "https"
|
272
277
|
|
273
278
|
# Pass through Net::HTTP override values, which currently include:
|
274
279
|
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
275
280
|
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
276
281
|
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
277
282
|
# :verify_callback, :verify_depth, :verify_mode
|
278
|
-
options.each do |key,
|
279
|
-
http.send
|
283
|
+
options.each do |key, _value|
|
284
|
+
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
280
285
|
end
|
281
286
|
|
282
287
|
request = Net::HTTP::Get.new(uri.request_uri)
|
@@ -284,23 +289,23 @@ module Nokogiri
|
|
284
289
|
# basic authentication
|
285
290
|
auth = headers.delete(:basic_auth)
|
286
291
|
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
287
|
-
request.basic_auth
|
292
|
+
request.basic_auth(auth.first, auth.last) if auth
|
288
293
|
|
289
294
|
# remaining options are treated as headers
|
290
|
-
headers.each {|key, value| request[key.to_s] = value.to_s}
|
295
|
+
headers.each { |key, value| request[key.to_s] = value.to_s }
|
291
296
|
|
292
297
|
response = http.request(request)
|
293
298
|
|
294
299
|
case response
|
295
300
|
when Net::HTTPSuccess
|
296
|
-
doc = parse(reencode(response.body, response[
|
297
|
-
doc.instance_variable_set(
|
301
|
+
doc = parse(reencode(response.body, response["content-type"]), options)
|
302
|
+
doc.instance_variable_set("@response", response)
|
298
303
|
doc.class.send(:attr_reader, :response)
|
299
304
|
doc
|
300
305
|
when Net::HTTPRedirection
|
301
306
|
response.value if limit <= 1
|
302
|
-
location = URI.join(uri, response[
|
303
|
-
get_impl(location, options.merge(:
|
307
|
+
location = URI.join(uri, response["location"])
|
308
|
+
get_impl(location, options.merge(follow_limit: limit - 1))
|
304
309
|
else
|
305
310
|
response.value
|
306
311
|
end
|
@@ -309,10 +314,10 @@ module Nokogiri
|
|
309
314
|
def self.read_and_encode(string, encoding)
|
310
315
|
# Read the string with the given encoding.
|
311
316
|
if string.respond_to?(:read)
|
312
|
-
if encoding.nil?
|
313
|
-
string
|
317
|
+
string = if encoding.nil?
|
318
|
+
string.read
|
314
319
|
else
|
315
|
-
string
|
320
|
+
string.read(encoding: encoding)
|
316
321
|
end
|
317
322
|
else
|
318
323
|
# Otherwise the string has the given encoding.
|
@@ -342,7 +347,7 @@ module Nokogiri
|
|
342
347
|
# http://bugs.ruby-lang.org/issues/2567
|
343
348
|
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
344
349
|
#
|
345
|
-
def self.reencode(body, content_type=nil)
|
350
|
+
def self.reencode(body, content_type = nil)
|
346
351
|
if body.encoding == Encoding::ASCII_8BIT
|
347
352
|
encoding = nil
|
348
353
|
|
@@ -362,8 +367,8 @@ module Nokogiri
|
|
362
367
|
end
|
363
368
|
|
364
369
|
# look for a charset in a meta tag in the first 1024 bytes
|
365
|
-
|
366
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m,
|
370
|
+
unless encoding
|
371
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
367
372
|
data.scan(/<meta.*?>/m).each do |meta|
|
368
373
|
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
369
374
|
end
|
@@ -390,57 +395,56 @@ module Nokogiri
|
|
390
395
|
ns = current_node.namespace
|
391
396
|
ns_uri = ns.nil? ? nil : ns.href
|
392
397
|
# XXX(sfc): attach namespaces to all nodes, even html?
|
393
|
-
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
394
|
-
|
398
|
+
tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
399
|
+
current_node.name
|
395
400
|
else
|
396
|
-
|
401
|
+
"#{ns.prefix}:#{current_node.name}"
|
397
402
|
end
|
398
|
-
io <<
|
403
|
+
io << "<" << tagname
|
399
404
|
current_node.attribute_nodes.each do |attr|
|
400
405
|
attr_ns = attr.namespace
|
401
406
|
if attr_ns.nil?
|
402
407
|
attr_name = attr.name
|
403
408
|
else
|
404
409
|
ns_uri = attr_ns.href
|
405
|
-
if ns_uri == XML_NAMESPACE
|
406
|
-
|
407
|
-
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/,
|
408
|
-
|
410
|
+
attr_name = if ns_uri == XML_NAMESPACE
|
411
|
+
"xml:" + attr.name.sub(/^[^:]*:/, "")
|
412
|
+
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
|
413
|
+
"xmlns"
|
409
414
|
elsif ns_uri == XMLNS_NAMESPACE
|
410
|
-
|
415
|
+
"xmlns:" + attr.name.sub(/^[^:]*:/, "")
|
411
416
|
elsif ns_uri == XLINK_NAMESPACE
|
412
|
-
|
417
|
+
"xlink:" + attr.name.sub(/^[^:]*:/, "")
|
413
418
|
else
|
414
|
-
|
419
|
+
"#{attr_ns.prefix}:#{attr.name}"
|
415
420
|
end
|
416
421
|
end
|
417
|
-
io <<
|
422
|
+
io << " " << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
|
418
423
|
end
|
419
|
-
io <<
|
420
|
-
|
421
|
-
link meta param source track wbr].include?(current_node.name)
|
424
|
+
io << ">"
|
425
|
+
unless ["area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"].include?(current_node.name)
|
422
426
|
io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
|
423
427
|
current_node.children.each do |child|
|
424
428
|
# XXX(sfc): Templates handled specially?
|
425
429
|
serialize_node_internal(child, io, encoding, options)
|
426
430
|
end
|
427
|
-
io <<
|
431
|
+
io << "</" << tagname << ">"
|
428
432
|
end
|
429
433
|
when XML::Node::TEXT_NODE
|
430
434
|
parent = current_node.parent
|
431
|
-
if parent.element? &&
|
432
|
-
|
435
|
+
io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
|
436
|
+
current_node.content
|
433
437
|
else
|
434
|
-
|
438
|
+
escape_text(current_node.content, encoding, false)
|
435
439
|
end
|
436
440
|
when XML::Node::CDATA_SECTION_NODE
|
437
|
-
io <<
|
441
|
+
io << "<![CDATA[" << current_node.content << "]]>"
|
438
442
|
when XML::Node::COMMENT_NODE
|
439
|
-
io <<
|
443
|
+
io << "<!--" << current_node.content << "-->"
|
440
444
|
when XML::Node::PI_NODE
|
441
|
-
io <<
|
445
|
+
io << "<?" << current_node.content << ">"
|
442
446
|
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
443
|
-
|
447
|
+
io << "<!DOCTYPE " << current_node.name << ">"
|
444
448
|
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
445
449
|
current_node.children.each do |child|
|
446
450
|
serialize_node_internal(child, io, encoding, options)
|
@@ -451,23 +455,24 @@ module Nokogiri
|
|
451
455
|
end
|
452
456
|
|
453
457
|
def self.escape_text(text, encoding, attribute_mode)
|
454
|
-
if attribute_mode
|
455
|
-
text
|
456
|
-
|
458
|
+
text = if attribute_mode
|
459
|
+
text.gsub(/[&\u00a0"]/,
|
460
|
+
"&" => "&", "\u00a0" => " ", '"' => """)
|
457
461
|
else
|
458
|
-
text
|
459
|
-
|
462
|
+
text.gsub(/[&\u00a0<>]/,
|
463
|
+
"&" => "&", "\u00a0" => " ", "<" => "<", ">" => ">")
|
460
464
|
end
|
461
465
|
# Not part of the standard
|
462
466
|
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
463
467
|
end
|
464
468
|
|
465
469
|
def self.prepend_newline?(node)
|
466
|
-
return false unless
|
470
|
+
return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
|
471
|
+
|
467
472
|
first_child = node.children[0]
|
468
473
|
first_child.text? && first_child.content.start_with?("\n")
|
469
474
|
end
|
470
475
|
end
|
471
476
|
end
|
472
477
|
|
473
|
-
require_relative
|
478
|
+
require_relative "gumbo"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# The line below caused a problem on non-GAE rack environment.
|
3
4
|
# unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
|
4
5
|
#
|
@@ -8,13 +9,13 @@
|
|
8
9
|
# should skip loading xml jars. This is because those are in WEB-INF/lib and
|
9
10
|
# already set in the classpath.
|
10
11
|
unless $LOAD_PATH.to_s.include?("appengine-rack")
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
16
|
-
require
|
17
|
-
require
|
18
|
-
require
|
19
|
-
require
|
12
|
+
require "stringio"
|
13
|
+
require "isorelax.jar"
|
14
|
+
require "jing.jar"
|
15
|
+
require "nekohtml.jar"
|
16
|
+
require "nekodtd.jar"
|
17
|
+
require "xercesImpl.jar"
|
18
|
+
require "serializer.jar"
|
19
|
+
require "xalan.jar"
|
20
|
+
require "xml-apis.jar"
|
20
21
|
end
|