nokogiri 1.15.3 → 1.18.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +12 -17
- data/LICENSE-DEPENDENCIES.md +6 -6
- data/README.md +11 -5
- data/dependencies.yml +9 -8
- data/ext/nokogiri/extconf.rb +191 -154
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +26 -25
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +25 -33
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +3 -12
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +173 -158
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -10
- data/ext/nokogiri/xml_node.c +142 -108
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +74 -100
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +220 -128
- data/ext/nokogiri/xml_sax_push_parser.c +69 -50
- data/ext/nokogiri/xml_schema.c +51 -87
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +3 -6
- data/ext/nokogiri/xml_xpath_context.c +104 -104
- data/ext/nokogiri/xslt_stylesheet.c +16 -11
- data/gumbo-parser/Makefile +18 -0
- data/gumbo-parser/src/ascii.c +2 -2
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +66 -25
- data/gumbo-parser/src/tokenizer.c +7 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +44 -27
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +45 -24
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +2 -2
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -138
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +6 -5
- data/lib/nokogiri/xml/attr.rb +2 -2
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +74 -31
- data/lib/nokogiri/xml/document_fragment.rb +86 -15
- data/lib/nokogiri/xml/namespace.rb +1 -2
- data/lib/nokogiri/xml/node.rb +113 -35
- data/lib/nokogiri/xml/node_set.rb +12 -10
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +51 -17
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +39 -43
- data/lib/nokogiri/xml/syntax_error.rb +23 -1
- data/lib/nokogiri/xml/xpath_context.rb +14 -3
- data/lib/nokogiri/xml.rb +14 -25
- data/lib/nokogiri/xslt/stylesheet.rb +29 -7
- data/lib/nokogiri/xslt.rb +4 -10
- data/lib/nokogiri.rb +1 -1
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
- data/ports/archives/libxml2-2.13.7.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.43.tar.xz +0 -0
- metadata +13 -14
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.11.4.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
@@ -44,6 +44,18 @@ module Nokogiri
|
|
44
44
|
VALUES = [XML, HTML4, HTML5]
|
45
45
|
end
|
46
46
|
|
47
|
+
# The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
|
48
|
+
attr_reader :builtins
|
49
|
+
|
50
|
+
# The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
|
51
|
+
attr_reader :doctype
|
52
|
+
|
53
|
+
# The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
|
54
|
+
attr_reader :prefix
|
55
|
+
|
56
|
+
# The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
|
57
|
+
attr_reader :namespaces
|
58
|
+
|
47
59
|
# :call-seq:
|
48
60
|
# new() → XPathVisitor
|
49
61
|
# new(builtins:, doctype:) → XPathVisitor
|
@@ -54,7 +66,12 @@ module Nokogiri
|
|
54
66
|
#
|
55
67
|
# [Returns] XPathVisitor
|
56
68
|
#
|
57
|
-
def initialize(
|
69
|
+
def initialize(
|
70
|
+
builtins: BuiltinsConfig::NEVER,
|
71
|
+
doctype: DoctypeConfig::XML,
|
72
|
+
prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
|
73
|
+
namespaces: nil
|
74
|
+
)
|
58
75
|
unless BuiltinsConfig::VALUES.include?(builtins)
|
59
76
|
raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
|
60
77
|
end
|
@@ -64,6 +81,8 @@ module Nokogiri
|
|
64
81
|
|
65
82
|
@builtins = builtins
|
66
83
|
@doctype = doctype
|
84
|
+
@prefix = prefix
|
85
|
+
@namespaces = namespaces
|
67
86
|
end
|
68
87
|
|
69
88
|
# :call-seq: config() → Hash
|
@@ -72,7 +91,7 @@ module Nokogiri
|
|
72
91
|
# a Hash representing the configuration of the XPathVisitor, suitable for use as
|
73
92
|
# part of the CSS cache key.
|
74
93
|
def config
|
75
|
-
{ builtins: @builtins, doctype: @doctype }
|
94
|
+
{ builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
|
76
95
|
end
|
77
96
|
|
78
97
|
# :stopdoc:
|
@@ -128,6 +147,8 @@ module Nokogiri
|
|
128
147
|
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
|
129
148
|
".#{"//" unless is_direct}#{node.value[1].accept(self)}"
|
130
149
|
else
|
150
|
+
validate_xpath_function_name(node.value.first)
|
151
|
+
|
131
152
|
# xpath function call, let's marshal those arguments
|
132
153
|
args = ["."]
|
133
154
|
args += node.value[1..-1].map do |n|
|
@@ -207,6 +228,7 @@ module Nokogiri
|
|
207
228
|
when "parent" then "node()"
|
208
229
|
when "root" then "not(parent::*)"
|
209
230
|
else
|
231
|
+
validate_xpath_function_name(node.value.first)
|
210
232
|
"nokogiri:#{node.value.first}(.)"
|
211
233
|
end
|
212
234
|
end
|
@@ -255,6 +277,15 @@ module Nokogiri
|
|
255
277
|
else
|
256
278
|
"*[local-name()='#{node.value.first}']"
|
257
279
|
end
|
280
|
+
elsif node.value.length == 2 # has a namespace prefix
|
281
|
+
if node.value.first.nil? # namespace prefix is empty
|
282
|
+
node.value.last
|
283
|
+
else
|
284
|
+
node.value.join(":")
|
285
|
+
end
|
286
|
+
elsif node.value.first != "*" && @namespaces&.key?("xmlns")
|
287
|
+
# apply the default namespace (if one is present) to a non-wildcard selector
|
288
|
+
"xmlns:#{node.value.first}"
|
258
289
|
else
|
259
290
|
node.value.first
|
260
291
|
end
|
@@ -270,11 +301,17 @@ module Nokogiri
|
|
270
301
|
|
271
302
|
private
|
272
303
|
|
304
|
+
def validate_xpath_function_name(name)
|
305
|
+
if name.start_with?("-")
|
306
|
+
raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
273
310
|
def html5_element_name_needs_namespace_handling(node)
|
274
|
-
# if
|
275
|
-
node.value.
|
276
|
-
# if
|
277
|
-
|
311
|
+
# if there is already a namespace (i.e., it is a prefixed QName), use it as normal
|
312
|
+
node.value.length == 1 &&
|
313
|
+
# if this is the wildcard selector "*", use it as normal
|
314
|
+
node.value.first != "*"
|
278
315
|
end
|
279
316
|
|
280
317
|
def nth(node, options = {})
|
@@ -302,7 +339,7 @@ module Nokogiri
|
|
302
339
|
end
|
303
340
|
|
304
341
|
def read_a_and_positive_b(values)
|
305
|
-
op = values[2]
|
342
|
+
op = values[2].strip
|
306
343
|
if op == "+"
|
307
344
|
a = values[0].to_i
|
308
345
|
b = values[3].to_i
|
@@ -335,25 +372,5 @@ module Nokogiri
|
|
335
372
|
end
|
336
373
|
end
|
337
374
|
end
|
338
|
-
|
339
|
-
module XPathVisitorAlwaysUseBuiltins # :nodoc:
|
340
|
-
def self.new
|
341
|
-
warn(
|
342
|
-
"Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins is deprecated and will be removed in a future version of Nokogiri",
|
343
|
-
{ uplevel: 1 },
|
344
|
-
)
|
345
|
-
XPathVisitor.new(builtins: :always)
|
346
|
-
end
|
347
|
-
end
|
348
|
-
|
349
|
-
module XPathVisitorOptimallyUseBuiltins # :nodoc:
|
350
|
-
def self.new
|
351
|
-
warn(
|
352
|
-
"Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins is deprecated and will be removed in a future version of Nokogiri",
|
353
|
-
{ uplevel: 1 },
|
354
|
-
)
|
355
|
-
XPathVisitor.new(builtins: :optimal)
|
356
|
-
end
|
357
|
-
end
|
358
375
|
end
|
359
376
|
end
|
data/lib/nokogiri/css.rb
CHANGED
@@ -8,53 +8,119 @@ module Nokogiri
|
|
8
8
|
# TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
|
9
9
|
# It is not used by Nokogiri and shouldn't be part of the public API.
|
10
10
|
def parse(selector) # :nodoc:
|
11
|
+
warn("Nokogiri::CSS.parse is deprecated and will be removed in a future version of Nokogiri. Use Nokogiri::CSS::Parser#parse instead.", uplevel: 1, category: :deprecated)
|
11
12
|
Parser.new.parse(selector)
|
12
13
|
end
|
13
14
|
|
14
15
|
# :call-seq:
|
15
|
-
# xpath_for(
|
16
|
-
# xpath_for(
|
16
|
+
# xpath_for(selector_list) → Array<String>
|
17
|
+
# xpath_for(selector_list [, prefix:] [, ns:] [, visitor:] [, cache:]) → Array<String>
|
17
18
|
#
|
18
|
-
# Translate a CSS selector to the equivalent XPath
|
19
|
+
# Translate a CSS selector list to the equivalent XPath expressions.
|
20
|
+
#
|
21
|
+
# 💡 Note that translated queries are cached by default for performance concerns.
|
22
|
+
#
|
23
|
+
# ⚠ Users should prefer Nokogiri::XML::Searchable#css, which is mixed into all document and
|
24
|
+
# node classes, for querying documents with CSS selectors. This method is the underlying
|
25
|
+
# mechanism used by XML::Searchable and is provided solely for advanced users to translate
|
26
|
+
# \CSS selectors to XPath directly.
|
27
|
+
#
|
28
|
+
# Also see Nokogiri::XML::Searchable#css for documentation on supported CSS selector features,
|
29
|
+
# some extended syntax that Nokogiri supports, and advanced CSS features like pseudo-class
|
30
|
+
# functions.
|
19
31
|
#
|
20
32
|
# [Parameters]
|
21
|
-
# - +
|
33
|
+
# - +selector_list+ (String)
|
22
34
|
#
|
35
|
+
# The CSS selector to be translated into XPath. This is always a String, but that string
|
36
|
+
# value may be a {selector list}[https://www.w3.org/TR/selectors-4/#grouping] (see
|
37
|
+
# examples).
|
38
|
+
#
|
39
|
+
# [Keyword arguments]
|
23
40
|
# - +prefix:+ (String)
|
24
41
|
#
|
25
|
-
# The XPath prefix
|
26
|
-
# +
|
42
|
+
# The XPath expression prefix which determines the search context. See Nokogiri::XML::XPath
|
43
|
+
# for standard options. Default is +XPath::GLOBAL_SEARCH_PREFIX+.
|
44
|
+
#
|
45
|
+
# - +ns:+ (Hash<String ⇒ String>, nil)
|
46
|
+
#
|
47
|
+
# Namespaces that are referenced in the query, if any. This is a hash where the keys are the
|
48
|
+
# namespace prefix and the values are the namespace URIs. Default is +nil+ indicating an
|
49
|
+
# empty set of namespaces.
|
27
50
|
#
|
28
51
|
# - +visitor:+ (Nokogiri::CSS::XPathVisitor)
|
29
52
|
#
|
30
|
-
#
|
31
|
-
#
|
53
|
+
# Use this XPathVisitor object to transform the CSS AST into XPath expressions. See
|
54
|
+
# Nokogiri::CSS::XPathVisitor for more information on some of the complex behavior that can
|
55
|
+
# be customized for your document type. Default is +Nokogiri::CSS::XPathVisitor.new+.
|
56
|
+
#
|
57
|
+
# ⚠ Note that this option is mutually exclusive with +prefix+ and +ns+. If +visitor+ is
|
58
|
+
# provided, +prefix+ and +ns+ must not be present.
|
59
|
+
#
|
60
|
+
# - +cache:+ (Boolean)
|
61
|
+
#
|
62
|
+
# Whether to use the SelectorCache for the translated query to ensure that repeated queries
|
63
|
+
# don't incur the overhead of re-parsing the selector. Default is +true+.
|
32
64
|
#
|
33
|
-
#
|
65
|
+
# [Returns] (Array<String>) The equivalent set of XPath expressions for +selector_list+
|
34
66
|
#
|
35
|
-
#
|
36
|
-
# the namespace prefix and the values are the namespace URIs. Default is an empty Hash.
|
67
|
+
# *Example* with a simple selector:
|
37
68
|
#
|
38
|
-
#
|
69
|
+
# Nokogiri::CSS.xpath_for("div") # => ["//div"]
|
39
70
|
#
|
40
|
-
#
|
71
|
+
# *Example* with a compound selector:
|
41
72
|
#
|
42
|
-
|
43
|
-
|
73
|
+
# Nokogiri::CSS.xpath_for("div.xl") # => ["//div[contains(concat(' ',normalize-space(@class),' '),' xl ')]"]
|
74
|
+
#
|
75
|
+
# *Example* with a complex selector:
|
76
|
+
#
|
77
|
+
# Nokogiri::CSS.xpath_for("h1 + div") # => ["//h1/following-sibling::*[1]/self::div"]
|
78
|
+
#
|
79
|
+
# *Example* with a selector list:
|
80
|
+
#
|
81
|
+
# Nokogiri::CSS.xpath_for("h1, h2, h3") # => ["//h1", "//h2", "//h3"]
|
82
|
+
#
|
83
|
+
def xpath_for(
|
84
|
+
selector, options = nil,
|
85
|
+
prefix: options&.delete(:prefix),
|
86
|
+
visitor: options&.delete(:visitor),
|
87
|
+
ns: options&.delete(:ns),
|
88
|
+
cache: true
|
89
|
+
)
|
90
|
+
unless options.nil?
|
91
|
+
warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated)
|
92
|
+
end
|
93
|
+
|
94
|
+
raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str)
|
44
95
|
|
45
96
|
selector = selector.to_str
|
46
|
-
raise
|
97
|
+
raise(Nokogiri::CSS::SyntaxError, "empty CSS selector") if selector.empty?
|
98
|
+
|
99
|
+
if visitor
|
100
|
+
raise ArgumentError, "cannot provide both :prefix and :visitor" if prefix
|
101
|
+
raise ArgumentError, "cannot provide both :ns and :visitor" if ns
|
102
|
+
end
|
103
|
+
|
104
|
+
visitor ||= begin
|
105
|
+
visitor_kw = {}
|
106
|
+
visitor_kw[:prefix] = prefix if prefix
|
107
|
+
visitor_kw[:namespaces] = ns if ns
|
47
108
|
|
48
|
-
|
49
|
-
|
50
|
-
ns = options.fetch(:ns, {})
|
109
|
+
Nokogiri::CSS::XPathVisitor.new(**visitor_kw)
|
110
|
+
end
|
51
111
|
|
52
|
-
|
112
|
+
if cache
|
113
|
+
key = SelectorCache.key(selector: selector, visitor: visitor)
|
114
|
+
SelectorCache[key] ||= Parser.new.xpath_for(selector, visitor)
|
115
|
+
else
|
116
|
+
Parser.new.xpath_for(selector, visitor)
|
117
|
+
end
|
53
118
|
end
|
54
119
|
end
|
55
120
|
end
|
56
121
|
end
|
57
122
|
|
123
|
+
require_relative "css/selector_cache"
|
58
124
|
require_relative "css/node"
|
59
125
|
require_relative "css/xpath_visitor"
|
60
126
|
x = $-w
|
@@ -23,11 +23,9 @@ module Nokogiri
|
|
23
23
|
list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
|
24
24
|
end
|
25
25
|
else
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
)
|
30
|
-
end
|
26
|
+
list = xpath(
|
27
|
+
*CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX, cache: false),
|
28
|
+
)
|
31
29
|
end
|
32
30
|
|
33
31
|
super if list.empty?
|
@@ -6,9 +6,9 @@ module Nokogiri
|
|
6
6
|
# Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
|
7
7
|
USEFUL_ALIASES = {
|
8
8
|
# alias_name => true_name
|
9
|
-
"
|
9
|
+
"ISO-2022-JP" => "ISO-2022-JP", # only for JRuby tests, this is a no-op in CRuby
|
10
|
+
"NOKOGIRI-SENTINEL" => "ISO-2022-JP", # indicating the Nokogiri has installed aliases
|
10
11
|
"Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
|
11
|
-
"UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
|
12
12
|
}
|
13
13
|
|
14
14
|
class << self
|
@@ -92,7 +92,7 @@ module Nokogiri
|
|
92
92
|
title = XML::Node.new("title", self) << tnode
|
93
93
|
if (head = at_xpath("//head"))
|
94
94
|
head << title
|
95
|
-
elsif (meta =
|
95
|
+
elsif (meta = at_xpath("//meta[@charset]") || meta_content_type)
|
96
96
|
# better put after charset declaration
|
97
97
|
meta.add_next_sibling(title)
|
98
98
|
else
|
@@ -161,52 +161,73 @@ module Nokogiri
|
|
161
161
|
end
|
162
162
|
|
163
163
|
class << self
|
164
|
-
|
165
|
-
#
|
166
|
-
#
|
167
|
-
#
|
168
|
-
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
172
|
-
|
164
|
+
# :call-seq:
|
165
|
+
# parse(input) { |options| ... } => Nokogiri::HTML4::Document
|
166
|
+
# parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
|
167
|
+
#
|
168
|
+
# Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
|
169
|
+
#
|
170
|
+
# [Required Parameters]
|
171
|
+
# - +input+ (String | IO) The content to be parsed.
|
172
|
+
#
|
173
|
+
# [Optional Keyword Arguments]
|
174
|
+
# - +url:+ (String) The base URI for this document.
|
175
|
+
#
|
176
|
+
# - +encoding:+ (String) The name of the encoding that should be used when processing the
|
177
|
+
# document. When not provided, the encoding will be determined based on the document
|
178
|
+
# content.
|
179
|
+
#
|
180
|
+
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
|
181
|
+
# behaviors during parsing. See ParseOptions for more information. The default value is
|
182
|
+
# +ParseOptions::DEFAULT_HTML+.
|
183
|
+
#
|
184
|
+
# [Yields]
|
185
|
+
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
|
186
|
+
# can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
|
187
|
+
#
|
188
|
+
# [Returns] Nokogiri::HTML4::Document
|
189
|
+
def parse(
|
190
|
+
input,
|
191
|
+
url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
|
192
|
+
url: url_, encoding: encoding_, options: options_
|
193
|
+
)
|
173
194
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
174
195
|
yield options if block_given?
|
175
196
|
|
176
|
-
url ||=
|
197
|
+
url ||= input.respond_to?(:path) ? input.path : nil
|
177
198
|
|
178
|
-
if
|
179
|
-
unless
|
180
|
-
encoding ||=
|
199
|
+
if input.respond_to?(:encoding)
|
200
|
+
unless input.encoding == Encoding::ASCII_8BIT
|
201
|
+
encoding ||= input.encoding.name
|
181
202
|
end
|
182
203
|
end
|
183
204
|
|
184
|
-
if
|
185
|
-
if
|
205
|
+
if input.respond_to?(:read)
|
206
|
+
if input.is_a?(Pathname)
|
186
207
|
# resolve the Pathname to the file and open it as an IO object, see #2110
|
187
|
-
|
188
|
-
url ||=
|
208
|
+
input = input.expand_path.open
|
209
|
+
url ||= input.path
|
189
210
|
end
|
190
211
|
|
191
212
|
unless encoding
|
192
|
-
|
213
|
+
input = EncodingReader.new(input)
|
193
214
|
begin
|
194
|
-
return read_io(
|
215
|
+
return read_io(input, url, encoding, options.to_i)
|
195
216
|
rescue EncodingReader::EncodingFound => e
|
196
217
|
encoding = e.found_encoding
|
197
218
|
end
|
198
219
|
end
|
199
|
-
return read_io(
|
220
|
+
return read_io(input, url, encoding, options.to_i)
|
200
221
|
end
|
201
222
|
|
202
223
|
# read_memory pukes on empty docs
|
203
|
-
if
|
224
|
+
if input.nil? || input.empty?
|
204
225
|
return encoding ? new.tap { |i| i.encoding = encoding } : new
|
205
226
|
end
|
206
227
|
|
207
|
-
encoding ||= EncodingReader.detect_encoding(
|
228
|
+
encoding ||= EncodingReader.detect_encoding(input)
|
208
229
|
|
209
|
-
read_memory(
|
230
|
+
read_memory(input, url, encoding, options.to_i)
|
210
231
|
end
|
211
232
|
end
|
212
233
|
end
|
@@ -3,13 +3,83 @@
|
|
3
3
|
module Nokogiri
|
4
4
|
module HTML4
|
5
5
|
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
6
|
-
|
7
|
-
#
|
8
|
-
|
6
|
+
#
|
7
|
+
# :call-seq:
|
8
|
+
# parse(input) { |options| ... } → HTML4::DocumentFragment
|
9
|
+
# parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
|
10
|
+
#
|
11
|
+
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
|
12
|
+
# method creates a new, empty HTML4::Document to contain the fragment.
|
13
|
+
#
|
14
|
+
# [Required Parameters]
|
15
|
+
# - +input+ (String | IO) The content to be parsed.
|
16
|
+
#
|
17
|
+
# [Optional Keyword Arguments]
|
18
|
+
# - +encoding:+ (String) The name of the encoding that should be used when processing the
|
19
|
+
# document. When not provided, the encoding will be determined based on the document
|
20
|
+
# content.
|
21
|
+
#
|
22
|
+
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
|
23
|
+
# behaviors during parsing. See ParseOptions for more information. The default value is
|
24
|
+
# +ParseOptions::DEFAULT_HTML+.
|
25
|
+
#
|
26
|
+
# [Yields]
|
27
|
+
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
|
28
|
+
# can be configured before parsing. See ParseOptions for more information.
|
29
|
+
#
|
30
|
+
# [Returns] HTML4::DocumentFragment
|
31
|
+
#
|
32
|
+
# *Example:* Parsing a string
|
33
|
+
#
|
34
|
+
# fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
|
35
|
+
#
|
36
|
+
# *Example:* Parsing an IO
|
37
|
+
#
|
38
|
+
# fragment = File.open("fragment.html") do |file|
|
39
|
+
# HTML4::DocumentFragment.parse(file)
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
# *Example:* Specifying encoding
|
43
|
+
#
|
44
|
+
# fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
|
45
|
+
#
|
46
|
+
# *Example:* Setting parse options dynamically
|
47
|
+
#
|
48
|
+
# HTML4::DocumentFragment.parse("<div>Hello World") do |options|
|
49
|
+
# options.huge.pedantic
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
def self.parse(
|
53
|
+
input,
|
54
|
+
encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
|
55
|
+
encoding: encoding_, options: options_,
|
56
|
+
&block
|
57
|
+
)
|
58
|
+
# TODO: this method should take a context node.
|
9
59
|
doc = HTML4::Document.new
|
10
60
|
|
11
|
-
|
12
|
-
|
61
|
+
if input.respond_to?(:read)
|
62
|
+
# Handle IO-like objects (IO, File, StringIO, etc.)
|
63
|
+
# The _read_ method of these objects doesn't accept an +encoding+ parameter.
|
64
|
+
# Encoding is usually set when the IO object is created or opened,
|
65
|
+
# or by using the _set_encoding_ method.
|
66
|
+
#
|
67
|
+
# 1. If +encoding+ is provided and the object supports _set_encoding_,
|
68
|
+
# set the encoding before reading.
|
69
|
+
# 2. Read the content from the IO-like object.
|
70
|
+
#
|
71
|
+
# Note: After reading, the content's encoding will be:
|
72
|
+
# - The encoding set by _set_encoding_ if it was called
|
73
|
+
# - The default encoding of the IO object otherwise
|
74
|
+
#
|
75
|
+
# For StringIO specifically, _set_encoding_ affects only the internal string,
|
76
|
+
# not how the data is read out.
|
77
|
+
input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
|
78
|
+
input = input.read
|
79
|
+
end
|
80
|
+
|
81
|
+
encoding ||= if input.respond_to?(:encoding)
|
82
|
+
encoding = input.encoding
|
13
83
|
if encoding == ::Encoding::ASCII_8BIT
|
14
84
|
"UTF-8"
|
15
85
|
else
|
@@ -21,29 +91,71 @@ module Nokogiri
|
|
21
91
|
|
22
92
|
doc.encoding = encoding
|
23
93
|
|
24
|
-
new(doc,
|
94
|
+
new(doc, input, options: options, &block)
|
25
95
|
end
|
26
96
|
|
27
|
-
|
28
|
-
|
97
|
+
#
|
98
|
+
# :call-seq:
|
99
|
+
# new(document) { |options| ... } → HTML4::DocumentFragment
|
100
|
+
# new(document, input) { |options| ... } → HTML4::DocumentFragment
|
101
|
+
# new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
|
102
|
+
#
|
103
|
+
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
|
104
|
+
#
|
105
|
+
# 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
|
106
|
+
# than call this method directly.
|
107
|
+
#
|
108
|
+
# [Required Parameters]
|
109
|
+
# - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
|
110
|
+
#
|
111
|
+
# [Optional Parameters]
|
112
|
+
# - +input+ (String) The content to be parsed.
|
113
|
+
#
|
114
|
+
# [Optional Keyword Arguments]
|
115
|
+
# - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
|
116
|
+
# below for more information.
|
117
|
+
#
|
118
|
+
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
|
119
|
+
# behaviors during parsing. See ParseOptions for more information. The default value is
|
120
|
+
# +ParseOptions::DEFAULT_HTML+.
|
121
|
+
#
|
122
|
+
# [Yields]
|
123
|
+
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
|
124
|
+
# can be configured before parsing. See ParseOptions for more information.
|
125
|
+
#
|
126
|
+
# [Returns] HTML4::DocumentFragment
|
127
|
+
#
|
128
|
+
# === Context \Node
|
129
|
+
#
|
130
|
+
# If a context node is specified using +context:+, then the fragment will be created by
|
131
|
+
# calling XML::Node#parse on that node, so the parser will behave as if that Node is the
|
132
|
+
# parent of the fragment subtree.
|
133
|
+
#
|
134
|
+
def initialize(
|
135
|
+
document, input = nil,
|
136
|
+
context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
|
137
|
+
context: context_, options: options_
|
138
|
+
) # rubocop:disable Lint/MissingSuper
|
139
|
+
return self unless input
|
29
140
|
|
30
141
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
142
|
+
@parse_options = options
|
31
143
|
yield options if block_given?
|
32
144
|
|
33
|
-
if
|
145
|
+
if context
|
34
146
|
preexisting_errors = document.errors.dup
|
35
|
-
node_set =
|
147
|
+
node_set = context.parse("<div>#{input}</div>", options)
|
36
148
|
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
37
149
|
self.errors = document.errors - preexisting_errors
|
38
150
|
else
|
39
151
|
# This is a horrible hack, but I don't care
|
40
|
-
path = if /^\s*?<body/i.match?(
|
152
|
+
path = if /^\s*?<body/i.match?(input)
|
41
153
|
"/html/body"
|
42
154
|
else
|
43
155
|
"/html/body/node()"
|
44
156
|
end
|
45
157
|
|
46
|
-
temp_doc = HTML4::Document.parse("<html><body>#{
|
158
|
+
temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
|
47
159
|
temp_doc.xpath(path).each { |child| child.parent = self }
|
48
160
|
self.errors = temp_doc.errors
|
49
161
|
end
|
@@ -26,7 +26,7 @@ module Nokogiri
|
|
26
26
|
|
27
27
|
def initialize
|
28
28
|
@encoding = nil
|
29
|
-
super
|
29
|
+
super
|
30
30
|
end
|
31
31
|
|
32
32
|
def start_element(name, attrs = [])
|
@@ -94,7 +94,7 @@ module Nokogiri
|
|
94
94
|
# no support for a call without len
|
95
95
|
|
96
96
|
unless @firstchunk
|
97
|
-
(@firstchunk = @io.read(len)) ||
|
97
|
+
(@firstchunk = @io.read(len)) || return
|
98
98
|
|
99
99
|
# This implementation expects that the first call from
|
100
100
|
# htmlReadIO() is made with a length long enough (~1KB) to
|