nokogiri 1.12.2 → 1.13.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +9 -7
- data/bin/nokogiri +63 -50
- data/dependencies.yml +5 -6
- data/ext/nokogiri/extconf.rb +51 -35
- data/ext/nokogiri/gumbo.c +11 -11
- data/ext/nokogiri/html4_element_description.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +2 -1
- data/ext/nokogiri/nokogiri.c +1 -1
- data/ext/nokogiri/nokogiri.h +3 -0
- data/ext/nokogiri/xml_document.c +36 -36
- data/ext/nokogiri/xml_document_fragment.c +0 -2
- data/ext/nokogiri/xml_dtd.c +2 -2
- data/ext/nokogiri/xml_encoding_handler.c +25 -11
- data/ext/nokogiri/xml_namespace.c +2 -2
- data/ext/nokogiri/xml_node.c +647 -335
- data/ext/nokogiri/xml_reader.c +37 -11
- data/ext/nokogiri/xml_xpath_context.c +72 -49
- data/gumbo-parser/src/parser.c +0 -11
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +9 -8
- data/lib/nokogiri/css/parser.rb +11 -3
- data/lib/nokogiri/css/parser.y +10 -2
- data/lib/nokogiri/css/parser_extras.rb +20 -20
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +2 -1
- data/lib/nokogiri/css/tokenizer.rex +2 -1
- data/lib/nokogiri/css/xpath_visitor.rb +174 -75
- data/lib/nokogiri/css.rb +38 -6
- data/lib/nokogiri/decorators/slop.rb +8 -7
- data/lib/nokogiri/extension.rb +1 -1
- data/lib/nokogiri/gumbo.rb +1 -0
- data/lib/nokogiri/html.rb +16 -10
- data/lib/nokogiri/html4/builder.rb +1 -0
- data/lib/nokogiri/html4/document.rb +84 -75
- data/lib/nokogiri/html4/document_fragment.rb +11 -7
- data/lib/nokogiri/html4/element_description.rb +1 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +426 -520
- data/lib/nokogiri/html4/entity_lookup.rb +2 -1
- data/lib/nokogiri/html4/sax/parser.rb +2 -1
- data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
- data/lib/nokogiri/html4.rb +11 -5
- data/lib/nokogiri/html5/document.rb +24 -10
- data/lib/nokogiri/html5/document_fragment.rb +5 -2
- data/lib/nokogiri/html5/node.rb +6 -3
- data/lib/nokogiri/html5.rb +68 -64
- data/lib/nokogiri/jruby/dependencies.rb +10 -9
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version/constant.rb +2 -1
- data/lib/nokogiri/version/info.rb +19 -13
- data/lib/nokogiri/version.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +5 -3
- data/lib/nokogiri/xml/attribute_decl.rb +2 -1
- data/lib/nokogiri/xml/builder.rb +69 -31
- data/lib/nokogiri/xml/cdata.rb +2 -1
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +178 -96
- data/lib/nokogiri/xml/document_fragment.rb +41 -38
- data/lib/nokogiri/xml/dtd.rb +3 -2
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +2 -1
- data/lib/nokogiri/xml/entity_decl.rb +3 -2
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +2 -0
- data/lib/nokogiri/xml/node/save_options.rb +7 -4
- data/lib/nokogiri/xml/node.rb +512 -348
- data/lib/nokogiri/xml/node_set.rb +46 -54
- data/lib/nokogiri/xml/notation.rb +12 -0
- data/lib/nokogiri/xml/parse_options.rb +11 -7
- data/lib/nokogiri/xml/pp/character_data.rb +8 -6
- data/lib/nokogiri/xml/pp/node.rb +24 -26
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/reader.rb +17 -19
- data/lib/nokogiri/xml/relax_ng.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +20 -19
- data/lib/nokogiri/xml/sax/parser.rb +36 -34
- data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +7 -6
- data/lib/nokogiri/xml/searchable.rb +42 -22
- data/lib/nokogiri/xml/syntax_error.rb +4 -4
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
- data/lib/nokogiri/xml/xpath.rb +12 -0
- data/lib/nokogiri/xml/xpath_context.rb +2 -3
- data/lib/nokogiri/xml.rb +3 -3
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/nokogiri/xslt.rb +3 -2
- data/lib/nokogiri.rb +19 -16
- data/lib/xsd/xmlparser/nokogiri.rb +25 -24
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- metadata +101 -27
@@ -1,6 +1,7 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
+
require "pathname"
|
4
5
|
|
5
6
|
module Nokogiri
|
6
7
|
module HTML4
|
@@ -9,11 +10,10 @@ module Nokogiri
|
|
9
10
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
10
11
|
# then nil is returned.
|
11
12
|
def meta_encoding
|
12
|
-
|
13
|
-
when meta = at('//meta[@charset]')
|
13
|
+
if (meta = at_xpath("//meta[@charset]"))
|
14
14
|
meta[:charset]
|
15
|
-
|
16
|
-
meta[
|
15
|
+
elsif (meta = meta_content_type)
|
16
|
+
meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
@@ -33,24 +33,22 @@ module Nokogiri
|
|
33
33
|
#
|
34
34
|
# Beware in CRuby, that libxml2 automatically inserts a meta tag
|
35
35
|
# into a head element.
|
36
|
-
def meta_encoding=
|
37
|
-
|
38
|
-
|
39
|
-
meta['content'] = 'text/html; charset=%s' % encoding
|
36
|
+
def meta_encoding=(encoding)
|
37
|
+
if (meta = meta_content_type)
|
38
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
40
39
|
encoding
|
41
|
-
|
42
|
-
meta[
|
40
|
+
elsif (meta = at_xpath("//meta[@charset]"))
|
41
|
+
meta["charset"] = encoding
|
43
42
|
else
|
44
|
-
meta = XML::Node.new(
|
45
|
-
if dtd = internal_subset
|
46
|
-
meta[
|
43
|
+
meta = XML::Node.new("meta", self)
|
44
|
+
if (dtd = internal_subset) && dtd.html5_dtd?
|
45
|
+
meta["charset"] = encoding
|
47
46
|
else
|
48
|
-
meta[
|
49
|
-
meta[
|
47
|
+
meta["http-equiv"] = "Content-Type"
|
48
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
50
49
|
end
|
51
50
|
|
52
|
-
|
53
|
-
when head = at('//head')
|
51
|
+
if (head = at_xpath("//head"))
|
54
52
|
head.prepend_child(meta)
|
55
53
|
else
|
56
54
|
set_metadata_element(meta)
|
@@ -60,9 +58,9 @@ module Nokogiri
|
|
60
58
|
end
|
61
59
|
|
62
60
|
def meta_content_type
|
63
|
-
xpath(
|
64
|
-
node[
|
65
|
-
|
61
|
+
xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
|
62
|
+
node["http-equiv"] =~ /\AContent-Type\z/i
|
63
|
+
end
|
66
64
|
end
|
67
65
|
private :meta_content_type
|
68
66
|
|
@@ -70,7 +68,7 @@ module Nokogiri
|
|
70
68
|
# Get the title string of this document. Return nil if there is
|
71
69
|
# no title tag.
|
72
70
|
def title
|
73
|
-
title =
|
71
|
+
(title = at_xpath("//title")) && title.inner_text
|
74
72
|
end
|
75
73
|
|
76
74
|
###
|
@@ -86,44 +84,41 @@ module Nokogiri
|
|
86
84
|
# content element (typically <body>) if any.
|
87
85
|
def title=(text)
|
88
86
|
tnode = XML::Text.new(text, self)
|
89
|
-
if title =
|
87
|
+
if (title = at_xpath("//title"))
|
90
88
|
title.children = tnode
|
91
89
|
return text
|
92
90
|
end
|
93
91
|
|
94
|
-
title = XML::Node.new(
|
95
|
-
|
96
|
-
when head = at('//head')
|
92
|
+
title = XML::Node.new("title", self) << tnode
|
93
|
+
if (head = at_xpath("//head"))
|
97
94
|
head << title
|
98
|
-
|
95
|
+
elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
|
99
96
|
# better put after charset declaration
|
100
97
|
meta.add_next_sibling(title)
|
101
98
|
else
|
102
99
|
set_metadata_element(title)
|
103
100
|
end
|
104
|
-
text
|
105
101
|
end
|
106
102
|
|
107
|
-
def set_metadata_element(element)
|
108
|
-
|
109
|
-
when head = at('//head')
|
103
|
+
def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
|
104
|
+
if (head = at_xpath("//head"))
|
110
105
|
head << element
|
111
|
-
|
112
|
-
head = html.prepend_child(XML::Node.new(
|
106
|
+
elsif (html = at_xpath("//html"))
|
107
|
+
head = html.prepend_child(XML::Node.new("head", self))
|
113
108
|
head.prepend_child(element)
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
109
|
+
elsif (first = children.find do |node|
|
110
|
+
case node
|
111
|
+
when XML::Element, XML::Text
|
112
|
+
true
|
113
|
+
end
|
114
|
+
end)
|
120
115
|
# We reach here only if the underlying document model
|
121
116
|
# allows <html>/<head> elements to be omitted and does not
|
122
117
|
# automatically supply them.
|
123
118
|
first.add_previous_sibling(element)
|
124
119
|
else
|
125
|
-
html = add_child(XML::Node.new(
|
126
|
-
head = html.add_child(XML::Node.new(
|
120
|
+
html = add_child(XML::Node.new("html", self))
|
121
|
+
head = html.add_child(XML::Node.new("head", self))
|
127
122
|
head.prepend_child(element)
|
128
123
|
end
|
129
124
|
end
|
@@ -143,15 +138,25 @@ module Nokogiri
|
|
143
138
|
# config.format.as_xml
|
144
139
|
# end
|
145
140
|
#
|
146
|
-
def serialize
|
141
|
+
def serialize(options = {})
|
147
142
|
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
148
143
|
super
|
149
144
|
end
|
150
145
|
|
151
146
|
####
|
152
147
|
# Create a Nokogiri::XML::DocumentFragment from +tags+
|
153
|
-
def fragment
|
154
|
-
DocumentFragment.new(self, tags,
|
148
|
+
def fragment(tags = nil)
|
149
|
+
DocumentFragment.new(self, tags, root)
|
150
|
+
end
|
151
|
+
|
152
|
+
# :call-seq:
|
153
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
154
|
+
#
|
155
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
156
|
+
#
|
157
|
+
# See XPathVisitor for more information.
|
158
|
+
def xpath_doctype
|
159
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
|
155
160
|
end
|
156
161
|
|
157
162
|
class << self
|
@@ -163,9 +168,8 @@ module Nokogiri
|
|
163
168
|
# is a number that sets options in the parser, such as
|
164
169
|
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
165
170
|
# Nokogiri::XML::ParseOptions.
|
166
|
-
def parse
|
171
|
+
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
167
172
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
168
|
-
|
169
173
|
yield options if block_given?
|
170
174
|
|
171
175
|
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
@@ -206,7 +210,7 @@ module Nokogiri
|
|
206
210
|
end
|
207
211
|
|
208
212
|
# read_memory pukes on empty docs
|
209
|
-
if string_or_io.nil?
|
213
|
+
if string_or_io.nil? || string_or_io.empty?
|
210
214
|
return encoding ? new.tap { |i| i.encoding = encoding } : new
|
211
215
|
end
|
212
216
|
|
@@ -216,37 +220,38 @@ module Nokogiri
|
|
216
220
|
end
|
217
221
|
end
|
218
222
|
|
219
|
-
class EncodingFound < StandardError # :nodoc:
|
223
|
+
class EncodingFound < StandardError # :nodoc: all
|
220
224
|
attr_reader :found_encoding
|
221
225
|
|
222
226
|
def initialize(encoding)
|
223
227
|
@found_encoding = encoding
|
224
|
-
super("encoding found: %s"
|
228
|
+
super(format("encoding found: %s", encoding))
|
225
229
|
end
|
226
230
|
end
|
227
231
|
|
228
|
-
|
229
|
-
|
232
|
+
# :nodoc: all
|
233
|
+
class EncodingReader
|
234
|
+
class SAXHandler < Nokogiri::XML::SAX::Document
|
230
235
|
attr_reader :encoding
|
231
|
-
|
236
|
+
|
232
237
|
def initialize
|
233
238
|
@encoding = nil
|
234
239
|
super()
|
235
240
|
end
|
236
|
-
|
241
|
+
|
237
242
|
def start_element(name, attrs = [])
|
238
|
-
return unless name ==
|
243
|
+
return unless name == "meta"
|
239
244
|
attr = Hash[attrs]
|
240
|
-
charset = attr[
|
241
|
-
@encoding = charset
|
242
|
-
http_equiv = attr[
|
243
|
-
http_equiv.match(/\AContent-Type\z/i)
|
244
|
-
content = attr[
|
245
|
-
m = content.match(/;\s*charset\s*=\s*([\w-]+)/)
|
246
|
-
@encoding = m[1]
|
245
|
+
(charset = attr["charset"]) &&
|
246
|
+
(@encoding = charset)
|
247
|
+
(http_equiv = attr["http-equiv"]) &&
|
248
|
+
http_equiv.match(/\AContent-Type\z/i) &&
|
249
|
+
(content = attr["content"]) &&
|
250
|
+
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
251
|
+
(@encoding = m[1])
|
247
252
|
end
|
248
253
|
end
|
249
|
-
|
254
|
+
|
250
255
|
class JumpSAXHandler < SAXHandler
|
251
256
|
def initialize(jumptag)
|
252
257
|
@jumptag = jumptag
|
@@ -255,26 +260,30 @@ module Nokogiri
|
|
255
260
|
|
256
261
|
def start_element(name, attrs = [])
|
257
262
|
super
|
258
|
-
throw
|
259
|
-
throw
|
263
|
+
throw(@jumptag, @encoding) if @encoding
|
264
|
+
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
260
265
|
end
|
261
266
|
end
|
262
267
|
|
263
268
|
def self.detect_encoding(chunk)
|
264
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)
|
265
|
-
return Nokogiri.XML(m[1]).encoding
|
269
|
+
(m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)) &&
|
270
|
+
(return Nokogiri.XML(m[1]).encoding)
|
266
271
|
|
267
272
|
if Nokogiri.jruby?
|
268
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)
|
269
|
-
return m[4]
|
270
|
-
catch(:encoding_found)
|
273
|
+
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
274
|
+
(return m[4])
|
275
|
+
catch(:encoding_found) do
|
271
276
|
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
272
277
|
nil
|
273
|
-
|
278
|
+
end
|
274
279
|
else
|
275
280
|
handler = SAXHandler.new
|
276
281
|
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
277
|
-
|
282
|
+
begin
|
283
|
+
parser << chunk
|
284
|
+
rescue
|
285
|
+
Nokogiri::SyntaxError
|
286
|
+
end
|
278
287
|
handler.encoding
|
279
288
|
end
|
280
289
|
end
|
@@ -293,13 +302,13 @@ module Nokogiri
|
|
293
302
|
def read(len)
|
294
303
|
# no support for a call without len
|
295
304
|
|
296
|
-
|
297
|
-
@firstchunk = @io.read(len)
|
305
|
+
unless @firstchunk
|
306
|
+
(@firstchunk = @io.read(len)) || (return nil)
|
298
307
|
|
299
308
|
# This implementation expects that the first call from
|
300
309
|
# htmlReadIO() is made with a length long enough (~1KB) to
|
301
310
|
# achieve advanced encoding detection.
|
302
|
-
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
311
|
+
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
303
312
|
# The first chunk is stored for the next read in retry.
|
304
313
|
raise @encoding_found = EncodingFound.new(encoding)
|
305
314
|
end
|
@@ -308,7 +317,7 @@ module Nokogiri
|
|
308
317
|
|
309
318
|
ret = @firstchunk.slice!(0, len)
|
310
319
|
if (len -= ret.length) > 0
|
311
|
-
rest = @io.read(len)
|
320
|
+
(rest = @io.read(len)) && ret << (rest)
|
312
321
|
end
|
313
322
|
if ret.empty?
|
314
323
|
nil
|
@@ -1,34 +1,38 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Nokogiri
|
3
4
|
module HTML4
|
4
5
|
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
5
6
|
####
|
6
7
|
# Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
|
7
|
-
def self.parse(tags, encoding = nil)
|
8
|
+
def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
8
9
|
doc = HTML4::Document.new
|
9
10
|
|
10
11
|
encoding ||= if tags.respond_to?(:encoding)
|
11
12
|
encoding = tags.encoding
|
12
13
|
if encoding == ::Encoding::ASCII_8BIT
|
13
|
-
|
14
|
+
"UTF-8"
|
14
15
|
else
|
15
16
|
encoding.name
|
16
17
|
end
|
17
18
|
else
|
18
|
-
|
19
|
+
"UTF-8"
|
19
20
|
end
|
20
21
|
|
21
22
|
doc.encoding = encoding
|
22
23
|
|
23
|
-
new(doc, tags)
|
24
|
+
new(doc, tags, nil, options, &block)
|
24
25
|
end
|
25
26
|
|
26
|
-
def initialize(document, tags = nil, ctx = nil)
|
27
|
+
def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
27
28
|
return self unless tags
|
28
29
|
|
30
|
+
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
31
|
+
yield options if block_given?
|
32
|
+
|
29
33
|
if ctx
|
30
34
|
preexisting_errors = document.errors.dup
|
31
|
-
node_set = ctx.parse("<div>#{tags}</div>")
|
35
|
+
node_set = ctx.parse("<div>#{tags}</div>", options)
|
32
36
|
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
33
37
|
self.errors = document.errors - preexisting_errors
|
34
38
|
else
|
@@ -39,7 +43,7 @@ module Nokogiri
|
|
39
43
|
"/html/body/node()"
|
40
44
|
end
|
41
45
|
|
42
|
-
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
|
46
|
+
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
|
43
47
|
temp_doc.xpath(path).each { |child| child.parent = self }
|
44
48
|
self.errors = temp_doc.errors
|
45
49
|
end
|