nokogiri 1.12.5 → 1.13.6
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +9 -7
- data/bin/nokogiri +63 -50
- data/dependencies.yml +13 -64
- data/ext/nokogiri/extconf.rb +64 -44
- data/ext/nokogiri/html4_sax_parser_context.c +2 -3
- data/ext/nokogiri/xml_document.c +35 -35
- data/ext/nokogiri/xml_document_fragment.c +0 -2
- data/ext/nokogiri/xml_dtd.c +2 -2
- data/ext/nokogiri/xml_encoding_handler.c +25 -11
- data/ext/nokogiri/xml_node.c +638 -333
- data/ext/nokogiri/xml_reader.c +37 -11
- data/ext/nokogiri/xml_sax_parser_context.c +10 -3
- data/ext/nokogiri/xml_xpath_context.c +72 -49
- data/ext/nokogiri/xslt_stylesheet.c +107 -9
- data/gumbo-parser/src/parser.c +0 -11
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +9 -8
- data/lib/nokogiri/css/parser.rb +360 -341
- data/lib/nokogiri/css/parser.y +249 -244
- data/lib/nokogiri/css/parser_extras.rb +22 -20
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +179 -82
- data/lib/nokogiri/css.rb +38 -6
- data/lib/nokogiri/decorators/slop.rb +8 -7
- data/lib/nokogiri/extension.rb +1 -1
- data/lib/nokogiri/gumbo.rb +1 -0
- data/lib/nokogiri/html.rb +16 -10
- data/lib/nokogiri/html4/builder.rb +1 -0
- data/lib/nokogiri/html4/document.rb +88 -77
- data/lib/nokogiri/html4/document_fragment.rb +11 -7
- data/lib/nokogiri/html4/element_description.rb +1 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +426 -520
- data/lib/nokogiri/html4/entity_lookup.rb +2 -1
- data/lib/nokogiri/html4/sax/parser.rb +5 -2
- data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
- data/lib/nokogiri/html4.rb +11 -5
- data/lib/nokogiri/html5/document.rb +27 -10
- data/lib/nokogiri/html5/document_fragment.rb +5 -2
- data/lib/nokogiri/html5/node.rb +10 -3
- data/lib/nokogiri/html5.rb +69 -64
- data/lib/nokogiri/jruby/dependencies.rb +10 -9
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version/constant.rb +2 -1
- data/lib/nokogiri/version/info.rb +20 -13
- data/lib/nokogiri/version.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +5 -3
- data/lib/nokogiri/xml/attribute_decl.rb +2 -1
- data/lib/nokogiri/xml/builder.rb +34 -32
- data/lib/nokogiri/xml/cdata.rb +2 -1
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +144 -103
- data/lib/nokogiri/xml/document_fragment.rb +41 -38
- data/lib/nokogiri/xml/dtd.rb +3 -2
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +2 -1
- data/lib/nokogiri/xml/entity_decl.rb +3 -2
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +2 -0
- data/lib/nokogiri/xml/node/save_options.rb +8 -4
- data/lib/nokogiri/xml/node.rb +521 -351
- data/lib/nokogiri/xml/node_set.rb +50 -54
- data/lib/nokogiri/xml/notation.rb +12 -0
- data/lib/nokogiri/xml/parse_options.rb +12 -7
- data/lib/nokogiri/xml/pp/character_data.rb +8 -6
- data/lib/nokogiri/xml/pp/node.rb +24 -26
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/reader.rb +17 -19
- data/lib/nokogiri/xml/relax_ng.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +20 -19
- data/lib/nokogiri/xml/sax/parser.rb +37 -34
- data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +7 -6
- data/lib/nokogiri/xml/searchable.rb +93 -62
- data/lib/nokogiri/xml/syntax_error.rb +5 -4
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
- data/lib/nokogiri/xml/xpath.rb +12 -0
- data/lib/nokogiri/xml/xpath_context.rb +2 -3
- data/lib/nokogiri/xml.rb +4 -3
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/nokogiri/xslt.rb +21 -13
- data/lib/nokogiri.rb +19 -16
- data/lib/xsd/xmlparser/nokogiri.rb +25 -24
- data/patches/libxml2/0004-use-glibc-strlen.patch +3 -3
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2443 -1914
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
- metadata +109 -31
- data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
- data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
- data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -1,6 +1,7 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
+
require "pathname"
|
4
5
|
|
5
6
|
module Nokogiri
|
6
7
|
module HTML4
|
@@ -9,11 +10,10 @@ module Nokogiri
|
|
9
10
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
10
11
|
# then nil is returned.
|
11
12
|
def meta_encoding
|
12
|
-
|
13
|
-
when meta = at('//meta[@charset]')
|
13
|
+
if (meta = at_xpath("//meta[@charset]"))
|
14
14
|
meta[:charset]
|
15
|
-
|
16
|
-
meta[
|
15
|
+
elsif (meta = meta_content_type)
|
16
|
+
meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
@@ -33,24 +33,22 @@ module Nokogiri
|
|
33
33
|
#
|
34
34
|
# Beware in CRuby, that libxml2 automatically inserts a meta tag
|
35
35
|
# into a head element.
|
36
|
-
def meta_encoding=
|
37
|
-
|
38
|
-
|
39
|
-
meta['content'] = 'text/html; charset=%s' % encoding
|
36
|
+
def meta_encoding=(encoding)
|
37
|
+
if (meta = meta_content_type)
|
38
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
40
39
|
encoding
|
41
|
-
|
42
|
-
meta[
|
40
|
+
elsif (meta = at_xpath("//meta[@charset]"))
|
41
|
+
meta["charset"] = encoding
|
43
42
|
else
|
44
|
-
meta = XML::Node.new(
|
45
|
-
if dtd = internal_subset
|
46
|
-
meta[
|
43
|
+
meta = XML::Node.new("meta", self)
|
44
|
+
if (dtd = internal_subset) && dtd.html5_dtd?
|
45
|
+
meta["charset"] = encoding
|
47
46
|
else
|
48
|
-
meta[
|
49
|
-
meta[
|
47
|
+
meta["http-equiv"] = "Content-Type"
|
48
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
50
49
|
end
|
51
50
|
|
52
|
-
|
53
|
-
when head = at('//head')
|
51
|
+
if (head = at_xpath("//head"))
|
54
52
|
head.prepend_child(meta)
|
55
53
|
else
|
56
54
|
set_metadata_element(meta)
|
@@ -60,9 +58,9 @@ module Nokogiri
|
|
60
58
|
end
|
61
59
|
|
62
60
|
def meta_content_type
|
63
|
-
xpath(
|
64
|
-
node[
|
65
|
-
|
61
|
+
xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
|
62
|
+
node["http-equiv"] =~ /\AContent-Type\z/i
|
63
|
+
end
|
66
64
|
end
|
67
65
|
private :meta_content_type
|
68
66
|
|
@@ -70,7 +68,7 @@ module Nokogiri
|
|
70
68
|
# Get the title string of this document. Return nil if there is
|
71
69
|
# no title tag.
|
72
70
|
def title
|
73
|
-
title =
|
71
|
+
(title = at_xpath("//title")) && title.inner_text
|
74
72
|
end
|
75
73
|
|
76
74
|
###
|
@@ -86,52 +84,50 @@ module Nokogiri
|
|
86
84
|
# content element (typically <body>) if any.
|
87
85
|
def title=(text)
|
88
86
|
tnode = XML::Text.new(text, self)
|
89
|
-
if title =
|
87
|
+
if (title = at_xpath("//title"))
|
90
88
|
title.children = tnode
|
91
89
|
return text
|
92
90
|
end
|
93
91
|
|
94
|
-
title = XML::Node.new(
|
95
|
-
|
96
|
-
when head = at('//head')
|
92
|
+
title = XML::Node.new("title", self) << tnode
|
93
|
+
if (head = at_xpath("//head"))
|
97
94
|
head << title
|
98
|
-
|
95
|
+
elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
|
99
96
|
# better put after charset declaration
|
100
97
|
meta.add_next_sibling(title)
|
101
98
|
else
|
102
99
|
set_metadata_element(title)
|
103
100
|
end
|
104
|
-
text
|
105
101
|
end
|
106
102
|
|
107
|
-
def set_metadata_element(element)
|
108
|
-
|
109
|
-
when head = at('//head')
|
103
|
+
def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
|
104
|
+
if (head = at_xpath("//head"))
|
110
105
|
head << element
|
111
|
-
|
112
|
-
head = html.prepend_child(XML::Node.new(
|
106
|
+
elsif (html = at_xpath("//html"))
|
107
|
+
head = html.prepend_child(XML::Node.new("head", self))
|
113
108
|
head.prepend_child(element)
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
109
|
+
elsif (first = children.find do |node|
|
110
|
+
case node
|
111
|
+
when XML::Element, XML::Text
|
112
|
+
true
|
113
|
+
end
|
114
|
+
end)
|
120
115
|
# We reach here only if the underlying document model
|
121
116
|
# allows <html>/<head> elements to be omitted and does not
|
122
117
|
# automatically supply them.
|
123
118
|
first.add_previous_sibling(element)
|
124
119
|
else
|
125
|
-
html = add_child(XML::Node.new(
|
126
|
-
head = html.add_child(XML::Node.new(
|
120
|
+
html = add_child(XML::Node.new("html", self))
|
121
|
+
head = html.add_child(XML::Node.new("head", self))
|
127
122
|
head.prepend_child(element)
|
128
123
|
end
|
129
124
|
end
|
130
125
|
private :set_metadata_element
|
131
126
|
|
132
127
|
####
|
133
|
-
# Serialize Node using +options+.
|
134
|
-
#
|
128
|
+
# Serialize Node using +options+. Save options can also be set using a block.
|
129
|
+
#
|
130
|
+
# See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
|
135
131
|
#
|
136
132
|
# These two statements are equivalent:
|
137
133
|
#
|
@@ -143,15 +139,25 @@ module Nokogiri
|
|
143
139
|
# config.format.as_xml
|
144
140
|
# end
|
145
141
|
#
|
146
|
-
def serialize
|
142
|
+
def serialize(options = {})
|
147
143
|
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
148
144
|
super
|
149
145
|
end
|
150
146
|
|
151
147
|
####
|
152
148
|
# Create a Nokogiri::XML::DocumentFragment from +tags+
|
153
|
-
def fragment
|
154
|
-
DocumentFragment.new(self, tags,
|
149
|
+
def fragment(tags = nil)
|
150
|
+
DocumentFragment.new(self, tags, root)
|
151
|
+
end
|
152
|
+
|
153
|
+
# :call-seq:
|
154
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
155
|
+
#
|
156
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
157
|
+
#
|
158
|
+
# See XPathVisitor for more information.
|
159
|
+
def xpath_doctype
|
160
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
|
155
161
|
end
|
156
162
|
|
157
163
|
class << self
|
@@ -163,9 +169,8 @@ module Nokogiri
|
|
163
169
|
# is a number that sets options in the parser, such as
|
164
170
|
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
165
171
|
# Nokogiri::XML::ParseOptions.
|
166
|
-
def parse
|
172
|
+
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
167
173
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
168
|
-
|
169
174
|
yield options if block_given?
|
170
175
|
|
171
176
|
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
@@ -206,7 +211,7 @@ module Nokogiri
|
|
206
211
|
end
|
207
212
|
|
208
213
|
# read_memory pukes on empty docs
|
209
|
-
if string_or_io.nil?
|
214
|
+
if string_or_io.nil? || string_or_io.empty?
|
210
215
|
return encoding ? new.tap { |i| i.encoding = encoding } : new
|
211
216
|
end
|
212
217
|
|
@@ -216,37 +221,39 @@ module Nokogiri
|
|
216
221
|
end
|
217
222
|
end
|
218
223
|
|
219
|
-
class EncodingFound < StandardError # :nodoc:
|
224
|
+
class EncodingFound < StandardError # :nodoc: all
|
220
225
|
attr_reader :found_encoding
|
221
226
|
|
222
227
|
def initialize(encoding)
|
223
228
|
@found_encoding = encoding
|
224
|
-
super("encoding found: %s"
|
229
|
+
super(format("encoding found: %s", encoding))
|
225
230
|
end
|
226
231
|
end
|
227
232
|
|
228
|
-
|
229
|
-
|
233
|
+
# :nodoc: all
|
234
|
+
class EncodingReader
|
235
|
+
class SAXHandler < Nokogiri::XML::SAX::Document
|
230
236
|
attr_reader :encoding
|
231
|
-
|
237
|
+
|
232
238
|
def initialize
|
233
239
|
@encoding = nil
|
234
240
|
super()
|
235
241
|
end
|
236
|
-
|
242
|
+
|
237
243
|
def start_element(name, attrs = [])
|
238
|
-
return unless name ==
|
244
|
+
return unless name == "meta"
|
245
|
+
|
239
246
|
attr = Hash[attrs]
|
240
|
-
charset = attr[
|
241
|
-
@encoding = charset
|
242
|
-
http_equiv = attr[
|
243
|
-
http_equiv.match(/\AContent-Type\z/i)
|
244
|
-
content = attr[
|
245
|
-
m = content.match(/;\s*charset\s*=\s*([\w-]+)/)
|
246
|
-
@encoding = m[1]
|
247
|
+
(charset = attr["charset"]) &&
|
248
|
+
(@encoding = charset)
|
249
|
+
(http_equiv = attr["http-equiv"]) &&
|
250
|
+
http_equiv.match(/\AContent-Type\z/i) &&
|
251
|
+
(content = attr["content"]) &&
|
252
|
+
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
253
|
+
(@encoding = m[1])
|
247
254
|
end
|
248
255
|
end
|
249
|
-
|
256
|
+
|
250
257
|
class JumpSAXHandler < SAXHandler
|
251
258
|
def initialize(jumptag)
|
252
259
|
@jumptag = jumptag
|
@@ -255,26 +262,30 @@ module Nokogiri
|
|
255
262
|
|
256
263
|
def start_element(name, attrs = [])
|
257
264
|
super
|
258
|
-
throw
|
259
|
-
throw
|
265
|
+
throw(@jumptag, @encoding) if @encoding
|
266
|
+
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
260
267
|
end
|
261
268
|
end
|
262
269
|
|
263
270
|
def self.detect_encoding(chunk)
|
264
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]
|
265
|
-
return Nokogiri.XML(m[1]).encoding
|
271
|
+
(m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
|
272
|
+
(return Nokogiri.XML(m[1]).encoding)
|
266
273
|
|
267
274
|
if Nokogiri.jruby?
|
268
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)
|
269
|
-
return m[4]
|
270
|
-
catch(:encoding_found)
|
275
|
+
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
276
|
+
(return m[4])
|
277
|
+
catch(:encoding_found) do
|
271
278
|
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
272
279
|
nil
|
273
|
-
|
280
|
+
end
|
274
281
|
else
|
275
282
|
handler = SAXHandler.new
|
276
283
|
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
277
|
-
|
284
|
+
begin
|
285
|
+
parser << chunk
|
286
|
+
rescue
|
287
|
+
Nokogiri::SyntaxError
|
288
|
+
end
|
278
289
|
handler.encoding
|
279
290
|
end
|
280
291
|
end
|
@@ -293,13 +304,13 @@ module Nokogiri
|
|
293
304
|
def read(len)
|
294
305
|
# no support for a call without len
|
295
306
|
|
296
|
-
|
297
|
-
@firstchunk = @io.read(len)
|
307
|
+
unless @firstchunk
|
308
|
+
(@firstchunk = @io.read(len)) || (return nil)
|
298
309
|
|
299
310
|
# This implementation expects that the first call from
|
300
311
|
# htmlReadIO() is made with a length long enough (~1KB) to
|
301
312
|
# achieve advanced encoding detection.
|
302
|
-
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
313
|
+
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
303
314
|
# The first chunk is stored for the next read in retry.
|
304
315
|
raise @encoding_found = EncodingFound.new(encoding)
|
305
316
|
end
|
@@ -308,7 +319,7 @@ module Nokogiri
|
|
308
319
|
|
309
320
|
ret = @firstchunk.slice!(0, len)
|
310
321
|
if (len -= ret.length) > 0
|
311
|
-
rest = @io.read(len)
|
322
|
+
(rest = @io.read(len)) && ret << (rest)
|
312
323
|
end
|
313
324
|
if ret.empty?
|
314
325
|
nil
|
@@ -1,34 +1,38 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Nokogiri
|
3
4
|
module HTML4
|
4
5
|
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
5
6
|
####
|
6
7
|
# Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
|
7
|
-
def self.parse(tags, encoding = nil)
|
8
|
+
def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
8
9
|
doc = HTML4::Document.new
|
9
10
|
|
10
11
|
encoding ||= if tags.respond_to?(:encoding)
|
11
12
|
encoding = tags.encoding
|
12
13
|
if encoding == ::Encoding::ASCII_8BIT
|
13
|
-
|
14
|
+
"UTF-8"
|
14
15
|
else
|
15
16
|
encoding.name
|
16
17
|
end
|
17
18
|
else
|
18
|
-
|
19
|
+
"UTF-8"
|
19
20
|
end
|
20
21
|
|
21
22
|
doc.encoding = encoding
|
22
23
|
|
23
|
-
new(doc, tags)
|
24
|
+
new(doc, tags, nil, options, &block)
|
24
25
|
end
|
25
26
|
|
26
|
-
def initialize(document, tags = nil, ctx = nil)
|
27
|
+
def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
27
28
|
return self unless tags
|
28
29
|
|
30
|
+
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
31
|
+
yield options if block_given?
|
32
|
+
|
29
33
|
if ctx
|
30
34
|
preexisting_errors = document.errors.dup
|
31
|
-
node_set = ctx.parse("<div>#{tags}</div>")
|
35
|
+
node_set = ctx.parse("<div>#{tags}</div>", options)
|
32
36
|
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
33
37
|
self.errors = document.errors - preexisting_errors
|
34
38
|
else
|
@@ -39,7 +43,7 @@ module Nokogiri
|
|
39
43
|
"/html/body/node()"
|
40
44
|
end
|
41
45
|
|
42
|
-
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
|
46
|
+
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
|
43
47
|
temp_doc.xpath(path).each { |child| child.parent = self }
|
44
48
|
self.errors = temp_doc.errors
|
45
49
|
end
|