nokogiri 1.12.5 → 1.14.3
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +41 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +23 -14
- data/bin/nokogiri +63 -50
- data/dependencies.yml +33 -66
- data/ext/nokogiri/extconf.rb +159 -63
- data/ext/nokogiri/gumbo.c +21 -11
- data/ext/nokogiri/html4_document.c +2 -2
- data/ext/nokogiri/html4_element_description.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +2 -2
- data/ext/nokogiri/html4_sax_parser_context.c +3 -9
- data/ext/nokogiri/html4_sax_push_parser.c +1 -1
- data/ext/nokogiri/nokogiri.c +38 -51
- data/ext/nokogiri/nokogiri.h +26 -14
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +3 -3
- data/ext/nokogiri/xml_attribute_decl.c +5 -5
- data/ext/nokogiri/xml_cdata.c +3 -3
- data/ext/nokogiri/xml_comment.c +1 -1
- data/ext/nokogiri/xml_document.c +53 -44
- data/ext/nokogiri/xml_document_fragment.c +1 -3
- data/ext/nokogiri/xml_dtd.c +11 -11
- data/ext/nokogiri/xml_element_content.c +3 -3
- data/ext/nokogiri/xml_element_decl.c +5 -5
- data/ext/nokogiri/xml_encoding_handler.c +28 -14
- data/ext/nokogiri/xml_entity_decl.c +6 -6
- data/ext/nokogiri/xml_entity_reference.c +1 -1
- data/ext/nokogiri/xml_namespace.c +80 -14
- data/ext/nokogiri/xml_node.c +982 -396
- data/ext/nokogiri/xml_node_set.c +4 -6
- data/ext/nokogiri/xml_processing_instruction.c +1 -1
- data/ext/nokogiri/xml_reader.c +133 -32
- data/ext/nokogiri/xml_relax_ng.c +1 -3
- data/ext/nokogiri/xml_sax_parser.c +23 -17
- data/ext/nokogiri/xml_sax_parser_context.c +11 -9
- data/ext/nokogiri/xml_sax_push_parser.c +1 -3
- data/ext/nokogiri/xml_schema.c +4 -6
- data/ext/nokogiri/xml_syntax_error.c +1 -1
- data/ext/nokogiri/xml_text.c +2 -2
- data/ext/nokogiri/xml_xpath_context.c +144 -114
- data/ext/nokogiri/xslt_stylesheet.c +122 -23
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +2 -2
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +2 -2
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +8 -16
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +9 -8
- data/lib/nokogiri/css/parser.rb +360 -341
- data/lib/nokogiri/css/parser.y +249 -244
- data/lib/nokogiri/css/parser_extras.rb +22 -20
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +184 -85
- data/lib/nokogiri/css.rb +44 -6
- data/lib/nokogiri/decorators/slop.rb +8 -7
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +4 -3
- data/lib/nokogiri/gumbo.rb +1 -0
- data/lib/nokogiri/html.rb +16 -10
- data/lib/nokogiri/html4/builder.rb +1 -0
- data/lib/nokogiri/html4/document.rb +56 -164
- data/lib/nokogiri/html4/document_fragment.rb +11 -7
- data/lib/nokogiri/html4/element_description.rb +1 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +432 -532
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4/entity_lookup.rb +2 -1
- data/lib/nokogiri/html4/sax/parser.rb +5 -2
- data/lib/nokogiri/html4/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +7 -7
- data/lib/nokogiri/html4.rb +12 -5
- data/lib/nokogiri/html5/document.rb +126 -32
- data/lib/nokogiri/html5/document_fragment.rb +14 -4
- data/lib/nokogiri/html5/node.rb +12 -7
- data/lib/nokogiri/html5.rb +138 -222
- data/lib/nokogiri/jruby/dependencies.rb +2 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version/constant.rb +2 -1
- data/lib/nokogiri/version/info.rb +32 -24
- data/lib/nokogiri/version.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +54 -3
- data/lib/nokogiri/xml/attribute_decl.rb +2 -1
- data/lib/nokogiri/xml/builder.rb +35 -33
- data/lib/nokogiri/xml/cdata.rb +2 -1
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +232 -143
- data/lib/nokogiri/xml/document_fragment.rb +88 -42
- data/lib/nokogiri/xml/dtd.rb +3 -2
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +2 -1
- data/lib/nokogiri/xml/entity_decl.rb +3 -2
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +44 -0
- data/lib/nokogiri/xml/node/save_options.rb +14 -8
- data/lib/nokogiri/xml/node.rb +708 -383
- data/lib/nokogiri/xml/node_set.rb +134 -59
- data/lib/nokogiri/xml/notation.rb +12 -0
- data/lib/nokogiri/xml/parse_options.rb +140 -56
- data/lib/nokogiri/xml/pp/character_data.rb +8 -6
- data/lib/nokogiri/xml/pp/node.rb +26 -26
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +3 -1
- data/lib/nokogiri/xml/reader.rb +20 -24
- data/lib/nokogiri/xml/relax_ng.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +20 -19
- data/lib/nokogiri/xml/sax/parser.rb +38 -36
- data/lib/nokogiri/xml/sax/parser_context.rb +7 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +5 -5
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +7 -6
- data/lib/nokogiri/xml/searchable.rb +93 -62
- data/lib/nokogiri/xml/syntax_error.rb +5 -4
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
- data/lib/nokogiri/xml/xpath.rb +12 -0
- data/lib/nokogiri/xml/xpath_context.rb +2 -3
- data/lib/nokogiri/xml.rb +4 -3
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/nokogiri/xslt.rb +21 -13
- data/lib/nokogiri.rb +22 -27
- data/lib/xsd/xmlparser/nokogiri.rb +28 -25
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2445 -1919
- data/ports/archives/libxml2-2.10.4.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
- metadata +20 -171
- data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -2511
- data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +0 -31
- data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +0 -19
- data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
data/lib/nokogiri/html5.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
# frozen_string_literal: true
|
3
|
+
|
3
4
|
#
|
4
5
|
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
5
6
|
#
|
@@ -16,13 +17,15 @@
|
|
16
17
|
# limitations under the License.
|
17
18
|
#
|
18
19
|
|
19
|
-
require_relative
|
20
|
-
require_relative
|
21
|
-
require_relative
|
20
|
+
require_relative "html5/document"
|
21
|
+
require_relative "html5/document_fragment"
|
22
|
+
require_relative "html5/node"
|
22
23
|
|
23
24
|
module Nokogiri
|
24
|
-
#
|
25
|
-
#
|
25
|
+
# Since v1.12.0
|
26
|
+
#
|
27
|
+
# ⚠ HTML5 functionality is not available when running JRuby.
|
28
|
+
#
|
26
29
|
# Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
27
30
|
def self.HTML5(input, url = nil, encoding = nil, **options, &block)
|
28
31
|
Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
|
@@ -30,6 +33,8 @@ module Nokogiri
|
|
30
33
|
|
31
34
|
# == Usage
|
32
35
|
#
|
36
|
+
# ⚠ HTML5 functionality is not available when running JRuby.
|
37
|
+
#
|
33
38
|
# Parse an HTML5 document:
|
34
39
|
#
|
35
40
|
# doc = Nokogiri.HTML5(string)
|
@@ -220,254 +225,165 @@ module Nokogiri
|
|
220
225
|
# * Instead of returning +unknown+ as the element name for unknown tags, the
|
221
226
|
# original tag name is returned verbatim.
|
222
227
|
#
|
223
|
-
#
|
224
|
-
# @note HTML5 functionality is not available when running JRuby.
|
228
|
+
# Since v1.12.0
|
225
229
|
module HTML5
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
|
232
|
-
XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
|
230
|
+
class << self
|
231
|
+
# Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
232
|
+
def parse(string, url = nil, encoding = nil, **options, &block)
|
233
|
+
Document.parse(string, url, encoding, **options, &block)
|
234
|
+
end
|
233
235
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
236
|
+
# Parse a fragment from +string+. Convenience method for
|
237
|
+
# {Nokogiri::HTML5::DocumentFragment.parse}.
|
238
|
+
def fragment(string, encoding = nil, **options)
|
239
|
+
DocumentFragment.parse(string, encoding, options)
|
240
|
+
end
|
238
241
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
242
|
+
# Fetch and parse a HTML document from the web, following redirects,
|
243
|
+
# handling https, and determining the character encoding using HTML5
|
244
|
+
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
245
|
+
# http headers and special options. Everything which is not a
|
246
|
+
# special option is considered a header. Special options include:
|
247
|
+
# * :follow_limit => number of redirects which are followed
|
248
|
+
# * :basic_auth => [username, password]
|
249
|
+
def get(uri, options = {})
|
250
|
+
# TODO: deprecate
|
251
|
+
warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
|
252
|
+
uplevel: 1, category: :deprecated)
|
253
|
+
get_impl(uri, options)
|
254
|
+
end
|
244
255
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
256
|
+
# :nodoc:
|
257
|
+
def read_and_encode(string, encoding)
|
258
|
+
# Read the string with the given encoding.
|
259
|
+
if string.respond_to?(:read)
|
260
|
+
string = if encoding.nil?
|
261
|
+
string.read
|
262
|
+
else
|
263
|
+
string.read(encoding: encoding)
|
264
|
+
end
|
265
|
+
else
|
266
|
+
# Otherwise the string has the given encoding.
|
267
|
+
string = string.to_s
|
268
|
+
if encoding
|
269
|
+
string = string.dup
|
270
|
+
string.force_encoding(encoding)
|
271
|
+
end
|
272
|
+
end
|
257
273
|
|
258
|
-
|
274
|
+
# convert to UTF-8
|
275
|
+
if string.encoding != Encoding::UTF_8
|
276
|
+
string = reencode(string)
|
277
|
+
end
|
278
|
+
string
|
279
|
+
end
|
259
280
|
|
260
|
-
|
261
|
-
headers = options.clone
|
262
|
-
headers = {:follow_limit => headers} if Numeric === headers # deprecated
|
263
|
-
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
281
|
+
private
|
264
282
|
|
265
|
-
|
266
|
-
|
283
|
+
def get_impl(uri, options = {})
|
284
|
+
headers = options.clone
|
285
|
+
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
286
|
+
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
267
287
|
|
268
|
-
|
288
|
+
require "net/http"
|
289
|
+
uri = URI(uri) unless URI === uri
|
269
290
|
|
270
|
-
|
271
|
-
http.use_ssl = true if uri.scheme == 'https'
|
291
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
272
292
|
|
273
|
-
|
274
|
-
|
275
|
-
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
276
|
-
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
277
|
-
# :verify_callback, :verify_depth, :verify_mode
|
278
|
-
options.each do |key, value|
|
279
|
-
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
|
280
|
-
end
|
293
|
+
# TLS / SSL support
|
294
|
+
http.use_ssl = true if uri.scheme == "https"
|
281
295
|
|
282
|
-
|
296
|
+
# Pass through Net::HTTP override values, which currently include:
|
297
|
+
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
298
|
+
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
299
|
+
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
300
|
+
# :verify_callback, :verify_depth, :verify_mode
|
301
|
+
options.each do |key, _value|
|
302
|
+
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
303
|
+
end
|
283
304
|
|
284
|
-
|
285
|
-
auth = headers.delete(:basic_auth)
|
286
|
-
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
287
|
-
request.basic_auth auth.first, auth.last if auth
|
305
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
288
306
|
|
289
|
-
|
290
|
-
|
307
|
+
# basic authentication
|
308
|
+
auth = headers.delete(:basic_auth)
|
309
|
+
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
310
|
+
request.basic_auth(auth.first, auth.last) if auth
|
291
311
|
|
292
|
-
|
312
|
+
# remaining options are treated as headers
|
313
|
+
headers.each { |key, value| request[key.to_s] = value.to_s }
|
293
314
|
|
294
|
-
|
295
|
-
when Net::HTTPSuccess
|
296
|
-
doc = parse(reencode(response.body, response['content-type']), options)
|
297
|
-
doc.instance_variable_set('@response', response)
|
298
|
-
doc.class.send(:attr_reader, :response)
|
299
|
-
doc
|
300
|
-
when Net::HTTPRedirection
|
301
|
-
response.value if limit <= 1
|
302
|
-
location = URI.join(uri, response['location'])
|
303
|
-
get_impl(location, options.merge(:follow_limit => limit-1))
|
304
|
-
else
|
305
|
-
response.value
|
306
|
-
end
|
307
|
-
end
|
315
|
+
response = http.request(request)
|
308
316
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
317
|
+
case response
|
318
|
+
when Net::HTTPSuccess
|
319
|
+
doc = parse(reencode(response.body, response["content-type"]), options)
|
320
|
+
doc.instance_variable_set(:@response, response)
|
321
|
+
doc.class.send(:attr_reader, :response)
|
322
|
+
doc
|
323
|
+
when Net::HTTPRedirection
|
324
|
+
response.value if limit <= 1
|
325
|
+
location = URI.join(uri, response["location"])
|
326
|
+
get_impl(location, options.merge(follow_limit: limit - 1))
|
314
327
|
else
|
315
|
-
|
316
|
-
end
|
317
|
-
else
|
318
|
-
# Otherwise the string has the given encoding.
|
319
|
-
string = string.to_s
|
320
|
-
if encoding
|
321
|
-
string = string.dup
|
322
|
-
string.force_encoding(encoding)
|
328
|
+
response.value
|
323
329
|
end
|
324
330
|
end
|
325
331
|
|
326
|
-
#
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
#
|
342
|
-
# http://bugs.ruby-lang.org/issues/2567
|
343
|
-
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
344
|
-
#
|
345
|
-
def self.reencode(body, content_type=nil)
|
346
|
-
if body.encoding == Encoding::ASCII_8BIT
|
347
|
-
encoding = nil
|
332
|
+
# Charset sniffing is a complex and controversial topic that understandably isn't done _by
|
333
|
+
# default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
|
334
|
+
# consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
|
335
|
+
# the Gumbo parser *only* supports utf-8.
|
336
|
+
#
|
337
|
+
# Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
|
338
|
+
# this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
|
339
|
+
# the HTML5 standard.
|
340
|
+
#
|
341
|
+
# http://bugs.ruby-lang.org/issues/2567
|
342
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
343
|
+
#
|
344
|
+
def reencode(body, content_type = nil)
|
345
|
+
if body.encoding == Encoding::ASCII_8BIT
|
346
|
+
encoding = nil
|
348
347
|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
end
|
358
|
-
|
359
|
-
# look for a charset in a content-encoding header
|
360
|
-
if content_type
|
361
|
-
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
362
|
-
end
|
363
|
-
|
364
|
-
# look for a charset in a meta tag in the first 1024 bytes
|
365
|
-
if not encoding
|
366
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
367
|
-
data.scan(/<meta.*?>/m).each do |meta|
|
368
|
-
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
348
|
+
# look for a Byte Order Mark (BOM)
|
349
|
+
initial_bytes = body[0..2].bytes
|
350
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
351
|
+
encoding = Encoding::UTF_8
|
352
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
353
|
+
encoding = Encoding::UTF_16BE
|
354
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
355
|
+
encoding = Encoding::UTF_16LE
|
369
356
|
end
|
370
|
-
end
|
371
|
-
|
372
|
-
# if all else fails, default to the official default encoding for HTML
|
373
|
-
encoding ||= Encoding::ISO_8859_1
|
374
357
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
rescue ArgumentError
|
380
|
-
body.force_encoding(Encoding::ISO_8859_1)
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
body.encode(Encoding::UTF_8)
|
385
|
-
end
|
358
|
+
# look for a charset in a content-encoding header
|
359
|
+
if content_type
|
360
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
361
|
+
end
|
386
362
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
# XXX(sfc): attach namespaces to all nodes, even html?
|
393
|
-
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
394
|
-
tagname = current_node.name
|
395
|
-
else
|
396
|
-
tagname = "#{ns.prefix}:#{current_node.name}"
|
397
|
-
end
|
398
|
-
io << '<' << tagname
|
399
|
-
current_node.attribute_nodes.each do |attr|
|
400
|
-
attr_ns = attr.namespace
|
401
|
-
if attr_ns.nil?
|
402
|
-
attr_name = attr.name
|
403
|
-
else
|
404
|
-
ns_uri = attr_ns.href
|
405
|
-
if ns_uri == XML_NAMESPACE
|
406
|
-
attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
|
407
|
-
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
|
408
|
-
attr_name = 'xmlns'
|
409
|
-
elsif ns_uri == XMLNS_NAMESPACE
|
410
|
-
attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
|
411
|
-
elsif ns_uri == XLINK_NAMESPACE
|
412
|
-
attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
|
413
|
-
else
|
414
|
-
attr_name = "#{attr_ns.prefix}:#{attr.name}"
|
363
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
364
|
+
unless encoding
|
365
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
366
|
+
data.scan(/<meta.*?>/im).each do |meta|
|
367
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
415
368
|
end
|
416
369
|
end
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
370
|
+
|
371
|
+
# if all else fails, default to the official default encoding for HTML
|
372
|
+
encoding ||= Encoding::ISO_8859_1
|
373
|
+
|
374
|
+
# change the encoding to match the detected or inferred encoding
|
375
|
+
body = body.dup
|
376
|
+
begin
|
377
|
+
body.force_encoding(encoding)
|
378
|
+
rescue ArgumentError
|
379
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
426
380
|
end
|
427
|
-
io << '</' << tagname << '>'
|
428
|
-
end
|
429
|
-
when XML::Node::TEXT_NODE
|
430
|
-
parent = current_node.parent
|
431
|
-
if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
|
432
|
-
io << current_node.content
|
433
|
-
else
|
434
|
-
io << escape_text(current_node.content, encoding, false)
|
435
|
-
end
|
436
|
-
when XML::Node::CDATA_SECTION_NODE
|
437
|
-
io << '<![CDATA[' << current_node.content << ']]>'
|
438
|
-
when XML::Node::COMMENT_NODE
|
439
|
-
io << '<!--' << current_node.content << '-->'
|
440
|
-
when XML::Node::PI_NODE
|
441
|
-
io << '<?' << current_node.content << '>'
|
442
|
-
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
443
|
-
io << '<!DOCTYPE ' << current_node.name << '>'
|
444
|
-
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
445
|
-
current_node.children.each do |child|
|
446
|
-
serialize_node_internal(child, io, encoding, options)
|
447
381
|
end
|
448
|
-
else
|
449
|
-
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
|
450
|
-
end
|
451
|
-
end
|
452
382
|
|
453
|
-
|
454
|
-
if attribute_mode
|
455
|
-
text = text.gsub(/[&\u00a0"]/,
|
456
|
-
'&' => '&', "\u00a0" => ' ', '"' => '"')
|
457
|
-
else
|
458
|
-
text = text.gsub(/[&\u00a0<>]/,
|
459
|
-
'&' => '&', "\u00a0" => ' ', '<' => '<', '>' => '>')
|
383
|
+
body.encode(Encoding::UTF_8)
|
460
384
|
end
|
461
|
-
# Not part of the standard
|
462
|
-
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
463
|
-
end
|
464
|
-
|
465
|
-
def self.prepend_newline?(node)
|
466
|
-
return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
|
467
|
-
first_child = node.children[0]
|
468
|
-
first_child.text? && first_child.content.start_with?("\n")
|
469
385
|
end
|
470
386
|
end
|
471
387
|
end
|
472
388
|
|
473
|
-
require_relative
|
389
|
+
require_relative "gumbo"
|
@@ -1,20 +1,3 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
-
|
3
|
-
|
4
|
-
#
|
5
|
-
# However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
|
6
|
-
# an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
|
7
|
-
# of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
|
8
|
-
# should skip loading xml jars. This is because those are in WEB-INF/lib and
|
9
|
-
# already set in the classpath.
|
10
|
-
unless $LOAD_PATH.to_s.include?("appengine-rack")
|
11
|
-
require 'stringio'
|
12
|
-
require 'isorelax.jar'
|
13
|
-
require 'jing.jar'
|
14
|
-
require 'nekohtml.jar'
|
15
|
-
require 'nekodtd.jar'
|
16
|
-
require 'xercesImpl.jar'
|
17
|
-
require 'serializer.jar'
|
18
|
-
require 'xalan.jar'
|
19
|
-
require 'xml-apis.jar'
|
20
|
-
end
|
2
|
+
|
3
|
+
require_relative "nokogiri_jars"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# this is a generated file, to avoid over-writing it just delete this comment
|
2
|
+
begin
|
3
|
+
require 'jar_dependencies'
|
4
|
+
rescue LoadError
|
5
|
+
require 'xalan/xalan/2.7.2/xalan-2.7.2.jar'
|
6
|
+
require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
|
7
|
+
require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
|
8
|
+
require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
|
9
|
+
require 'org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar'
|
10
|
+
require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
|
11
|
+
require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
|
12
|
+
require 'xalan/serializer/2.7.2/serializer-2.7.2.jar'
|
13
|
+
require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
|
14
|
+
end
|
15
|
+
|
16
|
+
if defined? Jars
|
17
|
+
require_jar 'xalan', 'xalan', '2.7.2'
|
18
|
+
require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
|
19
|
+
require_jar 'nu.validator', 'jing', '20200702VNU'
|
20
|
+
require_jar 'xerces', 'xercesImpl', '2.12.2'
|
21
|
+
require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko1'
|
22
|
+
require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
|
23
|
+
require_jar 'xml-apis', 'xml-apis', '1.4.01'
|
24
|
+
require_jar 'xalan', 'serializer', '2.7.2'
|
25
|
+
require_jar 'isorelax', 'isorelax', '20030108'
|
26
|
+
end
|
27
|
+
|
28
|
+
module Nokogiri
|
29
|
+
# generated by the :vendor_jars rake task
|
30
|
+
JAR_DEPENDENCIES = {
|
31
|
+
"isorelax:isorelax" => "20030108",
|
32
|
+
"net.sf.saxon:Saxon-HE" => "9.6.0-4",
|
33
|
+
"net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
|
34
|
+
"nu.validator:jing" => "20200702VNU",
|
35
|
+
"org.nokogiri:nekodtd" => "0.1.11.noko1",
|
36
|
+
"xalan:serializer" => "2.7.2",
|
37
|
+
"xalan:xalan" => "2.7.2",
|
38
|
+
"xerces:xercesImpl" => "2.12.2",
|
39
|
+
"xml-apis:xml-apis" => "1.4.01",
|
40
|
+
}.freeze
|
41
|
+
XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
|
42
|
+
NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
|
43
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "singleton"
|
3
4
|
require "shellwords"
|
4
5
|
|
@@ -102,18 +103,18 @@ module Nokogiri
|
|
102
103
|
ldflags = []
|
103
104
|
|
104
105
|
if libxml2_using_packaged?
|
105
|
-
cppflags << "-I#{File.join(header_directory,
|
106
|
-
cppflags << "-I#{File.join(header_directory,
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
ldflags << "-L#{lib_directory.shellescape}"
|
115
|
-
ldflags << "-l:nokogiri.so"
|
106
|
+
cppflags << "-I#{File.join(header_directory, "include").shellescape}"
|
107
|
+
cppflags << "-I#{File.join(header_directory, "include/libxml2").shellescape}"
|
108
|
+
end
|
109
|
+
|
110
|
+
if windows?
|
111
|
+
# on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
|
112
|
+
lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
|
113
|
+
unless File.exist?(lib_directory)
|
114
|
+
lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
|
116
115
|
end
|
116
|
+
ldflags << "-L#{lib_directory.shellescape}"
|
117
|
+
ldflags << "-l:nokogiri.so"
|
117
118
|
end
|
118
119
|
|
119
120
|
nokogiri["cppflags"] = cppflags
|
@@ -168,21 +169,18 @@ module Nokogiri
|
|
168
169
|
vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
|
169
170
|
elsif jruby?
|
170
171
|
vi["other_libraries"] = {}.tap do |ol|
|
171
|
-
|
172
|
-
|
172
|
+
Nokogiri::JAR_DEPENDENCIES.each do |k, v|
|
173
|
+
ol[k] = v
|
174
|
+
end
|
173
175
|
end
|
174
176
|
end
|
175
177
|
end
|
176
178
|
end
|
177
179
|
|
178
180
|
def to_markdown
|
179
|
-
begin
|
180
|
-
require "psych"
|
181
|
-
rescue LoadError
|
182
|
-
end
|
183
181
|
require "yaml"
|
184
182
|
"# Nokogiri (#{Nokogiri::VERSION})\n" +
|
185
|
-
|
183
|
+
YAML.dump(to_hash).each_line.map { |line| " #{line}" }.join
|
186
184
|
end
|
187
185
|
|
188
186
|
instance.warnings.each do |warning|
|
@@ -190,26 +188,36 @@ module Nokogiri
|
|
190
188
|
end
|
191
189
|
end
|
192
190
|
|
193
|
-
|
191
|
+
# :nodoc:
|
192
|
+
def self.uses_libxml?(requirement = nil)
|
194
193
|
return false unless VersionInfo.instance.libxml2?
|
195
194
|
return true unless requirement
|
195
|
+
|
196
196
|
Gem::Requirement.new(requirement).satisfied_by?(VersionInfo.instance.loaded_libxml_version)
|
197
197
|
end
|
198
198
|
|
199
|
+
# :nodoc:
|
199
200
|
def self.uses_gumbo?
|
200
201
|
uses_libxml? # TODO: replace with Gumbo functionality
|
201
202
|
end
|
202
203
|
|
203
|
-
|
204
|
+
# :nodoc:
|
205
|
+
def self.jruby?
|
204
206
|
VersionInfo.instance.jruby?
|
205
207
|
end
|
206
208
|
|
207
|
-
#
|
208
|
-
|
209
|
-
|
209
|
+
# :nodoc:
|
210
|
+
def self.libxml2_patches
|
211
|
+
if VersionInfo.instance.libxml2_using_packaged?
|
212
|
+
Nokogiri::VERSION_INFO["libxml"]["patches"]
|
213
|
+
else
|
214
|
+
[]
|
215
|
+
end
|
210
216
|
end
|
217
|
+
|
218
|
+
require_relative "../jruby/dependencies" if Nokogiri.jruby?
|
211
219
|
require_relative "../extension"
|
212
220
|
|
213
|
-
#
|
221
|
+
# Detailed version info about Nokogiri and the installed extension dependencies.
|
214
222
|
VERSION_INFO = VersionInfo.instance.to_hash
|
215
223
|
end
|
data/lib/nokogiri/version.rb
CHANGED