nokogiri 1.13.8 → 1.15.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +40 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +18 -11
- data/dependencies.yml +33 -15
- data/ext/nokogiri/extconf.rb +164 -46
- data/ext/nokogiri/gumbo.c +20 -10
- data/ext/nokogiri/html4_document.c +3 -4
- data/ext/nokogiri/html4_element_description.c +20 -15
- data/ext/nokogiri/html4_entity_lookup.c +2 -2
- data/ext/nokogiri/html4_sax_parser_context.c +11 -22
- data/ext/nokogiri/html4_sax_push_parser.c +3 -3
- data/ext/nokogiri/nokogiri.c +84 -75
- data/ext/nokogiri/nokogiri.h +31 -16
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +2 -2
- data/ext/nokogiri/xml_attribute_decl.c +2 -2
- data/ext/nokogiri/xml_cdata.c +32 -18
- data/ext/nokogiri/xml_comment.c +2 -2
- data/ext/nokogiri/xml_document.c +127 -34
- data/ext/nokogiri/xml_document_fragment.c +2 -2
- data/ext/nokogiri/xml_dtd.c +2 -2
- data/ext/nokogiri/xml_element_content.c +34 -31
- data/ext/nokogiri/xml_element_decl.c +7 -7
- data/ext/nokogiri/xml_encoding_handler.c +15 -7
- data/ext/nokogiri/xml_entity_decl.c +1 -1
- data/ext/nokogiri/xml_entity_reference.c +2 -2
- data/ext/nokogiri/xml_namespace.c +79 -14
- data/ext/nokogiri/xml_node.c +300 -34
- data/ext/nokogiri/xml_node_set.c +125 -107
- data/ext/nokogiri/xml_processing_instruction.c +2 -2
- data/ext/nokogiri/xml_reader.c +81 -48
- data/ext/nokogiri/xml_relax_ng.c +66 -81
- data/ext/nokogiri/xml_sax_parser.c +45 -20
- data/ext/nokogiri/xml_sax_parser_context.c +46 -30
- data/ext/nokogiri/xml_sax_push_parser.c +30 -11
- data/ext/nokogiri/xml_schema.c +95 -117
- data/ext/nokogiri/xml_syntax_error.c +1 -1
- data/ext/nokogiri/xml_text.c +28 -14
- data/ext/nokogiri/xml_xpath_context.c +216 -136
- data/ext/nokogiri/xslt_stylesheet.c +118 -64
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +10 -6
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +15 -16
- data/gumbo-parser/src/foreign_attrs.gperf +1 -1
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +21 -5
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +7 -5
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/decorators/slop.rb +1 -1
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +4 -3
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/document_fragment.rb +1 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +1827 -365
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +10 -3
- data/lib/nokogiri/html5/node.rb +8 -5
- data/lib/nokogiri/html5.rb +130 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/attribute_decl.rb +4 -2
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -55
- data/lib/nokogiri/xml/document_fragment.rb +50 -7
- data/lib/nokogiri/xml/element_content.rb +10 -2
- data/lib/nokogiri/xml/element_decl.rb +4 -2
- data/lib/nokogiri/xml/entity_decl.rb +4 -2
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +14 -4
- data/lib/nokogiri/xml/node.rb +212 -48
- data/lib/nokogiri/xml/node_set.rb +88 -9
- data/lib/nokogiri/xml/parse_options.rb +129 -50
- data/lib/nokogiri/xml/pp/node.rb +28 -15
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/document.rb +1 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xml/searchable.rb +18 -10
- data/lib/nokogiri/xslt.rb +74 -4
- data/lib/nokogiri.rb +15 -15
- data/lib/xsd/xmlparser/nokogiri.rb +4 -2
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.11.7.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
- metadata +19 -242
- data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +0 -3037
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
data/lib/nokogiri/html5.rb
CHANGED
@@ -227,250 +227,164 @@ module Nokogiri
|
|
227
227
|
#
|
228
228
|
# Since v1.12.0
|
229
229
|
module HTML5
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
236
|
-
XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
|
230
|
+
class << self
|
231
|
+
# Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
|
232
|
+
def parse(string, url = nil, encoding = nil, **options, &block)
|
233
|
+
Document.parse(string, url, encoding, **options, &block)
|
234
|
+
end
|
237
235
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
236
|
+
# Parse a fragment from +string+. Convenience method for
|
237
|
+
# {Nokogiri::HTML5::DocumentFragment.parse}.
|
238
|
+
def fragment(string, encoding = nil, **options)
|
239
|
+
DocumentFragment.parse(string, encoding, options)
|
240
|
+
end
|
242
241
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
242
|
+
# Fetch and parse a HTML document from the web, following redirects,
|
243
|
+
# handling https, and determining the character encoding using HTML5
|
244
|
+
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
245
|
+
# http headers and special options. Everything which is not a
|
246
|
+
# special option is considered a header. Special options include:
|
247
|
+
# * :follow_limit => number of redirects which are followed
|
248
|
+
# * :basic_auth => [username, password]
|
249
|
+
def get(uri, options = {})
|
250
|
+
# TODO: deprecate
|
251
|
+
warn(
|
252
|
+
"Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
|
253
|
+
uplevel: 1,
|
254
|
+
category: :deprecated,
|
255
|
+
)
|
256
|
+
get_impl(uri, options)
|
257
|
+
end
|
248
258
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
259
|
+
# :nodoc:
|
260
|
+
def read_and_encode(string, encoding)
|
261
|
+
# Read the string with the given encoding.
|
262
|
+
if string.respond_to?(:read)
|
263
|
+
string = if encoding.nil?
|
264
|
+
string.read
|
265
|
+
else
|
266
|
+
string.read(encoding: encoding)
|
267
|
+
end
|
268
|
+
else
|
269
|
+
# Otherwise the string has the given encoding.
|
270
|
+
string = string.to_s
|
271
|
+
if encoding
|
272
|
+
string = string.dup
|
273
|
+
string.force_encoding(encoding)
|
274
|
+
end
|
275
|
+
end
|
262
276
|
|
263
|
-
|
277
|
+
# convert to UTF-8
|
278
|
+
if string.encoding != Encoding::UTF_8
|
279
|
+
string = reencode(string)
|
280
|
+
end
|
281
|
+
string
|
282
|
+
end
|
264
283
|
|
265
|
-
|
266
|
-
headers = options.clone
|
267
|
-
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
268
|
-
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
284
|
+
private
|
269
285
|
|
270
|
-
|
271
|
-
|
286
|
+
def get_impl(uri, options = {})
|
287
|
+
headers = options.clone
|
288
|
+
headers = { follow_limit: headers } if Numeric === headers # deprecated
|
289
|
+
limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
272
290
|
|
273
|
-
|
291
|
+
require "net/http"
|
292
|
+
uri = URI(uri) unless URI === uri
|
274
293
|
|
275
|
-
|
276
|
-
http.use_ssl = true if uri.scheme == "https"
|
294
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
277
295
|
|
278
|
-
|
279
|
-
|
280
|
-
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
281
|
-
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
282
|
-
# :verify_callback, :verify_depth, :verify_mode
|
283
|
-
options.each do |key, _value|
|
284
|
-
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
285
|
-
end
|
296
|
+
# TLS / SSL support
|
297
|
+
http.use_ssl = true if uri.scheme == "https"
|
286
298
|
|
287
|
-
|
299
|
+
# Pass through Net::HTTP override values, which currently include:
|
300
|
+
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
301
|
+
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
302
|
+
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
303
|
+
# :verify_callback, :verify_depth, :verify_mode
|
304
|
+
options.each do |key, _value|
|
305
|
+
http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
|
306
|
+
end
|
288
307
|
|
289
|
-
|
290
|
-
auth = headers.delete(:basic_auth)
|
291
|
-
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
292
|
-
request.basic_auth(auth.first, auth.last) if auth
|
308
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
293
309
|
|
294
|
-
|
295
|
-
|
310
|
+
# basic authentication
|
311
|
+
auth = headers.delete(:basic_auth)
|
312
|
+
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
313
|
+
request.basic_auth(auth.first, auth.last) if auth
|
296
314
|
|
297
|
-
|
315
|
+
# remaining options are treated as headers
|
316
|
+
headers.each { |key, value| request[key.to_s] = value.to_s }
|
298
317
|
|
299
|
-
|
300
|
-
when Net::HTTPSuccess
|
301
|
-
doc = parse(reencode(response.body, response["content-type"]), options)
|
302
|
-
doc.instance_variable_set("@response", response)
|
303
|
-
doc.class.send(:attr_reader, :response)
|
304
|
-
doc
|
305
|
-
when Net::HTTPRedirection
|
306
|
-
response.value if limit <= 1
|
307
|
-
location = URI.join(uri, response["location"])
|
308
|
-
get_impl(location, options.merge(follow_limit: limit - 1))
|
309
|
-
else
|
310
|
-
response.value
|
311
|
-
end
|
312
|
-
end
|
318
|
+
response = http.request(request)
|
313
319
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
320
|
+
case response
|
321
|
+
when Net::HTTPSuccess
|
322
|
+
doc = parse(reencode(response.body, response["content-type"]), options)
|
323
|
+
doc.instance_variable_set(:@response, response)
|
324
|
+
doc.class.send(:attr_reader, :response)
|
325
|
+
doc
|
326
|
+
when Net::HTTPRedirection
|
327
|
+
response.value if limit <= 1
|
328
|
+
location = URI.join(uri, response["location"])
|
329
|
+
get_impl(location, options.merge(follow_limit: limit - 1))
|
319
330
|
else
|
320
|
-
|
321
|
-
end
|
322
|
-
else
|
323
|
-
# Otherwise the string has the given encoding.
|
324
|
-
string = string.to_s
|
325
|
-
if encoding
|
326
|
-
string = string.dup
|
327
|
-
string.force_encoding(encoding)
|
331
|
+
response.value
|
328
332
|
end
|
329
333
|
end
|
330
334
|
|
331
|
-
#
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
initial_bytes = body[0..2].bytes
|
356
|
-
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
357
|
-
encoding = Encoding::UTF_8
|
358
|
-
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
359
|
-
encoding = Encoding::UTF_16BE
|
360
|
-
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
361
|
-
encoding = Encoding::UTF_16LE
|
362
|
-
end
|
363
|
-
|
364
|
-
# look for a charset in a content-encoding header
|
365
|
-
if content_type
|
366
|
-
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
367
|
-
end
|
368
|
-
|
369
|
-
# look for a charset in a meta tag in the first 1024 bytes
|
370
|
-
unless encoding
|
371
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
372
|
-
data.scan(/<meta.*?>/m).each do |meta|
|
373
|
-
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
335
|
+
# Charset sniffing is a complex and controversial topic that understandably isn't done _by
|
336
|
+
# default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
|
337
|
+
# consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
|
338
|
+
# the Gumbo parser *only* supports utf-8.
|
339
|
+
#
|
340
|
+
# Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
|
341
|
+
# this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
|
342
|
+
# the HTML5 standard.
|
343
|
+
#
|
344
|
+
# http://bugs.ruby-lang.org/issues/2567
|
345
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
346
|
+
#
|
347
|
+
def reencode(body, content_type = nil)
|
348
|
+
if body.encoding == Encoding::ASCII_8BIT
|
349
|
+
encoding = nil
|
350
|
+
|
351
|
+
# look for a Byte Order Mark (BOM)
|
352
|
+
initial_bytes = body[0..2].bytes
|
353
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
354
|
+
encoding = Encoding::UTF_8
|
355
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
356
|
+
encoding = Encoding::UTF_16BE
|
357
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
358
|
+
encoding = Encoding::UTF_16LE
|
374
359
|
end
|
375
|
-
end
|
376
|
-
|
377
|
-
# if all else fails, default to the official default encoding for HTML
|
378
|
-
encoding ||= Encoding::ISO_8859_1
|
379
|
-
|
380
|
-
# change the encoding to match the detected or inferred encoding
|
381
|
-
body = body.dup
|
382
|
-
begin
|
383
|
-
body.force_encoding(encoding)
|
384
|
-
rescue ArgumentError
|
385
|
-
body.force_encoding(Encoding::ISO_8859_1)
|
386
|
-
end
|
387
|
-
end
|
388
360
|
|
389
|
-
|
390
|
-
|
361
|
+
# look for a charset in a content-encoding header
|
362
|
+
if content_type
|
363
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
364
|
+
end
|
391
365
|
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
# XXX(sfc): attach namespaces to all nodes, even html?
|
398
|
-
tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
399
|
-
current_node.name
|
400
|
-
else
|
401
|
-
"#{ns.prefix}:#{current_node.name}"
|
402
|
-
end
|
403
|
-
io << "<" << tagname
|
404
|
-
current_node.attribute_nodes.each do |attr|
|
405
|
-
attr_ns = attr.namespace
|
406
|
-
if attr_ns.nil?
|
407
|
-
attr_name = attr.name
|
408
|
-
else
|
409
|
-
ns_uri = attr_ns.href
|
410
|
-
attr_name = if ns_uri == XML_NAMESPACE
|
411
|
-
"xml:" + attr.name.sub(/^[^:]*:/, "")
|
412
|
-
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
|
413
|
-
"xmlns"
|
414
|
-
elsif ns_uri == XMLNS_NAMESPACE
|
415
|
-
"xmlns:" + attr.name.sub(/^[^:]*:/, "")
|
416
|
-
elsif ns_uri == XLINK_NAMESPACE
|
417
|
-
"xlink:" + attr.name.sub(/^[^:]*:/, "")
|
418
|
-
else
|
419
|
-
"#{attr_ns.prefix}:#{attr.name}"
|
366
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
367
|
+
unless encoding
|
368
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
|
369
|
+
data.scan(/<meta.*?>/im).each do |meta|
|
370
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
420
371
|
end
|
421
372
|
end
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
373
|
+
|
374
|
+
# if all else fails, default to the official default encoding for HTML
|
375
|
+
encoding ||= Encoding::ISO_8859_1
|
376
|
+
|
377
|
+
# change the encoding to match the detected or inferred encoding
|
378
|
+
body = body.dup
|
379
|
+
begin
|
380
|
+
body.force_encoding(encoding)
|
381
|
+
rescue ArgumentError
|
382
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
430
383
|
end
|
431
|
-
io << "</" << tagname << ">"
|
432
|
-
end
|
433
|
-
when XML::Node::TEXT_NODE
|
434
|
-
parent = current_node.parent
|
435
|
-
io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
|
436
|
-
current_node.content
|
437
|
-
else
|
438
|
-
escape_text(current_node.content, encoding, false)
|
439
|
-
end
|
440
|
-
when XML::Node::CDATA_SECTION_NODE
|
441
|
-
io << "<![CDATA[" << current_node.content << "]]>"
|
442
|
-
when XML::Node::COMMENT_NODE
|
443
|
-
io << "<!--" << current_node.content << "-->"
|
444
|
-
when XML::Node::PI_NODE
|
445
|
-
io << "<?" << current_node.content << ">"
|
446
|
-
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
447
|
-
io << "<!DOCTYPE " << current_node.name << ">"
|
448
|
-
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
449
|
-
current_node.children.each do |child|
|
450
|
-
serialize_node_internal(child, io, encoding, options)
|
451
384
|
end
|
452
|
-
else
|
453
|
-
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
|
454
|
-
end
|
455
|
-
end
|
456
385
|
|
457
|
-
|
458
|
-
text = if attribute_mode
|
459
|
-
text.gsub(/[&\u00a0"]/,
|
460
|
-
"&" => "&", "\u00a0" => " ", '"' => """)
|
461
|
-
else
|
462
|
-
text.gsub(/[&\u00a0<>]/,
|
463
|
-
"&" => "&", "\u00a0" => " ", "<" => "<", ">" => ">")
|
386
|
+
body.encode(Encoding::UTF_8)
|
464
387
|
end
|
465
|
-
# Not part of the standard
|
466
|
-
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
467
|
-
end
|
468
|
-
|
469
|
-
def self.prepend_newline?(node)
|
470
|
-
return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
|
471
|
-
|
472
|
-
first_child = node.children[0]
|
473
|
-
first_child.text? && first_child.content.start_with?("\n")
|
474
388
|
end
|
475
389
|
end
|
476
390
|
end
|
@@ -1,21 +1,3 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
# unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
|
5
|
-
#
|
6
|
-
# However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
|
7
|
-
# an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
|
8
|
-
# of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
|
9
|
-
# should skip loading xml jars. This is because those are in WEB-INF/lib and
|
10
|
-
# already set in the classpath.
|
11
|
-
unless $LOAD_PATH.to_s.include?("appengine-rack")
|
12
|
-
require "stringio"
|
13
|
-
require "isorelax.jar"
|
14
|
-
require "jing.jar"
|
15
|
-
require "nekohtml.jar"
|
16
|
-
require "nekodtd.jar"
|
17
|
-
require "xercesImpl.jar"
|
18
|
-
require "serializer.jar"
|
19
|
-
require "xalan.jar"
|
20
|
-
require "xml-apis.jar"
|
21
|
-
end
|
3
|
+
require_relative "nokogiri_jars"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# this is a generated file, to avoid over-writing it just delete this comment
|
2
|
+
begin
|
3
|
+
require 'jar_dependencies'
|
4
|
+
rescue LoadError
|
5
|
+
require 'xalan/serializer/2.7.3/serializer-2.7.3.jar'
|
6
|
+
require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
|
7
|
+
require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
|
8
|
+
require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
|
9
|
+
require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
|
10
|
+
require 'xalan/xalan/2.7.3/xalan-2.7.3.jar'
|
11
|
+
require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
|
12
|
+
require 'org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar'
|
13
|
+
require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
|
14
|
+
end
|
15
|
+
|
16
|
+
if defined? Jars
|
17
|
+
require_jar 'xalan', 'serializer', '2.7.3'
|
18
|
+
require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
|
19
|
+
require_jar 'nu.validator', 'jing', '20200702VNU'
|
20
|
+
require_jar 'xerces', 'xercesImpl', '2.12.2'
|
21
|
+
require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
|
22
|
+
require_jar 'xalan', 'xalan', '2.7.3'
|
23
|
+
require_jar 'xml-apis', 'xml-apis', '1.4.01'
|
24
|
+
require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko2'
|
25
|
+
require_jar 'isorelax', 'isorelax', '20030108'
|
26
|
+
end
|
27
|
+
|
28
|
+
module Nokogiri
|
29
|
+
# generated by the :vendor_jars rake task
|
30
|
+
JAR_DEPENDENCIES = {
|
31
|
+
"isorelax:isorelax" => "20030108",
|
32
|
+
"net.sf.saxon:Saxon-HE" => "9.6.0-4",
|
33
|
+
"net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
|
34
|
+
"nu.validator:jing" => "20200702VNU",
|
35
|
+
"org.nokogiri:nekodtd" => "0.1.11.noko2",
|
36
|
+
"xalan:serializer" => "2.7.3",
|
37
|
+
"xalan:xalan" => "2.7.3",
|
38
|
+
"xerces:xercesImpl" => "2.12.2",
|
39
|
+
"xml-apis:xml-apis" => "1.4.01",
|
40
|
+
}.freeze
|
41
|
+
XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
|
42
|
+
NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
|
43
|
+
end
|
@@ -105,16 +105,16 @@ module Nokogiri
|
|
105
105
|
if libxml2_using_packaged?
|
106
106
|
cppflags << "-I#{File.join(header_directory, "include").shellescape}"
|
107
107
|
cppflags << "-I#{File.join(header_directory, "include/libxml2").shellescape}"
|
108
|
+
end
|
108
109
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
end
|
115
|
-
ldflags << "-L#{lib_directory.shellescape}"
|
116
|
-
ldflags << "-l:nokogiri.so"
|
110
|
+
if windows?
|
111
|
+
# on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
|
112
|
+
lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
|
113
|
+
unless File.exist?(lib_directory)
|
114
|
+
lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
|
117
115
|
end
|
116
|
+
ldflags << "-L#{lib_directory.shellescape}"
|
117
|
+
ldflags << "-l:nokogiri.so"
|
118
118
|
end
|
119
119
|
|
120
120
|
nokogiri["cppflags"] = cppflags
|
@@ -169,8 +169,9 @@ module Nokogiri
|
|
169
169
|
vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
|
170
170
|
elsif jruby?
|
171
171
|
vi["other_libraries"] = {}.tap do |ol|
|
172
|
-
|
173
|
-
|
172
|
+
Nokogiri::JAR_DEPENDENCIES.each do |k, v|
|
173
|
+
ol[k] = v
|
174
|
+
end
|
174
175
|
end
|
175
176
|
end
|
176
177
|
end
|
data/lib/nokogiri/xml/attr.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module Nokogiri
|
@@ -7,6 +8,54 @@ module Nokogiri
|
|
7
8
|
alias_method :to_s, :content
|
8
9
|
alias_method :content=, :value=
|
9
10
|
|
11
|
+
#
|
12
|
+
# :call-seq: deconstruct_keys(array_of_names) → Hash
|
13
|
+
#
|
14
|
+
# Returns a hash describing the Attr, to use in pattern matching.
|
15
|
+
#
|
16
|
+
# Valid keys and their values:
|
17
|
+
# - +name+ → (String) The name of the attribute.
|
18
|
+
# - +value+ → (String) The value of the attribute.
|
19
|
+
# - +namespace+ → (Namespace, nil) The Namespace of the attribute, or +nil+ if there is no namespace.
|
20
|
+
#
|
21
|
+
# ⚡ This is an experimental feature, available since v1.14.0
|
22
|
+
#
|
23
|
+
# *Example*
|
24
|
+
#
|
25
|
+
# doc = Nokogiri::XML.parse(<<~XML)
|
26
|
+
# <?xml version="1.0"?>
|
27
|
+
# <root xmlns="http://nokogiri.org/ns/default" xmlns:noko="http://nokogiri.org/ns/noko">
|
28
|
+
# <child1 foo="abc" noko:bar="def"/>
|
29
|
+
# </root>
|
30
|
+
# XML
|
31
|
+
#
|
32
|
+
# attributes = doc.root.elements.first.attribute_nodes
|
33
|
+
# # => [#(Attr:0x35c { name = "foo", value = "abc" }),
|
34
|
+
# # #(Attr:0x370 {
|
35
|
+
# # name = "bar",
|
36
|
+
# # namespace = #(Namespace:0x384 {
|
37
|
+
# # prefix = "noko",
|
38
|
+
# # href = "http://nokogiri.org/ns/noko"
|
39
|
+
# # }),
|
40
|
+
# # value = "def"
|
41
|
+
# # })]
|
42
|
+
#
|
43
|
+
# attributes.first.deconstruct_keys([:name, :value, :namespace])
|
44
|
+
# # => {:name=>"foo", :value=>"abc", :namespace=>nil}
|
45
|
+
#
|
46
|
+
# attributes.last.deconstruct_keys([:name, :value, :namespace])
|
47
|
+
# # => {:name=>"bar",
|
48
|
+
# # :value=>"def",
|
49
|
+
# # :namespace=>
|
50
|
+
# # #(Namespace:0x384 {
|
51
|
+
# # prefix = "noko",
|
52
|
+
# # href = "http://nokogiri.org/ns/noko"
|
53
|
+
# # })}
|
54
|
+
#
|
55
|
+
def deconstruct_keys(keys)
|
56
|
+
{ name: name, value: value, namespace: namespace }
|
57
|
+
end
|
58
|
+
|
10
59
|
private
|
11
60
|
|
12
61
|
def inspect_attributes
|
@@ -12,8 +12,10 @@ module Nokogiri
|
|
12
12
|
undef_method :namespace_definitions
|
13
13
|
undef_method :line if method_defined?(:line)
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
private
|
16
|
+
|
17
|
+
def inspect_attributes
|
18
|
+
[:to_s]
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end
|
data/lib/nokogiri/xml/builder.rb
CHANGED
@@ -234,7 +234,7 @@ module Nokogiri
|
|
234
234
|
#
|
235
235
|
# == Document Types
|
236
236
|
#
|
237
|
-
# To create a document type (DTD),
|
237
|
+
# To create a document type (DTD), use the Builder#doc method to get
|
238
238
|
# the current context document. Then call Node#create_internal_subset to
|
239
239
|
# create the DTD node.
|
240
240
|
#
|