nokogiri 1.13.10-x86-linux → 1.14.0.rc1-x86-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +33 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/README.md +18 -11
- data/dependencies.yml +25 -7
- data/ext/nokogiri/extconf.rb +79 -20
- data/ext/nokogiri/gumbo.c +19 -9
- data/ext/nokogiri/html4_document.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +0 -5
- data/ext/nokogiri/nokogiri.c +32 -51
- data/ext/nokogiri/nokogiri.h +17 -14
- data/ext/nokogiri/xml_attribute_decl.c +1 -1
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +16 -11
- data/ext/nokogiri/xml_element_content.c +2 -2
- data/ext/nokogiri/xml_element_decl.c +1 -1
- data/ext/nokogiri/xml_encoding_handler.c +2 -2
- data/ext/nokogiri/xml_namespace.c +38 -8
- data/ext/nokogiri/xml_node.c +286 -26
- data/ext/nokogiri/xml_node_set.c +0 -2
- data/ext/nokogiri/xml_reader.c +40 -20
- data/ext/nokogiri/xml_relax_ng.c +0 -2
- data/ext/nokogiri/xml_sax_parser.c +22 -16
- data/ext/nokogiri/xml_sax_parser_context.c +0 -5
- data/ext/nokogiri/xml_sax_push_parser.c +0 -2
- data/ext/nokogiri/xml_schema.c +0 -2
- data/ext/nokogiri/xml_xpath_context.c +87 -83
- data/ext/nokogiri/xslt_stylesheet.c +14 -13
- data/gumbo-parser/Makefile +10 -0
- data/lib/nokogiri/2.7/nokogiri.so +0 -0
- data/lib/nokogiri/3.0/nokogiri.so +0 -0
- data/lib/nokogiri/3.1/nokogiri.so +0 -0
- data/lib/nokogiri/3.2/nokogiri.so +0 -0
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +3 -1
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -54
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +4 -2
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +87 -9
- data/lib/nokogiri/xml/parse_options.rb +127 -48
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- metadata +15 -250
- data/lib/nokogiri/2.6/nokogiri.so +0 -0
    
        data/lib/nokogiri/html5/node.rb
    CHANGED
    
    | @@ -28,7 +28,7 @@ module Nokogiri | |
| 28 28 | 
             
                  def inner_html(options = {})
         | 
| 29 29 | 
             
                    return super(options) unless document.is_a?(HTML5::Document)
         | 
| 30 30 |  | 
| 31 | 
            -
                    result = options[:preserve_newline] &&  | 
| 31 | 
            +
                    result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
         | 
| 32 32 | 
             
                    result << children.map { |child| child.to_html(options) }.join
         | 
| 33 33 | 
             
                    result
         | 
| 34 34 | 
             
                  end
         | 
| @@ -56,11 +56,9 @@ module Nokogiri | |
| 56 56 | 
             
                      native_write_to(io, encoding, indent_string, config_options)
         | 
| 57 57 | 
             
                    else
         | 
| 58 58 | 
             
                      # Serialize including the current node.
         | 
| 59 | 
            +
                      html = html_standard_serialize(options[:preserve_newline] || false)
         | 
| 59 60 | 
             
                      encoding ||= document.encoding || Encoding::UTF_8
         | 
| 60 | 
            -
                       | 
| 61 | 
            -
                        preserve_newline: options[:preserve_newline] || false,
         | 
| 62 | 
            -
                      }
         | 
| 63 | 
            -
                      HTML5.serialize_node_internal(self, io, encoding, internal_ops)
         | 
| 61 | 
            +
                      io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
         | 
| 64 62 | 
             
                    end
         | 
| 65 63 | 
             
                  end
         | 
| 66 64 |  | 
    
        data/lib/nokogiri/html5.rb
    CHANGED
    
    | @@ -227,250 +227,161 @@ module Nokogiri | |
| 227 227 | 
             
              #
         | 
| 228 228 | 
             
              # Since v1.12.0
         | 
| 229 229 | 
             
              module HTML5
         | 
| 230 | 
            -
                 | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
             | 
| 235 | 
            -
                XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
         | 
| 236 | 
            -
                XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
         | 
| 230 | 
            +
                class << self
         | 
| 231 | 
            +
                  # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
         | 
| 232 | 
            +
                  def parse(string, url = nil, encoding = nil, **options, &block)
         | 
| 233 | 
            +
                    Document.parse(string, url, encoding, **options, &block)
         | 
| 234 | 
            +
                  end
         | 
| 237 235 |  | 
| 238 | 
            -
             | 
| 239 | 
            -
             | 
| 240 | 
            -
                   | 
| 241 | 
            -
             | 
| 236 | 
            +
                  # Parse a fragment from +string+. Convenience method for
         | 
| 237 | 
            +
                  # {Nokogiri::HTML5::DocumentFragment.parse}.
         | 
| 238 | 
            +
                  def fragment(string, encoding = nil, **options)
         | 
| 239 | 
            +
                    DocumentFragment.parse(string, encoding, options)
         | 
| 240 | 
            +
                  end
         | 
| 242 241 |  | 
| 243 | 
            -
             | 
| 244 | 
            -
             | 
| 245 | 
            -
             | 
| 246 | 
            -
                   | 
| 247 | 
            -
             | 
| 242 | 
            +
                  # Fetch and parse a HTML document from the web, following redirects,
         | 
| 243 | 
            +
                  # handling https, and determining the character encoding using HTML5
         | 
| 244 | 
            +
                  # rules.  +uri+ may be a +String+ or a +URI+.  +options+ contains
         | 
| 245 | 
            +
                  # http headers and special options.  Everything which is not a
         | 
| 246 | 
            +
                  # special option is considered a header.  Special options include:
         | 
| 247 | 
            +
                  #  * :follow_limit => number of redirects which are followed
         | 
| 248 | 
            +
                  #  * :basic_auth => [username, password]
         | 
| 249 | 
            +
                  def get(uri, options = {})
         | 
| 250 | 
            +
                    # TODO: deprecate
         | 
| 251 | 
            +
                    warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
         | 
| 252 | 
            +
                      uplevel: 1, category: :deprecated)
         | 
| 253 | 
            +
                    get_impl(uri, options)
         | 
| 254 | 
            +
                  end
         | 
| 248 255 |  | 
| 249 | 
            -
             | 
| 250 | 
            -
             | 
| 251 | 
            -
             | 
| 252 | 
            -
             | 
| 253 | 
            -
             | 
| 254 | 
            -
             | 
| 255 | 
            -
             | 
| 256 | 
            -
             | 
| 257 | 
            -
             | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 260 | 
            -
             | 
| 261 | 
            -
             | 
| 256 | 
            +
                  # :nodoc:
         | 
| 257 | 
            +
                  def read_and_encode(string, encoding)
         | 
| 258 | 
            +
                    # Read the string with the given encoding.
         | 
| 259 | 
            +
                    if string.respond_to?(:read)
         | 
| 260 | 
            +
                      string = if encoding.nil?
         | 
| 261 | 
            +
                        string.read
         | 
| 262 | 
            +
                      else
         | 
| 263 | 
            +
                        string.read(encoding: encoding)
         | 
| 264 | 
            +
                      end
         | 
| 265 | 
            +
                    else
         | 
| 266 | 
            +
                      # Otherwise the string has the given encoding.
         | 
| 267 | 
            +
                      string = string.to_s
         | 
| 268 | 
            +
                      if encoding
         | 
| 269 | 
            +
                        string = string.dup
         | 
| 270 | 
            +
                        string.force_encoding(encoding)
         | 
| 271 | 
            +
                      end
         | 
| 272 | 
            +
                    end
         | 
| 262 273 |  | 
| 263 | 
            -
             | 
| 274 | 
            +
                    # convert to UTF-8
         | 
| 275 | 
            +
                    if string.encoding != Encoding::UTF_8
         | 
| 276 | 
            +
                      string = reencode(string)
         | 
| 277 | 
            +
                    end
         | 
| 278 | 
            +
                    string
         | 
| 279 | 
            +
                  end
         | 
| 264 280 |  | 
| 265 | 
            -
             | 
| 266 | 
            -
                  headers = options.clone
         | 
| 267 | 
            -
                  headers = { follow_limit: headers } if Numeric === headers # deprecated
         | 
| 268 | 
            -
                  limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
         | 
| 281 | 
            +
                  private
         | 
| 269 282 |  | 
| 270 | 
            -
                   | 
| 271 | 
            -
             | 
| 283 | 
            +
                  def get_impl(uri, options = {})
         | 
| 284 | 
            +
                    headers = options.clone
         | 
| 285 | 
            +
                    headers = { follow_limit: headers } if Numeric === headers # deprecated
         | 
| 286 | 
            +
                    limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
         | 
| 272 287 |  | 
| 273 | 
            -
             | 
| 288 | 
            +
                    require "net/http"
         | 
| 289 | 
            +
                    uri = URI(uri) unless URI === uri
         | 
| 274 290 |  | 
| 275 | 
            -
             | 
| 276 | 
            -
                  http.use_ssl = true if uri.scheme == "https"
         | 
| 291 | 
            +
                    http = Net::HTTP.new(uri.host, uri.port)
         | 
| 277 292 |  | 
| 278 | 
            -
             | 
| 279 | 
            -
             | 
| 280 | 
            -
                  #   :close_on_empty_response, :continue_timeout, :key, :open_timeout,
         | 
| 281 | 
            -
                  #   :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
         | 
| 282 | 
            -
                  #   :verify_callback, :verify_depth, :verify_mode
         | 
| 283 | 
            -
                  options.each do |key, _value|
         | 
| 284 | 
            -
                    http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
         | 
| 285 | 
            -
                  end
         | 
| 293 | 
            +
                    # TLS / SSL support
         | 
| 294 | 
            +
                    http.use_ssl = true if uri.scheme == "https"
         | 
| 286 295 |  | 
| 287 | 
            -
             | 
| 296 | 
            +
                    # Pass through Net::HTTP override values, which currently include:
         | 
| 297 | 
            +
                    #   :ca_file, :ca_path, :cert, :cert_store, :ciphers,
         | 
| 298 | 
            +
                    #   :close_on_empty_response, :continue_timeout, :key, :open_timeout,
         | 
| 299 | 
            +
                    #   :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
         | 
| 300 | 
            +
                    #   :verify_callback, :verify_depth, :verify_mode
         | 
| 301 | 
            +
                    options.each do |key, _value|
         | 
| 302 | 
            +
                      http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
         | 
| 303 | 
            +
                    end
         | 
| 288 304 |  | 
| 289 | 
            -
             | 
| 290 | 
            -
                  auth = headers.delete(:basic_auth)
         | 
| 291 | 
            -
                  auth ||= [uri.user, uri.password] if uri.user && uri.password
         | 
| 292 | 
            -
                  request.basic_auth(auth.first, auth.last) if auth
         | 
| 305 | 
            +
                    request = Net::HTTP::Get.new(uri.request_uri)
         | 
| 293 306 |  | 
| 294 | 
            -
             | 
| 295 | 
            -
             | 
| 307 | 
            +
                    # basic authentication
         | 
| 308 | 
            +
                    auth = headers.delete(:basic_auth)
         | 
| 309 | 
            +
                    auth ||= [uri.user, uri.password] if uri.user && uri.password
         | 
| 310 | 
            +
                    request.basic_auth(auth.first, auth.last) if auth
         | 
| 296 311 |  | 
| 297 | 
            -
             | 
| 312 | 
            +
                    # remaining options are treated as headers
         | 
| 313 | 
            +
                    headers.each { |key, value| request[key.to_s] = value.to_s }
         | 
| 298 314 |  | 
| 299 | 
            -
             | 
| 300 | 
            -
                  when Net::HTTPSuccess
         | 
| 301 | 
            -
                    doc = parse(reencode(response.body, response["content-type"]), options)
         | 
| 302 | 
            -
                    doc.instance_variable_set("@response", response)
         | 
| 303 | 
            -
                    doc.class.send(:attr_reader, :response)
         | 
| 304 | 
            -
                    doc
         | 
| 305 | 
            -
                  when Net::HTTPRedirection
         | 
| 306 | 
            -
                    response.value if limit <= 1
         | 
| 307 | 
            -
                    location = URI.join(uri, response["location"])
         | 
| 308 | 
            -
                    get_impl(location, options.merge(follow_limit: limit - 1))
         | 
| 309 | 
            -
                  else
         | 
| 310 | 
            -
                    response.value
         | 
| 311 | 
            -
                  end
         | 
| 312 | 
            -
                end
         | 
| 315 | 
            +
                    response = http.request(request)
         | 
| 313 316 |  | 
| 314 | 
            -
             | 
| 315 | 
            -
             | 
| 316 | 
            -
             | 
| 317 | 
            -
             | 
| 318 | 
            -
                       | 
| 317 | 
            +
                    case response
         | 
| 318 | 
            +
                    when Net::HTTPSuccess
         | 
| 319 | 
            +
                      doc = parse(reencode(response.body, response["content-type"]), options)
         | 
| 320 | 
            +
                      doc.instance_variable_set(:@response, response)
         | 
| 321 | 
            +
                      doc.class.send(:attr_reader, :response)
         | 
| 322 | 
            +
                      doc
         | 
| 323 | 
            +
                    when Net::HTTPRedirection
         | 
| 324 | 
            +
                      response.value if limit <= 1
         | 
| 325 | 
            +
                      location = URI.join(uri, response["location"])
         | 
| 326 | 
            +
                      get_impl(location, options.merge(follow_limit: limit - 1))
         | 
| 319 327 | 
             
                    else
         | 
| 320 | 
            -
                       | 
| 321 | 
            -
                    end
         | 
| 322 | 
            -
                  else
         | 
| 323 | 
            -
                    # Otherwise the string has the given encoding.
         | 
| 324 | 
            -
                    string = string.to_s
         | 
| 325 | 
            -
                    if encoding
         | 
| 326 | 
            -
                      string = string.dup
         | 
| 327 | 
            -
                      string.force_encoding(encoding)
         | 
| 328 | 
            +
                      response.value
         | 
| 328 329 | 
             
                    end
         | 
| 329 330 | 
             
                  end
         | 
| 330 331 |  | 
| 331 | 
            -
                  #  | 
| 332 | 
            -
                   | 
| 333 | 
            -
             | 
| 334 | 
            -
                   | 
| 335 | 
            -
                   | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 | 
            -
             | 
| 339 | 
            -
             | 
| 340 | 
            -
             | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
| 343 | 
            -
             | 
| 344 | 
            -
             | 
| 345 | 
            -
             | 
| 346 | 
            -
             | 
| 347 | 
            -
             | 
| 348 | 
            -
             | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 | 
            -
             | 
| 352 | 
            -
             | 
| 353 | 
            -
             | 
| 354 | 
            -
             | 
| 355 | 
            -
                    initial_bytes = body[0..2].bytes
         | 
| 356 | 
            -
                    if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
         | 
| 357 | 
            -
                      encoding = Encoding::UTF_8
         | 
| 358 | 
            -
                    elsif initial_bytes[0..1] == [0xFE, 0xFF]
         | 
| 359 | 
            -
                      encoding = Encoding::UTF_16BE
         | 
| 360 | 
            -
                    elsif initial_bytes[0..1] == [0xFF, 0xFE]
         | 
| 361 | 
            -
                      encoding = Encoding::UTF_16LE
         | 
| 362 | 
            -
                    end
         | 
| 363 | 
            -
             | 
| 364 | 
            -
                    # look for a charset in a content-encoding header
         | 
| 365 | 
            -
                    if content_type
         | 
| 366 | 
            -
                      encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
         | 
| 367 | 
            -
                    end
         | 
| 368 | 
            -
             | 
| 369 | 
            -
                    # look for a charset in a meta tag in the first 1024 bytes
         | 
| 370 | 
            -
                    unless encoding
         | 
| 371 | 
            -
                      data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
         | 
| 372 | 
            -
                      data.scan(/<meta.*?>/m).each do |meta|
         | 
| 373 | 
            -
                        encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
         | 
| 332 | 
            +
                  # Charset sniffing is a complex and controversial topic that understandably isn't done _by
         | 
| 333 | 
            +
                  # default_ by the Ruby Net::HTTP library.  This being said, it is a very real problem for
         | 
| 334 | 
            +
                  # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
         | 
| 335 | 
            +
                  # the Gumbo parser *only* supports utf-8.
         | 
| 336 | 
            +
                  #
         | 
| 337 | 
            +
                  # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection.  Following
         | 
| 338 | 
            +
                  # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
         | 
| 339 | 
            +
                  # the HTML5 standard.
         | 
| 340 | 
            +
                  #
         | 
| 341 | 
            +
                  # http://bugs.ruby-lang.org/issues/2567
         | 
| 342 | 
            +
                  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
         | 
| 343 | 
            +
                  #
         | 
| 344 | 
            +
                  def reencode(body, content_type = nil)
         | 
| 345 | 
            +
                    if body.encoding == Encoding::ASCII_8BIT
         | 
| 346 | 
            +
                      encoding = nil
         | 
| 347 | 
            +
             | 
| 348 | 
            +
                      # look for a Byte Order Mark (BOM)
         | 
| 349 | 
            +
                      initial_bytes = body[0..2].bytes
         | 
| 350 | 
            +
                      if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
         | 
| 351 | 
            +
                        encoding = Encoding::UTF_8
         | 
| 352 | 
            +
                      elsif initial_bytes[0..1] == [0xFE, 0xFF]
         | 
| 353 | 
            +
                        encoding = Encoding::UTF_16BE
         | 
| 354 | 
            +
                      elsif initial_bytes[0..1] == [0xFF, 0xFE]
         | 
| 355 | 
            +
                        encoding = Encoding::UTF_16LE
         | 
| 374 356 | 
             
                      end
         | 
| 375 | 
            -
                    end
         | 
| 376 | 
            -
             | 
| 377 | 
            -
                    # if all else fails, default to the official default encoding for HTML
         | 
| 378 | 
            -
                    encoding ||= Encoding::ISO_8859_1
         | 
| 379 | 
            -
             | 
| 380 | 
            -
                    # change the encoding to match the detected or inferred encoding
         | 
| 381 | 
            -
                    body = body.dup
         | 
| 382 | 
            -
                    begin
         | 
| 383 | 
            -
                      body.force_encoding(encoding)
         | 
| 384 | 
            -
                    rescue ArgumentError
         | 
| 385 | 
            -
                      body.force_encoding(Encoding::ISO_8859_1)
         | 
| 386 | 
            -
                    end
         | 
| 387 | 
            -
                  end
         | 
| 388 357 |  | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 358 | 
            +
                      # look for a charset in a content-encoding header
         | 
| 359 | 
            +
                      if content_type
         | 
| 360 | 
            +
                        encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
         | 
| 361 | 
            +
                      end
         | 
| 391 362 |  | 
| 392 | 
            -
             | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 | 
            -
             | 
| 396 | 
            -
             | 
| 397 | 
            -
                    # XXX(sfc): attach namespaces to all nodes, even html?
         | 
| 398 | 
            -
                    tagname = if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
         | 
| 399 | 
            -
                      current_node.name
         | 
| 400 | 
            -
                    else
         | 
| 401 | 
            -
                      "#{ns.prefix}:#{current_node.name}"
         | 
| 402 | 
            -
                    end
         | 
| 403 | 
            -
                    io << "<" << tagname
         | 
| 404 | 
            -
                    current_node.attribute_nodes.each do |attr|
         | 
| 405 | 
            -
                      attr_ns = attr.namespace
         | 
| 406 | 
            -
                      if attr_ns.nil?
         | 
| 407 | 
            -
                        attr_name = attr.name
         | 
| 408 | 
            -
                      else
         | 
| 409 | 
            -
                        ns_uri = attr_ns.href
         | 
| 410 | 
            -
                        attr_name = if ns_uri == XML_NAMESPACE
         | 
| 411 | 
            -
                          "xml:" + attr.name.sub(/^[^:]*:/, "")
         | 
| 412 | 
            -
                        elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, "") == "xmlns"
         | 
| 413 | 
            -
                          "xmlns"
         | 
| 414 | 
            -
                        elsif ns_uri == XMLNS_NAMESPACE
         | 
| 415 | 
            -
                          "xmlns:" + attr.name.sub(/^[^:]*:/, "")
         | 
| 416 | 
            -
                        elsif ns_uri == XLINK_NAMESPACE
         | 
| 417 | 
            -
                          "xlink:" + attr.name.sub(/^[^:]*:/, "")
         | 
| 418 | 
            -
                        else
         | 
| 419 | 
            -
                          "#{attr_ns.prefix}:#{attr.name}"
         | 
| 363 | 
            +
                      # look for a charset in a meta tag in the first 1024 bytes
         | 
| 364 | 
            +
                      unless encoding
         | 
| 365 | 
            +
                        data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
         | 
| 366 | 
            +
                        data.scan(/<meta.*?>/im).each do |meta|
         | 
| 367 | 
            +
                          encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
         | 
| 420 368 | 
             
                        end
         | 
| 421 369 | 
             
                      end
         | 
| 422 | 
            -
             | 
| 423 | 
            -
             | 
| 424 | 
            -
             | 
| 425 | 
            -
             | 
| 426 | 
            -
                       | 
| 427 | 
            -
                       | 
| 428 | 
            -
             | 
| 429 | 
            -
                         | 
| 370 | 
            +
             | 
| 371 | 
            +
                      # if all else fails, default to the official default encoding for HTML
         | 
| 372 | 
            +
                      encoding ||= Encoding::ISO_8859_1
         | 
| 373 | 
            +
             | 
| 374 | 
            +
                      # change the encoding to match the detected or inferred encoding
         | 
| 375 | 
            +
                      body = body.dup
         | 
| 376 | 
            +
                      begin
         | 
| 377 | 
            +
                        body.force_encoding(encoding)
         | 
| 378 | 
            +
                      rescue ArgumentError
         | 
| 379 | 
            +
                        body.force_encoding(Encoding::ISO_8859_1)
         | 
| 430 380 | 
             
                      end
         | 
| 431 | 
            -
                      io << "</" << tagname << ">"
         | 
| 432 | 
            -
                    end
         | 
| 433 | 
            -
                  when XML::Node::TEXT_NODE
         | 
| 434 | 
            -
                    parent = current_node.parent
         | 
| 435 | 
            -
                    io << if parent.element? && ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"].include?(parent.name)
         | 
| 436 | 
            -
                      current_node.content
         | 
| 437 | 
            -
                    else
         | 
| 438 | 
            -
                      escape_text(current_node.content, encoding, false)
         | 
| 439 | 
            -
                    end
         | 
| 440 | 
            -
                  when XML::Node::CDATA_SECTION_NODE
         | 
| 441 | 
            -
                    io << "<![CDATA[" << current_node.content << "]]>"
         | 
| 442 | 
            -
                  when XML::Node::COMMENT_NODE
         | 
| 443 | 
            -
                    io << "<!--" << current_node.content << "-->"
         | 
| 444 | 
            -
                  when XML::Node::PI_NODE
         | 
| 445 | 
            -
                    io << "<?" << current_node.content << ">"
         | 
| 446 | 
            -
                  when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
         | 
| 447 | 
            -
                    io << "<!DOCTYPE " << current_node.name << ">"
         | 
| 448 | 
            -
                  when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
         | 
| 449 | 
            -
                    current_node.children.each do |child|
         | 
| 450 | 
            -
                      serialize_node_internal(child, io, encoding, options)
         | 
| 451 381 | 
             
                    end
         | 
| 452 | 
            -
                  else
         | 
| 453 | 
            -
                    raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
         | 
| 454 | 
            -
                  end
         | 
| 455 | 
            -
                end
         | 
| 456 382 |  | 
| 457 | 
            -
             | 
| 458 | 
            -
                  text = if attribute_mode
         | 
| 459 | 
            -
                    text.gsub(/[&\u00a0"]/,
         | 
| 460 | 
            -
                      "&" => "&", "\u00a0" => " ", '"' => """)
         | 
| 461 | 
            -
                  else
         | 
| 462 | 
            -
                    text.gsub(/[&\u00a0<>]/,
         | 
| 463 | 
            -
                      "&" => "&", "\u00a0" => " ", "<" => "<", ">" => ">")
         | 
| 383 | 
            +
                    body.encode(Encoding::UTF_8)
         | 
| 464 384 | 
             
                  end
         | 
| 465 | 
            -
                  # Not part of the standard
         | 
| 466 | 
            -
                  text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
         | 
| 467 | 
            -
                end
         | 
| 468 | 
            -
             | 
| 469 | 
            -
                def self.prepend_newline?(node)
         | 
| 470 | 
            -
                  return false unless ["pre", "textarea", "listing"].include?(node.name) && !node.children.empty?
         | 
| 471 | 
            -
             | 
| 472 | 
            -
                  first_child = node.children[0]
         | 
| 473 | 
            -
                  first_child.text? && first_child.content.start_with?("\n")
         | 
| 474 385 | 
             
                end
         | 
| 475 386 | 
             
              end
         | 
| 476 387 | 
             
            end
         | 
| @@ -1,21 +1,3 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 | 
            -
             | 
| 4 | 
            -
            # unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
         | 
| 5 | 
            -
            #
         | 
| 6 | 
            -
            # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
         | 
| 7 | 
            -
            # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
         | 
| 8 | 
            -
            # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
         | 
| 9 | 
            -
            # should skip loading xml jars. This is because those are in WEB-INF/lib and
         | 
| 10 | 
            -
            # already set in the classpath.
         | 
| 11 | 
            -
            unless $LOAD_PATH.to_s.include?("appengine-rack")
         | 
| 12 | 
            -
              require "stringio"
         | 
| 13 | 
            -
              require "isorelax.jar"
         | 
| 14 | 
            -
              require "jing.jar"
         | 
| 15 | 
            -
              require "nekohtml.jar"
         | 
| 16 | 
            -
              require "nekodtd.jar"
         | 
| 17 | 
            -
              require "xercesImpl.jar"
         | 
| 18 | 
            -
              require "serializer.jar"
         | 
| 19 | 
            -
              require "xalan.jar"
         | 
| 20 | 
            -
              require "xml-apis.jar"
         | 
| 21 | 
            -
            end
         | 
| 3 | 
            +
            require_relative "nokogiri_jars"
         | 
| @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            # this is a generated file, to avoid over-writing it just delete this comment
         | 
| 2 | 
            +
            begin
         | 
| 3 | 
            +
              require 'jar_dependencies'
         | 
| 4 | 
            +
            rescue LoadError
         | 
| 5 | 
            +
              require 'xalan/xalan/2.7.2/xalan-2.7.2.jar'
         | 
| 6 | 
            +
              require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
         | 
| 7 | 
            +
              require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
         | 
| 8 | 
            +
              require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
         | 
| 9 | 
            +
              require 'org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar'
         | 
| 10 | 
            +
              require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
         | 
| 11 | 
            +
              require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
         | 
| 12 | 
            +
              require 'xalan/serializer/2.7.2/serializer-2.7.2.jar'
         | 
| 13 | 
            +
              require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
         | 
| 14 | 
            +
            end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            if defined? Jars
         | 
| 17 | 
            +
              require_jar 'xalan', 'xalan', '2.7.2'
         | 
| 18 | 
            +
              require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
         | 
| 19 | 
            +
              require_jar 'nu.validator', 'jing', '20200702VNU'
         | 
| 20 | 
            +
              require_jar 'xerces', 'xercesImpl', '2.12.2'
         | 
| 21 | 
            +
              require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko1'
         | 
| 22 | 
            +
              require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
         | 
| 23 | 
            +
              require_jar 'xml-apis', 'xml-apis', '1.4.01'
         | 
| 24 | 
            +
              require_jar 'xalan', 'serializer', '2.7.2'
         | 
| 25 | 
            +
              require_jar 'isorelax', 'isorelax', '20030108'
         | 
| 26 | 
            +
            end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            module Nokogiri
         | 
| 29 | 
            +
              # generated by the :vendor_jars rake task
         | 
| 30 | 
            +
              JAR_DEPENDENCIES = {
         | 
| 31 | 
            +
                "isorelax:isorelax" => "20030108",
         | 
| 32 | 
            +
                "net.sf.saxon:Saxon-HE" => "9.6.0-4",
         | 
| 33 | 
            +
                "net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
         | 
| 34 | 
            +
                "nu.validator:jing" => "20200702VNU",
         | 
| 35 | 
            +
                "org.nokogiri:nekodtd" => "0.1.11.noko1",
         | 
| 36 | 
            +
                "xalan:serializer" => "2.7.2",
         | 
| 37 | 
            +
                "xalan:xalan" => "2.7.2",
         | 
| 38 | 
            +
                "xerces:xercesImpl" => "2.12.2",
         | 
| 39 | 
            +
                "xml-apis:xml-apis" => "1.4.01",
         | 
| 40 | 
            +
              }.freeze
         | 
| 41 | 
            +
              XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
         | 
| 42 | 
            +
              NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
         | 
| 43 | 
            +
            end
         | 
| @@ -105,16 +105,16 @@ module Nokogiri | |
| 105 105 | 
             
                        if libxml2_using_packaged?
         | 
| 106 106 | 
             
                          cppflags << "-I#{File.join(header_directory, "include").shellescape}"
         | 
| 107 107 | 
             
                          cppflags << "-I#{File.join(header_directory, "include/libxml2").shellescape}"
         | 
| 108 | 
            +
                        end
         | 
| 108 109 |  | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
                            end
         | 
| 115 | 
            -
                            ldflags << "-L#{lib_directory.shellescape}"
         | 
| 116 | 
            -
                            ldflags << "-l:nokogiri.so"
         | 
| 110 | 
            +
                        if windows?
         | 
| 111 | 
            +
                          # on windows, nokogumbo needs to link against nokogiri.so to resolve symbols. see #2167
         | 
| 112 | 
            +
                          lib_directory = File.expand_path(File.join(File.dirname(__FILE__), "../#{ruby_minor}"))
         | 
| 113 | 
            +
                          unless File.exist?(lib_directory)
         | 
| 114 | 
            +
                            lib_directory = File.expand_path(File.join(File.dirname(__FILE__), ".."))
         | 
| 117 115 | 
             
                          end
         | 
| 116 | 
            +
                          ldflags << "-L#{lib_directory.shellescape}"
         | 
| 117 | 
            +
                          ldflags << "-l:nokogiri.so"
         | 
| 118 118 | 
             
                        end
         | 
| 119 119 |  | 
| 120 120 | 
             
                        nokogiri["cppflags"] = cppflags
         | 
| @@ -169,8 +169,9 @@ module Nokogiri | |
| 169 169 | 
             
                      vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
         | 
| 170 170 | 
             
                    elsif jruby?
         | 
| 171 171 | 
             
                      vi["other_libraries"] = {}.tap do |ol|
         | 
| 172 | 
            -
                         | 
| 173 | 
            -
             | 
| 172 | 
            +
                        Nokogiri::JAR_DEPENDENCIES.each do |k, v|
         | 
| 173 | 
            +
                          ol[k] = v
         | 
| 174 | 
            +
                        end
         | 
| 174 175 | 
             
                      end
         | 
| 175 176 | 
             
                    end
         | 
| 176 177 | 
             
                  end
         | 
    
        data/lib/nokogiri/xml/attr.rb
    CHANGED
    
    | @@ -1,3 +1,4 @@ | |
| 1 | 
            +
            # coding: utf-8
         | 
| 1 2 | 
             
            # frozen_string_literal: true
         | 
| 2 3 |  | 
| 3 4 | 
             
            module Nokogiri
         | 
| @@ -7,6 +8,54 @@ module Nokogiri | |
| 7 8 | 
             
                  alias_method :to_s, :content
         | 
| 8 9 | 
             
                  alias_method :content=, :value=
         | 
| 9 10 |  | 
| 11 | 
            +
                  #
         | 
| 12 | 
            +
                  #  :call-seq: deconstruct_keys(array_of_names) → Hash
         | 
| 13 | 
            +
                  #
         | 
| 14 | 
            +
                  #  Returns a hash describing the Attr, to use in pattern matching.
         | 
| 15 | 
            +
                  #
         | 
| 16 | 
            +
                  #  Valid keys and their values:
         | 
| 17 | 
            +
                  #  - +name+ → (String) The name of the attribute.
         | 
| 18 | 
            +
                  #  - +value+ → (String) The value of the attribute.
         | 
| 19 | 
            +
                  #  - +namespace+ → (Namespace, nil) The Namespace of the attribute, or +nil+ if there is no namespace.
         | 
| 20 | 
            +
                  #
         | 
| 21 | 
            +
                  #  ⚡ This is an experimental feature, available since v1.14.0
         | 
| 22 | 
            +
                  #
         | 
| 23 | 
            +
                  #  *Example*
         | 
| 24 | 
            +
                  #
         | 
| 25 | 
            +
                  #    doc = Nokogiri::XML.parse(<<~XML)
         | 
| 26 | 
            +
                  #      <?xml version="1.0"?>
         | 
| 27 | 
            +
                  #      <root xmlns="http://nokogiri.org/ns/default" xmlns:noko="http://nokogiri.org/ns/noko">
         | 
| 28 | 
            +
                  #        <child1 foo="abc" noko:bar="def"/>
         | 
| 29 | 
            +
                  #      </root>
         | 
| 30 | 
            +
                  #    XML
         | 
| 31 | 
            +
                  #
         | 
| 32 | 
            +
                  #    attributes = doc.root.elements.first.attribute_nodes
         | 
| 33 | 
            +
                  #    # => [#(Attr:0x35c { name = "foo", value = "abc" }),
         | 
| 34 | 
            +
                  #    #     #(Attr:0x370 {
         | 
| 35 | 
            +
                  #    #       name = "bar",
         | 
| 36 | 
            +
                  #    #       namespace = #(Namespace:0x384 {
         | 
| 37 | 
            +
                  #    #         prefix = "noko",
         | 
| 38 | 
            +
                  #    #         href = "http://nokogiri.org/ns/noko"
         | 
| 39 | 
            +
                  #    #         }),
         | 
| 40 | 
            +
                  #    #       value = "def"
         | 
| 41 | 
            +
                  #    #       })]
         | 
| 42 | 
            +
                  #
         | 
| 43 | 
            +
                  #    attributes.first.deconstruct_keys([:name, :value, :namespace])
         | 
| 44 | 
            +
                  #    # => {:name=>"foo", :value=>"abc", :namespace=>nil}
         | 
| 45 | 
            +
                  #
         | 
| 46 | 
            +
                  #    attributes.last.deconstruct_keys([:name, :value, :namespace])
         | 
| 47 | 
            +
                  #    # => {:name=>"bar",
         | 
| 48 | 
            +
                  #    #     :value=>"def",
         | 
| 49 | 
            +
                  #    #     :namespace=>
         | 
| 50 | 
            +
                  #    #      #(Namespace:0x384 {
         | 
| 51 | 
            +
                  #    #        prefix = "noko",
         | 
| 52 | 
            +
                  #    #        href = "http://nokogiri.org/ns/noko"
         | 
| 53 | 
            +
                  #    #        })}
         | 
| 54 | 
            +
                  #
         | 
| 55 | 
            +
                  def deconstruct_keys(keys)
         | 
| 56 | 
            +
                    { name: name, value: value, namespace: namespace }
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 10 59 | 
             
                  private
         | 
| 11 60 |  | 
| 12 61 | 
             
                  def inspect_attributes
         | 
    
        data/lib/nokogiri/xml/builder.rb
    CHANGED
    
    | @@ -234,7 +234,7 @@ module Nokogiri | |
| 234 234 | 
             
                #
         | 
| 235 235 | 
             
                # == Document Types
         | 
| 236 236 | 
             
                #
         | 
| 237 | 
            -
                # To create a document type (DTD),  | 
| 237 | 
            +
                # To create a document type (DTD), use the Builder#doc method to get
         | 
| 238 238 | 
             
                # the current context document.  Then call Node#create_internal_subset to
         | 
| 239 239 | 
             
                # create the DTD node.
         | 
| 240 240 | 
             
                #
         |