nokogiri 1.13.10-x86-linux → 1.14.0.rc1-x86-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +33 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/README.md +18 -11
- data/dependencies.yml +25 -7
- data/ext/nokogiri/extconf.rb +79 -20
- data/ext/nokogiri/gumbo.c +19 -9
- data/ext/nokogiri/html4_document.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +0 -5
- data/ext/nokogiri/nokogiri.c +32 -51
- data/ext/nokogiri/nokogiri.h +17 -14
- data/ext/nokogiri/xml_attribute_decl.c +1 -1
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +16 -11
- data/ext/nokogiri/xml_element_content.c +2 -2
- data/ext/nokogiri/xml_element_decl.c +1 -1
- data/ext/nokogiri/xml_encoding_handler.c +2 -2
- data/ext/nokogiri/xml_namespace.c +38 -8
- data/ext/nokogiri/xml_node.c +286 -26
- data/ext/nokogiri/xml_node_set.c +0 -2
- data/ext/nokogiri/xml_reader.c +40 -20
- data/ext/nokogiri/xml_relax_ng.c +0 -2
- data/ext/nokogiri/xml_sax_parser.c +22 -16
- data/ext/nokogiri/xml_sax_parser_context.c +0 -5
- data/ext/nokogiri/xml_sax_push_parser.c +0 -2
- data/ext/nokogiri/xml_schema.c +0 -2
- data/ext/nokogiri/xml_xpath_context.c +87 -83
- data/ext/nokogiri/xslt_stylesheet.c +14 -13
- data/gumbo-parser/Makefile +10 -0
- data/lib/nokogiri/2.7/nokogiri.so +0 -0
- data/lib/nokogiri/3.0/nokogiri.so +0 -0
- data/lib/nokogiri/3.1/nokogiri.so +0 -0
- data/lib/nokogiri/3.2/nokogiri.so +0 -0
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +3 -1
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -54
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +4 -2
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +87 -9
- data/lib/nokogiri/xml/parse_options.rb +127 -48
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- metadata +15 -250
- data/lib/nokogiri/2.6/nokogiri.so +0 -0
    
        data/gumbo-parser/Makefile
    CHANGED
    
    | @@ -13,8 +13,18 @@ LDFLAGS := -pthread | |
| 13 13 |  | 
| 14 14 | 
             
            all: check
         | 
| 15 15 |  | 
| 16 | 
            +
            # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
         | 
| 17 | 
            +
            # the generated files should be committed to SCM
         | 
| 18 | 
            +
            ifneq ($(CI),true)
         | 
| 19 | 
            +
            src/foreign_attrs.c: src/foreign_attrs.gperf
         | 
| 20 | 
            +
            	gperf -m100 -n $< | ./gperf-filter.sed > $@
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            src/%.c: src/%.gperf
         | 
| 23 | 
            +
            	gperf -m100 $< | ./gperf-filter.sed > $@
         | 
| 24 | 
            +
             | 
| 16 25 | 
             
            src/%.c: src/%.rl
         | 
| 17 26 | 
             
            	ragel -F1 -o $@ $<
         | 
| 27 | 
            +
            endif
         | 
| 18 28 |  | 
| 19 29 | 
             
            build/src:
         | 
| 20 30 | 
             
            	mkdir -p $@
         | 
| Binary file | 
| Binary file | 
| Binary file | 
| Binary file | 
    
        data/lib/nokogiri/css/node.rb
    CHANGED
    
    
| @@ -278,7 +278,9 @@ module Nokogiri | |
| 278 278 | 
             
                  end
         | 
| 279 279 |  | 
| 280 280 | 
             
                  def nth(node, options = {})
         | 
| 281 | 
            -
                     | 
| 281 | 
            +
                    unless node.value.size == 4
         | 
| 282 | 
            +
                      raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}")
         | 
| 283 | 
            +
                    end
         | 
| 282 284 |  | 
| 283 285 | 
             
                    a, b = read_a_and_positive_b(node.value)
         | 
| 284 286 | 
             
                    position = if options[:child]
         | 
    
        data/lib/nokogiri/css.rb
    CHANGED
    
    | @@ -40,9 +40,15 @@ module Nokogiri | |
| 40 40 | 
             
                  # 💡 Note that translated queries are cached for performance concerns.
         | 
| 41 41 | 
             
                  #
         | 
| 42 42 | 
             
                  def xpath_for(selector, options = {})
         | 
| 43 | 
            +
                    raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    selector = selector.to_str
         | 
| 46 | 
            +
                    raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
         | 
| 47 | 
            +
             | 
| 43 48 | 
             
                    prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
         | 
| 44 49 | 
             
                    visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
         | 
| 45 50 | 
             
                    ns = options.fetch(:ns, {})
         | 
| 51 | 
            +
             | 
| 46 52 | 
             
                    Parser.new(ns).xpath_for(selector, prefix, visitor)
         | 
| 47 53 | 
             
                  end
         | 
| 48 54 | 
             
                end
         | 
| @@ -0,0 +1,57 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            # frozen_string_literal: true
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Nokogiri
         | 
| 5 | 
            +
              class EncodingHandler
         | 
| 6 | 
            +
                # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
         | 
| 7 | 
            +
                USEFUL_ALIASES = {
         | 
| 8 | 
            +
                  # alias_name => true_name
         | 
| 9 | 
            +
                  "NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
         | 
| 10 | 
            +
                  "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
         | 
| 11 | 
            +
                  "UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
         | 
| 12 | 
            +
                }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                class << self
         | 
| 15 | 
            +
                  def install_default_aliases
         | 
| 16 | 
            +
                    USEFUL_ALIASES.each do |alias_name, name|
         | 
| 17 | 
            +
                      EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
         | 
| 18 | 
            +
                    end
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                # :stopdoc:
         | 
| 23 | 
            +
                if Nokogiri.jruby?
         | 
| 24 | 
            +
                  class << self
         | 
| 25 | 
            +
                    def [](name)
         | 
| 26 | 
            +
                      storage.key?(name) ? new(storage[name]) : nil
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    def alias(name, alias_name)
         | 
| 30 | 
            +
                      storage[alias_name] = name
         | 
| 31 | 
            +
                    end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    def delete(name)
         | 
| 34 | 
            +
                      storage.delete(name)
         | 
| 35 | 
            +
                    end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    def clear_aliases!
         | 
| 38 | 
            +
                      storage.clear
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    private
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    def storage
         | 
| 44 | 
            +
                      @storage ||= {}
         | 
| 45 | 
            +
                    end
         | 
| 46 | 
            +
                  end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  def initialize(name)
         | 
| 49 | 
            +
                    @name = name
         | 
| 50 | 
            +
                  end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                  attr_reader :name
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
              end
         | 
| 55 | 
            +
            end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            Nokogiri::EncodingHandler.install_default_aliases
         | 
    
        data/lib/nokogiri/extension.rb
    CHANGED
    
    | @@ -3,13 +3,14 @@ | |
| 3 3 | 
             
            # load the C or Java extension
         | 
| 4 4 | 
             
            begin
         | 
| 5 5 | 
             
              # native precompiled gems package shared libraries in <gem_dir>/lib/nokogiri/<ruby_version>
         | 
| 6 | 
            -
               | 
| 6 | 
            +
              RUBY_VERSION =~ /(\d+\.\d+)/
         | 
| 7 7 | 
             
              require_relative "#{Regexp.last_match(1)}/nokogiri"
         | 
| 8 8 | 
             
            rescue LoadError => e
         | 
| 9 9 | 
             
              if /GLIBC/.match?(e.message)
         | 
| 10 10 | 
             
                warn(<<~EOM)
         | 
| 11 11 |  | 
| 12 | 
            -
                  ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system | 
| 12 | 
            +
                  ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system
         | 
| 13 | 
            +
                         with an unsupported version of glibc.
         | 
| 13 14 |  | 
| 14 15 | 
             
                    #{e.message}
         | 
| 15 16 |  | 
| @@ -176,7 +176,7 @@ module Nokogiri | |
| 176 176 | 
             
                      url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
         | 
| 177 177 |  | 
| 178 178 | 
             
                      if string_or_io.respond_to?(:encoding)
         | 
| 179 | 
            -
                        unless string_or_io.encoding | 
| 179 | 
            +
                        unless string_or_io.encoding == Encoding::ASCII_8BIT
         | 
| 180 180 | 
             
                          encoding ||= string_or_io.encoding.name
         | 
| 181 181 | 
             
                        end
         | 
| 182 182 | 
             
                      end
         | 
| @@ -189,21 +189,10 @@ module Nokogiri | |
| 189 189 | 
             
                        end
         | 
| 190 190 |  | 
| 191 191 | 
             
                        unless encoding
         | 
| 192 | 
            -
                          # Libxml2's parser has poor support for encoding
         | 
| 193 | 
            -
                          # detection.  First, it does not recognize the HTML5
         | 
| 194 | 
            -
                          # style meta charset declaration.  Secondly, even if it
         | 
| 195 | 
            -
                          # successfully detects an encoding hint, it does not
         | 
| 196 | 
            -
                          # re-decode or re-parse the preceding part which may be
         | 
| 197 | 
            -
                          # garbled.
         | 
| 198 | 
            -
                          #
         | 
| 199 | 
            -
                          # EncodingReader aims to perform advanced encoding
         | 
| 200 | 
            -
                          # detection beyond what Libxml2 does, and to emulate
         | 
| 201 | 
            -
                          # rewinding of a stream and make Libxml2 redo parsing
         | 
| 202 | 
            -
                          # from the start when an encoding hint is found.
         | 
| 203 192 | 
             
                          string_or_io = EncodingReader.new(string_or_io)
         | 
| 204 193 | 
             
                          begin
         | 
| 205 194 | 
             
                            return read_io(string_or_io, url, encoding, options.to_i)
         | 
| 206 | 
            -
                          rescue EncodingFound => e
         | 
| 195 | 
            +
                          rescue EncodingReader::EncodingFound => e
         | 
| 207 196 | 
             
                            encoding = e.found_encoding
         | 
| 208 197 | 
             
                          end
         | 
| 209 198 | 
             
                        end
         | 
| @@ -220,114 +209,6 @@ module Nokogiri | |
| 220 209 | 
             
                      read_memory(string_or_io, url, encoding, options.to_i)
         | 
| 221 210 | 
             
                    end
         | 
| 222 211 | 
             
                  end
         | 
| 223 | 
            -
             | 
| 224 | 
            -
                  class EncodingFound < StandardError # :nodoc: all
         | 
| 225 | 
            -
                    attr_reader :found_encoding
         | 
| 226 | 
            -
             | 
| 227 | 
            -
                    def initialize(encoding)
         | 
| 228 | 
            -
                      @found_encoding = encoding
         | 
| 229 | 
            -
                      super(format("encoding found: %s", encoding))
         | 
| 230 | 
            -
                    end
         | 
| 231 | 
            -
                  end
         | 
| 232 | 
            -
             | 
| 233 | 
            -
                  # :nodoc: all
         | 
| 234 | 
            -
                  class EncodingReader
         | 
| 235 | 
            -
                    class SAXHandler < Nokogiri::XML::SAX::Document
         | 
| 236 | 
            -
                      attr_reader :encoding
         | 
| 237 | 
            -
             | 
| 238 | 
            -
                      def initialize
         | 
| 239 | 
            -
                        @encoding = nil
         | 
| 240 | 
            -
                        super()
         | 
| 241 | 
            -
                      end
         | 
| 242 | 
            -
             | 
| 243 | 
            -
                      def start_element(name, attrs = [])
         | 
| 244 | 
            -
                        return unless name == "meta"
         | 
| 245 | 
            -
             | 
| 246 | 
            -
                        attr = Hash[attrs]
         | 
| 247 | 
            -
                        (charset = attr["charset"]) &&
         | 
| 248 | 
            -
                          (@encoding = charset)
         | 
| 249 | 
            -
                        (http_equiv = attr["http-equiv"]) &&
         | 
| 250 | 
            -
                          http_equiv.match(/\AContent-Type\z/i) &&
         | 
| 251 | 
            -
                          (content = attr["content"]) &&
         | 
| 252 | 
            -
                          (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
         | 
| 253 | 
            -
                          (@encoding = m[1])
         | 
| 254 | 
            -
                      end
         | 
| 255 | 
            -
                    end
         | 
| 256 | 
            -
             | 
| 257 | 
            -
                    class JumpSAXHandler < SAXHandler
         | 
| 258 | 
            -
                      def initialize(jumptag)
         | 
| 259 | 
            -
                        @jumptag = jumptag
         | 
| 260 | 
            -
                        super()
         | 
| 261 | 
            -
                      end
         | 
| 262 | 
            -
             | 
| 263 | 
            -
                      def start_element(name, attrs = [])
         | 
| 264 | 
            -
                        super
         | 
| 265 | 
            -
                        throw(@jumptag, @encoding) if @encoding
         | 
| 266 | 
            -
                        throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
         | 
| 267 | 
            -
                      end
         | 
| 268 | 
            -
                    end
         | 
| 269 | 
            -
             | 
| 270 | 
            -
                    def self.detect_encoding(chunk)
         | 
| 271 | 
            -
                      (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
         | 
| 272 | 
            -
                        (return Nokogiri.XML(m[1]).encoding)
         | 
| 273 | 
            -
             | 
| 274 | 
            -
                      if Nokogiri.jruby?
         | 
| 275 | 
            -
                        (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
         | 
| 276 | 
            -
                          (return m[4])
         | 
| 277 | 
            -
                        catch(:encoding_found) do
         | 
| 278 | 
            -
                          Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
         | 
| 279 | 
            -
                          nil
         | 
| 280 | 
            -
                        end
         | 
| 281 | 
            -
                      else
         | 
| 282 | 
            -
                        handler = SAXHandler.new
         | 
| 283 | 
            -
                        parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
         | 
| 284 | 
            -
                        begin
         | 
| 285 | 
            -
                          parser << chunk
         | 
| 286 | 
            -
                        rescue
         | 
| 287 | 
            -
                          Nokogiri::SyntaxError
         | 
| 288 | 
            -
                        end
         | 
| 289 | 
            -
                        handler.encoding
         | 
| 290 | 
            -
                      end
         | 
| 291 | 
            -
                    end
         | 
| 292 | 
            -
             | 
| 293 | 
            -
                    def initialize(io)
         | 
| 294 | 
            -
                      @io = io
         | 
| 295 | 
            -
                      @firstchunk = nil
         | 
| 296 | 
            -
                      @encoding_found = nil
         | 
| 297 | 
            -
                    end
         | 
| 298 | 
            -
             | 
| 299 | 
            -
                    # This method is used by the C extension so that
         | 
| 300 | 
            -
                    # Nokogiri::HTML4::Document#read_io() does not leak memory when
         | 
| 301 | 
            -
                    # EncodingFound is raised.
         | 
| 302 | 
            -
                    attr_reader :encoding_found
         | 
| 303 | 
            -
             | 
| 304 | 
            -
                    def read(len)
         | 
| 305 | 
            -
                      # no support for a call without len
         | 
| 306 | 
            -
             | 
| 307 | 
            -
                      unless @firstchunk
         | 
| 308 | 
            -
                        (@firstchunk = @io.read(len)) || (return nil)
         | 
| 309 | 
            -
             | 
| 310 | 
            -
                        # This implementation expects that the first call from
         | 
| 311 | 
            -
                        # htmlReadIO() is made with a length long enough (~1KB) to
         | 
| 312 | 
            -
                        # achieve advanced encoding detection.
         | 
| 313 | 
            -
                        if (encoding = EncodingReader.detect_encoding(@firstchunk))
         | 
| 314 | 
            -
                          # The first chunk is stored for the next read in retry.
         | 
| 315 | 
            -
                          raise @encoding_found = EncodingFound.new(encoding)
         | 
| 316 | 
            -
                        end
         | 
| 317 | 
            -
                      end
         | 
| 318 | 
            -
                      @encoding_found = nil
         | 
| 319 | 
            -
             | 
| 320 | 
            -
                      ret = @firstchunk.slice!(0, len)
         | 
| 321 | 
            -
                      if (len -= ret.length) > 0
         | 
| 322 | 
            -
                        (rest = @io.read(len)) && ret << (rest)
         | 
| 323 | 
            -
                      end
         | 
| 324 | 
            -
                      if ret.empty?
         | 
| 325 | 
            -
                        nil
         | 
| 326 | 
            -
                      else
         | 
| 327 | 
            -
                        ret
         | 
| 328 | 
            -
                      end
         | 
| 329 | 
            -
                    end
         | 
| 330 | 
            -
                  end
         | 
| 331 212 | 
             
                end
         | 
| 332 213 | 
             
              end
         | 
| 333 214 | 
             
            end
         | 
| @@ -25,43 +25,37 @@ module Nokogiri | |
| 25 25 |  | 
| 26 26 | 
             
                  unless method_defined?(:implied_start_tag?)
         | 
| 27 27 | 
             
                    def implied_start_tag?
         | 
| 28 | 
            -
                       | 
| 29 | 
            -
                      d ? d.startTag : nil
         | 
| 28 | 
            +
                      default_desc&.startTag
         | 
| 30 29 | 
             
                    end
         | 
| 31 30 | 
             
                  end
         | 
| 32 31 |  | 
| 33 32 | 
             
                  unless method_defined?(:implied_end_tag?)
         | 
| 34 33 | 
             
                    def implied_end_tag?
         | 
| 35 | 
            -
                       | 
| 36 | 
            -
                      d ? d.endTag : nil
         | 
| 34 | 
            +
                      default_desc&.endTag
         | 
| 37 35 | 
             
                    end
         | 
| 38 36 | 
             
                  end
         | 
| 39 37 |  | 
| 40 38 | 
             
                  unless method_defined?(:save_end_tag?)
         | 
| 41 39 | 
             
                    def save_end_tag?
         | 
| 42 | 
            -
                       | 
| 43 | 
            -
                      d ? d.saveEndTag : nil
         | 
| 40 | 
            +
                      default_desc&.saveEndTag
         | 
| 44 41 | 
             
                    end
         | 
| 45 42 | 
             
                  end
         | 
| 46 43 |  | 
| 47 44 | 
             
                  unless method_defined?(:deprecated?)
         | 
| 48 45 | 
             
                    def deprecated?
         | 
| 49 | 
            -
                       | 
| 50 | 
            -
                      d ? d.depr : nil
         | 
| 46 | 
            +
                      default_desc&.depr
         | 
| 51 47 | 
             
                    end
         | 
| 52 48 | 
             
                  end
         | 
| 53 49 |  | 
| 54 50 | 
             
                  unless method_defined?(:description)
         | 
| 55 51 | 
             
                    def description
         | 
| 56 | 
            -
                       | 
| 57 | 
            -
                      d ? d.desc : nil
         | 
| 52 | 
            +
                      default_desc&.desc
         | 
| 58 53 | 
             
                    end
         | 
| 59 54 | 
             
                  end
         | 
| 60 55 |  | 
| 61 56 | 
             
                  unless method_defined?(:default_sub_element)
         | 
| 62 57 | 
             
                    def default_sub_element
         | 
| 63 | 
            -
                       | 
| 64 | 
            -
                      d ? d.defaultsubelt : nil
         | 
| 58 | 
            +
                      default_desc&.defaultsubelt
         | 
| 65 59 | 
             
                    end
         | 
| 66 60 | 
             
                  end
         | 
| 67 61 |  | 
| @@ -0,0 +1,121 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Nokogiri
         | 
| 4 | 
            +
              module HTML4
         | 
| 5 | 
            +
                # Libxml2's parser has poor support for encoding detection.  First, it does not recognize the
         | 
| 6 | 
            +
                # HTML5 style meta charset declaration.  Secondly, even if it successfully detects an encoding
         | 
| 7 | 
            +
                # hint, it does not re-decode or re-parse the preceding part which may be garbled.
         | 
| 8 | 
            +
                #
         | 
| 9 | 
            +
                # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
         | 
| 10 | 
            +
                # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
         | 
| 11 | 
            +
                # hint is found.
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                # :nodoc: all
         | 
| 14 | 
            +
                class EncodingReader
         | 
| 15 | 
            +
                  class EncodingFound < StandardError
         | 
| 16 | 
            +
                    attr_reader :found_encoding
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                    def initialize(encoding)
         | 
| 19 | 
            +
                      @found_encoding = encoding
         | 
| 20 | 
            +
                      super(format("encoding found: %s", encoding))
         | 
| 21 | 
            +
                    end
         | 
| 22 | 
            +
                  end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  class SAXHandler < Nokogiri::XML::SAX::Document
         | 
| 25 | 
            +
                    attr_reader :encoding
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    def initialize
         | 
| 28 | 
            +
                      @encoding = nil
         | 
| 29 | 
            +
                      super()
         | 
| 30 | 
            +
                    end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    def start_element(name, attrs = [])
         | 
| 33 | 
            +
                      return unless name == "meta"
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                      attr = Hash[attrs]
         | 
| 36 | 
            +
                      (charset = attr["charset"]) &&
         | 
| 37 | 
            +
                        (@encoding = charset)
         | 
| 38 | 
            +
                      (http_equiv = attr["http-equiv"]) &&
         | 
| 39 | 
            +
                        http_equiv.match(/\AContent-Type\z/i) &&
         | 
| 40 | 
            +
                        (content = attr["content"]) &&
         | 
| 41 | 
            +
                        (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
         | 
| 42 | 
            +
                        (@encoding = m[1])
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                  end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                  class JumpSAXHandler < SAXHandler
         | 
| 47 | 
            +
                    def initialize(jumptag)
         | 
| 48 | 
            +
                      @jumptag = jumptag
         | 
| 49 | 
            +
                      super()
         | 
| 50 | 
            +
                    end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    def start_element(name, attrs = [])
         | 
| 53 | 
            +
                      super
         | 
| 54 | 
            +
                      throw(@jumptag, @encoding) if @encoding
         | 
| 55 | 
            +
                      throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
         | 
| 56 | 
            +
                    end
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  def self.detect_encoding(chunk)
         | 
| 60 | 
            +
                    (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
         | 
| 61 | 
            +
                      (return Nokogiri.XML(m[1]).encoding)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    if Nokogiri.jruby?
         | 
| 64 | 
            +
                      (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
         | 
| 65 | 
            +
                        (return m[4])
         | 
| 66 | 
            +
                      catch(:encoding_found) do
         | 
| 67 | 
            +
                        Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
         | 
| 68 | 
            +
                        nil
         | 
| 69 | 
            +
                      end
         | 
| 70 | 
            +
                    else
         | 
| 71 | 
            +
                      handler = SAXHandler.new
         | 
| 72 | 
            +
                      parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
         | 
| 73 | 
            +
                      begin
         | 
| 74 | 
            +
                        parser << chunk
         | 
| 75 | 
            +
                      rescue
         | 
| 76 | 
            +
                        Nokogiri::SyntaxError
         | 
| 77 | 
            +
                      end
         | 
| 78 | 
            +
                      handler.encoding
         | 
| 79 | 
            +
                    end
         | 
| 80 | 
            +
                  end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                  def initialize(io)
         | 
| 83 | 
            +
                    @io = io
         | 
| 84 | 
            +
                    @firstchunk = nil
         | 
| 85 | 
            +
                    @encoding_found = nil
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                  # This method is used by the C extension so that
         | 
| 89 | 
            +
                  # Nokogiri::HTML4::Document#read_io() does not leak memory when
         | 
| 90 | 
            +
                  # EncodingFound is raised.
         | 
| 91 | 
            +
                  attr_reader :encoding_found
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                  def read(len)
         | 
| 94 | 
            +
                    # no support for a call without len
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                    unless @firstchunk
         | 
| 97 | 
            +
                      (@firstchunk = @io.read(len)) || (return nil)
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                      # This implementation expects that the first call from
         | 
| 100 | 
            +
                      # htmlReadIO() is made with a length long enough (~1KB) to
         | 
| 101 | 
            +
                      # achieve advanced encoding detection.
         | 
| 102 | 
            +
                      if (encoding = EncodingReader.detect_encoding(@firstchunk))
         | 
| 103 | 
            +
                        # The first chunk is stored for the next read in retry.
         | 
| 104 | 
            +
                        raise @encoding_found = EncodingFound.new(encoding)
         | 
| 105 | 
            +
                      end
         | 
| 106 | 
            +
                    end
         | 
| 107 | 
            +
                    @encoding_found = nil
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    ret = @firstchunk.slice!(0, len)
         | 
| 110 | 
            +
                    if (len -= ret.length) > 0
         | 
| 111 | 
            +
                      (rest = @io.read(len)) && ret << (rest)
         | 
| 112 | 
            +
                    end
         | 
| 113 | 
            +
                    if ret.empty?
         | 
| 114 | 
            +
                      nil
         | 
| 115 | 
            +
                    else
         | 
| 116 | 
            +
                      ret
         | 
| 117 | 
            +
                    end
         | 
| 118 | 
            +
                  end
         | 
| 119 | 
            +
                end
         | 
| 120 | 
            +
              end
         | 
| 121 | 
            +
            end
         | 
    
        data/lib/nokogiri/html4.rb
    CHANGED
    
    | @@ -39,6 +39,7 @@ end | |
| 39 39 | 
             
            require_relative "html4/entity_lookup"
         | 
| 40 40 | 
             
            require_relative "html4/document"
         | 
| 41 41 | 
             
            require_relative "html4/document_fragment"
         | 
| 42 | 
            +
            require_relative "html4/encoding_reader"
         | 
| 42 43 | 
             
            require_relative "html4/sax/parser_context"
         | 
| 43 44 | 
             
            require_relative "html4/sax/parser"
         | 
| 44 45 | 
             
            require_relative "html4/sax/push_parser"
         | 
| @@ -21,48 +21,137 @@ require_relative "../html4/document" | |
| 21 21 |  | 
| 22 22 | 
             
            module Nokogiri
         | 
| 23 23 | 
             
              module HTML5
         | 
| 24 | 
            +
                # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
         | 
| 25 | 
            +
                #
         | 
| 26 | 
            +
                # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
         | 
| 27 | 
            +
                # mode.
         | 
| 28 | 
            +
                #
         | 
| 29 | 
            +
                # Since v1.14.0
         | 
| 30 | 
            +
                module QuirksMode
         | 
| 31 | 
            +
                  NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
         | 
| 32 | 
            +
                  QUIRKS = 1 # The document was parsed in "quirks" mode
         | 
| 33 | 
            +
                  LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 24 36 | 
             
                # Since v1.12.0
         | 
| 25 37 | 
             
                #
         | 
| 26 38 | 
             
                # 💡 HTML5 functionality is not available when running JRuby.
         | 
| 27 39 | 
             
                class Document < Nokogiri::HTML4::Document
         | 
| 28 | 
            -
                   | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 40 | 
            +
                  # Get the url name for this document, as passed into Document.parse, Document.read_io, or
         | 
| 41 | 
            +
                  # Document.read_memory
         | 
| 42 | 
            +
                  attr_reader :url
         | 
| 31 43 |  | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 44 | 
            +
                  # Get the parser's quirks mode value. See HTML5::QuirksMode.
         | 
| 45 | 
            +
                  #
         | 
| 46 | 
            +
                  # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
         | 
| 47 | 
            +
                  #
         | 
| 48 | 
            +
                  # Since v1.14.0
         | 
| 49 | 
            +
                  attr_reader :quirks_mode
         | 
| 35 50 |  | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 51 | 
            +
                  class << self
         | 
| 52 | 
            +
                    # :call-seq:
         | 
| 53 | 
            +
                    #   parse(input)
         | 
| 54 | 
            +
                    #   parse(input, url=nil, encoding=nil, **options)
         | 
| 55 | 
            +
                    #   parse(input, url=nil, encoding=nil) { |options| ... }
         | 
| 56 | 
            +
                    #
         | 
| 57 | 
            +
                    # Parse HTML5 input.
         | 
| 58 | 
            +
                    #
         | 
| 59 | 
            +
                    # [Parameters]
         | 
| 60 | 
            +
                    # - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
         | 
| 61 | 
            +
                    #   IO, or StringIO.
         | 
| 62 | 
            +
                    #
         | 
| 63 | 
            +
                    # - +url+ (optional) is a String indicating the canonical URI where this document is located.
         | 
| 64 | 
            +
                    #
         | 
| 65 | 
            +
                    # - +encoding+ (optional) is the encoding that should be used when processing
         | 
| 66 | 
            +
                    #   the document.
         | 
| 67 | 
            +
                    #
         | 
| 68 | 
            +
                    # - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
         | 
| 69 | 
            +
                    #   during parsing. The three currently supported options are +:max_errors+,
         | 
| 70 | 
            +
                    #   +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
         | 
| 71 | 
            +
                    #
         | 
| 72 | 
            +
                    #   ⚠ Note that these options are different than those made available by
         | 
| 73 | 
            +
                    #   Nokogiri::XML::Document and Nokogiri::HTML4::Document.
         | 
| 74 | 
            +
                    #
         | 
| 75 | 
            +
                    # - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
         | 
| 76 | 
            +
                    #   Nokogiri::HTML5 for more information and usage.
         | 
| 77 | 
            +
                    #
         | 
| 78 | 
            +
                    # [Returns] Nokogiri::HTML5::Document
         | 
| 79 | 
            +
                    #
         | 
| 80 | 
            +
                    def parse(string_or_io, url = nil, encoding = nil, **options, &block)
         | 
| 81 | 
            +
                      yield options if block
         | 
| 82 | 
            +
                      string_or_io = "" unless string_or_io
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                      if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
         | 
| 85 | 
            +
                        encoding ||= string_or_io.encoding.name
         | 
| 86 | 
            +
                      end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                      if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
         | 
| 89 | 
            +
                        url ||= string_or_io.path
         | 
| 90 | 
            +
                      end
         | 
| 91 | 
            +
                      unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
         | 
| 92 | 
            +
                        raise ArgumentError, "not a string or IO object"
         | 
| 93 | 
            +
                      end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                      do_parse(string_or_io, url, encoding, options)
         | 
| 38 96 | 
             
                    end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 97 | 
            +
             | 
| 98 | 
            +
                    # Create a new document from an IO object.
         | 
| 99 | 
            +
                    #
         | 
| 100 | 
            +
                    # 💡 Most users should prefer Document.parse to this method.
         | 
| 101 | 
            +
                    def read_io(io, url = nil, encoding = nil, **options)
         | 
| 102 | 
            +
                      raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                      do_parse(io, url, encoding, options)
         | 
| 41 105 | 
             
                    end
         | 
| 42 106 |  | 
| 43 | 
            -
                     | 
| 44 | 
            -
             | 
| 107 | 
            +
                    # Create a new document from a String.
         | 
| 108 | 
            +
                    #
         | 
| 109 | 
            +
                    # 💡 Most users should prefer Document.parse to this method.
         | 
| 110 | 
            +
                    def read_memory(string, url = nil, encoding = nil, **options)
         | 
| 111 | 
            +
                      raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
         | 
| 45 112 |  | 
| 46 | 
            -
             | 
| 47 | 
            -
                     | 
| 113 | 
            +
                      do_parse(string, url, encoding, options)
         | 
| 114 | 
            +
                    end
         | 
| 48 115 |  | 
| 49 | 
            -
                     | 
| 50 | 
            -
                  end
         | 
| 116 | 
            +
                    private
         | 
| 51 117 |  | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 118 | 
            +
                    def do_parse(string_or_io, url, encoding, options)
         | 
| 119 | 
            +
                      string = HTML5.read_and_encode(string_or_io, encoding)
         | 
| 120 | 
            +
                      max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
         | 
| 121 | 
            +
                      max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
         | 
| 122 | 
            +
                      max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
         | 
| 123 | 
            +
                      doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
         | 
| 124 | 
            +
                      doc.encoding = "UTF-8"
         | 
| 125 | 
            +
                      doc
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
                  end
         | 
| 54 128 |  | 
| 55 | 
            -
             | 
| 129 | 
            +
                  def initialize(*args) # :nodoc:
         | 
| 130 | 
            +
                    super
         | 
| 131 | 
            +
                    @url = nil
         | 
| 132 | 
            +
                    @quirks_mode = nil
         | 
| 56 133 | 
             
                  end
         | 
| 57 134 |  | 
| 58 | 
            -
                   | 
| 59 | 
            -
             | 
| 135 | 
            +
                  # :call-seq:
         | 
| 136 | 
            +
                  #   fragment() → Nokogiri::HTML5::DocumentFragment
         | 
| 137 | 
            +
                  #   fragment(markup) → Nokogiri::HTML5::DocumentFragment
         | 
| 138 | 
            +
                  #
         | 
| 139 | 
            +
                  # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
         | 
| 140 | 
            +
                  #
         | 
| 141 | 
            +
                  # [Properties]
         | 
| 142 | 
            +
                  # - +markup+ (String) The HTML5 markup fragment to be parsed
         | 
| 143 | 
            +
                  #
         | 
| 144 | 
            +
                  # [Returns]
         | 
| 145 | 
            +
                  #   Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
         | 
| 146 | 
            +
                  #
         | 
| 147 | 
            +
                  def fragment(markup = nil)
         | 
| 148 | 
            +
                    DocumentFragment.new(self, markup)
         | 
| 60 149 | 
             
                  end
         | 
| 61 150 |  | 
| 62 | 
            -
                  def to_xml(options = {}, &block)
         | 
| 151 | 
            +
                  def to_xml(options = {}, &block) # :nodoc:
         | 
| 63 152 | 
             
                    # Bypass XML::Document#to_xml which doesn't add
         | 
| 64 153 | 
             
                    # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
         | 
| 65 | 
            -
                    XML::Node.instance_method(:to_xml). | 
| 154 | 
            +
                    XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
         | 
| 66 155 | 
             
                  end
         | 
| 67 156 |  | 
| 68 157 | 
             
                  # :call-seq:
         | 
| @@ -70,22 +159,10 @@ module Nokogiri | |
| 70 159 | 
             
                  #
         | 
| 71 160 | 
             
                  # [Returns] The document type which determines CSS-to-XPath translation.
         | 
| 72 161 | 
             
                  #
         | 
| 73 | 
            -
                  # See XPathVisitor for more information.
         | 
| 162 | 
            +
                  # See CSS::XPathVisitor for more information.
         | 
| 74 163 | 
             
                  def xpath_doctype
         | 
| 75 164 | 
             
                    Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
         | 
| 76 165 | 
             
                  end
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                  private
         | 
| 79 | 
            -
             | 
| 80 | 
            -
                  def self.do_parse(string_or_io, url, encoding, options)
         | 
| 81 | 
            -
                    string = HTML5.read_and_encode(string_or_io, encoding)
         | 
| 82 | 
            -
                    max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
         | 
| 83 | 
            -
                    max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
         | 
| 84 | 
            -
                    max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
         | 
| 85 | 
            -
                    doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
         | 
| 86 | 
            -
                    doc.encoding = "UTF-8"
         | 
| 87 | 
            -
                    doc
         | 
| 88 | 
            -
                  end
         | 
| 89 166 | 
             
                end
         | 
| 90 167 | 
             
              end
         | 
| 91 168 | 
             
            end
         | 
| @@ -28,6 +28,13 @@ module Nokogiri | |
| 28 28 | 
             
                  attr_accessor :document
         | 
| 29 29 | 
             
                  attr_accessor :errors
         | 
| 30 30 |  | 
| 31 | 
            +
                  # Get the parser's quirks mode value. See HTML5::QuirksMode.
         | 
| 32 | 
            +
                  #
         | 
| 33 | 
            +
                  # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
         | 
| 34 | 
            +
                  #
         | 
| 35 | 
            +
                  # Since v1.14.0
         | 
| 36 | 
            +
                  attr_reader :quirks_mode
         | 
| 37 | 
            +
             | 
| 31 38 | 
             
                  # Create a document fragment.
         | 
| 32 39 | 
             
                  def initialize(doc, tags = nil, ctx = nil, options = {})
         | 
| 33 40 | 
             
                    self.document = doc
         | 
| @@ -41,10 +48,10 @@ module Nokogiri | |
| 41 48 | 
             
                    Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
         | 
| 42 49 | 
             
                  end
         | 
| 43 50 |  | 
| 44 | 
            -
                  def serialize(options = {}, &block)
         | 
| 51 | 
            +
                  def serialize(options = {}, &block) # :nodoc:
         | 
| 45 52 | 
             
                    # Bypass XML::Document.serialize which doesn't support options even
         | 
| 46 53 | 
             
                    # though XML::Node.serialize does!
         | 
| 47 | 
            -
                    XML::Node.instance_method(:serialize). | 
| 54 | 
            +
                    XML::Node.instance_method(:serialize).bind_call(self, options, &block)
         | 
| 48 55 | 
             
                  end
         | 
| 49 56 |  | 
| 50 57 | 
             
                  # Parse a document fragment from +tags+, returning a Nodeset.
         |