RubyGems - nokogiri - Versions diffs - 1.4.4 → 1.4.5 - Mend

nokogiri 1.4.4 → 1.4.5

Potentially problematic release.

This version of nokogiri might be problematic. Click here for more details.

Files changed (54) hide show

data/.gemtest +0 -0
data/CHANGELOG.ja.rdoc +16 -0
data/CHANGELOG.rdoc +23 -1
data/Manifest.txt +4 -3
data/Rakefile +41 -37
data/ext/nokogiri/xml_document.c +9 -0
data/ext/nokogiri/xml_io.c +32 -7
data/ext/nokogiri/xml_node.c +14 -13
data/ext/nokogiri/xml_sax_parser.c +4 -2
data/ext/nokogiri/xslt_stylesheet.c +9 -3
data/lib/nokogiri/css.rb +6 -3
data/lib/nokogiri/css/parser.rb +665 -70
data/lib/nokogiri/css/parser.y +3 -1
data/lib/nokogiri/css/parser_extras.rb +91 -0
data/lib/nokogiri/css/tokenizer.rb +148 -3
data/lib/nokogiri/css/tokenizer.rex +1 -1
data/lib/nokogiri/ffi/structs/xml_attr.rb +2 -1
data/lib/nokogiri/ffi/structs/xml_node_set.rb +1 -1
data/lib/nokogiri/ffi/weak_bucket.rb +10 -10
data/lib/nokogiri/ffi/xml/document.rb +8 -0
data/lib/nokogiri/ffi/xml/node_set.rb +1 -0
data/lib/nokogiri/ffi/xml/sax/parser.rb +9 -1
data/lib/nokogiri/ffi/xslt/stylesheet.rb +4 -0
data/lib/nokogiri/html/document.rb +134 -15
data/lib/nokogiri/html/sax/parser.rb +6 -2
data/lib/nokogiri/version.rb +6 -1
data/lib/nokogiri/xml/node.rb +8 -23
data/lib/nokogiri/xml/node/save_options.rb +10 -0
data/lib/nokogiri/xml/node_set.rb +1 -1
data/lib/nokogiri/xml/parse_options.rb +8 -0
data/lib/nokogiri/xml/reader.rb +6 -6
data/lib/nokogiri/xml/sax/document.rb +2 -2
data/lib/nokogiri/xml/schema.rb +7 -1
data/tasks/cross_compile.rb +8 -15
data/test/css/test_tokenizer.rb +8 -0
data/test/files/encoding.html +82 -0
data/test/files/encoding.xhtml +84 -0
data/test/helper.rb +2 -0
data/test/html/sax/test_parser.rb +45 -0
data/test/html/test_document.rb +55 -0
data/test/html/test_document_encoding.rb +46 -0
data/test/html/test_element_description.rb +1 -1
data/test/test_memory_leak.rb +20 -0
data/test/test_reader.rb +13 -0
data/test/test_xslt_transforms.rb +6 -2
data/test/xml/sax/test_parser.rb +16 -0
data/test/xml/test_document.rb +3 -1
data/test/xml/test_node.rb +13 -1
data/test/xml/test_node_set.rb +10 -0
data/test/xml/test_schema.rb +5 -0
metadata +94 -109
data/deps.rip +0 -5
data/lib/nokogiri/css/generated_parser.rb +0 -676
data/lib/nokogiri/css/generated_tokenizer.rb +0 -145

@@ -1,4 +1,4 @@
-class Nokogiri::CSS::GeneratedParser
+class Nokogiri::CSS::Parser
 token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
 token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
@@ -233,3 +233,5 @@ end
 ---- header
+require 'nokogiri/css/parser_extras'

data/lib/nokogiri/css/parser_extras.rb ADDED

@@ -0,0 +1,91 @@
+require 'thread'
+module Nokogiri
+  module CSS
+    class Parser < Racc::Parser
+      @cache_on = true
+      @cache    = {}
+      @mutex    = Mutex.new
+      class << self
+        # Turn on CSS parse caching
+        attr_accessor :cache_on
+        alias :cache_on? :cache_on
+        alias :set_cache :cache_on=
+        # Get the css selector in +string+ from the cache
+        def [] string
+          return unless @cache_on
+          @mutex.synchronize { @cache[string] }
+        end
+        # Set the css selector in +string+ in the cache to +value+
+        def []= string, value
+          return value unless @cache_on
+          @mutex.synchronize { @cache[string] = value }
+        end
+        # Clear the cache
+        def clear_cache
+          @mutex.synchronize { @cache = {} }
+        end
+        # Execute +block+ without cache
+        def without_cache &block
+          tmp = @cache_on
+          @cache_on = false
+          block.call
+          @cache_on = tmp
+        end
+        ###
+        # Parse this CSS selector in +selector+.  Returns an AST.
+        def parse selector
+          @warned ||= false
+          unless @warned
+            $stderr.puts('Nokogiri::CSS::Parser.parse is deprecated, call Nokogiri::CSS.parse(), this will be removed August 1st or version 1.4.0 (whichever is first)')
+            @warned = true
+          end
+          new.parse selector
+        end
+      end
+      # Create a new CSS parser with respect to +namespaces+
+      def initialize namespaces = {}
+        @tokenizer  = Tokenizer.new
+        @namespaces = namespaces
+        super()
+      end
+      def parse string
+        @tokenizer.scan_setup string
+        do_parse
+      end
+      def next_token
+        @tokenizer.next_token
+      end
+      # Get the xpath for +string+ using +options+
+      def xpath_for string, options={}
+        key = "#{string}#{options[:ns]}#{options[:prefix]}"
+        v = self.class[key]
+        return v if v
+        args = [
+          options[:prefix] || '//',
+          options[:visitor] || XPathVisitor.new
+        ]
+        self.class[key] = parse(string).map { |ast|
+          ast.to_xpath(*args)
+        }
+      end
+      # On CSS parser error, raise an exception
+      def on_error error_token_id, error_value, value_stack
+        after = value_stack.compact.last
+        raise SyntaxError.new("unexpected '#{error_value}' after '#{after}'")
+      end
+    end
+  end
+end

data/lib/nokogiri/css/tokenizer.rb CHANGED

@@ -1,7 +1,152 @@
+#--
+# DO NOT MODIFY!!!!
+# This file is automatically generated by rex 1.0.5
+# from lexical definition file "lib/nokogiri/css/tokenizer.rex".
+#++
 module Nokogiri
-  module CSS
-    class Tokenizer < GeneratedTokenizer
-      alias :scan :scan_setup
+module CSS
+class Tokenizer
+  require 'strscan'
+  class ScanError < StandardError ; end
+  attr_reader   :lineno
+  attr_reader   :filename
+  attr_accessor :state
+  def scan_setup(str)
+    @ss = StringScanner.new(str)
+    @lineno =  1
+    @state  = nil
+  end
+  def action
+    yield
+  end
+  def scan_str(str)
+    scan_setup(str)
+    do_parse
+  end
+  alias :scan :scan_str
+  def load_file( filename )
+    @filename = filename
+    open(filename, "r") do |f|
+      scan_setup(f.read)
     end
   end
+  def scan_file( filename )
+    load_file(filename)
+    do_parse
+  end
+  def next_token
+    return if @ss.eos?
+    # skips empty actions
+    until token = _next_token or @ss.eos?; end
+    token
+  end
+  def _next_token
+    text = @ss.peek(1)
+    @lineno  +=  1  if text == "\n"
+    token = case @state
+    when nil
+      case
+      when (text = @ss.scan(/has\([\s]*/))
+         action { [:HAS, text] }
+      when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
+         action { [:FUNCTION, text] }
+      when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
+         action { [:IDENT, text] }
+      when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
+         action { [:HASH, text] }
+      when (text = @ss.scan(/[\s]*~=[\s]*/))
+         action { [:INCLUDES, text] }
+      when (text = @ss.scan(/[\s]*\|=[\s]*/))
+         action { [:DASHMATCH, text] }
+      when (text = @ss.scan(/[\s]*\^=[\s]*/))
+         action { [:PREFIXMATCH, text] }
+      when (text = @ss.scan(/[\s]*\$=[\s]*/))
+         action { [:SUFFIXMATCH, text] }
+      when (text = @ss.scan(/[\s]*\*=[\s]*/))
+         action { [:SUBSTRINGMATCH, text] }
+      when (text = @ss.scan(/[\s]*!=[\s]*/))
+         action { [:NOT_EQUAL, text] }
+      when (text = @ss.scan(/[\s]*=[\s]*/))
+         action { [:EQUAL, text] }
+      when (text = @ss.scan(/[\s]*\)/))
+         action { [:RPAREN, text] }
+      when (text = @ss.scan(/[\s]*\[[\s]*/))
+         action { [:LSQUARE, text] }
+      when (text = @ss.scan(/[\s]*\]/))
+         action { [:RSQUARE, text] }
+      when (text = @ss.scan(/[\s]*\+[\s]*/))
+         action { [:PLUS, text] }
+      when (text = @ss.scan(/[\s]*>[\s]*/))
+         action { [:GREATER, text] }
+      when (text = @ss.scan(/[\s]*,[\s]*/))
+         action { [:COMMA, text] }
+      when (text = @ss.scan(/[\s]*~[\s]*/))
+         action { [:TILDE, text] }
+      when (text = @ss.scan(/\:not\([\s]*/))
+         action { [:NOT, text] }
+      when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
+         action { [:NUMBER, text] }
+      when (text = @ss.scan(/[\s]*\/\/[\s]*/))
+         action { [:DOUBLESLASH, text] }
+      when (text = @ss.scan(/[\s]*\/[\s]*/))
+         action { [:SLASH, text] }
+      when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
+         action {[:UNICODE_RANGE, text] }
+      when (text = @ss.scan(/[\s]+/))
+         action { [:S, text] }
+      when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
+         action { [:STRING, text] }
+      when (text = @ss.scan(/./))
+         action { [text, text] }
+      else
+        text = @ss.string[@ss.pos .. -1]
+        raise  ScanError, "can not match: '" + text + "'"
+      end  # if
+    else
+      raise  ScanError, "undefined state: '" + state.to_s + "'"
+    end  # case state
+    token
+  end  # def _next_token
+end # class
+end
 end

data/lib/nokogiri/css/tokenizer.rex CHANGED

@@ -1,6 +1,6 @@
 module Nokogiri
 module CSS
-class GeneratedTokenizer < GeneratedParser
+class Tokenizer
 macro
   nl        \n|\r\n|\r|\f

data/lib/nokogiri/ffi/structs/xml_attr.rb CHANGED

@@ -11,7 +11,8 @@ module Nokogiri
         :parent,        :pointer,
         :next,          :pointer,
         :prev,          :pointer,
-        :doc,           :pointer
+        :doc,           :pointer,
+        :ns,            :pointer
         )
     end

data/lib/nokogiri/ffi/structs/xml_node_set.rb CHANGED

@@ -24,7 +24,7 @@ module Nokogiri
       end
       def nodeTab
-        self[:nodeTab].read_array_of_pointer(self[:nodeNr])
+        self[:nodeTab].null? ? [] : self[:nodeTab].read_array_of_pointer(self[:nodeNr])
       end
       def nodeTab=(array)

data/lib/nokogiri/ffi/weak_bucket.rb CHANGED

@@ -5,27 +5,27 @@ else
   require 'weakling'
   Nokogiri::VERSION_INFO['refs'] = "weakling"
 end
-require 'singleton'
 module Nokogiri
   class WeakBucket
-    include Singleton
     if Nokogiri::VERSION_INFO['refs'] == "weakling"
-      attr_accessor :bucket
-      def initialize
-        @bucket = Weakling::IdHash.new
-      end
+      @@bucket = Weakling::IdHash.new
+      @@semaphore = Mutex.new
       def WeakBucket.get_object(cstruct)
-        instance.bucket[cstruct.ruby_node_pointer]
+        @@semaphore.synchronize do
+          @@bucket[cstruct.ruby_node_pointer]
+        end
       end
       def WeakBucket.set_object(cstruct, object)
-        cstruct.ruby_node_pointer = instance.bucket.add(object)
+        @@semaphore.synchronize do
+          cstruct.ruby_node_pointer = @@bucket.add(object)
+        end
       end
     else
       def WeakBucket.get_object(cstruct)
         ptr = cstruct.ruby_node_pointer
         ptr != 0 ? ObjectSpace._id2ref(ptr) : nil

data/lib/nokogiri/ffi/xml/document.rb CHANGED

@@ -157,6 +157,14 @@ module Nokogiri
             LibXML.xmlFreeNsList(node.cstruct[:nsDef])
             node.cstruct[:nsDef] = nil
           end
+          unless node.cstruct[:properties].nil?
+            prop_ptr = node.cstruct[:properties]
+            while ! prop_ptr.null?
+              prop_cstruct = LibXML::XmlAttr.new(node.cstruct[:properties])
+              prop_cstruct[:ns] = nil unless prop_cstruct[:ns].nil?
+              prop_ptr = prop_cstruct[:next]
+            end
+          end
         end
       end

data/lib/nokogiri/ffi/xml/node_set.rb CHANGED

@@ -87,6 +87,7 @@ module Nokogiri
       end
       def unlink # :nodoc:
+        return if cstruct[:nodeNr] == 0
         nodetab = cstruct.nodeTab
         cstruct[:nodeNr].times do |j|
           node_cstruct = LibXML::XmlNode.new(nodetab[j])

data/lib/nokogiri/ffi/xml/sax/parser.rb CHANGED

@@ -48,7 +48,15 @@ module Nokogiri
         end
         def __internal__startElement(_, name, attributes)
-          attrs = attributes.null? ? [] : attributes.get_array_of_string(0)
+          attrs = []
+          offset = 0
+          if ! attributes.null?
+            while ! attributes.get_pointer(LibXML.pointer_offset(offset)).null? do
+              cons = attributes.get_array_of_string(LibXML.pointer_offset(offset), 2)
+              attrs << cons
+              offset += 2
+            end
+          end
           @document.start_element name, attrs
         end

data/lib/nokogiri/ffi/xslt/stylesheet.rb CHANGED

@@ -51,6 +51,10 @@ module Nokogiri
       end
       def transform(document, params=[]) # :nodoc:
+        unless document.kind_of? Nokogiri::XML::Document
+          raise ArgumentError, "argument must be a Nokogiri::XML::Document"
+        end
         params = params.to_a.flatten if params.is_a?(Hash)
         raise(TypeError) unless params.is_a?(Array)

data/lib/nokogiri/html/document.rb CHANGED

@@ -3,25 +3,44 @@ module Nokogiri
     class Document < Nokogiri::XML::Document
       ###
       # Get the meta tag encoding for this document.  If there is no meta tag,
-      # then nil is returned
+      # then nil is returned.
       def meta_encoding
-        return nil unless meta = css('meta').find { |node|
-          node['http-equiv'] =~ /Content-Type/i
-        }
-        /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
+        meta = meta_content_type and
+          /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
       end
       ###
       # Set the meta tag encoding for this document.  If there is no meta
-      # content tag, nil is returned and the encoding is not set.
+      # content tag, the encoding is not set.
       def meta_encoding= encoding
-        return nil unless meta = css('meta').find { |node|
-          node['http-equiv'] =~ /Content-Type/i
+        meta = meta_content_type and
+          meta['content'] = "text/html; charset=%s" % encoding
+      end
+      def meta_content_type
+        css('meta[@http-equiv]').find { |node|
+          node['http-equiv'] =~ /\AContent-Type\z/i
         }
+      end
+      private :meta_content_type
-        meta['content'] = "text/html; charset=%s" % encoding
-        encoding
+      ###
+      # Get the title string of this document.  Return nil if there is
+      # no title tag.
+      def title
+        title = at('title') and title.inner_text
+      end
+      ###
+      # Set the title string of this document.  If there is no head
+      # element, the title is not set.
+      def title=(text)
+        unless title = at('title')
+          head = at('head') or return nil
+          title = Nokogiri::XML::Node.new('title', self)
+          head << title
+        end
+        title.children = XML::Text.new(text, self)
       end
       ####
@@ -39,10 +58,7 @@ module Nokogiri
       #   end
       #
       def serialize options = {}
-        options[:save_with] ||= XML::Node::SaveOptions::FORMAT |
-            XML::Node::SaveOptions::AS_HTML |
-            XML::Node::SaveOptions::NO_DECLARATION |
-            XML::Node::SaveOptions::NO_EMPTY_TAGS
+        options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
         super
       end
@@ -75,16 +91,119 @@ module Nokogiri
           if string_or_io.respond_to?(:read)
             url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
+            if !encoding
+              # Perform further encoding detection that libxml2 does
+              # not do.
+              string_or_io = EncodingReader.new(string_or_io)
+              begin
+                return read_io(string_or_io, url, encoding, options.to_i)
+              rescue EncodingFoundException => e
+                # A retry is required because libxml2 has a problem in
+                # that it cannot switch encoding well in the middle of
+                # parsing, especially if it has already seen a
+                # non-ASCII character when it finds an encoding hint.
+                encoding = e.encoding
+              end
+            end
             return read_io(string_or_io, url, encoding, options.to_i)
           end
           # read_memory pukes on empty docs
           return new if string_or_io.nil? or string_or_io.empty?
+          if !encoding
+            encoding = EncodingReader.detect_encoding(string_or_io)
+          end
           read_memory(string_or_io, url, encoding, options.to_i)
         end
       end
+      class EncodingFoundException < Exception # :nodoc:
+        attr_reader :encoding
+        def initialize(encoding)
+          @encoding = encoding
+          super("encoding found: %s" % encoding)
+        end
+      end
+      class EncodingReader # :nodoc:
+        class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
+          attr_reader :encoding
+          def found(encoding)
+            @encoding = encoding
+            throw :found
+          end
+          def not_found(encoding)
+            found nil
+          end
+          def start_element(name, attrs = [])
+            case name
+            when /\A(?:div|h1|img|p|br)\z/
+              not_found
+            when 'meta'
+              attr = Hash[attrs]
+              http_equiv = attr['http-equiv'] and
+                http_equiv.match(/\AContent-Type\z/i) and
+                content = attr['content'] and
+                m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
+                found m[1]
+            end
+          end
+        end
+        def self.detect_encoding(chunk)
+          m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
+            return Nokogiri.XML(m[1]).encoding
+          handler = SAXHandler.new
+          parser = Nokogiri::HTML::SAX::Parser.new(handler)
+          catch(:found) {
+            parser.parse(chunk)
+          }
+          handler.encoding
+        rescue => e
+          nil
+        end
+        def initialize(io)
+          @io = io
+          @firstchunk = nil
+        end
+        def read(len)
+          # no support for a call without len
+          if !@firstchunk
+            @firstchunk = @io.read(len) or return nil
+            # This implementation expects and assumes that the first
+            # call from htmlReadIO() is made with a length long enough
+            # (~1KB) to achieve further encoding detection that
+            # libxml2 does not do.
+            if encoding = EncodingReader.detect_encoding(@firstchunk)
+              raise EncodingFoundException, encoding
+            end
+            # This chunk is stored for the next read in retry.
+            return @firstchunk
+          end
+          ret = @firstchunk.slice!(0, len)
+          if (len -= ret.length) > 0
+            rest = @io.read(len) and ret << rest
+          end
+          if ret.empty?
+            nil
+          else
+            ret
+          end
+        end
+      end
     end
   end
 end