RubyGems - nokogiri - Versions diffs - 1.5.0 → 1.5.1.rc1 - Mend

nokogiri 1.5.0 → 1.5.1.rc1

Potentially problematic release.

This version of nokogiri might be problematic. Click here for more details.

Files changed (61) hide show

data/CHANGELOG.ja.rdoc +39 -12
data/CHANGELOG.rdoc +28 -0
data/C_CODING_STYLE.rdoc +27 -0
data/Manifest.txt +4 -0
data/README.rdoc +11 -7
data/Rakefile +42 -29
data/bin/nokogiri +10 -2
data/ext/nokogiri/extconf.rb +9 -1
data/ext/nokogiri/html_document.c +16 -0
data/ext/nokogiri/html_sax_parser_context.c +59 -37
data/ext/nokogiri/html_sax_push_parser.c +87 -0
data/ext/nokogiri/html_sax_push_parser.h +9 -0
data/ext/nokogiri/nokogiri.c +6 -8
data/ext/nokogiri/nokogiri.h +3 -0
data/ext/nokogiri/xml_document.c +101 -3
data/ext/nokogiri/xml_document.h +3 -3
data/ext/nokogiri/xml_node.c +150 -58
data/ext/nokogiri/xml_node_set.c +169 -120
data/ext/nokogiri/xml_node_set.h +5 -0
data/ext/nokogiri/xml_sax_parser_context.c +64 -41
data/ext/nokogiri/xml_text.c +2 -0
data/ext/nokogiri/xml_xpath_context.c +30 -24
data/ext/nokogiri/xslt_stylesheet.c +62 -16
data/ext/nokogiri/xslt_stylesheet.h +5 -0
data/lib/nokogiri/css/parser.rb +165 -159
data/lib/nokogiri/css/parser.y +6 -3
data/lib/nokogiri/css/tokenizer.rb +1 -1
data/lib/nokogiri/css/tokenizer.rex +1 -1
data/lib/nokogiri/html.rb +1 -0
data/lib/nokogiri/html/document.rb +82 -42
data/lib/nokogiri/html/sax/push_parser.rb +16 -0
data/lib/nokogiri/version.rb +1 -1
data/lib/nokogiri/xml.rb +6 -0
data/lib/nokogiri/xml/builder.rb +7 -1
data/lib/nokogiri/xml/document.rb +32 -17
data/lib/nokogiri/xml/document_fragment.rb +6 -1
data/lib/nokogiri/xml/node.rb +40 -9
data/lib/nokogiri/xslt.rb +5 -1
data/tasks/cross_compile.rb +1 -0
data/tasks/nokogiri.org.rb +6 -0
data/tasks/test.rb +1 -0
data/test/css/test_xpath_visitor.rb +6 -0
data/test/helper.rb +1 -0
data/test/html/test_document.rb +26 -0
data/test/html/test_document_fragment.rb +1 -2
data/test/test_memory_leak.rb +81 -1
data/test/test_xslt_transforms.rb +152 -123
data/test/xml/test_builder.rb +24 -2
data/test/xml/test_c14n.rb +151 -0
data/test/xml/test_document.rb +48 -0
data/test/xml/test_namespace.rb +5 -0
data/test/xml/test_node.rb +82 -1
data/test/xml/test_node_attributes.rb +19 -0
data/test/xml/test_node_inheritance.rb +32 -0
data/test/xml/test_node_reparenting.rb +32 -0
data/test/xml/test_node_set.rb +16 -8
data/test/xml/test_reader_encoding.rb +16 -0
data/test/xml/test_unparented_node.rb +24 -0
data/test/xml/test_xinclude.rb +83 -0
data/test/xml/test_xpath.rb +22 -0
metadata +208 -241

@@ -69,6 +69,10 @@ rule
     : '.' IDENT { result = Node.new(:CLASS_CONDITION, [val[1]]) }
     ;
   element_name
+    : namespaced_ident
+    | '*' { result = Node.new(:ELEMENT_NAME, val) }
+    ;
+  namespaced_ident
     : namespace '|' IDENT {
         result = Node.new(:ELEMENT_NAME,
           [[val.first, val.last].compact.join(':')]
@@ -78,16 +82,15 @@ rule
         name = @namespaces.key?('xmlns') ? "xmlns:#{val.first}" : val.first
         result = Node.new(:ELEMENT_NAME, [name])
       }
-    | '*' { result = Node.new(:ELEMENT_NAME, val) }
     ;
   namespace
     : IDENT { result = val[0] }
     |
     ;
   attrib
-    : LSQUARE IDENT attrib_val_0or1 RSQUARE {
+    : LSQUARE namespaced_ident attrib_val_0or1 RSQUARE {
         result = Node.new(:ATTRIBUTE_CONDITION,
-          [Node.new(:ELEMENT_NAME, [val[1]])] + (val[2] || [])
+          [val[1]] + (val[2] || [])
         )
       }
     | LSQUARE function attrib_val_0or1 RSQUARE {

data/lib/nokogiri/css/tokenizer.rb CHANGED

@@ -6,7 +6,7 @@
 module Nokogiri
 module CSS
-class Tokenizer
+class Tokenizer # :nodoc:
   require 'strscan'
   class ScanError < StandardError ; end

data/lib/nokogiri/css/tokenizer.rex CHANGED

@@ -1,6 +1,6 @@
 module Nokogiri
 module CSS
-class Tokenizer
+class Tokenizer # :nodoc:
 macro
   nl        \n|\r\n|\r|\f

data/lib/nokogiri/html.rb CHANGED

@@ -3,6 +3,7 @@ require 'nokogiri/html/document'
 require 'nokogiri/html/document_fragment'
 require 'nokogiri/html/sax/parser_context'
 require 'nokogiri/html/sax/parser'
+require 'nokogiri/html/sax/push_parser'
 require 'nokogiri/html/element_description'
 require 'nokogiri/html/element_description_defaults'

data/lib/nokogiri/html/document.rb CHANGED

@@ -19,7 +19,9 @@ module Nokogiri
       def meta_content_type
         css('meta[@http-equiv]').find { |node|
-          node['http-equiv'] =~ /\AContent-Type\z/i
+          node['http-equiv'] =~ /\AContent-Type\z/i and
+            !node['content'].nil? and
+            !node['content'].empty?
         }
       end
       private :meta_content_type
@@ -92,17 +94,22 @@ module Nokogiri
           if string_or_io.respond_to?(:read)
             url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
             if !encoding
-              # Perform advanced encoding detection that libxml2 does
-              # not do.
+              # Libxml2's parser has poor support for encoding
+              # detection.  First, it does not recognize the HTML5
+              # style meta charset declaration.  Secondly, even if it
+              # successfully detects an encoding hint, it does not
+              # re-decode or re-parse the preceding part which may be
+              # garbled.
+              #
+              # EncodingReader aims to perform advanced encoding
+              # detection beyond what Libxml2 does, and to emulate
+              # rewinding of a stream and make Libxml2 redo parsing
+              # from the start when an encoding hint is found.
               string_or_io = EncodingReader.new(string_or_io)
               begin
                 return read_io(string_or_io, url, encoding, options.to_i)
-              rescue EncodingFoundException => e
-                # A retry is required because libxml2 has a problem in
-                # that it cannot switch encoding well in the middle of
-                # parsing, especially if it has already seen a
-                # non-ASCII character when it finds an encoding hint.
-                encoding = e.encoding
+              rescue EncodingFound => e
+                encoding = e.found_encoding
               end
             end
             return read_io(string_or_io, url, encoding, options.to_i)
@@ -111,19 +118,17 @@ module Nokogiri
           # read_memory pukes on empty docs
           return new if string_or_io.nil? or string_or_io.empty?
-          if !encoding
-            encoding = EncodingReader.detect_encoding(string_or_io)
-          end
+          encoding ||= EncodingReader.detect_encoding(string_or_io)
           read_memory(string_or_io, url, encoding, options.to_i)
         end
       end
-      class EncodingFoundException < Exception # :nodoc:
-        attr_reader :encoding
+      class EncodingFound < StandardError # :nodoc:
+        attr_reader :found_encoding
         def initialize(encoding)
-          @encoding = encoding
+          @found_encoding = encoding
           super("encoding found: %s" % encoding)
         end
       end
@@ -131,57 +136,91 @@ module Nokogiri
       class EncodingReader # :nodoc:
         class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
           attr_reader :encoding
-          def found(encoding)
-            @encoding = encoding
-            throw :found
+          def initialize
+            @encoding = nil
+            super()
           end
-          def not_found(encoding)
-            found nil
+          def start_element(name, attrs = [])
+            return unless name == 'meta'
+            attr = Hash[attrs]
+            charset = attr['charset'] and
+              @encoding = charset
+            http_equiv = attr['http-equiv'] and
+              http_equiv.match(/\AContent-Type\z/i) and
+              content = attr['content'] and
+              m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
+              @encoding = m[1]
+          end
+        end
+        class JumpSAXHandler < SAXHandler
+          def initialize(jumptag)
+            @jumptag = jumptag
+            super()
           end
           def start_element(name, attrs = [])
-            case name
-            when /\A(?:div|h1|img|p|br)\z/
-              not_found
-            when 'meta'
-              attr = Hash[attrs]
-              charset = attr['charset'] and
-                found charset
-              http_equiv = attr['http-equiv'] and
-                http_equiv.match(/\AContent-Type\z/i) and
-                content = attr['content'] and
-                m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
-                found m[1]
-            end
+            super
+            throw @jumptag, @encoding if @encoding
+            throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
           end
         end
         def self.detect_encoding(chunk)
+          if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
+            return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
+          end
           m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
             return Nokogiri.XML(m[1]).encoding
           if Nokogiri.jruby?
             m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
               return m[4]
+            catch(:encoding_found) {
+              Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
+              nil
+            }
+          else
+            handler = SAXHandler.new
+            parser = Nokogiri::HTML::SAX::PushParser.new(handler)
+            parser << chunk rescue Nokogiri::SyntaxError
+            handler.encoding
           end
+        end
+        def self.is_jruby_without_fix?
+          JRUBY_VERSION.split('.').join.to_i < 165
+        end
-          handler = SAXHandler.new
-          parser = Nokogiri::HTML::SAX::Parser.new(handler)
-          catch(:found) {
-            parser.parse(chunk)
+        def self.detect_encoding_for_jruby_without_fix(chunk)
+          m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
+            return Nokogiri.XML(m[1]).encoding
+          m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
+            return m[4]
+          catch(:encoding_found) {
+            Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
+            nil
           }
-          handler.encoding
-        rescue
+        rescue Nokogiri::SyntaxError, RuntimeError
+          # Ignore parser errors that nokogiri may raise
           nil
         end
         def initialize(io)
           @io = io
           @firstchunk = nil
+          @encoding_found = nil
         end
+        # This method is used by the C extension so that
+        # Nokogiri::HTML::Document#read_io() does not leak memory when
+        # EncodingFound is raised.
+        attr_reader :encoding_found
         def read(len)
           # no support for a call without len
@@ -193,9 +232,10 @@ module Nokogiri
             # achieve advanced encoding detection.
             if encoding = EncodingReader.detect_encoding(@firstchunk)
               # The first chunk is stored for the next read in retry.
-              raise EncodingFoundException, encoding
+              raise @encoding_found = EncodingFound.new(encoding)
             end
           end
+          @encoding_found = nil
           ret = @firstchunk.slice!(0, len)
           if (len -= ret.length) > 0

data/lib/nokogiri/html/sax/push_parser.rb ADDED

@@ -0,0 +1,16 @@
+module Nokogiri
+  module HTML
+    module SAX
+      class PushParser
+        def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
+          @document = doc
+          @encoding = encoding
+          @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
+          ## Create our push parser context
+          initialize_native(@sax_parser, file_name, @encoding)
+        end
+      end
+    end
+  end
+end

data/lib/nokogiri/version.rb CHANGED

@@ -1,6 +1,6 @@
 module Nokogiri
   # The version of Nokogiri you are using
-  VERSION = '1.5.0'
+  VERSION = '1.5.1.rc1'
   class VersionInfo # :nodoc:
     def jruby?

data/lib/nokogiri/xml.rb CHANGED

@@ -35,6 +35,12 @@ module Nokogiri
   end
   module XML
+    # Original C14N 1.0 spec canonicalization
+    XML_C14N_1_0 =       0
+    # Exclusive C14N 1.0 spec canonicalization
+    XML_C14N_EXCLUSIVE_1_0 =     1
+    # C14N 1.1 spec canonicalization
+    XML_C14N_1_1 = 2
     class << self
       ###
       # Parse an XML document using the Nokogiri::XML::Reader API.  See

data/lib/nokogiri/xml/builder.rb CHANGED

@@ -305,7 +305,13 @@ module Nokogiri
       ###
       # Create a CDATA Node with content of +string+
       def cdata string
-        insert(doc.create_cdata(string))
+        insert doc.create_cdata(string)
+      end
+      ###
+      # Create a Comment Node with content of +string+
+      def comment string
+        insert doc.create_comment(string)
       end
       ###

data/lib/nokogiri/xml/document.rb CHANGED

@@ -8,6 +8,12 @@ module Nokogiri
     # For searching a Document, see Nokogiri::XML::Node#css and
     # Nokogiri::XML::Node#xpath
     class Document < Nokogiri::XML::Node
+      # I'm ignoring unicode characters here.
+      # See http://www.w3.org/TR/REC-xml-names/#ns-decl for more details.
+      NCNAME_START_CHAR = "A-Za-z_"
+      NCNAME_CHAR       = NCNAME_START_CHAR + "\\-.0-9"
+      NCNAME_RE         = /^xmlns(:[#{NCNAME_START_CHAR}][#{NCNAME_CHAR}]*)?$/
       ##
       # Parse an XML file.  +string_or_io+ may be a String, or any object that
       # responds to _read_ and _close_ such as an IO, or StringIO.
@@ -17,20 +23,23 @@ module Nokogiri
       # Nokogiri::XML::ParseOptions::RECOVER.  See the constants in
       # Nokogiri::XML::ParseOptions.
       def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block
         options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
         # Give the options to the user
         yield options if block_given?
-        if string_or_io.respond_to?(:read)
+        doc = if string_or_io.respond_to?(:read)
           url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
-          return read_io(string_or_io, url, encoding, options.to_i)
+          read_io(string_or_io, url, encoding, options.to_i)
+        else
+          # read_memory pukes on empty docs
+          return new if string_or_io.nil? or string_or_io.empty?
+          read_memory(string_or_io, url, encoding, options.to_i)
         end
-        # read_memory pukes on empty docs
-        return new if string_or_io.nil? or string_or_io.empty?
+        # do xinclude processing
+        doc.do_xinclude(options) if options.xinclude?
-        read_memory(string_or_io, url, encoding, options.to_i)
+        return doc
       end
       # A list of Nokogiri::XML::SyntaxError found when parsing a document
@@ -57,7 +66,7 @@ module Nokogiri
           when Hash
             arg.each { |k,v|
               key = k.to_s
-              if key =~ /^xmlns(:\w+)?$/
+              if key =~ NCNAME_RE
                 ns_name = key.split(":", 2)[1]
                 elm.add_namespace_definition ns_name, v
                 next
@@ -71,14 +80,19 @@ module Nokogiri
         elm
       end
-      # Create a text node with +text+
-      def create_text_node text, &block
-        Nokogiri::XML::Text.new(text.to_s, self, &block)
+      # Create a Text Node with +string+
+      def create_text_node string, &block
+        Nokogiri::XML::Text.new string.to_s, self, &block
+      end
+      # Create a CDATA Node containing +string+
+      def create_cdata string, &block
+        Nokogiri::XML::CDATA.new self, string.to_s, &block
       end
-      # Create a CDATA element containing +text+
-      def create_cdata text
-        Nokogiri::XML::CDATA.new(self, text.to_s)
+      # Create a Comment Node containing +string+
+      def create_comment string, &block
+        Nokogiri::XML::Comment.new self, string.to_s, &block
       end
       # The name of this document.  Always returns "document"
@@ -194,11 +208,12 @@ module Nokogiri
       undef_method :add_namespace_definition, :attributes
       undef_method :namespace_definitions, :line, :add_namespace
-      def add_child child
+      def add_child node_or_tags
         raise "Document already has a root node" if root
-        if child.type == Node::DOCUMENT_FRAG_NODE
-          raise "Document cannot have multiple root nodes" if child.children.size > 1
-          super(child.children.first)
+        node_or_tags = coerce(node_or_tags)
+        if node_or_tags.is_a?(XML::NodeSet)
+          raise "Document cannot have multiple root nodes" if node_or_tags.size > 1
+          super(node_or_tags.first)
         else
           super
         end

data/lib/nokogiri/xml/document_fragment.rb CHANGED

@@ -11,7 +11,12 @@ module Nokogiri
         return self unless tags
         children = if ctx
-                     ctx.parse(tags)
+                     # Fix for issue#490
+                     if Nokogiri.jruby?
+                       ctx.parse("<root>#{tags}</root>").xpath("/root/node()")
+                     else
+                       ctx.parse(tags)
+                     end
                    else
                      XML::Document.parse("<root>#{tags}</root>") \
                        .xpath("/root/node()")

data/lib/nokogiri/xml/node.rb CHANGED

@@ -255,6 +255,12 @@ module Nokogiri
         get(name.to_s)
       end
+      ###
+      # Set the attribute value for the attribute +name+ to +value+
+      def []= name, value
+        set name.to_s, value
+      end
       ###
       # Add +node_or_tags+ as a child of this Node.
       # +node_or_tags+ can be a Nokogiri::XML::Node, a ::DocumentFragment, a ::NodeSet, or a string containing markup.
@@ -291,6 +297,8 @@ module Nokogiri
       #
       # Also see related method +before+.
       def add_previous_sibling node_or_tags
+        raise ArgumentError.new("A document may not have multiple root nodes.") if parent.is_a?(XML::Document)
         node_or_tags = coerce(node_or_tags)
         if node_or_tags.is_a?(XML::NodeSet)
           if text?
@@ -315,6 +323,8 @@ module Nokogiri
       #
       # Also see related method +after+.
       def add_next_sibling node_or_tags
+        raise ArgumentError.new("A document may not have multiple root nodes.") if parent.is_a?(XML::Document)
         node_or_tags = coerce(node_or_tags)
         if node_or_tags.is_a?(XML::NodeSet)
           if text?
@@ -452,9 +462,9 @@ module Nokogiri
       # If you need to distinguish attributes with the same name, with different namespaces
       # use #attribute_nodes instead.
       def attributes
-        Hash[*(attribute_nodes.map { |node|
+        Hash[attribute_nodes.map { |node|
           [node.node_name, node]
-        }.flatten)]
+        }]
       end
       ###
@@ -471,9 +481,9 @@ module Nokogiri
       ###
       # Iterate over each attribute name and value pair for this Node.
-      def each &block
+      def each
         attribute_nodes.each { |node|
-          block.call([node.node_name, node.value])
+          yield [node.node_name, node.value]
         }
       end
@@ -555,7 +565,7 @@ module Nokogiri
       # default namespaces set on ancestor will NOT be, even if self
       # has no explicit default namespace.
       def namespaces
-        Hash[*namespace_scopes.map { |nd|
+        Hash[namespace_scopes.map { |nd|
           key = ['xmlns', nd.prefix].compact.join(':')
           if RUBY_VERSION >= '1.9' && document.encoding
             begin
@@ -564,7 +574,7 @@ module Nokogiri
             end
           end
           [key, nd.href]
-        }.flatten]
+        }]
       end
       # Returns true if this is a Comment
@@ -766,8 +776,7 @@ module Nokogiri
       #
       # See Node#write_to for a list of +options+
       def to_xml options = {}
-        options[:save_with] |= SaveOptions::DEFAULT_XML if options[:save_with]
-        options[:save_with] = SaveOptions::DEFAULT_XML unless options[:save_with]
+        options[:save_with] ||= SaveOptions::DEFAULT_XML
         serialize(options)
       end
@@ -865,6 +874,28 @@ module Nokogiri
         compare other
       end
+      ###
+      # Do xinclude substitution on the subtree below node. If given a block, a
+      # Nokogiri::XML::ParseOptions object initialized from +options+, will be
+      # passed to it, allowing more convenient modification of the parser options.
+      def do_xinclude options = XML::ParseOptions::DEFAULT_XML, &block
+        options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
+        # give options to user
+        yield options if block_given?
+        # call c extension
+        process_xincludes(options.to_i)
+      end
+      def canonicalize(mode=XML::XML_C14N_1_0,inclusive_namespaces=nil,with_comments=false)
+        c14n_root = self
+        document.canonicalize(mode, inclusive_namespaces, with_comments) do |node, parent|
+          tn = node.is_a?(XML::Node) ? node : parent
+          tn == c14n_root || tn.ancestors.include?(c14n_root)
+        end
+      end
       private
       def extract_params params # :nodoc:
@@ -893,7 +924,7 @@ module Nokogiri
         return data.children           if data.is_a?(XML::DocumentFragment)
         return fragment(data).children if data.is_a?(String)
-        if data.is_a?(Document) || !data.is_a?(XML::Node)
+        if data.is_a?(Document) || data.is_a?(XML::Attr) || !data.is_a?(XML::Node)
           raise ArgumentError, <<-EOERR
 Requires a Node, NodeSet or String argument, and cannot accept a #{data.class}.
 (You probably want to select a node from the Document with at() or search(), or create a new Node via Node.new().)