RubyGems - nokogiri-maglev- - Versions diffs - 1.5.0.1 → 1.5.2 - Mend

nokogiri-maglev- 1.5.0.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

data/CHANGELOG.ja.rdoc +56 -12
data/CHANGELOG.rdoc +49 -0
data/C_CODING_STYLE.rdoc +27 -0
data/Manifest.txt +4 -0
data/README.rdoc +11 -7
data/Rakefile +42 -27
data/bin/nokogiri +10 -2
data/ext/nokogiri/extconf.rb +11 -3
data/ext/nokogiri/html_document.c +16 -0
data/ext/nokogiri/html_sax_parser_context.c +59 -37
data/ext/nokogiri/html_sax_push_parser.c +87 -0
data/ext/nokogiri/html_sax_push_parser.h +9 -0
data/ext/nokogiri/nokogiri.c +7 -9
data/ext/nokogiri/nokogiri.h +3 -0
data/ext/nokogiri/xml_document.c +101 -3
data/ext/nokogiri/xml_document.h +3 -3
data/ext/nokogiri/xml_node.c +151 -58
data/ext/nokogiri/xml_node_set.c +169 -120
data/ext/nokogiri/xml_node_set.h +5 -0
data/ext/nokogiri/xml_sax_parser_context.c +64 -41
data/ext/nokogiri/xml_text.c +2 -0
data/ext/nokogiri/xml_xpath_context.c +31 -25
data/ext/nokogiri/xslt_stylesheet.c +62 -16
data/ext/nokogiri/xslt_stylesheet.h +5 -0
data/lib/nokogiri/css/parser.rb +165 -159
data/lib/nokogiri/css/parser.y +6 -3
data/lib/nokogiri/css/tokenizer.rb +1 -1
data/lib/nokogiri/css/tokenizer.rex +1 -1
data/lib/nokogiri/html.rb +1 -0
data/lib/nokogiri/html/document.rb +82 -42
data/lib/nokogiri/html/sax/push_parser.rb +16 -0
data/lib/nokogiri/version.rb +1 -1
data/lib/nokogiri/xml.rb +6 -0
data/lib/nokogiri/xml/builder.rb +7 -1
data/lib/nokogiri/xml/document.rb +32 -17
data/lib/nokogiri/xml/document_fragment.rb +6 -1
data/lib/nokogiri/xml/node.rb +40 -9
data/lib/nokogiri/xslt.rb +5 -1
data/tasks/cross_compile.rb +1 -0
data/tasks/nokogiri.org.rb +6 -0
data/tasks/test.rb +1 -0
data/test/css/test_xpath_visitor.rb +6 -0
data/test/helper.rb +1 -0
data/test/html/test_document.rb +26 -0
data/test/html/test_document_fragment.rb +1 -2
data/test/test_memory_leak.rb +81 -1
data/test/test_xslt_transforms.rb +152 -123
data/test/xml/test_builder.rb +24 -2
data/test/xml/test_c14n.rb +151 -0
data/test/xml/test_document.rb +48 -0
data/test/xml/test_namespace.rb +5 -0
data/test/xml/test_node.rb +82 -1
data/test/xml/test_node_attributes.rb +19 -0
data/test/xml/test_node_inheritance.rb +32 -0
data/test/xml/test_node_reparenting.rb +32 -0
data/test/xml/test_node_set.rb +16 -8
data/test/xml/test_reader_encoding.rb +16 -0
data/test/xml/test_unparented_node.rb +32 -0
data/test/xml/test_xinclude.rb +83 -0
data/test/xml/test_xpath.rb +22 -0
metadata +208 -241

data/lib/nokogiri/css/parser.y CHANGED Viewed

@@ -69,6 +69,10 @@ rule
     : '.' IDENT { result = Node.new(:CLASS_CONDITION, [val[1]]) }
     ;
   element_name
+    : namespaced_ident
+    | '*' { result = Node.new(:ELEMENT_NAME, val) }
+    ;
+  namespaced_ident
     : namespace '|' IDENT {
         result = Node.new(:ELEMENT_NAME,
           [[val.first, val.last].compact.join(':')]
@@ -78,16 +82,15 @@ rule
         name = @namespaces.key?('xmlns') ? "xmlns:#{val.first}" : val.first
         result = Node.new(:ELEMENT_NAME, [name])
       }
-    | '*' { result = Node.new(:ELEMENT_NAME, val) }
     ;
   namespace
     : IDENT { result = val[0] }
     |
     ;
   attrib
-    : LSQUARE IDENT attrib_val_0or1 RSQUARE {
+    : LSQUARE namespaced_ident attrib_val_0or1 RSQUARE {
         result = Node.new(:ATTRIBUTE_CONDITION,
-          [Node.new(:ELEMENT_NAME, [val[1]])] + (val[2] || [])
+          [val[1]] + (val[2] || [])
         )
       }
     | LSQUARE function attrib_val_0or1 RSQUARE {

data/lib/nokogiri/css/tokenizer.rb CHANGED Viewed

@@ -6,7 +6,7 @@
 module Nokogiri
 module CSS
-class Tokenizer
+class Tokenizer # :nodoc:
   require 'strscan'
   class ScanError < StandardError ; end

data/lib/nokogiri/css/tokenizer.rex CHANGED Viewed

@@ -1,6 +1,6 @@
 module Nokogiri
 module CSS
-class Tokenizer
+class Tokenizer # :nodoc:
 macro
   nl        \n|\r\n|\r|\f

data/lib/nokogiri/html.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'nokogiri/html/document'
 require 'nokogiri/html/document_fragment'
 require 'nokogiri/html/sax/parser_context'
 require 'nokogiri/html/sax/parser'
+require 'nokogiri/html/sax/push_parser'
 require 'nokogiri/html/element_description'
 require 'nokogiri/html/element_description_defaults'

data/lib/nokogiri/html/document.rb CHANGED Viewed

@@ -19,7 +19,9 @@ module Nokogiri
       def meta_content_type
         css('meta[@http-equiv]').find { |node|
-          node['http-equiv'] =~ /\AContent-Type\z/i
+          node['http-equiv'] =~ /\AContent-Type\z/i and
+            !node['content'].nil? and
+            !node['content'].empty?
         }
       end
       private :meta_content_type
@@ -92,17 +94,22 @@ module Nokogiri
           if string_or_io.respond_to?(:read)
             url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
             if !encoding
-              # Perform advanced encoding detection that libxml2 does
-              # not do.
+              # Libxml2's parser has poor support for encoding
+              # detection.  First, it does not recognize the HTML5
+              # style meta charset declaration.  Secondly, even if it
+              # successfully detects an encoding hint, it does not
+              # re-decode or re-parse the preceding part which may be
+              # garbled.
+              #
+              # EncodingReader aims to perform advanced encoding
+              # detection beyond what Libxml2 does, and to emulate
+              # rewinding of a stream and make Libxml2 redo parsing
+              # from the start when an encoding hint is found.
               string_or_io = EncodingReader.new(string_or_io)
               begin
                 return read_io(string_or_io, url, encoding, options.to_i)
-              rescue EncodingFoundException => e
-                # A retry is required because libxml2 has a problem in
-                # that it cannot switch encoding well in the middle of
-                # parsing, especially if it has already seen a
-                # non-ASCII character when it finds an encoding hint.
-                encoding = e.encoding
+              rescue EncodingFound => e
+                encoding = e.found_encoding
               end
             end
             return read_io(string_or_io, url, encoding, options.to_i)
@@ -111,19 +118,17 @@ module Nokogiri
           # read_memory pukes on empty docs
           return new if string_or_io.nil? or string_or_io.empty?
-          if !encoding
-            encoding = EncodingReader.detect_encoding(string_or_io)
-          end
+          encoding ||= EncodingReader.detect_encoding(string_or_io)
           read_memory(string_or_io, url, encoding, options.to_i)
         end
       end
-      class EncodingFoundException < Exception # :nodoc:
-        attr_reader :encoding
+      class EncodingFound < StandardError # :nodoc:
+        attr_reader :found_encoding
         def initialize(encoding)
-          @encoding = encoding
+          @found_encoding = encoding
           super("encoding found: %s" % encoding)
         end
       end
@@ -131,57 +136,91 @@ module Nokogiri
       class EncodingReader # :nodoc:
         class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
           attr_reader :encoding
-          def found(encoding)
-            @encoding = encoding
-            throw :found
+          def initialize
+            @encoding = nil
+            super()
           end
-          def not_found(encoding)
-            found nil
+          def start_element(name, attrs = [])
+            return unless name == 'meta'
+            attr = Hash[attrs]
+            charset = attr['charset'] and
+              @encoding = charset
+            http_equiv = attr['http-equiv'] and
+              http_equiv.match(/\AContent-Type\z/i) and
+              content = attr['content'] and
+              m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
+              @encoding = m[1]
+          end
+        end
+        class JumpSAXHandler < SAXHandler
+          def initialize(jumptag)
+            @jumptag = jumptag
+            super()
           end
           def start_element(name, attrs = [])
-            case name
-            when /\A(?:div|h1|img|p|br)\z/
-              not_found
-            when 'meta'
-              attr = Hash[attrs]
-              charset = attr['charset'] and
-                found charset
-              http_equiv = attr['http-equiv'] and
-                http_equiv.match(/\AContent-Type\z/i) and
-                content = attr['content'] and
-                m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
-                found m[1]
-            end
+            super
+            throw @jumptag, @encoding if @encoding
+            throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
           end
         end
         def self.detect_encoding(chunk)
+          if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
+            return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
+          end
           m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
             return Nokogiri.XML(m[1]).encoding
           if Nokogiri.jruby?
             m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
               return m[4]
+            catch(:encoding_found) {
+              Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
+              nil
+            }
+          else
+            handler = SAXHandler.new
+            parser = Nokogiri::HTML::SAX::PushParser.new(handler)
+            parser << chunk rescue Nokogiri::SyntaxError
+            handler.encoding
           end
+        end
+        def self.is_jruby_without_fix?
+          JRUBY_VERSION.split('.').join.to_i < 165
+        end
-          handler = SAXHandler.new
-          parser = Nokogiri::HTML::SAX::Parser.new(handler)
-          catch(:found) {
-            parser.parse(chunk)
+        def self.detect_encoding_for_jruby_without_fix(chunk)
+          m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
+            return Nokogiri.XML(m[1]).encoding
+          m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
+            return m[4]
+          catch(:encoding_found) {
+            Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
+            nil
           }
-          handler.encoding
-        rescue
+        rescue Nokogiri::SyntaxError, RuntimeError
+          # Ignore parser errors that nokogiri may raise
           nil
         end
         def initialize(io)
           @io = io
           @firstchunk = nil
+          @encoding_found = nil
         end
+        # This method is used by the C extension so that
+        # Nokogiri::HTML::Document#read_io() does not leak memory when
+        # EncodingFound is raised.
+        attr_reader :encoding_found
         def read(len)
           # no support for a call without len
@@ -193,9 +232,10 @@ module Nokogiri
             # achieve advanced encoding detection.
             if encoding = EncodingReader.detect_encoding(@firstchunk)
               # The first chunk is stored for the next read in retry.
-              raise EncodingFoundException, encoding
+              raise @encoding_found = EncodingFound.new(encoding)
             end
           end
+          @encoding_found = nil
           ret = @firstchunk.slice!(0, len)
           if (len -= ret.length) > 0

data/lib/nokogiri/html/sax/push_parser.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module Nokogiri
+  module HTML
+    module SAX
+      class PushParser
+        def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
+          @document = doc
+          @encoding = encoding
+          @sax_parser = HTML::SAX::Parser.new(doc, @encoding)
+          ## Create our push parser context
+          initialize_native(@sax_parser, file_name, @encoding)
+        end
+      end
+    end
+  end
+end

data/lib/nokogiri/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Nokogiri
   # The version of Nokogiri you are using
-  VERSION = '1.5.0.1'
+  VERSION = '1.5.2'
   class VersionInfo # :nodoc:
     def jruby?

data/lib/nokogiri/xml.rb CHANGED Viewed

@@ -35,6 +35,12 @@ module Nokogiri
   end
   module XML
+    # Original C14N 1.0 spec canonicalization
+    XML_C14N_1_0 =       0
+    # Exclusive C14N 1.0 spec canonicalization
+    XML_C14N_EXCLUSIVE_1_0 =     1
+    # C14N 1.1 spec canonicalization
+    XML_C14N_1_1 = 2
     class << self
       ###
       # Parse an XML document using the Nokogiri::XML::Reader API.  See

data/lib/nokogiri/xml/builder.rb CHANGED Viewed

@@ -306,7 +306,13 @@ module Nokogiri
       ###
       # Create a CDATA Node with content of +string+
       def cdata string
-        insert(doc.create_cdata(string))
+        insert doc.create_cdata(string)
+      end
+      ###
+      # Create a Comment Node with content of +string+
+      def comment string
+        insert doc.create_comment(string)
       end
       ###

data/lib/nokogiri/xml/document.rb CHANGED Viewed

@@ -8,6 +8,12 @@ module Nokogiri
     # For searching a Document, see Nokogiri::XML::Node#css and
     # Nokogiri::XML::Node#xpath
     class Document < Nokogiri::XML::Node
+      # I'm ignoring unicode characters here.
+      # See http://www.w3.org/TR/REC-xml-names/#ns-decl for more details.
+      NCNAME_START_CHAR = "A-Za-z_"
+      NCNAME_CHAR       = NCNAME_START_CHAR + "\\-.0-9"
+      NCNAME_RE         = /^xmlns(:[#{NCNAME_START_CHAR}][#{NCNAME_CHAR}]*)?$/
       ##
       # Parse an XML file.  +string_or_io+ may be a String, or any object that
       # responds to _read_ and _close_ such as an IO, or StringIO.
@@ -17,20 +23,23 @@ module Nokogiri
       # Nokogiri::XML::ParseOptions::RECOVER.  See the constants in
       # Nokogiri::XML::ParseOptions.
       def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block
         options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
         # Give the options to the user
         yield options if block_given?
-        if string_or_io.respond_to?(:read)
+        doc = if string_or_io.respond_to?(:read)
           url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
-          return read_io(string_or_io, url, encoding, options.to_i)
+          read_io(string_or_io, url, encoding, options.to_i)
+        else
+          # read_memory pukes on empty docs
+          return new if string_or_io.nil? or string_or_io.empty?
+          read_memory(string_or_io, url, encoding, options.to_i)
         end
-        # read_memory pukes on empty docs
-        return new if string_or_io.nil? or string_or_io.empty?
+        # do xinclude processing
+        doc.do_xinclude(options) if options.xinclude?
-        read_memory(string_or_io, url, encoding, options.to_i)
+        return doc
       end
       # A list of Nokogiri::XML::SyntaxError found when parsing a document
@@ -57,7 +66,7 @@ module Nokogiri
           when Hash
             arg.each { |k,v|
               key = k.to_s
-              if key =~ /^xmlns(:\w+)?$/
+              if key =~ NCNAME_RE
                 ns_name = key.split(":", 2)[1]
                 elm.add_namespace_definition ns_name, v
                 next
@@ -71,14 +80,19 @@ module Nokogiri
         elm
       end
-      # Create a text node with +text+
-      def create_text_node text, &block
-        Nokogiri::XML::Text.new(text.to_s, self, &block)
+      # Create a Text Node with +string+
+      def create_text_node string, &block
+        Nokogiri::XML::Text.new string.to_s, self, &block
+      end
+      # Create a CDATA Node containing +string+
+      def create_cdata string, &block
+        Nokogiri::XML::CDATA.new self, string.to_s, &block
       end
-      # Create a CDATA element containing +text+
-      def create_cdata text
-        Nokogiri::XML::CDATA.new(self, text.to_s)
+      # Create a Comment Node containing +string+
+      def create_comment string, &block
+        Nokogiri::XML::Comment.new self, string.to_s, &block
       end
       # The name of this document.  Always returns "document"
@@ -194,11 +208,12 @@ module Nokogiri
       undef_method :add_namespace_definition, :attributes
       undef_method :namespace_definitions, :line, :add_namespace
-      def add_child child
+      def add_child node_or_tags
         raise "Document already has a root node" if root
-        if child.type == Node::DOCUMENT_FRAG_NODE
-          raise "Document cannot have multiple root nodes" if child.children.size > 1
-          super(child.children.first)
+        node_or_tags = coerce(node_or_tags)
+        if node_or_tags.is_a?(XML::NodeSet)
+          raise "Document cannot have multiple root nodes" if node_or_tags.size > 1
+          super(node_or_tags.first)
         else
           super
         end

data/lib/nokogiri/xml/document_fragment.rb CHANGED Viewed

@@ -11,7 +11,12 @@ module Nokogiri
         return self unless tags
         children = if ctx
-                     ctx.parse(tags)
+                     # Fix for issue#490
+                     if Nokogiri.jruby?
+                       ctx.parse("<root>#{tags}</root>").xpath("/root/node()")
+                     else
+                       ctx.parse(tags)
+                     end
                    else
                      XML::Document.parse("<root>#{tags}</root>") \
                        .xpath("/root/node()")

data/lib/nokogiri/xml/node.rb CHANGED Viewed

@@ -255,6 +255,12 @@ module Nokogiri
         get(name.to_s)
       end
+      ###
+      # Set the attribute value for the attribute +name+ to +value+
+      def []= name, value
+        set name.to_s, value
+      end
       ###
       # Add +node_or_tags+ as a child of this Node.
       # +node_or_tags+ can be a Nokogiri::XML::Node, a ::DocumentFragment, a ::NodeSet, or a string containing markup.
@@ -291,6 +297,8 @@ module Nokogiri
       #
       # Also see related method +before+.
       def add_previous_sibling node_or_tags
+        raise ArgumentError.new("A document may not have multiple root nodes.") if parent.is_a?(XML::Document) && !node_or_tags.is_a?(XML::ProcessingInstruction)
         node_or_tags = coerce(node_or_tags)
         if node_or_tags.is_a?(XML::NodeSet)
           if text?
@@ -315,6 +323,8 @@ module Nokogiri
       #
       # Also see related method +after+.
       def add_next_sibling node_or_tags
+        raise ArgumentError.new("A document may not have multiple root nodes.") if parent.is_a?(XML::Document)
         node_or_tags = coerce(node_or_tags)
         if node_or_tags.is_a?(XML::NodeSet)
           if text?
@@ -452,9 +462,9 @@ module Nokogiri
       # If you need to distinguish attributes with the same name, with different namespaces
       # use #attribute_nodes instead.
       def attributes
-        Hash[*(attribute_nodes.map { |node|
+        Hash[attribute_nodes.map { |node|
           [node.node_name, node]
-        }.flatten)]
+        }]
       end
       ###
@@ -471,9 +481,9 @@ module Nokogiri
       ###
       # Iterate over each attribute name and value pair for this Node.
-      def each &block
+      def each
         attribute_nodes.each { |node|
-          block.call([node.node_name, node.value])
+          yield [node.node_name, node.value]
         }
       end
@@ -555,7 +565,7 @@ module Nokogiri
       # default namespaces set on ancestor will NOT be, even if self
       # has no explicit default namespace.
       def namespaces
-        Hash[*namespace_scopes.map { |nd|
+        Hash[namespace_scopes.map { |nd|
           key = ['xmlns', nd.prefix].compact.join(':')
           if RUBY_VERSION >= '1.9' && document.encoding
             begin
@@ -564,7 +574,7 @@ module Nokogiri
             end
           end
           [key, nd.href]
-        }.flatten]
+        }]
       end
       # Returns true if this is a Comment
@@ -766,8 +776,7 @@ module Nokogiri
       #
       # See Node#write_to for a list of +options+
       def to_xml options = {}
-        options[:save_with] |= SaveOptions::DEFAULT_XML if options[:save_with]
-        options[:save_with] = SaveOptions::DEFAULT_XML unless options[:save_with]
+        options[:save_with] ||= SaveOptions::DEFAULT_XML
         serialize(options)
       end
@@ -865,6 +874,28 @@ module Nokogiri
         compare other
       end
+      ###
+      # Do xinclude substitution on the subtree below node. If given a block, a
+      # Nokogiri::XML::ParseOptions object initialized from +options+, will be
+      # passed to it, allowing more convenient modification of the parser options.
+      def do_xinclude options = XML::ParseOptions::DEFAULT_XML, &block
+        options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
+        # give options to user
+        yield options if block_given?
+        # call c extension
+        process_xincludes(options.to_i)
+      end
+      def canonicalize(mode=XML::XML_C14N_1_0,inclusive_namespaces=nil,with_comments=false)
+        c14n_root = self
+        document.canonicalize(mode, inclusive_namespaces, with_comments) do |node, parent|
+          tn = node.is_a?(XML::Node) ? node : parent
+          tn == c14n_root || tn.ancestors.include?(c14n_root)
+        end
+      end
       private
       def extract_params params # :nodoc:
@@ -893,7 +924,7 @@ module Nokogiri
         return data.children           if data.is_a?(XML::DocumentFragment)
         return fragment(data).children if data.is_a?(String)
-        if data.is_a?(Document) || !data.is_a?(XML::Node)
+        if data.is_a?(Document) || data.is_a?(XML::Attr) || !data.is_a?(XML::Node)
           raise ArgumentError, <<-EOERR
 Requires a Node, NodeSet or String argument, and cannot accept a #{data.class}.
 (You probably want to select a node from the Document with at() or search(), or create a new Node via Node.new().)