RubyGems - rexml - Versions diffs - 3.2.6 → 3.3.9 - Mend

rexml 3.2.6 → 3.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/NEWS.md +370 -0
data/lib/rexml/attribute.rb +3 -2
data/lib/rexml/document.rb +5 -1
data/lib/rexml/element.rb +16 -31
data/lib/rexml/entity.rb +9 -48
data/lib/rexml/formatters/pretty.rb +1 -1
data/lib/rexml/functions.rb +1 -2
data/lib/rexml/node.rb +8 -4
data/lib/rexml/parseexception.rb +1 -0
data/lib/rexml/parsers/baseparser.rb +433 -265
data/lib/rexml/parsers/pullparser.rb +12 -0
data/lib/rexml/parsers/sax2parser.rb +16 -19
data/lib/rexml/parsers/streamparser.rb +16 -10
data/lib/rexml/parsers/treeparser.rb +9 -21
data/lib/rexml/rexml.rb +1 -1
data/lib/rexml/source.rb +134 -98
data/lib/rexml/text.rb +39 -17
data/lib/rexml/xpath_parser.rb +7 -3
metadata +6 -50

data/lib/rexml/parsers/pullparser.rb CHANGED Viewed

@@ -47,6 +47,18 @@ module REXML
         @listeners << listener
       end
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+      def entity_expansion_limit=( limit )
+        @parser.entity_expansion_limit = limit
+      end
+      def entity_expansion_text_limit=( limit )
+        @parser.entity_expansion_text_limit = limit
+      end
       def each
         while has_next?
           yield self.pull

data/lib/rexml/parsers/sax2parser.rb CHANGED Viewed

@@ -22,6 +22,18 @@ module REXML
         @parser.source
       end
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+      def entity_expansion_limit=( limit )
+        @parser.entity_expansion_limit = limit
+      end
+      def entity_expansion_text_limit=( limit )
+        @parser.entity_expansion_text_limit = limit
+      end
       def add_listener( listener )
         @parser.add_listener( listener )
       end
@@ -157,25 +169,8 @@ module REXML
               end
             end
           when :text
-            #normalized = @parser.normalize( event[1] )
-            #handle( :characters, normalized )
-            copy = event[1].clone
-            esub = proc { |match|
-              if @entities.has_key?($1)
-                @entities[$1].gsub(Text::REFERENCE, &esub)
-              else
-                match
-              end
-            }
-            copy.gsub!( Text::REFERENCE, &esub )
-            copy.gsub!( Text::NUMERICENTITY ) {|m|
-              m=$1
-              m = "0#{m}" if m[0] == ?x
-              [Integer(m)].pack('U*')
-            }
-            handle( :characters, copy )
+            unnormalized = @parser.unnormalize( event[1], @entities )
+            handle( :characters, unnormalized )
           when :entitydecl
             handle_entitydecl( event )
           when :processing_instruction, :comment, :attlistdecl,
@@ -264,6 +259,8 @@ module REXML
       end
       def get_namespace( prefix )
+        return nil if @namespace_stack.empty?
         uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
           (@namespace_stack.find { |ns| not ns[nil].nil? })
         uris[-1][prefix] unless uris.nil? or 0 == uris.size

data/lib/rexml/parsers/streamparser.rb CHANGED Viewed

@@ -7,37 +7,42 @@ module REXML
       def initialize source, listener
         @listener = listener
         @parser = BaseParser.new( source )
-        @tag_stack = []
+        @entities = {}
       end
       def add_listener( listener )
         @parser.add_listener( listener )
       end
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+      def entity_expansion_limit=( limit )
+        @parser.entity_expansion_limit = limit
+      end
+      def entity_expansion_text_limit=( limit )
+        @parser.entity_expansion_text_limit = limit
+      end
       def parse
         # entity string
         while true
           event = @parser.pull
           case event[0]
           when :end_document
-            unless @tag_stack.empty?
-              tag_path = "/" + @tag_stack.join("/")
-              raise ParseException.new("Missing end tag for '#{tag_path}'",
-                                       @parser.source)
-            end
             return
           when :start_element
-            @tag_stack << event[1]
             attrs = event[2].each do |n, v|
               event[2][n] = @parser.unnormalize( v )
             end
             @listener.tag_start( event[1], attrs )
           when :end_element
             @listener.tag_end( event[1] )
-            @tag_stack.pop
           when :text
-            normalized = @parser.unnormalize( event[1] )
-            @listener.text( normalized )
+            unnormalized = @parser.unnormalize( event[1], @entities )
+            @listener.text( unnormalized )
           when :processing_instruction
             @listener.instruction( *event[1,2] )
           when :start_doctype
@@ -48,6 +53,7 @@ module REXML
           when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
             @listener.send( event[0].to_s, *event[1..-1] )
           when :entitydecl, :notationdecl
+            @entities[ event[1] ] = event[2] if event.size == 3
             @listener.send( event[0].to_s, event[1..-1] )
           when :externalentity
             entity_reference = event[1]

data/lib/rexml/parsers/treeparser.rb CHANGED Viewed

@@ -15,8 +15,6 @@ module REXML
       end
       def parse
-        tag_stack = []
-        in_doctype = false
         entities = nil
         begin
           while true
@@ -24,32 +22,24 @@ module REXML
             #STDERR.puts "TREEPARSER GOT #{event.inspect}"
             case event[0]
             when :end_document
-              unless tag_stack.empty?
-                raise ParseException.new("No close tag for #{@build_context.xpath}",
-                                         @parser.source, @parser)
-              end
               return
             when :start_element
-              tag_stack.push(event[1])
               el = @build_context = @build_context.add_element( event[1] )
               event[2].each do |key, value|
                 el.attributes[key]=Attribute.new(key,value,self)
               end
             when :end_element
-              tag_stack.pop
               @build_context = @build_context.parent
             when :text
-              if not in_doctype
-                if @build_context[-1].instance_of? Text
-                  @build_context[-1] << event[1]
-                else
-                  @build_context.add(
-                    Text.new(event[1], @build_context.whitespace, nil, true)
-                  ) unless (
-                    @build_context.ignore_whitespace_nodes and
-                    event[1].strip.size==0
-                  )
-                end
+              if @build_context[-1].instance_of? Text
+                @build_context[-1] << event[1]
+              else
+                @build_context.add(
+                  Text.new(event[1], @build_context.whitespace, nil, true)
+                ) unless (
+                  @build_context.ignore_whitespace_nodes and
+                  event[1].strip.size==0
+                )
               end
             when :comment
               c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
             when :processing_instruction
               @build_context.add( Instruction.new( event[1], event[2] ) )
             when :end_doctype
-              in_doctype = false
               entities.each { |k,v| entities[k] = @build_context.entities[k].value }
               @build_context = @build_context.parent
             when :start_doctype
               doctype = DocType.new( event[1..-1], @build_context )
               @build_context = doctype
               entities = {}
-              in_doctype = true
             when :attlistdecl
               n = AttlistDecl.new( event[1..-1] )
               @build_context.add( n )

data/lib/rexml/rexml.rb CHANGED Viewed

@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.2.6"
+  VERSION = "3.3.9"
   REVISION = ""
   Copyright = COPYRIGHT

data/lib/rexml/source.rb CHANGED Viewed

@@ -1,8 +1,28 @@
 # coding: US-ASCII
 # frozen_string_literal: false
+require "strscan"
 require_relative 'encoding'
 module REXML
+  if StringScanner::Version < "1.0.0"
+    module StringScannerCheckScanString
+      refine StringScanner do
+        def check(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+        def scan(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+      end
+    end
+    using StringScannerCheckScanString
+  end
   # Generates Source-s.  USE THIS CLASS.
   class SourceFactory
     # Generates a Source object
@@ -30,26 +50,50 @@ module REXML
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
+    module Private
+      SCANNER_RESET_SIZE = 100000
+      PRE_DEFINED_TERM_PATTERNS = {}
+      pre_defined_terms = ["'", '"', "<"]
+      pre_defined_terms.each do |term|
+        PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
+      end
+    end
+    private_constant :Private
     # Constructor
     # @param arg must be a String, and should be a valid XML document
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
         detect_encoding
       end
       @line = 0
+      @term_encord = {}
     end
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+    def drop_parsed_content
+      if @scanner.pos > Private::SCANNER_RESET_SIZE
+        @scanner.string = @scanner.rest
+      end
+    end
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +102,78 @@ module REXML
       encoding_updated
     end
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
+    def read(term = nil)
     end
-    def read
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      data = @scanner.scan_until(pattern)
+      unless data
+        data = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+      end
+      data
     end
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
+    def ensure_buffer
     end
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
+    def match(pattern, cons=false)
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
+    def position
+      @scanner.pos
     end
-    def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+    def position=(pos)
+      @scanner.pos = pos
     end
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
-    end
-    def position
-      @orig.index( @buffer )
+      @scanner.eos?
     end
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
     private
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +196,7 @@ module REXML
       end
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,65 +205,72 @@ module REXML
       end
     end
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
+    def read(term = nil, min_bytes = 1)
+      term = encode(term) if term
+      begin
+        str = readline(term)
+        @scanner << str
+        read_bytes = str.bytesize
+        begin
+          while read_bytes < min_bytes
+            str = readline(term)
+            @scanner << str
+            read_bytes += str.bytesize
           end
+        rescue IOError
         end
-        rv = super
+        true
+      rescue Exception, NameError
+        @source = nil
+        false
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
     end
-    def read
-      begin
-        @buffer << readline
-      rescue Exception, NameError
-        @source = nil
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      term = @term_encord[term] ||= encode(term)
+      until str = @scanner.scan_until(pattern)
+        break if @source.nil?
+        break if @source.eof?
+        @scanner << readline(term)
+      end
+      if str
+        read if @scanner.eos? and !@source.eof?
+        str
+      else
+        rest = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+        rest
       end
     end
-    def consume( pattern )
-      match( pattern, true )
+    def ensure_buffer
+      read if @scanner.eos? && @source
     end
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
-        begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
-        rescue
-          @source = nil
+      # To avoid performance issue, we need to increase bytes to read per scan
+      min_bytes = 1
+      while true
+        if cons
+          md = @scanner.scan(pattern)
+        else
+          md = @scanner.check(pattern)
         end
+        break if md
+        return nil if pattern.is_a?(String)
+        return nil if @source.nil?
+        return nil unless read(nil, min_bytes)
+        min_bytes *= 2
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+      md.nil? ? nil : @scanner
     end
     def empty?
       super and ( @source.nil? || @source.eof? )
     end
-    def position
-      @er_source.pos rescue 0
-    end
     # @return the current line in the source
     def current_line
       begin
@@ -263,15 +294,20 @@ module REXML
     end
     private
-    def readline
-      str = @source.readline(@line_break)
+    def readline(term = nil)
       if @pending_buffer
+        begin
+          str = @source.readline(term || @line_break)
+        rescue IOError
+        end
         if str.nil?
           str = @pending_buffer
         else
           str = @pending_buffer + str
         end
         @pending_buffer = nil
+      else
+        str = @source.readline(term || @line_break)
       end
       return nil if str.nil?
@@ -290,7 +326,7 @@ module REXML
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end

data/lib/rexml/text.rb CHANGED Viewed

@@ -151,25 +151,45 @@ module REXML
         end
       end
-      # context sensitive
-      string.scan(pattern) do
-        if $1[-1] != ?;
-          raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
-        elsif $1[0] == ?&
-          if $5 and $5[0] == ?#
-            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-            when *VALID_CHAR
+      pos = 0
+      while (index = string.index(/<|&/, pos))
+        if string[index] == "<"
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        unless (end_index = string.index(/[^\s];/, index + 1))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        value = string[(index + 1)..end_index]
+        if /\s/.match?(value)
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        if value[0] == "#"
+          character_reference = value[1..-1]
+          unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
+            if character_reference[0] == "x" || character_reference[-1] == "x"
+              raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
             else
-              raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
+              raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
             end
-          # FIXME: below can't work but this needs API change.
-          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
-          #   if !doctype or !doctype.entities.has_key?($3)
-          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
-          #   end
           end
+          case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
+          end
+        elsif !(/\A#{Entity::NAME}\z/um.match?(value))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
         end
+        pos = end_index + 1
       end
+      string
     end
     def node_type
@@ -248,7 +268,8 @@ module REXML
     #   u = Text.new( "sean russell", false, nil, true )
     #   u.value   #-> "sean russell"
     def value
-      @unnormalized ||= Text::unnormalize( @string, doctype )
+      @unnormalized ||= Text::unnormalize(@string, doctype,
+                                          entity_expansion_text_limit: document&.entity_expansion_text_limit)
     end
     # Sets the contents of this text node.  This expects the text to be
@@ -391,11 +412,12 @@ module REXML
     end
     # Unescapes all possible entities
-    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
+    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
+      entity_expansion_text_limit ||= Security.entity_expansion_text_limit
       sum = 0
       string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
         s = Text.expand($&, doctype, filter)
-        if sum + s.bytesize > Security.entity_expansion_text_limit
+        if sum + s.bytesize > entity_expansion_text_limit
           raise "entity expansion has grown too large"
         else
           sum += s.bytesize