RubyGems - rexml - Versions diffs - 3.2.6 → 3.4.2 - Mend

rexml 3.2.6 → 3.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/NEWS.md +525 -0
data/lib/rexml/attribute.rb +10 -10
data/lib/rexml/cdata.rb +1 -1
data/lib/rexml/child.rb +2 -3
data/lib/rexml/comment.rb +1 -1
data/lib/rexml/doctype.rb +3 -8
data/lib/rexml/document.rb +23 -5
data/lib/rexml/element.rb +63 -84
data/lib/rexml/encoding.rb +3 -6
data/lib/rexml/entity.rb +9 -48
data/lib/rexml/formatters/pretty.rb +1 -1
data/lib/rexml/functions.rb +4 -5
data/lib/rexml/instruction.rb +1 -1
data/lib/rexml/namespace.rb +4 -4
data/lib/rexml/node.rb +10 -6
data/lib/rexml/parseexception.rb +1 -0
data/lib/rexml/parsers/baseparser.rb +538 -288
data/lib/rexml/parsers/pullparser.rb +16 -0
data/lib/rexml/parsers/sax2parser.rb +16 -19
data/lib/rexml/parsers/streamparser.rb +16 -10
data/lib/rexml/parsers/treeparser.rb +9 -21
data/lib/rexml/parsers/xpathparser.rb +4 -4
data/lib/rexml/quickpath.rb +19 -18
data/lib/rexml/rexml.rb +1 -1
data/lib/rexml/security.rb +2 -2
data/lib/rexml/source.rb +190 -100
data/lib/rexml/text.rb +68 -74
data/lib/rexml/validation/relaxng.rb +27 -26
data/lib/rexml/validation/validation.rb +8 -8
data/lib/rexml/xpath.rb +2 -13
data/lib/rexml/xpath_parser.rb +51 -45
metadata +6 -50

data/lib/rexml/source.rb CHANGED Viewed

@@ -1,8 +1,39 @@
 # coding: US-ASCII
 # frozen_string_literal: false
+require "stringio"
+require "strscan"
 require_relative 'encoding'
 module REXML
+  if StringScanner::Version < "1.0.0"
+    module StringScannerCheckScanString
+      refine StringScanner do
+        def check(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+        def scan(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+        def match?(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+        def skip(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+      end
+    end
+    using StringScannerCheckScanString
+  end
   # Generates Source-s.  USE THIS CLASS.
   class SourceFactory
     # Generates a Source object
@@ -15,7 +46,6 @@ module REXML
           arg.respond_to? :eof?
         IOSource.new(arg)
       elsif arg.respond_to? :to_str
-        require 'stringio'
         IOSource.new(StringIO.new(arg))
       elsif arg.kind_of? Source
         arg
@@ -30,26 +60,57 @@ module REXML
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
+    module Private
+      SPACES_PATTERN = /\s+/um
+      SCANNER_RESET_SIZE = 100000
+      PRE_DEFINED_TERM_PATTERNS = {}
+      pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
+      if StringScanner::Version < "3.1.1"
+        pre_defined_terms.each do |term|
+          PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
+        end
+      else
+        pre_defined_terms.each do |term|
+          PRE_DEFINED_TERM_PATTERNS[term] = term
+        end
+      end
+    end
+    private_constant :Private
     # Constructor
     # @param arg must be a String, and should be a valid XML document
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
         detect_encoding
       end
       @line = 0
+      @encoded_terms = {}
+    end
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
     end
+    def drop_parsed_content
+      if @scanner.pos > Private::SCANNER_RESET_SIZE
+        @scanner.string = @scanner.rest
+      end
+    end
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +119,98 @@ module REXML
       encoding_updated
     end
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
+    def read(term = nil)
     end
-    def read
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      data = @scanner.scan_until(pattern)
+      unless data
+        data = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+      end
+      data
     end
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
+    def ensure_buffer
     end
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
+    def match(pattern, cons=false)
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
+    def match?(pattern, cons=false)
+      if cons
+        !@scanner.skip(pattern).nil?
+      else
+        !@scanner.match?(pattern).nil?
+      end
     end
-    def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+    def skip_spaces
+      @scanner.skip(Private::SPACES_PATTERN) ? true : false
     end
-    # @return true if the Source is exhausted
-    def empty?
-      @buffer == ""
+    def position
+      @scanner.pos
     end
-    def position
-      @orig.index( @buffer )
+    def position=(pos)
+      @scanner.pos = pos
+    end
+    def peek_byte
+      @scanner.peek_byte
+    end
+    def scan_byte
+      @scanner.scan_byte
+    end
+    # @return true if the Source is exhausted
+    def empty?
+      @scanner.eos?
     end
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
     private
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +233,7 @@ module REXML
       end
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,63 +242,87 @@ module REXML
       end
     end
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
+    def read(term = nil, min_bytes = 1)
+      term = encode(term) if term
+      begin
+        str = readline(term)
+        @scanner << str
+        read_bytes = str.bytesize
+        begin
+          while read_bytes < min_bytes
+            str = readline(term)
+            @scanner << str
+            read_bytes += str.bytesize
           end
+        rescue IOError
         end
-        rv = super
+        true
+      rescue Exception, NameError
+        @source = nil
+        false
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
     end
-    def read
-      begin
-        @buffer << readline
-      rescue Exception, NameError
-        @source = nil
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      term = @encoded_terms[term] ||= encode(term)
+      until str = @scanner.scan_until(pattern)
+        break if @source.nil?
+        break if @source.eof?
+        @scanner << readline(term)
+      end
+      if str
+        read if @scanner.eos? and !@source.eof?
+        str
+      else
+        rest = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+        rest
       end
     end
-    def consume( pattern )
-      match( pattern, true )
+    def ensure_buffer
+      read if @scanner.eos? && @source
     end
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
-        begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
-        rescue
-          @source = nil
+      # To avoid performance issue, we need to increase bytes to read per scan
+      min_bytes = 1
+      while true
+        if cons
+          md = @scanner.scan(pattern)
+        else
+          md = @scanner.check(pattern)
         end
+        break if md
+        return nil if pattern.is_a?(String)
+        return nil if @source.nil?
+        return nil unless read(nil, min_bytes)
+        min_bytes *= 2
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+      md.nil? ? nil : @scanner
     end
-    def empty?
-      super and ( @source.nil? || @source.eof? )
+    def match?( pattern, cons=false )
+      # To avoid performance issue, we need to increase bytes to read per scan
+      min_bytes = 1
+      while true
+        if cons
+          n_matched_bytes = @scanner.skip(pattern)
+        else
+          n_matched_bytes = @scanner.match?(pattern)
+        end
+        return true if n_matched_bytes
+        return false if pattern.is_a?(String)
+        return false if @source.nil?
+        return false unless read(nil, min_bytes)
+        min_bytes *= 2
+      end
     end
-    def position
-      @er_source.pos rescue 0
+    def empty?
+      super and ( @source.nil? || @source.eof? )
     end
     # @return the current line in the source
@@ -255,7 +340,7 @@ module REXML
         rescue
         end
         @er_source.seek(pos)
-      rescue IOError
+      rescue IOError, SystemCallError
         pos = -1
         line = -1
       end
@@ -263,15 +348,20 @@ module REXML
     end
     private
-    def readline
-      str = @source.readline(@line_break)
+    def readline(term = nil)
       if @pending_buffer
+        begin
+          str = @source.readline(term || @line_break)
+        rescue IOError
+        end
         if str.nil?
           str = @pending_buffer
         else
           str = @pending_buffer + str
         end
         @pending_buffer = nil
+      else
+        str = @source.readline(term || @line_break)
       end
       return nil if str.nil?
@@ -290,7 +380,7 @@ module REXML
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end

data/lib/rexml/text.rb CHANGED Viewed

@@ -29,31 +29,16 @@ module REXML
       (0x10000..0x10FFFF)
     ]
-    if String.method_defined? :encode
-      VALID_XML_CHARS = Regexp.new('^['+
-        VALID_CHAR.map { |item|
-          case item
-          when Integer
-            [item].pack('U').force_encoding('utf-8')
-          when Range
-            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
-          end
-        }.join +
-      ']*$')
-    else
-      VALID_XML_CHARS = /^(
-           [\x09\x0A\x0D\x20-\x7E]            # ASCII
-         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
-         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
-         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
-         |  \xEF[\x80-\xBE]{2}                #
-         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
-         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
-         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
-         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
-         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
-       )*$/nx;
-    end
+    VALID_XML_CHARS = Regexp.new('^['+
+      VALID_CHAR.map { |item|
+        case item
+        when Integer
+          [item].pack('U').force_encoding('utf-8')
+        when Range
+          [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
+        end
+      }.join +
+    ']*$')
     # Constructor
     # +arg+ if a String, the content is set to the String.  If a Text,
@@ -119,57 +104,67 @@ module REXML
       @entity_filter = entity_filter if entity_filter
       clear_cache
-      Text.check(@string, illegal, doctype) if @raw
+      Text.check(@string, illegal) if @raw
     end
     def parent= parent
       super(parent)
-      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+      Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent
     end
     # check for illegal characters
-    def Text.check string, pattern, doctype
+    def Text.check string, pattern, doctype = nil
       # illegal anywhere
       if !string.match?(VALID_XML_CHARS)
-        if String.method_defined? :encode
-          string.chars.each do |c|
-            case c.ord
-            when *VALID_CHAR
-            else
-              raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
-            end
-          end
-        else
-          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
-            case c.unpack('U')
-            when *VALID_CHAR
-            else
-              raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
-            end
+        string.chars.each do |c|
+          case c.ord
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
           end
         end
       end
-      # context sensitive
-      string.scan(pattern) do
-        if $1[-1] != ?;
-          raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
-        elsif $1[0] == ?&
-          if $5 and $5[0] == ?#
-            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-            when *VALID_CHAR
+      pos = 0
+      while (index = string.index(/<|&/, pos))
+        if string[index] == "<"
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        unless (end_index = string.index(/[^\s];/, index + 1))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        value = string[(index + 1)..end_index]
+        if /\s/.match?(value)
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        if value[0] == "#"
+          character_reference = value[1..-1]
+          unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
+            if character_reference[0] == "x" || character_reference[-1] == "x"
+              raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
             else
-              raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
+              raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
             end
-          # FIXME: below can't work but this needs API change.
-          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
-          #   if !doctype or !doctype.entities.has_key?($3)
-          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
-          #   end
           end
+          case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
+          end
+        elsif !(/\A#{Entity::NAME}\z/um.match?(value))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
         end
+        pos = end_index + 1
       end
+      string
     end
     def node_type
@@ -182,7 +177,7 @@ module REXML
     def clone
-      return Text.new(self, true)
+      Text.new(self, true)
     end
@@ -205,10 +200,7 @@ module REXML
     end
     def doctype
-      if @parent
-        doc = @parent.document
-        doc.doctype if doc
-      end
+      @parent&.document&.doctype
     end
     REFERENCE = /#{Entity::REFERENCE}/
@@ -248,7 +240,8 @@ module REXML
     #   u = Text.new( "sean russell", false, nil, true )
     #   u.value   #-> "sean russell"
     def value
-      @unnormalized ||= Text::unnormalize( @string, doctype )
+      @unnormalized ||= Text::unnormalize(@string, doctype,
+                                          entity_expansion_text_limit: document&.entity_expansion_text_limit)
     end
     # Sets the contents of this text node.  This expects the text to be
@@ -268,30 +261,32 @@ module REXML
       # Recursively wrap string at width.
       return string if string.length <= width
       place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
-      if addnewline then
-        return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
+      if addnewline
+        "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
       else
-        return string[0,place] + "\n" + wrap(string[place+1..-1], width)
+        string[0,place] + "\n" + wrap(string[place+1..-1], width)
       end
     end
     def indent_text(string, level=1, style="\t", indentfirstline=true)
+      Kernel.warn("#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1)
       return string if level < 0
-      new_string = ''
+      new_string = +''
       string.each_line { |line|
         indent_string = style * level
         new_line = (indent_string + line).sub(/[\s]+$/,'')
         new_string << new_line
       }
       new_string.strip! unless indentfirstline
-      return new_string
+      new_string
     end
     # == DEPRECATED
     # See REXML::Formatters
     #
     def write( writer, indent=-1, transitive=false, ie_hack=false )
-      Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters", uplevel: 1)
+      Kernel.warn("#{self.class.name}#write is deprecated.  See REXML::Formatters", uplevel: 1)
       formatter = if indent > -1
           REXML::Formatters::Pretty.new( indent )
         else
@@ -303,9 +298,7 @@ module REXML
     # FIXME
     # This probably won't work properly
     def xpath
-      path = @parent.xpath
-      path += "/text()"
-      return path
+      @parent.xpath + "/text()"
     end
     # Writes out text, substituting special characters beforehand.
@@ -391,11 +384,12 @@ module REXML
     end
     # Unescapes all possible entities
-    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
+    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
+      entity_expansion_text_limit ||= Security.entity_expansion_text_limit
       sum = 0
       string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
         s = Text.expand($&, doctype, filter)
-        if sum + s.bytesize > Security.entity_expansion_text_limit
+        if sum + s.bytesize > entity_expansion_text_limit
           raise "entity expansion has grown too large"
         else
           sum += s.bytesize