RubyGems - rexml - Versions diffs - 3.2.5 → 3.3.8 - Mend

rexml 3.2.5 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rexml might be problematic. Click here for more details.

Files changed (25) hide show

checksums.yaml +4 -4
data/NEWS.md +449 -2
data/README.md +10 -1
data/doc/rexml/tasks/rdoc/element.rdoc +2 -2
data/doc/rexml/tutorial.rdoc +1358 -0
data/lib/rexml/attribute.rb +17 -11
data/lib/rexml/document.rb +6 -2
data/lib/rexml/element.rb +19 -34
data/lib/rexml/entity.rb +9 -38
data/lib/rexml/formatters/pretty.rb +3 -3
data/lib/rexml/functions.rb +1 -2
data/lib/rexml/namespace.rb +8 -4
data/lib/rexml/node.rb +8 -4
data/lib/rexml/parseexception.rb +1 -0
data/lib/rexml/parsers/baseparser.rb +426 -263
data/lib/rexml/parsers/pullparser.rb +12 -0
data/lib/rexml/parsers/sax2parser.rb +16 -19
data/lib/rexml/parsers/streamparser.rb +16 -10
data/lib/rexml/parsers/treeparser.rb +9 -21
data/lib/rexml/parsers/xpathparser.rb +136 -86
data/lib/rexml/rexml.rb +3 -1
data/lib/rexml/source.rb +128 -98
data/lib/rexml/text.rb +45 -21
data/lib/rexml/xpath_parser.rb +7 -3
metadata +10 -52

data/lib/rexml/source.rb CHANGED Viewed

@@ -1,8 +1,28 @@
 # coding: US-ASCII
 # frozen_string_literal: false
+require "strscan"
 require_relative 'encoding'
 module REXML
+  if StringScanner::Version < "1.0.0"
+    module StringScannerCheckScanString
+      refine StringScanner do
+        def check(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+        def scan(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+      end
+    end
+    using StringScannerCheckScanString
+  end
   # Generates Source-s.  USE THIS CLASS.
   class SourceFactory
     # Generates a Source object
@@ -30,18 +50,27 @@ module REXML
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
+    module Private
+      SCANNER_RESET_SIZE = 100000
+      PRE_DEFINED_TERM_PATTERNS = {}
+      pre_defined_terms = ["'", '"', "<"]
+      pre_defined_terms.each do |term|
+        PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
+      end
+    end
+    private_constant :Private
     # Constructor
     # @param arg must be a String, and should be a valid XML document
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -50,6 +79,20 @@ module REXML
       @line = 0
     end
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+    def drop_parsed_content
+      if @scanner.pos > Private::SCANNER_RESET_SIZE
+        @scanner.string = @scanner.rest
+      end
+    end
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +101,78 @@ module REXML
       encoding_updated
     end
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
+    def read(term = nil)
     end
-    def read
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      data = @scanner.scan_until(pattern)
+      unless data
+        data = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+      end
+      data
     end
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
+    def ensure_buffer
     end
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
+    def match(pattern, cons=false)
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
+    def position
+      @scanner.pos
     end
-    def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+    def position=(pos)
+      @scanner.pos = pos
     end
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
-    end
-    def position
-      @orig.index( @buffer )
+      @scanner.eos?
     end
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
     private
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +195,7 @@ module REXML
       end
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,65 +204,72 @@ module REXML
       end
     end
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
+    def read(term = nil, min_bytes = 1)
+      term = encode(term) if term
+      begin
+        str = readline(term)
+        @scanner << str
+        read_bytes = str.bytesize
+        begin
+          while read_bytes < min_bytes
+            str = readline(term)
+            @scanner << str
+            read_bytes += str.bytesize
           end
+        rescue IOError
         end
-        rv = super
+        true
+      rescue Exception, NameError
+        @source = nil
+        false
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
     end
-    def read
-      begin
-        @buffer << readline
-      rescue Exception, NameError
-        @source = nil
+    def read_until(term)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      term = encode(term)
+      until str = @scanner.scan_until(pattern)
+        break if @source.nil?
+        break if @source.eof?
+        @scanner << readline(term)
+      end
+      if str
+        read if @scanner.eos? and !@source.eof?
+        str
+      else
+        rest = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+        rest
       end
     end
-    def consume( pattern )
-      match( pattern, true )
+    def ensure_buffer
+      read if @scanner.eos? && @source
     end
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
-        begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
-        rescue
-          @source = nil
+      # To avoid performance issue, we need to increase bytes to read per scan
+      min_bytes = 1
+      while true
+        if cons
+          md = @scanner.scan(pattern)
+        else
+          md = @scanner.check(pattern)
         end
+        break if md
+        return nil if pattern.is_a?(String)
+        return nil if @source.nil?
+        return nil unless read(nil, min_bytes)
+        min_bytes *= 2
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+      md.nil? ? nil : @scanner
     end
     def empty?
       super and ( @source.nil? || @source.eof? )
     end
-    def position
-      @er_source.pos rescue 0
-    end
     # @return the current line in the source
     def current_line
       begin
@@ -263,8 +293,8 @@ module REXML
     end
     private
-    def readline
-      str = @source.readline(@line_break)
+    def readline(term = nil)
+      str = @source.readline(term || @line_break)
       if @pending_buffer
         if str.nil?
           str = @pending_buffer
@@ -290,7 +320,7 @@ module REXML
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end

data/lib/rexml/text.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# frozen_string_literal: false
+# frozen_string_literal: true
 require_relative 'security'
 require_relative 'entity'
 require_relative 'doctype'
@@ -131,7 +131,7 @@ module REXML
     def Text.check string, pattern, doctype
       # illegal anywhere
-      if string !~ VALID_XML_CHARS
+      if !string.match?(VALID_XML_CHARS)
         if String.method_defined? :encode
           string.chars.each do |c|
             case c.ord
@@ -151,25 +151,45 @@ module REXML
         end
       end
-      # context sensitive
-      string.scan(pattern) do
-        if $1[-1] != ?;
-          raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
-        elsif $1[0] == ?&
-          if $5 and $5[0] == ?#
-            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-            when *VALID_CHAR
+      pos = 0
+      while (index = string.index(/<|&/, pos))
+        if string[index] == "<"
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        unless (end_index = string.index(/[^\s];/, index + 1))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        value = string[(index + 1)..end_index]
+        if /\s/.match?(value)
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+        if value[0] == "#"
+          character_reference = value[1..-1]
+          unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
+            if character_reference[0] == "x" || character_reference[-1] == "x"
+              raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
             else
-              raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
+              raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
             end
-          # FIXME: below can't work but this needs API change.
-          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
-          #   if !doctype or !doctype.entities.has_key?($3)
-          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
-          #   end
           end
+          case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
+          end
+        elsif !(/\A#{Entity::NAME}\z/um.match?(value))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
         end
+        pos = end_index + 1
       end
+      string
     end
     def node_type
@@ -248,7 +268,8 @@ module REXML
     #   u = Text.new( "sean russell", false, nil, true )
     #   u.value   #-> "sean russell"
     def value
-      @unnormalized ||= Text::unnormalize( @string, doctype )
+      @unnormalized ||= Text::unnormalize(@string, doctype,
+                                          entity_expansion_text_limit: document&.entity_expansion_text_limit)
     end
     # Sets the contents of this text node.  This expects the text to be
@@ -371,7 +392,7 @@ module REXML
       copy = input.to_s
       # Doing it like this rather than in a loop improves the speed
       #copy = copy.gsub( EREFERENCE, '&amp;' )
-      copy = copy.gsub( "&", "&amp;" )
+      copy = copy.gsub( "&", "&amp;" ) if copy.include?("&")
       if doctype
         # Replace all ampersands that aren't part of an entity
         doctype.entities.each_value do |entity|
@@ -382,18 +403,21 @@ module REXML
       else
         # Replace all ampersands that aren't part of an entity
         DocType::DEFAULT_ENTITIES.each_value do |entity|
-          copy = copy.gsub(entity.value, "&#{entity.name};" )
+          if copy.include?(entity.value)
+            copy = copy.gsub(entity.value, "&#{entity.name};" )
+          end
         end
       end
       copy
     end
     # Unescapes all possible entities
-    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
+    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
+      entity_expansion_text_limit ||= Security.entity_expansion_text_limit
       sum = 0
       string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
         s = Text.expand($&, doctype, filter)
-        if sum + s.bytesize > Security.entity_expansion_text_limit
+        if sum + s.bytesize > entity_expansion_text_limit
           raise "entity expansion has grown too large"
         else
           sum += s.bytesize

data/lib/rexml/xpath_parser.rb CHANGED Viewed

@@ -590,6 +590,7 @@ module REXML
     def evaluate_predicate(expression, nodesets)
       enter(:predicate, expression, nodesets) if @debug
+      new_nodeset_count = 0
       new_nodesets = nodesets.collect do |nodeset|
         new_nodeset = []
         subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
           result = result[0] if result.kind_of? Array and result.length == 1
           if result.kind_of? Numeric
             if result == node.position
-              new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+              new_nodeset_count += 1
+              new_nodeset << XPathNode.new(node, position: new_nodeset_count)
             end
           elsif result.instance_of? Array
             if result.size > 0 and result.inject(false) {|k,s| s or k}
               if result.size > 0
-                new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+                new_nodeset_count += 1
+                new_nodeset << XPathNode.new(node, position: new_nodeset_count)
               end
             end
           else
             if result
-              new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+              new_nodeset_count += 1
+              new_nodeset << XPathNode.new(node, position: new_nodeset_count)
             end
           end
         end

metadata CHANGED Viewed

@@ -1,57 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rexml
 version: !ruby/object:Gem::Version
-  version: 3.2.5
+  version: 3.3.8
 platform: ruby
 authors:
 - Kouhei Sutou
-autorequire:
-bindir: exe
+bindir: bin
 cert_chain: []
-date: 2021-04-05 00:00:00.000000000 Z
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: bundler
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rake
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: test-unit
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
+date: 2024-09-29 00:00:00.000000000 Z
+dependencies: []
 description: An XML toolkit for Ruby
 email:
 - kou@cozmixng.org
@@ -73,6 +30,7 @@ extra_rdoc_files:
 - doc/rexml/tasks/tocs/master_toc.rdoc
 - doc/rexml/tasks/tocs/node_toc.rdoc
 - doc/rexml/tasks/tocs/parent_toc.rdoc
+- doc/rexml/tutorial.rdoc
 files:
 - LICENSE.txt
 - NEWS.md
@@ -89,6 +47,7 @@ files:
 - doc/rexml/tasks/tocs/master_toc.rdoc
 - doc/rexml/tasks/tocs/node_toc.rdoc
 - doc/rexml/tasks/tocs/parent_toc.rdoc
+- doc/rexml/tutorial.rdoc
 - lib/rexml.rb
 - lib/rexml/attlistdecl.rb
 - lib/rexml/attribute.rb
@@ -142,8 +101,8 @@ files:
 homepage: https://github.com/ruby/rexml
 licenses:
 - BSD-2-Clause
-metadata: {}
-post_install_message:
+metadata:
+  changelog_uri: https://github.com/ruby/rexml/releases/tag/v3.3.8
 rdoc_options:
 - "--main"
 - README.md
@@ -153,15 +112,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 2.5.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.3
-signing_key:
+rubygems_version: 3.6.0.dev
 specification_version: 4
 summary: An XML toolkit for Ruby
 test_files: []