RubyGems - regexp_parser - Versions diffs - 2.6.2 → 2.7.0 - Mend

regexp_parser 2.6.2 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -1
data/lib/regexp_parser/expression/classes/backreference.rb +4 -0
data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
data/lib/regexp_parser/expression/sequence.rb +0 -1
data/lib/regexp_parser/expression/shared.rb +5 -1
data/lib/regexp_parser/expression/subexpression.rb +4 -1
data/lib/regexp_parser/lexer.rb +61 -29
data/lib/regexp_parser/parser.rb +12 -11
data/lib/regexp_parser/scanner/property.rl +1 -1
data/lib/regexp_parser/scanner/scanner.rl +55 -40
data/lib/regexp_parser/scanner.rb +344 -298
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +2 -0
data/lib/regexp_parser/version.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 66568005494b517613155277c6be4731eb8a26bb9b48a692a9430507286ce583
-  data.tar.gz: d1fc6c6f1a0c7f939c51703ac844c2dbb134f96e0e55780646cb7e3e87d7a652
+  metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
+  data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
 SHA512:
-  metadata.gz: b955b2215b71c94497e52841142fab8c2b9930d0d6cea6ea2b3eeb8ed9fe84575e2f34aae3a6051af2b56429f98cf070b9151805f2cb93ddb511ec1e0e50dd7c
-  data.tar.gz: 3a4f083942b66ddb4b67ab33f14bb1c0b724a60c2b30605059d32ce3648e9cb46e31e797b7a526a2028c1e018d73365f5ef955256de4e63397d6ea105714ff12
+  metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
+  data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07

data/CHANGELOG.md CHANGED Viewed

@@ -5,7 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [Unreleased]
+## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
+### Added
+- `Regexp::Lexer.lex` now streams tokens when called with a block
+  - it can now take arbitrarily large input, just like `Regexp::Scanner`
+  - this also slightly improves `Regexp::Parser.parse` performance
+  - note: `Regexp::Parser.parse` still does not and will not support streaming
+- improved performance of `Subexpression#each_expression`
+- minor improvements to `Regexp::Scanner` performance
+- overall improvement of parse performance: about 10% for large Regexps
+### Fixed
+- parsing of octal escape sequences in sets, e.g. `[\141]`
+  * thanks to [Randy Stauner](https://github.com/rwstauner) for the report
 ## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)

data/lib/regexp_parser/expression/classes/backreference.rb CHANGED Viewed

@@ -20,6 +20,10 @@ module Regexp::Expression
         super
       end
+      def referential?
+        true
+      end
     end
     class Number < Backreference::Base

data/lib/regexp_parser/expression/classes/conditional.rb CHANGED Viewed

@@ -20,6 +20,10 @@ module Regexp::Expression
         self.referenced_expression = orig.referenced_expression.dup
         super
       end
+      def referential?
+        true
+      end
     end
     class Branch < Regexp::Expression::Sequence; end
@@ -55,6 +59,10 @@ module Regexp::Expression
         condition.reference
       end
+      def referential?
+        true
+      end
       def parts
         [text.dup, condition, *intersperse(branches, '|'), ')']
       end

data/lib/regexp_parser/expression/methods/traverse.rb CHANGED Viewed

@@ -36,11 +36,14 @@ module Regexp::Expression
     # Iterates over the expressions of this expression as an array, passing
     # the expression and its index within its parent to the given block.
-    def each_expression(include_self = false)
+    def each_expression(include_self = false, &block)
       return enum_for(__method__, include_self) unless block_given?
-      traverse(include_self) do |event, exp, index|
-        yield(exp, index) unless event == :exit
+      block.call(self, 0) if include_self
+      each_with_index do |exp, index|
+        block.call(exp, index)
+        exp.each_expression(&block) unless exp.terminal?
       end
     end

data/lib/regexp_parser/expression/sequence.rb CHANGED Viewed

@@ -13,7 +13,6 @@ module Regexp::Expression
           set_level:         exp.set_level,
           conditional_level: params[:conditional_level] || exp.conditional_level,
         )
-        sequence.nesting_level = exp.nesting_level + 1
         sequence.options = active_opts
         exp.expressions << sequence
         sequence

data/lib/regexp_parser/expression/shared.rb CHANGED Viewed

@@ -77,7 +77,11 @@ module Regexp::Expression
     end
     def terminal?
-      !respond_to?(:expressions)
+      true # overridden to be false in Expression::Subexpression
+    end
+    def referential?
+      false # overridden to be true e.g. in Expression::Backreference::Base
     end
     def nesting_level=(lvl)

data/lib/regexp_parser/expression/subexpression.rb CHANGED Viewed

@@ -19,7 +19,6 @@ module Regexp::Expression
       if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
         last.merge(exp)
       else
-        exp.nesting_level = nesting_level + 1
         expressions << exp
       end
     end
@@ -53,6 +52,10 @@ module Regexp::Expression
       )
     end
+    def terminal?
+      false
+    end
     private
     def intersperse(expressions, separator)

data/lib/regexp_parser/lexer.rb CHANGED Viewed

@@ -13,50 +13,68 @@ class Regexp::Lexer
   CONDITION_TOKENS = %i[condition condition_close].freeze
-  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    new.lex(input, syntax, options: options, &block)
+  def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
+    new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
   end
-  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    syntax = Regexp::Syntax.for(syntax)
+  def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
+    syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
+    self.block = block
+    self.collect_tokens = collect_tokens
     self.tokens = []
+    self.prev_token = nil
+    self.preprev_token = nil
     self.nesting = 0
     self.set_nesting = 0
     self.conditional_nesting = 0
     self.shift = 0
-    last = nil
-    Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
+    Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
       type, token = *syntax.normalize(type, token)
       syntax.check! type, token
       ascend(type, token)
-      if type == :quantifier and last
-        break_literal(last)        if last.type == :literal
-        break_codepoint_list(last) if last.token == :codepoint_list
+      if (last = prev_token) &&
+         type == :quantifier &&
+         (
+           (last.type == :literal         && (parts = break_literal(last))) ||
+           (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
+         )
+        emit(parts[0])
+        last = parts[1]
       end
       current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                   nesting, set_nesting, conditional_nesting)
-      current = merge_condition(current) if type == :conditional and
-        CONDITION_TOKENS.include?(token)
-      last.next = current if last
-      current.previous = last if last
+      if type == :conditional && CONDITION_TOKENS.include?(token)
+        current = merge_condition(current, last)
+      elsif last
+        last.next = current
+        current.previous = last
+        emit(last)
+      end
-      tokens << current
-      last = current
+      self.preprev_token = last
+      self.prev_token = current
       descend(type, token)
     end
-    if block_given?
-      tokens.map { |t| block.call(t) }
+    emit(prev_token) if prev_token
+    collect_tokens ? tokens : nil
+  end
+  def emit(token)
+    if block
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
+      res = block.call(token)
+      tokens << res if collect_tokens
     else
-      tokens
+      tokens << token
     end
   end
@@ -66,7 +84,9 @@ class Regexp::Lexer
   private
-  attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token, :preprev_token,
+                :nesting, :set_nesting, :conditional_nesting, :shift
   def ascend(type, token)
     case type
@@ -96,34 +116,46 @@ class Regexp::Lexer
     lead, last, _ = token.text.partition(/.\z/mu)
     return if lead.empty?
-    tokens.pop
-    tokens << Regexp::Token.new(:literal, :literal, lead,
+    token_1 = Regexp::Token.new(:literal, :literal, lead,
               token.ts, (token.te - last.length),
               nesting, set_nesting, conditional_nesting)
-    tokens << Regexp::Token.new(:literal, :literal, last,
+    token_2 = Regexp::Token.new(:literal, :literal, last,
               (token.ts + lead.length), token.te,
               nesting, set_nesting, conditional_nesting)
+    token_1.previous = preprev_token
+    token_1.next = token_2
+    token_2.previous = token_1 # .next will be set by #lex
+    [token_1, token_2]
   end
+  # if a codepoint list is followed by a quantifier, that quantifier applies
+  # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
+  # c.f. #break_literal.
   def break_codepoint_list(token)
     lead, _, tail = token.text.rpartition(' ')
     return if lead.empty?
-    tokens.pop
-    tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
+    token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
               token.ts, (token.te - tail.length),
               nesting, set_nesting, conditional_nesting)
-    tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
+    token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
               (token.ts + lead.length + 1), (token.te + 3),
               nesting, set_nesting, conditional_nesting)
     self.shift = shift + 3 # one space less, but extra \, u, {, and }
+    token_1.previous = preprev_token
+    token_1.next = token_2
+    token_2.previous = token_1 # .next will be set by #lex
+    [token_1, token_2]
   end
-  def merge_condition(current)
-    last = tokens.pop
-    Regexp::Token.new(:conditional, :condition, last.text + current.text,
+  def merge_condition(current, last)
+    token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
       last.ts, current.te, nesting, set_nesting, conditional_nesting)
+    token.previous = preprev_token # .next will be set by #lex
+    token
   end
 end # module Regexp::Lexer

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -18,11 +18,11 @@ class Regexp::Parser
     end
   end
-  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+  def self.parse(input, syntax = nil, options: nil, &block)
     new.parse(input, syntax, options: options, &block)
   end
-  def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+  def parse(input, syntax = nil, options: nil, &block)
     root = Root.construct(options: extract_options(input, options))
     self.root = root
@@ -35,7 +35,7 @@ class Regexp::Parser
     self.captured_group_counts = Hash.new(0)
-    Regexp::Lexer.scan(input, syntax, options: options) do |token|
+    Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
       parse_token(token)
     end
@@ -379,7 +379,7 @@ class Regexp::Parser
   end
   def sequence_operation(klass, token)
-    unless node.is_a?(klass)
+    unless node.instance_of?(klass)
       operator = klass.new(token, active_opts)
       sequence = operator.add_sequence(active_opts)
       sequence.expressions = node.expressions
@@ -541,7 +541,7 @@ class Regexp::Parser
   def range(token)
     exp = CharacterSet::Range.new(token, active_opts)
-    scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
+    scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
     exp << scope.expressions.pop
     nest(exp)
   end
@@ -568,7 +568,7 @@ class Regexp::Parser
   end
   def close_completed_character_set_range
-    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
+    decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
   end
   def active_opts
@@ -579,15 +579,16 @@ class Regexp::Parser
   # an instance of Backreference::Number, its #referenced_expression is set to
   # the instance of Group::Capture that it refers to via its number.
   def assign_referenced_expressions
-    # find all referencable expressions
+    # find all referencable and refering expressions
     targets = { 0 => root }
+    referrers = []
     root.each_expression do |exp|
       exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
+      referrers << exp if exp.referential?
     end
-    # assign them to any refering expressions
-    root.each_expression do |exp|
-      next unless exp.respond_to?(:reference)
+    # assign reference expression to refering expressions
+    # (in a second iteration because there might be forward references)
+    referrers.each do |exp|
       exp.referenced_expression = targets[exp.reference] ||
         raise(ParserError, "Invalid reference: #{exp.reference}")
     end

data/lib/regexp_parser/scanner/property.rl CHANGED Viewed

@@ -17,7 +17,7 @@
       text = copy(data, ts-1, te)
       type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
-      name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
+      name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
       token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
       validation_error(:property, name) unless token

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -59,9 +59,6 @@
   one_or_more           = '+' | '+?' | '++';
   quantifier_greedy     = '?'  | '*'  | '+';
-  quantifier_reluctant  = '??' | '*?' | '+?';
-  quantifier_possessive = '?+' | '*+' | '++';
-  quantifier_mode       = '?'  | '+';
   quantity_exact        = (digit+);
   quantity_minimum      = (digit+) . ',';
@@ -70,9 +67,6 @@
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
                           quantity_maximum | quantity_range ) . range_close;
-  quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_interval;
   conditional           = '(?(';
   group_comment         = '?#' . [^)]* . group_close;
@@ -132,7 +126,8 @@
                           keep_mark | sequence_char;
   # escapes that also work within a character set
-  set_escape            = backslash | brackets | escaped_ascii | property_char |
+  set_escape            = backslash | brackets | escaped_ascii |
+                          octal_sequence | property_char |
                           sequence_char | single_codepoint_char_type;
@@ -168,8 +163,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts, te-1))
-      emit(:set, :close, copy(data, ts+1, te))
+      emit(:literal, :literal, '-')
+      emit(:set, :close, ']')
       if in_set?
         fret;
       else
@@ -183,28 +178,27 @@
     };
     '^' {
-      text = copy(data, ts, te)
-      if tokens.last[1] == :open
-        emit(:set, :negate, text)
+      if prev_token[1] == :open
+        emit(:set, :negate, '^')
       else
-        emit(:literal, :literal, text)
+        emit(:literal, :literal, '^')
       end
     };
     '-' {
-      text = copy(data, ts, te)
-      # ranges cant start with a subset or intersection/negation/range operator
-      if tokens.last[0] == :set
-        emit(:literal, :literal, text)
+      # ranges cant start with the opening bracket, a subset, or
+      # intersection/negation/range operators
+      if prev_token[0] == :set
+        emit(:literal, :literal, '-')
       else
-        emit(:set, :range, text)
+        emit(:set, :range, '-')
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, copy(data, ts, te))
+      emit(:set, :intersection, '&&')
     };
     backslash {
@@ -212,7 +206,7 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, copy(data, ts, te))
+      emit(:set, :open, '[')
       fcall character_set;
     };
@@ -254,12 +248,22 @@
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
+    # Special case: in sets, octal sequences have higher priority than backrefs
+    octal_sequence {
+      emit(:escape, :octal, copy(data, ts-1, te))
+      fret;
+    };
+    # Scan all other escapes that work in sets with the generic escape scanner
     set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    # Treat all remaining escapes - those not supported in sets - as literal.
+    # (This currently includes \^, \-, \&, \:, although these could potentially
+    # be meta chars when not escaped, depending on their position in the set.)
     any > (escaped_set_alpha, 1) {
       emit(:escape, :literal, copy(data, ts-1, te))
       fret;
@@ -528,7 +532,7 @@
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, copy(data, ts, te))
+        emit(:conditional, :close, ')')
       else
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
@@ -536,7 +540,7 @@
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, copy(data, ts, te))
+        emit(:group, :close, ')')
       end
     };
@@ -717,23 +721,24 @@ class Regexp::Scanner
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, options: nil, &block)
-    new.scan(input_object, options: options, &block)
+  def self.scan(input_object, options: nil, collect_tokens: true, &block)
+    new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
   end
-  def scan(input_object, options: nil, &block)
-    self.literal = nil
+  def scan(input_object, options: nil, collect_tokens: true, &block)
+    self.collect_tokens = collect_tokens
+    self.literal_run = nil
     stack = []
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    data  = input.unpack("c*") if input.is_a?(String)
+    data  = input.unpack("c*")
     eof   = data.length
     self.tokens = []
-    self.block  = block_given? ? block : nil
+    self.block  = block
     self.set_depth = 0
     self.group_depth = 0
@@ -758,7 +763,7 @@ class Regexp::Scanner
           "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
-    emit_literal if literal
+    emit_literal if literal_run
     tokens
   end
@@ -785,26 +790,37 @@ class Regexp::Scanner
   def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    emit_literal if literal
+    emit_literal if literal_run
     # Ragel runs with byte-based indices (ts, te). These are of little value to
     # end-users, so we keep track of char-based indices and emit those instead.
     ts_char_pos = char_pos
     te_char_pos = char_pos + text.length
-    if block
-      block.call type, token, text, ts_char_pos, te_char_pos
-    end
+    tok = [type, token, text, ts_char_pos, te_char_pos]
-    tokens << [type, token, text, ts_char_pos, te_char_pos]
+    self.prev_token = tok
     self.char_pos = te_char_pos
+    if block
+      block.call type, token, text, ts_char_pos, te_char_pos
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
+      tokens << tok if collect_tokens
+    elsif collect_tokens
+      tokens << tok
+    end
   end
+  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
   private
-  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack, :char_pos
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token,
+                :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack,
+                :char_pos
   def free_spacing?(input_object, options)
     if options && !input_object.is_a?(String)
@@ -834,14 +850,13 @@ class Regexp::Scanner
   # Appends one or more characters to the literal buffer, to be emitted later
   # by a call to emit_literal.
   def append_literal(data, ts, te)
-    self.literal = literal || []
-    literal << copy(data, ts, te)
+    (self.literal_run ||= []) << copy(data, ts, te)
   end
   # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    text = literal.join
-    self.literal = nil
+    text = literal_run.join
+    self.literal_run = nil
     emit(:literal, :literal, text)
   end