RubyGems - regexp_parser - Versions diffs - 2.6.2 → 2.8.0 - Mend

regexp_parser 2.6.2 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +67 -0
data/Gemfile +2 -2
data/README.md +32 -29
data/lib/regexp_parser/expression/base.rb +0 -7
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/backreference.rb +4 -2
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
data/lib/regexp_parser/expression/classes/group.rb +0 -22
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
data/lib/regexp_parser/expression/methods/construct.rb +2 -4
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/tests.rb +40 -3
data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
data/lib/regexp_parser/expression/quantifier.rb +30 -17
data/lib/regexp_parser/expression/sequence.rb +5 -10
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +37 -20
data/lib/regexp_parser/expression/subexpression.rb +20 -15
data/lib/regexp_parser/expression.rb +2 -0
data/lib/regexp_parser/lexer.rb +76 -36
data/lib/regexp_parser/parser.rb +97 -97
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/mapping.rb +89 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +90 -169
data/lib/regexp_parser/scanner.rb +1157 -1330
data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
data/lib/regexp_parser/syntax/token/escape.rb +3 -1
data/lib/regexp_parser/syntax/token/meta.rb +9 -2
data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +2 -0
data/lib/regexp_parser/version.rb +1 -1
metadata +10 -3

data/lib/regexp_parser/expression/quantifier.rb CHANGED Viewed

@@ -8,14 +8,10 @@ module Regexp::Expression
     MODES = %i[greedy possessive reluctant]
-    attr_reader :min, :max, :mode
     def initialize(*args)
       deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
       init_from_token_and_options(*args)
-      @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
-      @min, @max = minmax
       # TODO: remove in v3.0.0, stop removing parts of #token (?)
       self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
     end
@@ -39,9 +35,21 @@ module Regexp::Expression
     end
     alias :lazy? :reluctant?
+    def min
+      derived_data[:min]
+    end
+    def max
+      derived_data[:max]
+    end
+    def mode
+      derived_data[:mode]
+    end
     private
-    def deprecated_old_init(token, text, min, max, mode = :greedy)
+    def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
       warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
            "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
            "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
            "This is consistent with how Expression::Base instances are created. "
       @token = token
       @text  = text
-      @min   = min
-      @max   = max
-      @mode  = mode
     end
-    def minmax
-      case token
-      when /zero_or_one/  then [0, 1]
-      when /zero_or_more/ then [0, -1]
-      when /one_or_more/  then [1, -1]
-      when :interval
-        int_min = text[/\{(\d*)/, 1]
-        int_max = text[/,?(\d*)\}/, 1]
-        [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
+    def derived_data
+      @derived_data ||= begin
+        min, max =
+          case text[0]
+          when '?'; [0, 1]
+          when '*'; [0, -1]
+          when '+'; [1, -1]
+          else
+            int_min = text[/\{(\d*)/, 1]
+            int_max = text[/,?(\d*)\}/, 1]
+            [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
+          end
+        mod = text[/.([?+])/, 1]
+        mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
+        { min: min, max: max, mode: mode }
       end
     end
   end

data/lib/regexp_parser/expression/sequence.rb CHANGED Viewed

@@ -12,25 +12,20 @@ module Regexp::Expression
           level:             exp.level,
           set_level:         exp.set_level,
           conditional_level: params[:conditional_level] || exp.conditional_level,
+          ts:                params[:ts],
         )
-        sequence.nesting_level = exp.nesting_level + 1
         sequence.options = active_opts
         exp.expressions << sequence
         sequence
       end
     end
-    def starts_at
-      expressions.first.starts_at
+    def ts
+      (head = expressions.first) ? head.ts : @ts
     end
-    alias :ts :starts_at
-    def quantify(*args)
-      target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
-      target or raise Regexp::Parser::Error,
-        "No valid target found for '#{text}' quantifier"
-      target.quantify(*args)
+    def quantify(token, *args)
+      extract_quantifier_target(token.text).quantify(token, *args)
     end
   end
 end

data/lib/regexp_parser/expression/sequence_operation.rb CHANGED Viewed

@@ -5,21 +5,16 @@ module Regexp::Expression
     alias :operands :expressions
     alias :operator :text
-    def starts_at
-      expressions.first.starts_at
+    def ts
+      (head = expressions.first) ? head.ts : @ts
     end
-    alias :ts :starts_at
     def <<(exp)
       expressions.last << exp
     end
-    def add_sequence(active_opts = {})
-      self.class::OPERAND.add_to(self, {}, active_opts)
-    end
-    def parts
-      intersperse(expressions, text.dup)
+    def add_sequence(active_opts = {}, params = { ts: 0 })
+      self.class::OPERAND.add_to(self, params, active_opts)
     end
   end
 end

data/lib/regexp_parser/expression/shared.rb CHANGED Viewed

@@ -8,7 +8,8 @@ module Regexp::Expression
         attr_accessor :type, :token, :text, :ts, :te,
                       :level, :set_level, :conditional_level,
-                      :options
+                      :options, :parent,
+                      :custom_to_s_handling, :pre_quantifier_decorations
         attr_reader   :nesting_level, :quantifier
       end
@@ -32,6 +33,10 @@ module Regexp::Expression
       self.text       = orig.text.dup         if orig.text
       self.options    = orig.options.dup      if orig.options
       self.quantifier = orig.quantifier.clone if orig.quantifier
+      self.parent     = nil # updated by Subexpression#initialize_copy
+      if orig.pre_quantifier_decorations
+        self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
+      end
       super
     end
@@ -39,35 +44,51 @@ module Regexp::Expression
       ts
     end
+    def ends_at(include_quantifier = true)
+      ts + (include_quantifier ? full_length : base_length)
+    end
     def base_length
       to_s(:base).length
     end
     def full_length
-      to_s.length
-    end
+      to_s(:original).length
+    end
+    # #to_s reproduces the original source, as an unparser would.
+    #
+    # It takes an optional format argument.
+    #
+    # Example:
+    #
+    # lit = Regexp::Parser.parse(/a +/x)[0]
+    #
+    # lit.to_s            # => 'a+'  # default; with quantifier
+    # lit.to_s(:full)     # => 'a+'  # default; with quantifier
+    # lit.to_s(:base)     # => 'a'   # without quantifier
+    # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
+    #
     def to_s(format = :full)
-      "#{parts.join}#{quantifier_affix(format)}"
+      base = parts.each_with_object(''.dup) do |part, buff|
+        if part.instance_of?(String)
+          buff << part
+        elsif !part.custom_to_s_handling
+          buff << part.to_s(:original)
+        end
+      end
+      "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
     end
     alias :to_str :to_s
-    def parts
-      [text.dup]
+    def pre_quantifier_decoration(expression_format = :original)
+      pre_quantifier_decorations.to_a.join if expression_format == :original
     end
-    def quantifier_affix(expression_format)
+    def quantifier_affix(expression_format = :full)
       quantifier.to_s if quantified? && expression_format != :base
     end
-    def quantified?
-      !quantifier.nil?
-    end
-    def optional?
-      quantified? && quantifier.min == 0
-    end
     def offset
       [starts_at, full_length]
     end
@@ -76,10 +97,6 @@ module Regexp::Expression
       '@%d+%d' % offset
     end
-    def terminal?
-      !respond_to?(:expressions)
-    end
     def nesting_level=(lvl)
       @nesting_level = lvl
       quantifier && quantifier.nesting_level = lvl

data/lib/regexp_parser/expression/subexpression.rb CHANGED Viewed

@@ -11,17 +11,15 @@ module Regexp::Expression
     # Override base method to clone the expressions as well.
     def initialize_copy(orig)
-      self.expressions = orig.expressions.map(&:clone)
+      self.expressions = orig.expressions.map do |exp|
+        exp.clone.tap { |copy| copy.parent = self }
+      end
       super
     end
     def <<(exp)
-      if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
-        last.merge(exp)
-      else
-        exp.nesting_level = nesting_level + 1
-        expressions << exp
-      end
+      exp.parent = self
+      expressions << exp
     end
     %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -39,11 +37,7 @@ module Regexp::Expression
     end
     def te
-      ts + to_s.length
-    end
-    def parts
-      expressions
+      ts + base_length
     end
     def to_h
@@ -53,10 +47,21 @@ module Regexp::Expression
       )
     end
-    private
+    def extract_quantifier_target(quantifier_description)
+      pre_quantifier_decorations = []
+      target = expressions.reverse.find do |exp|
+        if exp.decorative?
+          exp.custom_to_s_handling = true
+          pre_quantifier_decorations << exp.text
+          next
+        end
+        exp
+      end
+      target or raise Regexp::Parser::ParserError,
+        "No valid target found for '#{quantifier_description}' quantifier"
-    def intersperse(expressions, separator)
-      expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
+      target.pre_quantifier_decorations = pre_quantifier_decorations
+      target
     end
   end
 end

data/lib/regexp_parser/expression.rb CHANGED Viewed

@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
 require 'regexp_parser/expression/methods/match'
 require 'regexp_parser/expression/methods/match_length'
 require 'regexp_parser/expression/methods/options'
+require 'regexp_parser/expression/methods/parts'
+require 'regexp_parser/expression/methods/printing'
 require 'regexp_parser/expression/methods/strfregexp'
 require 'regexp_parser/expression/methods/tests'
 require 'regexp_parser/expression/methods/traverse'

data/lib/regexp_parser/lexer.rb CHANGED Viewed

@@ -6,57 +6,75 @@ class Regexp::Lexer
   OPENING_TOKENS = %i[
     capture passive lookahead nlookahead lookbehind nlookbehind
-    atomic options options_switch named absence
+    atomic options options_switch named absence open
   ].freeze
   CLOSING_TOKENS = %i[close].freeze
   CONDITION_TOKENS = %i[condition condition_close].freeze
-  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    new.lex(input, syntax, options: options, &block)
+  def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
+    new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
   end
-  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    syntax = Regexp::Syntax.for(syntax)
+  def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
+    syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
+    self.block = block
+    self.collect_tokens = collect_tokens
     self.tokens = []
+    self.prev_token = nil
+    self.preprev_token = nil
     self.nesting = 0
     self.set_nesting = 0
     self.conditional_nesting = 0
     self.shift = 0
-    last = nil
-    Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
+    Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
       type, token = *syntax.normalize(type, token)
       syntax.check! type, token
       ascend(type, token)
-      if type == :quantifier and last
-        break_literal(last)        if last.type == :literal
-        break_codepoint_list(last) if last.token == :codepoint_list
+      if (last = prev_token) &&
+         type == :quantifier &&
+         (
+           (last.type == :literal         && (parts = break_literal(last))) ||
+           (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
+         )
+        emit(parts[0])
+        last = parts[1]
       end
       current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                   nesting, set_nesting, conditional_nesting)
-      current = merge_condition(current) if type == :conditional and
-        CONDITION_TOKENS.include?(token)
-      last.next = current if last
-      current.previous = last if last
+      if type == :conditional && CONDITION_TOKENS.include?(token)
+        current = merge_condition(current, last)
+      elsif last
+        last.next = current
+        current.previous = last
+        emit(last)
+      end
-      tokens << current
-      last = current
+      self.preprev_token = last
+      self.prev_token = current
       descend(type, token)
     end
-    if block_given?
-      tokens.map { |t| block.call(t) }
+    emit(prev_token) if prev_token
+    collect_tokens ? tokens : nil
+  end
+  def emit(token)
+    if block
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
+      res = block.call(token)
+      tokens << res if collect_tokens
     else
-      tokens
+      tokens << token
     end
   end
@@ -66,27 +84,37 @@ class Regexp::Lexer
   private
-  attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token, :preprev_token,
+                :nesting, :set_nesting, :conditional_nesting, :shift
   def ascend(type, token)
+    return unless CLOSING_TOKENS.include?(token)
     case type
     when :group, :assertion
-      self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
+      self.nesting = nesting - 1
     when :set
-      self.set_nesting = set_nesting - 1 if token == :close
+      self.set_nesting = set_nesting - 1
     when :conditional
-      self.conditional_nesting = conditional_nesting - 1 if token == :close
+      self.conditional_nesting = conditional_nesting - 1
+    else
+      raise "unhandled nesting type #{type}"
     end
   end
   def descend(type, token)
+    return unless OPENING_TOKENS.include?(token)
     case type
     when :group, :assertion
-      self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
+      self.nesting = nesting + 1
     when :set
-      self.set_nesting = set_nesting + 1 if token == :open
+      self.set_nesting = set_nesting + 1
     when :conditional
-      self.conditional_nesting = conditional_nesting + 1 if token == :open
+      self.conditional_nesting = conditional_nesting + 1
+    else
+      raise "unhandled nesting type #{type}"
     end
   end
@@ -96,34 +124,46 @@ class Regexp::Lexer
     lead, last, _ = token.text.partition(/.\z/mu)
     return if lead.empty?
-    tokens.pop
-    tokens << Regexp::Token.new(:literal, :literal, lead,
+    token_1 = Regexp::Token.new(:literal, :literal, lead,
               token.ts, (token.te - last.length),
               nesting, set_nesting, conditional_nesting)
-    tokens << Regexp::Token.new(:literal, :literal, last,
+    token_2 = Regexp::Token.new(:literal, :literal, last,
               (token.ts + lead.length), token.te,
               nesting, set_nesting, conditional_nesting)
+    token_1.previous = preprev_token
+    token_1.next = token_2
+    token_2.previous = token_1 # .next will be set by #lex
+    [token_1, token_2]
   end
+  # if a codepoint list is followed by a quantifier, that quantifier applies
+  # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
+  # c.f. #break_literal.
   def break_codepoint_list(token)
     lead, _, tail = token.text.rpartition(' ')
     return if lead.empty?
-    tokens.pop
-    tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
+    token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
               token.ts, (token.te - tail.length),
               nesting, set_nesting, conditional_nesting)
-    tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
+    token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
               (token.ts + lead.length + 1), (token.te + 3),
               nesting, set_nesting, conditional_nesting)
     self.shift = shift + 3 # one space less, but extra \, u, {, and }
+    token_1.previous = preprev_token
+    token_1.next = token_2
+    token_2.previous = token_1 # .next will be set by #lex
+    [token_1, token_2]
   end
-  def merge_condition(current)
-    last = tokens.pop
-    Regexp::Token.new(:conditional, :condition, last.text + current.text,
+  def merge_condition(current, last)
+    token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
       last.ts, current.te, nesting, set_nesting, conditional_nesting)
+    token.previous = preprev_token # .next will be set by #lex
+    token
   end
 end # module Regexp::Lexer