RubyGems - regexp_parser - Versions diffs - 2.7.0 → 2.8.1 - Mend

regexp_parser 2.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +62 -3
data/Gemfile +3 -3
data/LICENSE +1 -1
data/README.md +33 -30
data/lib/regexp_parser/expression/base.rb +0 -7
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/backreference.rb +4 -6
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
data/lib/regexp_parser/expression/classes/group.rb +0 -22
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
data/lib/regexp_parser/expression/methods/construct.rb +2 -4
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/tests.rb +40 -3
data/lib/regexp_parser/expression/methods/traverse.rb +33 -20
data/lib/regexp_parser/expression/quantifier.rb +30 -17
data/lib/regexp_parser/expression/sequence.rb +5 -9
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +37 -24
data/lib/regexp_parser/expression/subexpression.rb +20 -18
data/lib/regexp_parser/expression.rb +2 -0
data/lib/regexp_parser/lexer.rb +15 -7
data/lib/regexp_parser/parser.rb +85 -86
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +11 -0
data/lib/regexp_parser/scanner/properties/short.csv +2 -0
data/lib/regexp_parser/scanner/property.rl +1 -1
data/lib/regexp_parser/scanner/scanner.rl +35 -129
data/lib/regexp_parser/scanner.rb +1084 -1303
data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
data/lib/regexp_parser/syntax/token/escape.rb +3 -1
data/lib/regexp_parser/syntax/token/meta.rb +9 -2
data/lib/regexp_parser/syntax/token/unicode_property.rb +17 -1
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/version.rb +1 -1
metadata +9 -3

data/lib/regexp_parser/expression/quantifier.rb CHANGED Viewed

@@ -8,14 +8,10 @@ module Regexp::Expression
     MODES = %i[greedy possessive reluctant]
-    attr_reader :min, :max, :mode
     def initialize(*args)
       deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
       init_from_token_and_options(*args)
-      @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
-      @min, @max = minmax
       # TODO: remove in v3.0.0, stop removing parts of #token (?)
       self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
     end
@@ -39,9 +35,21 @@ module Regexp::Expression
     end
     alias :lazy? :reluctant?
+    def min
+      derived_data[:min]
+    end
+    def max
+      derived_data[:max]
+    end
+    def mode
+      derived_data[:mode]
+    end
     private
-    def deprecated_old_init(token, text, min, max, mode = :greedy)
+    def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
       warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
            "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
            "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
            "This is consistent with how Expression::Base instances are created. "
       @token = token
       @text  = text
-      @min   = min
-      @max   = max
-      @mode  = mode
     end
-    def minmax
-      case token
-      when /zero_or_one/  then [0, 1]
-      when /zero_or_more/ then [0, -1]
-      when /one_or_more/  then [1, -1]
-      when :interval
-        int_min = text[/\{(\d*)/, 1]
-        int_max = text[/,?(\d*)\}/, 1]
-        [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
+    def derived_data
+      @derived_data ||= begin
+        min, max =
+          case text[0]
+          when '?'; [0, 1]
+          when '*'; [0, -1]
+          when '+'; [1, -1]
+          else
+            int_min = text[/\{(\d*)/, 1]
+            int_max = text[/,?(\d*)\}/, 1]
+            [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
+          end
+        mod = text[/.([?+])/, 1]
+        mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
+        { min: min, max: max, mode: mode }
       end
     end
   end

data/lib/regexp_parser/expression/sequence.rb CHANGED Viewed

@@ -12,6 +12,7 @@ module Regexp::Expression
           level:             exp.level,
           set_level:         exp.set_level,
           conditional_level: params[:conditional_level] || exp.conditional_level,
+          ts:                params[:ts],
         )
         sequence.options = active_opts
         exp.expressions << sequence
@@ -19,17 +20,12 @@ module Regexp::Expression
       end
     end
-    def starts_at
-      expressions.first.starts_at
+    def ts
+      (head = expressions.first) ? head.ts : @ts
     end
-    alias :ts :starts_at
-    def quantify(*args)
-      target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
-      target or raise Regexp::Parser::Error,
-        "No valid target found for '#{text}' quantifier"
-      target.quantify(*args)
+    def quantify(token, *args)
+      extract_quantifier_target(token.text).quantify(token, *args)
     end
   end
 end

data/lib/regexp_parser/expression/sequence_operation.rb CHANGED Viewed

@@ -5,21 +5,16 @@ module Regexp::Expression
     alias :operands :expressions
     alias :operator :text
-    def starts_at
-      expressions.first.starts_at
+    def ts
+      (head = expressions.first) ? head.ts : @ts
     end
-    alias :ts :starts_at
     def <<(exp)
       expressions.last << exp
     end
-    def add_sequence(active_opts = {})
-      self.class::OPERAND.add_to(self, {}, active_opts)
-    end
-    def parts
-      intersperse(expressions, text.dup)
+    def add_sequence(active_opts = {}, params = { ts: 0 })
+      self.class::OPERAND.add_to(self, params, active_opts)
     end
   end
 end

data/lib/regexp_parser/expression/shared.rb CHANGED Viewed

@@ -8,7 +8,8 @@ module Regexp::Expression
         attr_accessor :type, :token, :text, :ts, :te,
                       :level, :set_level, :conditional_level,
-                      :options
+                      :options, :parent,
+                      :custom_to_s_handling, :pre_quantifier_decorations
         attr_reader   :nesting_level, :quantifier
       end
@@ -32,6 +33,10 @@ module Regexp::Expression
       self.text       = orig.text.dup         if orig.text
       self.options    = orig.options.dup      if orig.options
       self.quantifier = orig.quantifier.clone if orig.quantifier
+      self.parent     = nil # updated by Subexpression#initialize_copy
+      if orig.pre_quantifier_decorations
+        self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
+      end
       super
     end
@@ -39,35 +44,51 @@ module Regexp::Expression
       ts
     end
+    def ends_at(include_quantifier = true)
+      ts + (include_quantifier ? full_length : base_length)
+    end
     def base_length
       to_s(:base).length
     end
     def full_length
-      to_s.length
-    end
+      to_s(:original).length
+    end
+    # #to_s reproduces the original source, as an unparser would.
+    #
+    # It takes an optional format argument.
+    #
+    # Example:
+    #
+    # lit = Regexp::Parser.parse(/a +/x)[0]
+    #
+    # lit.to_s            # => 'a+'  # default; with quantifier
+    # lit.to_s(:full)     # => 'a+'  # default; with quantifier
+    # lit.to_s(:base)     # => 'a'   # without quantifier
+    # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
+    #
     def to_s(format = :full)
-      "#{parts.join}#{quantifier_affix(format)}"
+      base = parts.each_with_object(''.dup) do |part, buff|
+        if part.instance_of?(String)
+          buff << part
+        elsif !part.custom_to_s_handling
+          buff << part.to_s(:original)
+        end
+      end
+      "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
     end
     alias :to_str :to_s
-    def parts
-      [text.dup]
+    def pre_quantifier_decoration(expression_format = :original)
+      pre_quantifier_decorations.to_a.join if expression_format == :original
     end
-    def quantifier_affix(expression_format)
+    def quantifier_affix(expression_format = :full)
       quantifier.to_s if quantified? && expression_format != :base
     end
-    def quantified?
-      !quantifier.nil?
-    end
-    def optional?
-      quantified? && quantifier.min == 0
-    end
     def offset
       [starts_at, full_length]
     end
@@ -76,14 +97,6 @@ module Regexp::Expression
       '@%d+%d' % offset
     end
-    def terminal?
-      true # overridden to be false in Expression::Subexpression
-    end
-    def referential?
-      false # overridden to be true e.g. in Expression::Backreference::Base
-    end
     def nesting_level=(lvl)
       @nesting_level = lvl
       quantifier && quantifier.nesting_level = lvl

data/lib/regexp_parser/expression/subexpression.rb CHANGED Viewed

@@ -11,16 +11,15 @@ module Regexp::Expression
     # Override base method to clone the expressions as well.
     def initialize_copy(orig)
-      self.expressions = orig.expressions.map(&:clone)
+      self.expressions = orig.expressions.map do |exp|
+        exp.clone.tap { |copy| copy.parent = self }
+      end
       super
     end
     def <<(exp)
-      if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
-        last.merge(exp)
-      else
-        expressions << exp
-      end
+      exp.parent = self
+      expressions << exp
     end
     %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -38,11 +37,7 @@ module Regexp::Expression
     end
     def te
-      ts + to_s.length
-    end
-    def parts
-      expressions
+      ts + base_length
     end
     def to_h
@@ -52,14 +47,21 @@ module Regexp::Expression
       )
     end
-    def terminal?
-      false
-    end
-    private
+    def extract_quantifier_target(quantifier_description)
+      pre_quantifier_decorations = []
+      target = expressions.reverse.find do |exp|
+        if exp.decorative?
+          exp.custom_to_s_handling = true
+          pre_quantifier_decorations << exp.text
+          next
+        end
+        exp
+      end
+      target or raise Regexp::Parser::ParserError,
+        "No valid target found for '#{quantifier_description}' quantifier"
-    def intersperse(expressions, separator)
-      expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
+      target.pre_quantifier_decorations = pre_quantifier_decorations
+      target
     end
   end
 end

data/lib/regexp_parser/expression.rb CHANGED Viewed

@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
 require 'regexp_parser/expression/methods/match'
 require 'regexp_parser/expression/methods/match_length'
 require 'regexp_parser/expression/methods/options'
+require 'regexp_parser/expression/methods/parts'
+require 'regexp_parser/expression/methods/printing'
 require 'regexp_parser/expression/methods/strfregexp'
 require 'regexp_parser/expression/methods/tests'
 require 'regexp_parser/expression/methods/traverse'

data/lib/regexp_parser/lexer.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class Regexp::Lexer
   OPENING_TOKENS = %i[
     capture passive lookahead nlookahead lookbehind nlookbehind
-    atomic options options_switch named absence
+    atomic options options_switch named absence open
   ].freeze
   CLOSING_TOKENS = %i[close].freeze
@@ -89,24 +89,32 @@ class Regexp::Lexer
                 :nesting, :set_nesting, :conditional_nesting, :shift
   def ascend(type, token)
+    return unless CLOSING_TOKENS.include?(token)
     case type
     when :group, :assertion
-      self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
+      self.nesting = nesting - 1
     when :set
-      self.set_nesting = set_nesting - 1 if token == :close
+      self.set_nesting = set_nesting - 1
     when :conditional
-      self.conditional_nesting = conditional_nesting - 1 if token == :close
+      self.conditional_nesting = conditional_nesting - 1
+    else
+      raise "unhandled nesting type #{type}"
     end
   end
   def descend(type, token)
+    return unless OPENING_TOKENS.include?(token)
     case type
     when :group, :assertion
-      self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
+      self.nesting = nesting + 1
     when :set
-      self.set_nesting = set_nesting + 1 if token == :open
+      self.set_nesting = set_nesting + 1
     when :conditional
-      self.conditional_nesting = conditional_nesting + 1 if token == :open
+      self.conditional_nesting = conditional_nesting + 1
+    else
+      raise "unhandled nesting type #{type}"
     end
   end

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -232,7 +232,7 @@ class Regexp::Parser
       node << Backreference::NameRecursionLevel.new(token, active_opts)
     when :name_call
       node << Backreference::NameCall.new(token, active_opts)
-    when :number, :number_ref
+    when :number, :number_ref # TODO: split in v3.0.0
       node << Backreference::Number.new(token, active_opts)
     when :number_recursion_ref
       node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
@@ -272,9 +272,9 @@ class Regexp::Parser
       nest_conditional(Conditional::Expression.new(token, active_opts))
     when :condition
       conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
     when :separator
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
       self.node = conditional_nesting.last.branches.last
     when :close
       conditional_nesting.pop
@@ -322,6 +322,7 @@ class Regexp::Parser
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
+        # TODO: emit :meta_control_sequence token in v3.0.0
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Control.new(token, active_opts)
@@ -329,6 +330,7 @@ class Regexp::Parser
     when :meta_sequence
       if token.text =~ /\A\\M-\\[Cc]/
+        # TODO: emit :meta_control_sequence token in v3.0.0:
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Meta.new(token, active_opts)
@@ -349,11 +351,7 @@ class Regexp::Parser
     when :comment
       node << Comment.new(token, active_opts)
     when :whitespace
-      if node.last.is_a?(WhiteSpace)
-        node.last.merge(WhiteSpace.new(token, active_opts))
-      else
-        node << WhiteSpace.new(token, active_opts)
-      end
+      node << WhiteSpace.new(token, active_opts)
     else
       raise UnknownTokenError.new('FreeSpace', token)
     end
@@ -381,96 +379,96 @@ class Regexp::Parser
   def sequence_operation(klass, token)
     unless node.instance_of?(klass)
       operator = klass.new(token, active_opts)
-      sequence = operator.add_sequence(active_opts)
+      sequence = operator.add_sequence(active_opts, { ts: token.ts })
       sequence.expressions = node.expressions
       node.expressions = []
       nest(operator)
     end
-    node.add_sequence(active_opts)
+    node.add_sequence(active_opts, { ts: token.te })
   end
   def posixclass(token)
     node << PosixClass.new(token, active_opts)
   end
-  include Regexp::Expression::UnicodeProperty
-  UPTokens = Regexp::Syntax::Token::UnicodeProperty
+  UP = Regexp::Expression::Property
+  UPTokens = Regexp::Syntax::Token::Property
   def property(token)
     case token.token
-    when :alnum;                  node << Alnum.new(token, active_opts)
-    when :alpha;                  node << Alpha.new(token, active_opts)
-    when :ascii;                  node << Ascii.new(token, active_opts)
-    when :blank;                  node << Blank.new(token, active_opts)
-    when :cntrl;                  node << Cntrl.new(token, active_opts)
-    when :digit;                  node << Digit.new(token, active_opts)
-    when :graph;                  node << Graph.new(token, active_opts)
-    when :lower;                  node << Lower.new(token, active_opts)
-    when :print;                  node << Print.new(token, active_opts)
-    when :punct;                  node << Punct.new(token, active_opts)
-    when :space;                  node << Space.new(token, active_opts)
-    when :upper;                  node << Upper.new(token, active_opts)
-    when :word;                   node << Word.new(token, active_opts)
-    when :xdigit;                 node << Xdigit.new(token, active_opts)
-    when :xposixpunct;            node << XPosixPunct.new(token, active_opts)
+    when :alnum;                  node << UP::Alnum.new(token, active_opts)
+    when :alpha;                  node << UP::Alpha.new(token, active_opts)
+    when :ascii;                  node << UP::Ascii.new(token, active_opts)
+    when :blank;                  node << UP::Blank.new(token, active_opts)
+    when :cntrl;                  node << UP::Cntrl.new(token, active_opts)
+    when :digit;                  node << UP::Digit.new(token, active_opts)
+    when :graph;                  node << UP::Graph.new(token, active_opts)
+    when :lower;                  node << UP::Lower.new(token, active_opts)
+    when :print;                  node << UP::Print.new(token, active_opts)
+    when :punct;                  node << UP::Punct.new(token, active_opts)
+    when :space;                  node << UP::Space.new(token, active_opts)
+    when :upper;                  node << UP::Upper.new(token, active_opts)
+    when :word;                   node << UP::Word.new(token, active_opts)
+    when :xdigit;                 node << UP::Xdigit.new(token, active_opts)
+    when :xposixpunct;            node << UP::XPosixPunct.new(token, active_opts)
     # only in Oniguruma (old rubies)
-    when :newline;                node << Newline.new(token, active_opts)
-    when :any;                    node << Any.new(token, active_opts)
-    when :assigned;               node << Assigned.new(token, active_opts)
-    when :letter;                 node << Letter::Any.new(token, active_opts)
-    when :cased_letter;           node << Letter::Cased.new(token, active_opts)
-    when :uppercase_letter;       node << Letter::Uppercase.new(token, active_opts)
-    when :lowercase_letter;       node << Letter::Lowercase.new(token, active_opts)
-    when :titlecase_letter;       node << Letter::Titlecase.new(token, active_opts)
-    when :modifier_letter;        node << Letter::Modifier.new(token, active_opts)
-    when :other_letter;           node << Letter::Other.new(token, active_opts)
-    when :mark;                   node << Mark::Any.new(token, active_opts)
-    when :combining_mark;         node << Mark::Combining.new(token, active_opts)
-    when :nonspacing_mark;        node << Mark::Nonspacing.new(token, active_opts)
-    when :spacing_mark;           node << Mark::Spacing.new(token, active_opts)
-    when :enclosing_mark;         node << Mark::Enclosing.new(token, active_opts)
-    when :number;                 node << Number::Any.new(token, active_opts)
-    when :decimal_number;         node << Number::Decimal.new(token, active_opts)
-    when :letter_number;          node << Number::Letter.new(token, active_opts)
-    when :other_number;           node << Number::Other.new(token, active_opts)
-    when :punctuation;            node << Punctuation::Any.new(token, active_opts)
-    when :connector_punctuation;  node << Punctuation::Connector.new(token, active_opts)
-    when :dash_punctuation;       node << Punctuation::Dash.new(token, active_opts)
-    when :open_punctuation;       node << Punctuation::Open.new(token, active_opts)
-    when :close_punctuation;      node << Punctuation::Close.new(token, active_opts)
-    when :initial_punctuation;    node << Punctuation::Initial.new(token, active_opts)
-    when :final_punctuation;      node << Punctuation::Final.new(token, active_opts)
-    when :other_punctuation;      node << Punctuation::Other.new(token, active_opts)
-    when :separator;              node << Separator::Any.new(token, active_opts)
-    when :space_separator;        node << Separator::Space.new(token, active_opts)
-    when :line_separator;         node << Separator::Line.new(token, active_opts)
-    when :paragraph_separator;    node << Separator::Paragraph.new(token, active_opts)
-    when :symbol;                 node << Symbol::Any.new(token, active_opts)
-    when :math_symbol;            node << Symbol::Math.new(token, active_opts)
-    when :currency_symbol;        node << Symbol::Currency.new(token, active_opts)
-    when :modifier_symbol;        node << Symbol::Modifier.new(token, active_opts)
-    when :other_symbol;           node << Symbol::Other.new(token, active_opts)
-    when :other;                  node << Codepoint::Any.new(token, active_opts)
-    when :control;                node << Codepoint::Control.new(token, active_opts)
-    when :format;                 node << Codepoint::Format.new(token, active_opts)
-    when :surrogate;              node << Codepoint::Surrogate.new(token, active_opts)
-    when :private_use;            node << Codepoint::PrivateUse.new(token, active_opts)
-    when :unassigned;             node << Codepoint::Unassigned.new(token, active_opts)
-    when *UPTokens::Age;          node << Age.new(token, active_opts)
-    when *UPTokens::Derived;      node << Derived.new(token, active_opts)
-    when *UPTokens::Emoji;        node << Emoji.new(token, active_opts)
-    when *UPTokens::Script;       node << Script.new(token, active_opts)
-    when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
+    when :newline;                node << UP::Newline.new(token, active_opts)
+    when :any;                    node << UP::Any.new(token, active_opts)
+    when :assigned;               node << UP::Assigned.new(token, active_opts)
+    when :letter;                 node << UP::Letter::Any.new(token, active_opts)
+    when :cased_letter;           node << UP::Letter::Cased.new(token, active_opts)
+    when :uppercase_letter;       node << UP::Letter::Uppercase.new(token, active_opts)
+    when :lowercase_letter;       node << UP::Letter::Lowercase.new(token, active_opts)
+    when :titlecase_letter;       node << UP::Letter::Titlecase.new(token, active_opts)
+    when :modifier_letter;        node << UP::Letter::Modifier.new(token, active_opts)
+    when :other_letter;           node << UP::Letter::Other.new(token, active_opts)
+    when :mark;                   node << UP::Mark::Any.new(token, active_opts)
+    when :combining_mark;         node << UP::Mark::Combining.new(token, active_opts)
+    when :nonspacing_mark;        node << UP::Mark::Nonspacing.new(token, active_opts)
+    when :spacing_mark;           node << UP::Mark::Spacing.new(token, active_opts)
+    when :enclosing_mark;         node << UP::Mark::Enclosing.new(token, active_opts)
+    when :number;                 node << UP::Number::Any.new(token, active_opts)
+    when :decimal_number;         node << UP::Number::Decimal.new(token, active_opts)
+    when :letter_number;          node << UP::Number::Letter.new(token, active_opts)
+    when :other_number;           node << UP::Number::Other.new(token, active_opts)
+    when :punctuation;            node << UP::Punctuation::Any.new(token, active_opts)
+    when :connector_punctuation;  node << UP::Punctuation::Connector.new(token, active_opts)
+    when :dash_punctuation;       node << UP::Punctuation::Dash.new(token, active_opts)
+    when :open_punctuation;       node << UP::Punctuation::Open.new(token, active_opts)
+    when :close_punctuation;      node << UP::Punctuation::Close.new(token, active_opts)
+    when :initial_punctuation;    node << UP::Punctuation::Initial.new(token, active_opts)
+    when :final_punctuation;      node << UP::Punctuation::Final.new(token, active_opts)
+    when :other_punctuation;      node << UP::Punctuation::Other.new(token, active_opts)
+    when :separator;              node << UP::Separator::Any.new(token, active_opts)
+    when :space_separator;        node << UP::Separator::Space.new(token, active_opts)
+    when :line_separator;         node << UP::Separator::Line.new(token, active_opts)
+    when :paragraph_separator;    node << UP::Separator::Paragraph.new(token, active_opts)
+    when :symbol;                 node << UP::Symbol::Any.new(token, active_opts)
+    when :math_symbol;            node << UP::Symbol::Math.new(token, active_opts)
+    when :currency_symbol;        node << UP::Symbol::Currency.new(token, active_opts)
+    when :modifier_symbol;        node << UP::Symbol::Modifier.new(token, active_opts)
+    when :other_symbol;           node << UP::Symbol::Other.new(token, active_opts)
+    when :other;                  node << UP::Codepoint::Any.new(token, active_opts)
+    when :control;                node << UP::Codepoint::Control.new(token, active_opts)
+    when :format;                 node << UP::Codepoint::Format.new(token, active_opts)
+    when :surrogate;              node << UP::Codepoint::Surrogate.new(token, active_opts)
+    when :private_use;            node << UP::Codepoint::PrivateUse.new(token, active_opts)
+    when :unassigned;             node << UP::Codepoint::Unassigned.new(token, active_opts)
+    when *UPTokens::Age;          node << UP::Age.new(token, active_opts)
+    when *UPTokens::Derived;      node << UP::Derived.new(token, active_opts)
+    when *UPTokens::Emoji;        node << UP::Emoji.new(token, active_opts)
+    when *UPTokens::Script;       node << UP::Script.new(token, active_opts)
+    when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
     else
       raise UnknownTokenError.new('UnicodeProperty', token)
@@ -478,8 +476,7 @@ class Regexp::Parser
   end
   def quantifier(token)
-    target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
-    target_node or raise ParserError, "No valid target found for '#{token.text}'"
+    target_node = node.extract_quantifier_target(token.text)
     # in case of chained quantifiers, wrap target in an implicit passive group
     # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -527,6 +524,8 @@ class Regexp::Parser
   end
   def open_set(token)
+    # TODO: this and Quantifier are the only cases where Expression#token
+    # does not match the scanner/lexer output. Fix in v3.0.0.
     token.token = :character
     nest(CharacterSet.new(token, active_opts))
   end
@@ -590,7 +589,7 @@ class Regexp::Parser
     # (in a second iteration because there might be forward references)
     referrers.each do |exp|
       exp.referenced_expression = targets[exp.reference] ||
-        raise(ParserError, "Invalid reference: #{exp.reference}")
+        raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
     end
   end
 end # module Regexp::Parser

data/lib/regexp_parser/scanner/errors/premature_end_error.rb ADDED Viewed

@@ -0,0 +1,8 @@
+class Regexp::Scanner
+  # Unexpected end of pattern
+  class PrematureEndError < ScannerError
+    def initialize(where = '')
+      super "Premature end of pattern at #{where}"
+    end
+  end
+end

data/lib/regexp_parser/scanner/errors/scanner_error.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'regexp_parser/error'
+class Regexp::Scanner
+  # General scanner error (catch all)
+  class ScannerError < Regexp::Parser::Error; end
+end