RubyGems - regexp_parser - Versions diffs - 2.1.1 → 2.9.2 - Mend

regexp_parser 2.1.1 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

checksums.yaml +4 -4
data/Gemfile +6 -5
data/LICENSE +1 -1
data/Rakefile +6 -70
data/lib/regexp_parser/error.rb +1 -1
data/lib/regexp_parser/expression/base.rb +76 -0
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +18 -3
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -7
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
data/lib/regexp_parser/expression/classes/free_space.rb +4 -4
data/lib/regexp_parser/expression/classes/group.rb +10 -22
data/lib/regexp_parser/expression/classes/keep.rb +2 -0
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
data/lib/regexp_parser/expression/classes/root.rb +3 -6
data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +10 -11
data/lib/regexp_parser/expression/methods/construct.rb +41 -0
data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
data/lib/regexp_parser/expression/methods/negative.rb +20 -0
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
data/lib/regexp_parser/expression/methods/tests.rb +47 -1
data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
data/lib/regexp_parser/expression/quantifier.rb +55 -24
data/lib/regexp_parser/expression/sequence.rb +11 -31
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +111 -0
data/lib/regexp_parser/expression/subexpression.rb +26 -18
data/lib/regexp_parser/expression.rb +37 -155
data/lib/regexp_parser/lexer.rb +81 -39
data/lib/regexp_parser/parser.rb +135 -173
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +651 -0
data/lib/regexp_parser/scanner/properties/short.csv +249 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +127 -185
data/lib/regexp_parser/scanner.rb +1185 -1402
data/lib/regexp_parser/syntax/any.rb +2 -7
data/lib/regexp_parser/syntax/base.rb +91 -66
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +33 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/token/meta.rb +20 -0
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +17 -34
data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
data/lib/regexp_parser/syntax/versions.rb +4 -2
data/lib/regexp_parser/syntax.rb +2 -2
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +6 -8
data/regexp_parser.gemspec +20 -22
metadata +49 -171
data/CHANGELOG.md +0 -494
data/README.md +0 -479
data/lib/regexp_parser/scanner/properties/long.yml +0 -594
data/lib/regexp_parser/scanner/properties/short.yml +0 -237
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/expression/base_spec.rb +0 -104
data/spec/expression/clone_spec.rb +0 -152
data/spec/expression/conditional_spec.rb +0 -89
data/spec/expression/free_space_spec.rb +0 -27
data/spec/expression/methods/match_length_spec.rb +0 -161
data/spec/expression/methods/match_spec.rb +0 -25
data/spec/expression/methods/strfregexp_spec.rb +0 -224
data/spec/expression/methods/tests_spec.rb +0 -99
data/spec/expression/methods/traverse_spec.rb +0 -161
data/spec/expression/options_spec.rb +0 -128
data/spec/expression/subexpression_spec.rb +0 -50
data/spec/expression/to_h_spec.rb +0 -26
data/spec/expression/to_s_spec.rb +0 -108
data/spec/lexer/all_spec.rb +0 -22
data/spec/lexer/conditionals_spec.rb +0 -53
data/spec/lexer/delimiters_spec.rb +0 -68
data/spec/lexer/escapes_spec.rb +0 -14
data/spec/lexer/keep_spec.rb +0 -10
data/spec/lexer/literals_spec.rb +0 -64
data/spec/lexer/nesting_spec.rb +0 -99
data/spec/lexer/refcalls_spec.rb +0 -60
data/spec/parser/all_spec.rb +0 -43
data/spec/parser/alternation_spec.rb +0 -88
data/spec/parser/anchors_spec.rb +0 -17
data/spec/parser/conditionals_spec.rb +0 -179
data/spec/parser/errors_spec.rb +0 -30
data/spec/parser/escapes_spec.rb +0 -121
data/spec/parser/free_space_spec.rb +0 -130
data/spec/parser/groups_spec.rb +0 -108
data/spec/parser/keep_spec.rb +0 -6
data/spec/parser/options_spec.rb +0 -28
data/spec/parser/posix_classes_spec.rb +0 -8
data/spec/parser/properties_spec.rb +0 -115
data/spec/parser/quantifiers_spec.rb +0 -68
data/spec/parser/refcalls_spec.rb +0 -117
data/spec/parser/set/intersections_spec.rb +0 -127
data/spec/parser/set/ranges_spec.rb +0 -111
data/spec/parser/sets_spec.rb +0 -178
data/spec/parser/types_spec.rb +0 -18
data/spec/scanner/all_spec.rb +0 -18
data/spec/scanner/anchors_spec.rb +0 -21
data/spec/scanner/conditionals_spec.rb +0 -128
data/spec/scanner/delimiters_spec.rb +0 -52
data/spec/scanner/errors_spec.rb +0 -67
data/spec/scanner/escapes_spec.rb +0 -64
data/spec/scanner/free_space_spec.rb +0 -165
data/spec/scanner/groups_spec.rb +0 -61
data/spec/scanner/keep_spec.rb +0 -10
data/spec/scanner/literals_spec.rb +0 -39
data/spec/scanner/meta_spec.rb +0 -18
data/spec/scanner/options_spec.rb +0 -36
data/spec/scanner/properties_spec.rb +0 -64
data/spec/scanner/quantifiers_spec.rb +0 -25
data/spec/scanner/refcalls_spec.rb +0 -55
data/spec/scanner/sets_spec.rb +0 -151
data/spec/scanner/types_spec.rb +0 -14
data/spec/spec_helper.rb +0 -16
data/spec/support/runner.rb +0 -42
data/spec/support/shared_examples.rb +0 -77
data/spec/support/warning_extractor.rb +0 -60
data/spec/syntax/syntax_spec.rb +0 -48
data/spec/syntax/syntax_token_map_spec.rb +0 -23
data/spec/syntax/versions/1.8.6_spec.rb +0 -17
data/spec/syntax/versions/1.9.1_spec.rb +0 -10
data/spec/syntax/versions/1.9.3_spec.rb +0 -9
data/spec/syntax/versions/2.0.0_spec.rb +0 -13
data/spec/syntax/versions/2.2.0_spec.rb +0 -9
data/spec/syntax/versions/aliases_spec.rb +0 -37
data/spec/token/token_spec.rb +0 -85
/data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-require 'regexp_parser/error'
-require 'regexp_parser/expression'
+require_relative 'error'
+require_relative 'expression'
 class Regexp::Parser
   include Regexp::Expression
@@ -18,12 +18,12 @@ class Regexp::Parser
     end
   end
-  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+  def self.parse(input, syntax = nil, options: nil, &block)
     new.parse(input, syntax, options: options, &block)
   end
-  def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    root = Root.build(extract_options(input, options))
+  def parse(input, syntax = nil, options: nil, &block)
+    root = Root.construct(options: extract_options(input, options))
     self.root = root
     self.node = root
@@ -35,10 +35,13 @@ class Regexp::Parser
     self.captured_group_counts = Hash.new(0)
-    Regexp::Lexer.scan(input, syntax, options: options) do |token|
+    Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
       parse_token(token)
     end
+    # Trigger recursive setting of #nesting_level, which reflects how deep
+    # a node is in the tree. Do this at the end to account for tree rewrites.
+    root.nesting_level = 0
     assign_referenced_expressions
     if block_given?
@@ -197,11 +200,11 @@ class Regexp::Parser
   end
   def captured_group_count_at_level
-    captured_group_counts[node.level]
+    captured_group_counts[node]
   end
   def count_captured_group
-    captured_group_counts[node.level] += 1
+    captured_group_counts[node] += 1
   end
   def close_group
@@ -229,10 +232,18 @@ class Regexp::Parser
       node << Backreference::NameRecursionLevel.new(token, active_opts)
     when :name_call
       node << Backreference::NameCall.new(token, active_opts)
-    when :number, :number_ref
+    when :number, :number_ref # TODO: split in v3.0.0
       node << Backreference::Number.new(token, active_opts)
     when :number_recursion_ref
-      node << Backreference::NumberRecursionLevel.new(token, active_opts)
+      node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
+        # TODO: should split off new token number_recursion_rel_ref and new
+        # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
+        if exp.text =~ /[<'][+-]/
+          assign_effective_number(exp)
+        else
+          exp.effective_number = exp.number
+        end
+      end
     when :number_call
       node << Backreference::NumberCall.new(token, active_opts)
     when :number_rel_ref
@@ -251,6 +262,8 @@ class Regexp::Parser
   def assign_effective_number(exp)
     exp.effective_number =
       exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
+    exp.effective_number > 0 ||
+      raise(ParserError, "Invalid reference: #{exp.reference}")
   end
   def conditional(token)
@@ -259,9 +272,9 @@ class Regexp::Parser
       nest_conditional(Conditional::Expression.new(token, active_opts))
     when :condition
       conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
     when :separator
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
       self.node = conditional_nesting.last.branches.last
     when :close
       conditional_nesting.pop
@@ -286,17 +299,9 @@ class Regexp::Parser
   def nest(exp)
     nesting.push(exp)
     node << exp
-    update_transplanted_subtree(exp, node)
     self.node = exp
   end
-  # subtrees are transplanted to build Alternations, Intersections, Ranges
-  def update_transplanted_subtree(exp, new_parent)
-    exp.nesting_level = new_parent.nesting_level + 1
-    exp.respond_to?(:each) &&
-      exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
-  end
   def escape(token)
     case token.token
@@ -317,6 +322,7 @@ class Regexp::Parser
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
+        # TODO: emit :meta_control_sequence token in v3.0.0
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Control.new(token, active_opts)
@@ -324,6 +330,7 @@ class Regexp::Parser
     when :meta_sequence
       if token.text =~ /\A\\M-\\[Cc]/
+        # TODO: emit :meta_control_sequence token in v3.0.0:
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Meta.new(token, active_opts)
@@ -344,11 +351,7 @@ class Regexp::Parser
     when :comment
       node << Comment.new(token, active_opts)
     when :whitespace
-      if node.last.is_a?(WhiteSpace)
-        node.last.merge(WhiteSpace.new(token, active_opts))
-      else
-        node << WhiteSpace.new(token, active_opts)
-      end
+      node << WhiteSpace.new(token, active_opts)
     else
       raise UnknownTokenError.new('FreeSpace', token)
     end
@@ -374,98 +377,99 @@ class Regexp::Parser
   end
   def sequence_operation(klass, token)
-    unless node.is_a?(klass)
+    unless node.instance_of?(klass)
       operator = klass.new(token, active_opts)
-      sequence = operator.add_sequence(active_opts)
+      sequence = operator.add_sequence(active_opts, { ts: token.ts })
       sequence.expressions = node.expressions
       node.expressions = []
       nest(operator)
     end
-    node.add_sequence(active_opts)
+    node.add_sequence(active_opts, { ts: token.te })
   end
   def posixclass(token)
     node << PosixClass.new(token, active_opts)
   end
-  include Regexp::Expression::UnicodeProperty
-  UPTokens = Regexp::Syntax::Token::UnicodeProperty
+  UP = Regexp::Expression::Property
+  UPTokens = Regexp::Syntax::Token::Property
   def property(token)
     case token.token
-    when :alnum;                  node << Alnum.new(token, active_opts)
-    when :alpha;                  node << Alpha.new(token, active_opts)
-    when :ascii;                  node << Ascii.new(token, active_opts)
-    when :blank;                  node << Blank.new(token, active_opts)
-    when :cntrl;                  node << Cntrl.new(token, active_opts)
-    when :digit;                  node << Digit.new(token, active_opts)
-    when :graph;                  node << Graph.new(token, active_opts)
-    when :lower;                  node << Lower.new(token, active_opts)
-    when :print;                  node << Print.new(token, active_opts)
-    when :punct;                  node << Punct.new(token, active_opts)
-    when :space;                  node << Space.new(token, active_opts)
-    when :upper;                  node << Upper.new(token, active_opts)
-    when :word;                   node << Word.new(token, active_opts)
-    when :xdigit;                 node << Xdigit.new(token, active_opts)
-    when :xposixpunct;            node << XPosixPunct.new(token, active_opts)
+    when :alnum;                  node << UP::Alnum.new(token, active_opts)
+    when :alpha;                  node << UP::Alpha.new(token, active_opts)
+    when :ascii;                  node << UP::Ascii.new(token, active_opts)
+    when :blank;                  node << UP::Blank.new(token, active_opts)
+    when :cntrl;                  node << UP::Cntrl.new(token, active_opts)
+    when :digit;                  node << UP::Digit.new(token, active_opts)
+    when :graph;                  node << UP::Graph.new(token, active_opts)
+    when :lower;                  node << UP::Lower.new(token, active_opts)
+    when :print;                  node << UP::Print.new(token, active_opts)
+    when :punct;                  node << UP::Punct.new(token, active_opts)
+    when :space;                  node << UP::Space.new(token, active_opts)
+    when :upper;                  node << UP::Upper.new(token, active_opts)
+    when :word;                   node << UP::Word.new(token, active_opts)
+    when :xdigit;                 node << UP::Xdigit.new(token, active_opts)
+    when :xposixpunct;            node << UP::XPosixPunct.new(token, active_opts)
     # only in Oniguruma (old rubies)
-    when :newline;                node << Newline.new(token, active_opts)
-    when :any;                    node << Any.new(token, active_opts)
-    when :assigned;               node << Assigned.new(token, active_opts)
-    when :letter;                 node << Letter::Any.new(token, active_opts)
-    when :cased_letter;           node << Letter::Cased.new(token, active_opts)
-    when :uppercase_letter;       node << Letter::Uppercase.new(token, active_opts)
-    when :lowercase_letter;       node << Letter::Lowercase.new(token, active_opts)
-    when :titlecase_letter;       node << Letter::Titlecase.new(token, active_opts)
-    when :modifier_letter;        node << Letter::Modifier.new(token, active_opts)
-    when :other_letter;           node << Letter::Other.new(token, active_opts)
-    when :mark;                   node << Mark::Any.new(token, active_opts)
-    when :combining_mark;         node << Mark::Combining.new(token, active_opts)
-    when :nonspacing_mark;        node << Mark::Nonspacing.new(token, active_opts)
-    when :spacing_mark;           node << Mark::Spacing.new(token, active_opts)
-    when :enclosing_mark;         node << Mark::Enclosing.new(token, active_opts)
-    when :number;                 node << Number::Any.new(token, active_opts)
-    when :decimal_number;         node << Number::Decimal.new(token, active_opts)
-    when :letter_number;          node << Number::Letter.new(token, active_opts)
-    when :other_number;           node << Number::Other.new(token, active_opts)
-    when :punctuation;            node << Punctuation::Any.new(token, active_opts)
-    when :connector_punctuation;  node << Punctuation::Connector.new(token, active_opts)
-    when :dash_punctuation;       node << Punctuation::Dash.new(token, active_opts)
-    when :open_punctuation;       node << Punctuation::Open.new(token, active_opts)
-    when :close_punctuation;      node << Punctuation::Close.new(token, active_opts)
-    when :initial_punctuation;    node << Punctuation::Initial.new(token, active_opts)
-    when :final_punctuation;      node << Punctuation::Final.new(token, active_opts)
-    when :other_punctuation;      node << Punctuation::Other.new(token, active_opts)
-    when :separator;              node << Separator::Any.new(token, active_opts)
-    when :space_separator;        node << Separator::Space.new(token, active_opts)
-    when :line_separator;         node << Separator::Line.new(token, active_opts)
-    when :paragraph_separator;    node << Separator::Paragraph.new(token, active_opts)
-    when :symbol;                 node << Symbol::Any.new(token, active_opts)
-    when :math_symbol;            node << Symbol::Math.new(token, active_opts)
-    when :currency_symbol;        node << Symbol::Currency.new(token, active_opts)
-    when :modifier_symbol;        node << Symbol::Modifier.new(token, active_opts)
-    when :other_symbol;           node << Symbol::Other.new(token, active_opts)
-    when :other;                  node << Codepoint::Any.new(token, active_opts)
-    when :control;                node << Codepoint::Control.new(token, active_opts)
-    when :format;                 node << Codepoint::Format.new(token, active_opts)
-    when :surrogate;              node << Codepoint::Surrogate.new(token, active_opts)
-    when :private_use;            node << Codepoint::PrivateUse.new(token, active_opts)
-    when :unassigned;             node << Codepoint::Unassigned.new(token, active_opts)
-    when *UPTokens::Age;          node << Age.new(token, active_opts)
-    when *UPTokens::Derived;      node << Derived.new(token, active_opts)
-    when *UPTokens::Emoji;        node << Emoji.new(token, active_opts)
-    when *UPTokens::Script;       node << Script.new(token, active_opts)
-    when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
+    when :newline;                node << UP::Newline.new(token, active_opts)
+    when :any;                    node << UP::Any.new(token, active_opts)
+    when :assigned;               node << UP::Assigned.new(token, active_opts)
+    when :letter;                 node << UP::Letter::Any.new(token, active_opts)
+    when :cased_letter;           node << UP::Letter::Cased.new(token, active_opts)
+    when :uppercase_letter;       node << UP::Letter::Uppercase.new(token, active_opts)
+    when :lowercase_letter;       node << UP::Letter::Lowercase.new(token, active_opts)
+    when :titlecase_letter;       node << UP::Letter::Titlecase.new(token, active_opts)
+    when :modifier_letter;        node << UP::Letter::Modifier.new(token, active_opts)
+    when :other_letter;           node << UP::Letter::Other.new(token, active_opts)
+    when :mark;                   node << UP::Mark::Any.new(token, active_opts)
+    when :combining_mark;         node << UP::Mark::Combining.new(token, active_opts)
+    when :nonspacing_mark;        node << UP::Mark::Nonspacing.new(token, active_opts)
+    when :spacing_mark;           node << UP::Mark::Spacing.new(token, active_opts)
+    when :enclosing_mark;         node << UP::Mark::Enclosing.new(token, active_opts)
+    when :number;                 node << UP::Number::Any.new(token, active_opts)
+    when :decimal_number;         node << UP::Number::Decimal.new(token, active_opts)
+    when :letter_number;          node << UP::Number::Letter.new(token, active_opts)
+    when :other_number;           node << UP::Number::Other.new(token, active_opts)
+    when :punctuation;            node << UP::Punctuation::Any.new(token, active_opts)
+    when :connector_punctuation;  node << UP::Punctuation::Connector.new(token, active_opts)
+    when :dash_punctuation;       node << UP::Punctuation::Dash.new(token, active_opts)
+    when :open_punctuation;       node << UP::Punctuation::Open.new(token, active_opts)
+    when :close_punctuation;      node << UP::Punctuation::Close.new(token, active_opts)
+    when :initial_punctuation;    node << UP::Punctuation::Initial.new(token, active_opts)
+    when :final_punctuation;      node << UP::Punctuation::Final.new(token, active_opts)
+    when :other_punctuation;      node << UP::Punctuation::Other.new(token, active_opts)
+    when :separator;              node << UP::Separator::Any.new(token, active_opts)
+    when :space_separator;        node << UP::Separator::Space.new(token, active_opts)
+    when :line_separator;         node << UP::Separator::Line.new(token, active_opts)
+    when :paragraph_separator;    node << UP::Separator::Paragraph.new(token, active_opts)
+    when :symbol;                 node << UP::Symbol::Any.new(token, active_opts)
+    when :math_symbol;            node << UP::Symbol::Math.new(token, active_opts)
+    when :currency_symbol;        node << UP::Symbol::Currency.new(token, active_opts)
+    when :modifier_symbol;        node << UP::Symbol::Modifier.new(token, active_opts)
+    when :other_symbol;           node << UP::Symbol::Other.new(token, active_opts)
+    when :other;                  node << UP::Codepoint::Any.new(token, active_opts)
+    when :control;                node << UP::Codepoint::Control.new(token, active_opts)
+    when :format;                 node << UP::Codepoint::Format.new(token, active_opts)
+    when :surrogate;              node << UP::Codepoint::Surrogate.new(token, active_opts)
+    when :private_use;            node << UP::Codepoint::PrivateUse.new(token, active_opts)
+    when :unassigned;             node << UP::Codepoint::Unassigned.new(token, active_opts)
+    when *UPTokens::Age;          node << UP::Age.new(token, active_opts)
+    when *UPTokens::Derived;      node << UP::Derived.new(token, active_opts)
+    when *UPTokens::Emoji;        node << UP::Emoji.new(token, active_opts)
+    when *UPTokens::Enumerated;   node << UP::Enumerated.new(token, active_opts)
+    when *UPTokens::Script;       node << UP::Script.new(token, active_opts)
+    when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
     else
       raise UnknownTokenError.new('UnicodeProperty', token)
@@ -473,86 +477,39 @@ class Regexp::Parser
   end
   def quantifier(token)
-    target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
-    target_node or raise ParserError, "No valid target found for '#{token.text}'"
+    target_node = node.extract_quantifier_target(token.text)
     # in case of chained quantifiers, wrap target in an implicit passive group
     # description of the problem: https://github.com/ammar/regexp_parser/issues/3
     # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
     if target_node.quantified?
-      new_token = Regexp::Token.new(
-        :group,
-        :passive,
-        '', # text
-        target_node.ts,
-        nil, # te (unused)
-        target_node.level,
-        target_node.set_level,
-        target_node.conditional_level
+      new_group = Group::Passive.construct(
+        token:             :passive,
+        ts:                target_node.ts,
+        level:             target_node.level,
+        set_level:         target_node.set_level,
+        conditional_level: target_node.conditional_level,
+        options:           active_opts,
       )
-      new_group = Group::Passive.new(new_token, active_opts)
       new_group.implicit = true
       new_group << target_node
-      increase_level(target_node)
+      increase_group_level(target_node)
       node.expressions[node.expressions.index(target_node)] = new_group
       target_node = new_group
     end
-    case token.token
-    when :zero_or_one
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
-    when :zero_or_one_reluctant
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
-    when :zero_or_one_possessive
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
-    when :zero_or_more
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
-    when :zero_or_more_reluctant
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
-    when :zero_or_more_possessive
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
-    when :one_or_more
-      target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
-    when :one_or_more_reluctant
-      target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
-    when :one_or_more_possessive
-      target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
-    when :interval
-      interval(target_node, token)
-    else
+    unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
+                             (?:_greedy|_reluctant|_possessive)?\z/x
       raise UnknownTokenError.new('Quantifier', token)
     end
+    target_node.quantify(token, active_opts)
   end
-  def increase_level(exp)
+  def increase_group_level(exp)
     exp.level += 1
-    exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
-  end
-  def interval(target_node, token)
-    text = token.text
-    mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
-    case mchr
-    when '?'
-      range_text = text[0...-1]
-      mode = :reluctant
-    when '+'
-      range_text = text[0...-1]
-      mode = :possessive
-    else
-      range_text = text
-      mode = :greedy
-    end
-    range = range_text.gsub(/\{|\}/, '').split(',', 2)
-    min = range[0].empty? ? 0 : range[0]
-    max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
-    target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
+    exp.quantifier.level += 1 if exp.quantifier
+    exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
   end
   def set(token)
@@ -568,6 +525,8 @@ class Regexp::Parser
   end
   def open_set(token)
+    # TODO: this and Quantifier are the only cases where Expression#token
+    # does not match the scanner/lexer output. Fix in v3.0.0.
     token.token = :character
     nest(CharacterSet.new(token, active_opts))
   end
@@ -582,7 +541,7 @@ class Regexp::Parser
   def range(token)
     exp = CharacterSet::Range.new(token, active_opts)
-    scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
+    scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
     exp << scope.expressions.pop
     nest(exp)
   end
@@ -609,26 +568,29 @@ class Regexp::Parser
   end
   def close_completed_character_set_range
-    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
+    decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
   end
   def active_opts
     options_stack.last
   end
-  # Assigns referenced expressions to refering expressions, e.g. if there is
+  # Assigns referenced expressions to referring expressions, e.g. if there is
   # an instance of Backreference::Number, its #referenced_expression is set to
   # the instance of Group::Capture that it refers to via its number.
   def assign_referenced_expressions
-    targets = {}
-    # find all referencable expressions
+    # find all referenceable and referring expressions
+    targets = { 0 => root }
+    referrers = []
     root.each_expression do |exp|
       exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
+      referrers << exp if exp.referential?
     end
-    # assign them to any refering expressions
-    root.each_expression do |exp|
-      exp.respond_to?(:reference) &&
-        exp.referenced_expression = targets[exp.reference]
+    # assign reference expression to referring expressions
+    # (in a second iteration because there might be forward references)
+    referrers.each do |exp|
+      exp.referenced_expression = targets[exp.reference] ||
+        raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
     end
   end
 end # module Regexp::Parser

data/lib/regexp_parser/scanner/errors/premature_end_error.rb ADDED Viewed

@@ -0,0 +1,8 @@
+class Regexp::Scanner
+  # Unexpected end of pattern
+  class PrematureEndError < ScannerError
+    def initialize(where = '')
+      super "Premature end of pattern at #{where}"
+    end
+  end
+end

data/lib/regexp_parser/scanner/errors/scanner_error.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require_relative '../../../regexp_parser/error'
+class Regexp::Scanner
+  # General scanner error (catch all)
+  class ScannerError < Regexp::Parser::Error; end
+end

data/lib/regexp_parser/scanner/errors/validation_error.rb ADDED Viewed

@@ -0,0 +1,63 @@
+class Regexp::Scanner
+  # Base for all scanner validation errors
+  class ValidationError < ScannerError
+    # Centralizes and unifies the handling of validation related errors.
+    def self.for(type, problem, reason = nil)
+      types.fetch(type).new(problem, reason)
+    end
+    def self.types
+      @types ||= {
+        backref:      InvalidBackrefError,
+        group:        InvalidGroupError,
+        group_option: InvalidGroupOption,
+        posix_class:  UnknownPosixClassError,
+        property:     UnknownUnicodePropertyError,
+        sequence:     InvalidSequenceError,
+      }
+    end
+  end
+  # Invalid sequence format. Used for escape sequences, mainly.
+  class InvalidSequenceError < ValidationError
+    def initialize(what = 'sequence', where = '')
+      super "Invalid #{what} at #{where}"
+    end
+  end
+  # Invalid group. Used for named groups.
+  class InvalidGroupError < ValidationError
+    def initialize(what, reason)
+      super "Invalid #{what}, #{reason}."
+    end
+  end
+  # Invalid groupOption. Used for inline options.
+  # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
+  class InvalidGroupOption < ValidationError
+    def initialize(option, text)
+      super "Invalid group option #{option} in #{text}"
+    end
+  end
+  # Invalid back reference. Used for name a number refs/calls.
+  class InvalidBackrefError < ValidationError
+    def initialize(what, reason)
+      super "Invalid back reference #{what}, #{reason}"
+    end
+  end
+  # The property name was not recognized by the scanner.
+  class UnknownUnicodePropertyError < ValidationError
+    def initialize(name, _)
+      super "Unknown unicode character property name #{name}"
+    end
+  end
+  # The POSIX class name was not recognized by the scanner.
+  class UnknownPosixClassError < ValidationError
+    def initialize(text, _)
+      super "Unknown POSIX class #{text}"
+    end
+  end
+end