RubyGems - regexp_parser - Versions diffs - 1.7.0 → 2.9.0 - Mend

regexp_parser 1.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

checksums.yaml +4 -4
data/Gemfile +9 -3
data/LICENSE +1 -1
data/Rakefile +6 -70
data/lib/regexp_parser/error.rb +4 -0
data/lib/regexp_parser/expression/base.rb +76 -0
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
data/lib/regexp_parser/expression/classes/group.rb +28 -15
data/lib/regexp_parser/expression/classes/keep.rb +2 -0
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
data/lib/regexp_parser/expression/classes/root.rb +4 -19
data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +11 -12
data/lib/regexp_parser/expression/methods/construct.rb +41 -0
data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
data/lib/regexp_parser/expression/methods/negative.rb +20 -0
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
data/lib/regexp_parser/expression/methods/tests.rb +47 -1
data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
data/lib/regexp_parser/expression/quantifier.rb +57 -17
data/lib/regexp_parser/expression/sequence.rb +11 -47
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +111 -0
data/lib/regexp_parser/expression/subexpression.rb +27 -19
data/lib/regexp_parser/expression.rb +15 -141
data/lib/regexp_parser/lexer.rb +83 -41
data/lib/regexp_parser/parser.rb +372 -429
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +651 -0
data/lib/regexp_parser/scanner/properties/short.csv +249 -0
data/lib/regexp_parser/scanner/property.rl +4 -4
data/lib/regexp_parser/scanner/scanner.rl +303 -368
data/lib/regexp_parser/scanner.rb +1423 -1674
data/lib/regexp_parser/syntax/any.rb +2 -7
data/lib/regexp_parser/syntax/base.rb +92 -67
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +33 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/token/meta.rb +20 -0
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +8 -6
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +0 -2
data/regexp_parser.gemspec +19 -23
metadata +53 -171
data/CHANGELOG.md +0 -349
data/README.md +0 -470
data/lib/regexp_parser/scanner/properties/long.yml +0 -594
data/lib/regexp_parser/scanner/properties/short.yml +0 -237
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/expression/base_spec.rb +0 -94
data/spec/expression/clone_spec.rb +0 -120
data/spec/expression/conditional_spec.rb +0 -89
data/spec/expression/free_space_spec.rb +0 -27
data/spec/expression/methods/match_length_spec.rb +0 -161
data/spec/expression/methods/match_spec.rb +0 -25
data/spec/expression/methods/strfregexp_spec.rb +0 -224
data/spec/expression/methods/tests_spec.rb +0 -99
data/spec/expression/methods/traverse_spec.rb +0 -161
data/spec/expression/options_spec.rb +0 -128
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9
data/spec/expression/subexpression_spec.rb +0 -50
data/spec/expression/to_h_spec.rb +0 -26
data/spec/expression/to_s_spec.rb +0 -100
data/spec/lexer/all_spec.rb +0 -22
data/spec/lexer/conditionals_spec.rb +0 -53
data/spec/lexer/escapes_spec.rb +0 -14
data/spec/lexer/keep_spec.rb +0 -10
data/spec/lexer/literals_spec.rb +0 -89
data/spec/lexer/nesting_spec.rb +0 -99
data/spec/lexer/refcalls_spec.rb +0 -55
data/spec/parser/all_spec.rb +0 -43
data/spec/parser/alternation_spec.rb +0 -88
data/spec/parser/anchors_spec.rb +0 -17
data/spec/parser/conditionals_spec.rb +0 -179
data/spec/parser/errors_spec.rb +0 -30
data/spec/parser/escapes_spec.rb +0 -121
data/spec/parser/free_space_spec.rb +0 -130
data/spec/parser/groups_spec.rb +0 -108
data/spec/parser/keep_spec.rb +0 -6
data/spec/parser/posix_classes_spec.rb +0 -8
data/spec/parser/properties_spec.rb +0 -115
data/spec/parser/quantifiers_spec.rb +0 -51
data/spec/parser/refcalls_spec.rb +0 -112
data/spec/parser/set/intersections_spec.rb +0 -127
data/spec/parser/set/ranges_spec.rb +0 -111
data/spec/parser/sets_spec.rb +0 -178
data/spec/parser/types_spec.rb +0 -18
data/spec/scanner/all_spec.rb +0 -18
data/spec/scanner/anchors_spec.rb +0 -21
data/spec/scanner/conditionals_spec.rb +0 -128
data/spec/scanner/errors_spec.rb +0 -68
data/spec/scanner/escapes_spec.rb +0 -53
data/spec/scanner/free_space_spec.rb +0 -133
data/spec/scanner/groups_spec.rb +0 -52
data/spec/scanner/keep_spec.rb +0 -10
data/spec/scanner/literals_spec.rb +0 -49
data/spec/scanner/meta_spec.rb +0 -18
data/spec/scanner/properties_spec.rb +0 -64
data/spec/scanner/quantifiers_spec.rb +0 -20
data/spec/scanner/refcalls_spec.rb +0 -36
data/spec/scanner/sets_spec.rb +0 -102
data/spec/scanner/types_spec.rb +0 -14
data/spec/spec_helper.rb +0 -15
data/spec/support/runner.rb +0 -42
data/spec/support/shared_examples.rb +0 -77
data/spec/support/warning_extractor.rb +0 -60
data/spec/syntax/syntax_spec.rb +0 -48
data/spec/syntax/syntax_token_map_spec.rb +0 -23
data/spec/syntax/versions/1.8.6_spec.rb +0 -17
data/spec/syntax/versions/1.9.1_spec.rb +0 -10
data/spec/syntax/versions/1.9.3_spec.rb +0 -9
data/spec/syntax/versions/2.0.0_spec.rb +0 -13
data/spec/syntax/versions/2.2.0_spec.rb +0 -9
data/spec/syntax/versions/aliases_spec.rb +0 -37
data/spec/token/token_spec.rb +0 -85
/data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -1,10 +1,10 @@
+require 'regexp_parser/error'
 require 'regexp_parser/expression'
 class Regexp::Parser
   include Regexp::Expression
-  include Regexp::Syntax
-  class ParserError < StandardError; end
+  class ParserError < Regexp::Parser::Error; end
   class UnknownTokenTypeError < ParserError
     def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
     end
   end
-  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
-    new.parse(input, syntax, &block)
+  def self.parse(input, syntax = nil, options: nil, &block)
+    new.parse(input, syntax, options: options, &block)
   end
-  def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
-    root = Root.build(options_from_input(input))
+  def parse(input, syntax = nil, options: nil, &block)
+    root = Root.construct(options: extract_options(input, options))
     self.root = root
     self.node = root
@@ -35,10 +35,13 @@ class Regexp::Parser
     self.captured_group_counts = Hash.new(0)
-    Regexp::Lexer.scan(input, syntax) do |token|
+    Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
       parse_token(token)
     end
+    # Trigger recursive setting of #nesting_level, which reflects how deep
+    # a node is in the tree. Do this at the end to account for tree rewrites.
+    root.nesting_level = 0
     assign_referenced_expressions
     if block_given?
@@ -54,107 +57,173 @@ class Regexp::Parser
                 :options_stack, :switching_options, :conditional_nesting,
                 :captured_group_counts
-  def options_from_input(input)
-    return {} unless input.is_a?(::Regexp)
+  def extract_options(input, options)
+    if options && !input.is_a?(String)
+      raise ArgumentError, 'options cannot be supplied unless parsing a String'
+    end
-    options = {}
-    options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
-    options[:m] = true if input.options & ::Regexp::MULTILINE  != 0
-    options[:x] = true if input.options & ::Regexp::EXTENDED   != 0
-    options
-  end
+    options = input.options if input.is_a?(::Regexp)
-  def nest(exp)
-    nesting.push(exp)
-    node << exp
-    update_transplanted_subtree(exp, node)
-    self.node = exp
-  end
+    return {} unless options
-  # subtrees are transplanted to build Alternations, Intersections, Ranges
-  def update_transplanted_subtree(exp, new_parent)
-    exp.nesting_level = new_parent.nesting_level + 1
-    exp.respond_to?(:each) &&
-      exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
-  end
-  def decrease_nesting
-    while nesting.last.is_a?(SequenceOperation)
-      nesting.pop
-      self.node = nesting.last
-    end
-    nesting.pop
-    yield(node) if block_given?
-    self.node = nesting.last
-    self.node = node.last if node.last.is_a?(SequenceOperation)
-  end
-  def nest_conditional(exp)
-    conditional_nesting.push(exp)
-    nest(exp)
+    enabled_options = {}
+    enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
+    enabled_options[:m] = true if options & ::Regexp::MULTILINE  != 0
+    enabled_options[:x] = true if options & ::Regexp::EXTENDED   != 0
+    enabled_options
   end
   def parse_token(token)
-    close_completed_character_set_range
     case token.type
-    when :meta;         meta(token)
-    when :quantifier;   quantifier(token)
-    when :anchor;       anchor(token)
-    when :escape;       escape(token)
-    when :group;        group(token)
-    when :assertion;    group(token)
-    when :set;          set(token)
-    when :type;         type(token)
-    when :backref;      backref(token)
-    when :conditional;  conditional(token)
-    when :keep;         keep(token)
-    when :posixclass, :nonposixclass
-      posixclass(token)
-    when :property, :nonproperty
-      property(token)
-    when :literal
-      node << Literal.new(token, active_opts)
-    when :free_space
-      free_space(token)
+    when :anchor;                     anchor(token)
+    when :assertion, :group;          group(token)
+    when :backref;                    backref(token)
+    when :conditional;                conditional(token)
+    when :escape;                     escape(token)
+    when :free_space;                 free_space(token)
+    when :keep;                       keep(token)
+    when :literal;                    literal(token)
+    when :meta;                       meta(token)
+    when :posixclass, :nonposixclass; posixclass(token)
+    when :property, :nonproperty;     property(token)
+    when :quantifier;                 quantifier(token)
+    when :set;                        set(token)
+    when :type;                       type(token)
     else
       raise UnknownTokenTypeError.new(token.type, token)
     end
+    close_completed_character_set_range
   end
-  def set(token)
+  def anchor(token)
     case token.token
-    when :open
-      open_set(token)
-    when :close
-      close_set
-    when :negate
-      negate_set
-    when :range
-      range(token)
-    when :intersection
-      intersection(token)
-    when :collation, :equivalent
-      node << Literal.new(token, active_opts)
+    when :bol;              node << Anchor::BeginningOfLine.new(token, active_opts)
+    when :bos;              node << Anchor::BOS.new(token, active_opts)
+    when :eol;              node << Anchor::EndOfLine.new(token, active_opts)
+    when :eos;              node << Anchor::EOS.new(token, active_opts)
+    when :eos_ob_eol;       node << Anchor::EOSobEOL.new(token, active_opts)
+    when :match_start;      node << Anchor::MatchStart.new(token, active_opts)
+    when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
+    when :word_boundary;    node << Anchor::WordBoundary.new(token, active_opts)
     else
-      raise UnknownTokenError.new('CharacterSet', token)
+      raise UnknownTokenError.new('Anchor', token)
     end
   end
-  def meta(token)
+  def group(token)
     case token.token
-    when :dot
-      node << CharacterType::Any.new(token, active_opts)
-    when :alternation
-      sequence_operation(Alternation, token)
+    when :options, :options_switch
+      options_group(token)
+    when :close
+      close_group
+    when :comment
+      node << Group::Comment.new(token, active_opts)
     else
-      raise UnknownTokenError.new('Meta', token)
+      open_group(token)
     end
   end
+  MOD_FLAGS = %w[i m x].map(&:to_sym)
+  ENC_FLAGS = %w[a d u].map(&:to_sym)
+  def options_group(token)
+    positive, negative = token.text.split('-', 2)
+    negative ||= ''
+    self.switching_options = token.token.equal?(:options_switch)
+    opt_changes = {}
+    new_active_opts = active_opts.dup
+    MOD_FLAGS.each do |flag|
+      if positive.include?(flag.to_s)
+        opt_changes[flag] = new_active_opts[flag] = true
+      end
+      if negative.include?(flag.to_s)
+        opt_changes[flag] = false
+        new_active_opts.delete(flag)
+      end
+    end
+    if (enc_flag = positive.reverse[/[adu]/])
+      enc_flag = enc_flag.to_sym
+      (ENC_FLAGS - [enc_flag]).each do |other|
+        opt_changes[other] = false if new_active_opts[other]
+        new_active_opts.delete(other)
+      end
+      opt_changes[enc_flag] = new_active_opts[enc_flag] = true
+    end
+    options_stack << new_active_opts
+    options_group = Group::Options.new(token, active_opts)
+    options_group.option_changes = opt_changes
+    nest(options_group)
+  end
+  def open_group(token)
+    group_class =
+      case token.token
+      when :absence;     Group::Absence
+      when :atomic;      Group::Atomic
+      when :capture;     Group::Capture
+      when :named;       Group::Named
+      when :passive;     Group::Passive
+      when :lookahead;   Assertion::Lookahead
+      when :lookbehind;  Assertion::Lookbehind
+      when :nlookahead;  Assertion::NegativeLookahead
+      when :nlookbehind; Assertion::NegativeLookbehind
+      else
+        raise UnknownTokenError.new('Group type open', token)
+      end
+    group = group_class.new(token, active_opts)
+    if group.capturing?
+      group.number          = total_captured_group_count + 1
+      group.number_at_level = captured_group_count_at_level + 1
+      count_captured_group
+    end
+    # Push the active options to the stack again. This way we can simply pop the
+    # stack for any group we close, no matter if it had its own options or not.
+    options_stack << active_opts
+    nest(group)
+  end
+  def total_captured_group_count
+    captured_group_counts.values.reduce(0, :+)
+  end
+  def captured_group_count_at_level
+    captured_group_counts[node]
+  end
+  def count_captured_group
+    captured_group_counts[node] += 1
+  end
+  def close_group
+    options_stack.pop unless switching_options
+    self.switching_options = false
+    decrease_nesting
+  end
+  def decrease_nesting
+    while nesting.last.is_a?(SequenceOperation)
+      nesting.pop
+      self.node = nesting.last
+    end
+    nesting.pop
+    yield(node) if block_given?
+    self.node = nesting.last
+    self.node = node.last if node.last.is_a?(SequenceOperation)
+  end
   def backref(token)
     case token.token
     when :name_ref
@@ -163,10 +232,18 @@ class Regexp::Parser
       node << Backreference::NameRecursionLevel.new(token, active_opts)
     when :name_call
       node << Backreference::NameCall.new(token, active_opts)
-    when :number, :number_ref
+    when :number, :number_ref # TODO: split in v3.0.0
       node << Backreference::Number.new(token, active_opts)
     when :number_recursion_ref
-      node << Backreference::NumberRecursionLevel.new(token, active_opts)
+      node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
+        # TODO: should split off new token number_recursion_rel_ref and new
+        # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
+        if exp.text =~ /[<'][+-]/
+          assign_effective_number(exp)
+        else
+          exp.effective_number = exp.number
+        end
+      end
     when :number_call
       node << Backreference::NumberCall.new(token, active_opts)
     when :number_rel_ref
@@ -182,31 +259,11 @@ class Regexp::Parser
     end
   end
-  def type(token)
-    case token.token
-    when :digit
-      node << CharacterType::Digit.new(token, active_opts)
-    when :nondigit
-      node << CharacterType::NonDigit.new(token, active_opts)
-    when :hex
-      node << CharacterType::Hex.new(token, active_opts)
-    when :nonhex
-      node << CharacterType::NonHex.new(token, active_opts)
-    when :space
-      node << CharacterType::Space.new(token, active_opts)
-    when :nonspace
-      node << CharacterType::NonSpace.new(token, active_opts)
-    when :word
-      node << CharacterType::Word.new(token, active_opts)
-    when :nonword
-      node << CharacterType::NonWord.new(token, active_opts)
-    when :linebreak
-      node << CharacterType::Linebreak.new(token, active_opts)
-    when :xgrapheme
-      node << CharacterType::ExtendedGrapheme.new(token, active_opts)
-    else
-      raise UnknownTokenError.new('CharacterType', token)
-    end
+  def assign_effective_number(exp)
+    exp.effective_number =
+      exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
+    exp.effective_number > 0 ||
+      raise(ParserError, "Invalid reference: #{exp.reference}")
   end
   def conditional(token)
@@ -215,9 +272,9 @@ class Regexp::Parser
       nest_conditional(Conditional::Expression.new(token, active_opts))
     when :condition
       conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
     when :separator
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
       self.node = conditional_nesting.last.branches.last
     when :close
       conditional_nesting.pop
@@ -234,157 +291,38 @@ class Regexp::Parser
     end
   end
-  def posixclass(token)
-    node << PosixClass.new(token, active_opts)
-  end
-  include Regexp::Expression::UnicodeProperty
-  def property(token)
-    case token.token
-    when :alnum;                  node << Alnum.new(token, active_opts)
-    when :alpha;                  node << Alpha.new(token, active_opts)
-    when :ascii;                  node << Ascii.new(token, active_opts)
-    when :blank;                  node << Blank.new(token, active_opts)
-    when :cntrl;                  node << Cntrl.new(token, active_opts)
-    when :digit;                  node << Digit.new(token, active_opts)
-    when :graph;                  node << Graph.new(token, active_opts)
-    when :lower;                  node << Lower.new(token, active_opts)
-    when :print;                  node << Print.new(token, active_opts)
-    when :punct;                  node << Punct.new(token, active_opts)
-    when :space;                  node << Space.new(token, active_opts)
-    when :upper;                  node << Upper.new(token, active_opts)
-    when :word;                   node << Word.new(token, active_opts)
-    when :xdigit;                 node << Xdigit.new(token, active_opts)
-    when :xposixpunct;            node << XPosixPunct.new(token, active_opts)
-    # only in Oniguruma (old rubies)
-    when :newline;                node << Newline.new(token, active_opts)
-    when :any;                    node << Any.new(token, active_opts)
-    when :assigned;               node << Assigned.new(token, active_opts)
-    when :letter;                 node << Letter::Any.new(token, active_opts)
-    when :cased_letter;           node << Letter::Cased.new(token, active_opts)
-    when :uppercase_letter;       node << Letter::Uppercase.new(token, active_opts)
-    when :lowercase_letter;       node << Letter::Lowercase.new(token, active_opts)
-    when :titlecase_letter;       node << Letter::Titlecase.new(token, active_opts)
-    when :modifier_letter;        node << Letter::Modifier.new(token, active_opts)
-    when :other_letter;           node << Letter::Other.new(token, active_opts)
-    when :mark;                   node << Mark::Any.new(token, active_opts)
-    when :combining_mark;         node << Mark::Combining.new(token, active_opts)
-    when :nonspacing_mark;        node << Mark::Nonspacing.new(token, active_opts)
-    when :spacing_mark;           node << Mark::Spacing.new(token, active_opts)
-    when :enclosing_mark;         node << Mark::Enclosing.new(token, active_opts)
-    when :number;                 node << Number::Any.new(token, active_opts)
-    when :decimal_number;         node << Number::Decimal.new(token, active_opts)
-    when :letter_number;          node << Number::Letter.new(token, active_opts)
-    when :other_number;           node << Number::Other.new(token, active_opts)
-    when :punctuation;            node << Punctuation::Any.new(token, active_opts)
-    when :connector_punctuation;  node << Punctuation::Connector.new(token, active_opts)
-    when :dash_punctuation;       node << Punctuation::Dash.new(token, active_opts)
-    when :open_punctuation;       node << Punctuation::Open.new(token, active_opts)
-    when :close_punctuation;      node << Punctuation::Close.new(token, active_opts)
-    when :initial_punctuation;    node << Punctuation::Initial.new(token, active_opts)
-    when :final_punctuation;      node << Punctuation::Final.new(token, active_opts)
-    when :other_punctuation;      node << Punctuation::Other.new(token, active_opts)
-    when :separator;              node << Separator::Any.new(token, active_opts)
-    when :space_separator;        node << Separator::Space.new(token, active_opts)
-    when :line_separator;         node << Separator::Line.new(token, active_opts)
-    when :paragraph_separator;    node << Separator::Paragraph.new(token, active_opts)
-    when :symbol;                 node << Symbol::Any.new(token, active_opts)
-    when :math_symbol;            node << Symbol::Math.new(token, active_opts)
-    when :currency_symbol;        node << Symbol::Currency.new(token, active_opts)
-    when :modifier_symbol;        node << Symbol::Modifier.new(token, active_opts)
-    when :other_symbol;           node << Symbol::Other.new(token, active_opts)
-    when :other;                  node << Codepoint::Any.new(token, active_opts)
-    when :control;                node << Codepoint::Control.new(token, active_opts)
-    when :format;                 node << Codepoint::Format.new(token, active_opts)
-    when :surrogate;              node << Codepoint::Surrogate.new(token, active_opts)
-    when :private_use;            node << Codepoint::PrivateUse.new(token, active_opts)
-    when :unassigned;             node << Codepoint::Unassigned.new(token, active_opts)
-    when *Token::UnicodeProperty::Age
-      node << Age.new(token, active_opts)
-    when *Token::UnicodeProperty::Derived
-      node << Derived.new(token, active_opts)
-    when *Token::UnicodeProperty::Emoji
-      node << Emoji.new(token, active_opts)
-    when *Token::UnicodeProperty::Script
-      node << Script.new(token, active_opts)
-    when *Token::UnicodeProperty::UnicodeBlock
-      node << Block.new(token, active_opts)
-    else
-      raise UnknownTokenError.new('UnicodeProperty', token)
-    end
+  def nest_conditional(exp)
+    conditional_nesting.push(exp)
+    nest(exp)
   end
-  def anchor(token)
-    case token.token
-    when :bol
-      node << Anchor::BeginningOfLine.new(token, active_opts)
-    when :eol
-      node << Anchor::EndOfLine.new(token, active_opts)
-    when :bos
-      node << Anchor::BOS.new(token, active_opts)
-    when :eos
-      node << Anchor::EOS.new(token, active_opts)
-    when :eos_ob_eol
-      node << Anchor::EOSobEOL.new(token, active_opts)
-    when :word_boundary
-      node << Anchor::WordBoundary.new(token, active_opts)
-    when :nonword_boundary
-      node << Anchor::NonWordBoundary.new(token, active_opts)
-    when :match_start
-      node << Anchor::MatchStart.new(token, active_opts)
-    else
-      raise UnknownTokenError.new('Anchor', token)
-    end
+  def nest(exp)
+    nesting.push(exp)
+    node << exp
+    self.node = exp
   end
   def escape(token)
     case token.token
-    when :backspace
-      node << EscapeSequence::Backspace.new(token, active_opts)
-    when :escape
-      node << EscapeSequence::AsciiEscape.new(token, active_opts)
-    when :bell
-      node << EscapeSequence::Bell.new(token, active_opts)
-    when :form_feed
-      node << EscapeSequence::FormFeed.new(token, active_opts)
-    when :newline
-      node << EscapeSequence::Newline.new(token, active_opts)
-    when :carriage
-      node << EscapeSequence::Return.new(token, active_opts)
-    when :tab
-      node << EscapeSequence::Tab.new(token, active_opts)
-    when :vertical_tab
-      node << EscapeSequence::VerticalTab.new(token, active_opts)
-    when :hex
-      node << EscapeSequence::Hex.new(token, active_opts)
-    when :octal
-      node << EscapeSequence::Octal.new(token, active_opts)
-    when :codepoint
-      node << EscapeSequence::Codepoint.new(token, active_opts)
-    when :codepoint_list
-      node << EscapeSequence::CodepointList.new(token, active_opts)
+    when :backspace;      node << EscapeSequence::Backspace.new(token, active_opts)
+    when :escape;         node << EscapeSequence::AsciiEscape.new(token, active_opts)
+    when :bell;           node << EscapeSequence::Bell.new(token, active_opts)
+    when :form_feed;      node << EscapeSequence::FormFeed.new(token, active_opts)
+    when :newline;        node << EscapeSequence::Newline.new(token, active_opts)
+    when :carriage;       node << EscapeSequence::Return.new(token, active_opts)
+    when :tab;            node << EscapeSequence::Tab.new(token, active_opts)
+    when :vertical_tab;   node << EscapeSequence::VerticalTab.new(token, active_opts)
+    when :codepoint;      node << EscapeSequence::Codepoint.new(token, active_opts)
+    when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
+    when :hex;            node << EscapeSequence::Hex.new(token, active_opts)
+    when :octal;          node << EscapeSequence::Octal.new(token, active_opts)
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
+        # TODO: emit :meta_control_sequence token in v3.0.0
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Control.new(token, active_opts)
@@ -392,6 +330,7 @@ class Regexp::Parser
     when :meta_sequence
       if token.text =~ /\A\\M-\\[Cc]/
+        # TODO: emit :meta_control_sequence token in v3.0.0:
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Meta.new(token, active_opts)
@@ -399,188 +338,195 @@ class Regexp::Parser
     else
       # treating everything else as a literal
+      # TODO: maybe split this up a bit more in v3.0.0?
+      # E.g. escaped quantifiers or set meta chars are not the same
+      # as stuff that would be a literal even without the backslash.
+      # Right now, they all end up here.
       node << EscapeSequence::Literal.new(token, active_opts)
     end
   end
-  def keep(token)
-    node << Keep::Mark.new(token, active_opts)
-  end
   def free_space(token)
     case token.token
     when :comment
       node << Comment.new(token, active_opts)
     when :whitespace
-      if node.last.is_a?(WhiteSpace)
-        node.last.merge(WhiteSpace.new(token, active_opts))
-      else
-        node << WhiteSpace.new(token, active_opts)
-      end
+      node << WhiteSpace.new(token, active_opts)
     else
       raise UnknownTokenError.new('FreeSpace', token)
     end
   end
-  def quantifier(token)
-    offset = -1
-    target_node = node.expressions[offset]
-    while target_node.is_a?(FreeSpace)
-      target_node = node.expressions[offset -= 1]
-    end
+  def keep(token)
+    node << Keep::Mark.new(token, active_opts)
+  end
-    target_node || raise(ArgumentError, 'No valid target found for '\
-                                        "'#{token.text}' ")
+  def literal(token)
+    node << Literal.new(token, active_opts)
+  end
+  def meta(token)
     case token.token
-    when :zero_or_one
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
-    when :zero_or_one_reluctant
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
-    when :zero_or_one_possessive
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
-    when :zero_or_more
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
-    when :zero_or_more_reluctant
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
-    when :zero_or_more_possessive
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
-    when :one_or_more
-      target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
-    when :one_or_more_reluctant
-      target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
-    when :one_or_more_possessive
-      target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
-    when :interval
-      interval(target_node, token)
+    when :dot
+      node << CharacterType::Any.new(token, active_opts)
+    when :alternation
+      sequence_operation(Alternation, token)
     else
-      raise UnknownTokenError.new('Quantifier', token)
+      raise UnknownTokenError.new('Meta', token)
     end
   end
-  def interval(target_node, token)
-    text = token.text
-    mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
-    case mchr
-    when '?'
-      range_text = text[0...-1]
-      mode = :reluctant
-    when '+'
-      range_text = text[0...-1]
-      mode = :possessive
-    else
-      range_text = text
-      mode = :greedy
+  def sequence_operation(klass, token)
+    unless node.instance_of?(klass)
+      operator = klass.new(token, active_opts)
+      sequence = operator.add_sequence(active_opts, { ts: token.ts })
+      sequence.expressions = node.expressions
+      node.expressions = []
+      nest(operator)
     end
-    range = range_text.gsub(/\{|\}/, '').split(',', 2)
-    min = range[0].empty? ? 0 : range[0]
-    max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
-    target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
+    node.add_sequence(active_opts, { ts: token.te })
   end
-  def group(token)
-    case token.token
-    when :options, :options_switch
-      options_group(token)
-    when :close
-      close_group
-    when :comment
-      node << Group::Comment.new(token, active_opts)
-    else
-      open_group(token)
-    end
+  def posixclass(token)
+    node << PosixClass.new(token, active_opts)
   end
-  MOD_FLAGS = %w[i m x].map(&:to_sym)
-  ENC_FLAGS = %w[a d u].map(&:to_sym)
+  UP = Regexp::Expression::Property
+  UPTokens = Regexp::Syntax::Token::Property
-  def options_group(token)
-    positive, negative = token.text.split('-', 2)
-    negative ||= ''
-    self.switching_options = token.token.equal?(:options_switch)
+  def property(token)
+    case token.token
+    when :alnum;                  node << UP::Alnum.new(token, active_opts)
+    when :alpha;                  node << UP::Alpha.new(token, active_opts)
+    when :ascii;                  node << UP::Ascii.new(token, active_opts)
+    when :blank;                  node << UP::Blank.new(token, active_opts)
+    when :cntrl;                  node << UP::Cntrl.new(token, active_opts)
+    when :digit;                  node << UP::Digit.new(token, active_opts)
+    when :graph;                  node << UP::Graph.new(token, active_opts)
+    when :lower;                  node << UP::Lower.new(token, active_opts)
+    when :print;                  node << UP::Print.new(token, active_opts)
+    when :punct;                  node << UP::Punct.new(token, active_opts)
+    when :space;                  node << UP::Space.new(token, active_opts)
+    when :upper;                  node << UP::Upper.new(token, active_opts)
+    when :word;                   node << UP::Word.new(token, active_opts)
+    when :xdigit;                 node << UP::Xdigit.new(token, active_opts)
+    when :xposixpunct;            node << UP::XPosixPunct.new(token, active_opts)
-    opt_changes = {}
-    new_active_opts = active_opts.dup
+    # only in Oniguruma (old rubies)
+    when :newline;                node << UP::Newline.new(token, active_opts)
+    when :any;                    node << UP::Any.new(token, active_opts)
+    when :assigned;               node << UP::Assigned.new(token, active_opts)
+    when :letter;                 node << UP::Letter::Any.new(token, active_opts)
+    when :cased_letter;           node << UP::Letter::Cased.new(token, active_opts)
+    when :uppercase_letter;       node << UP::Letter::Uppercase.new(token, active_opts)
+    when :lowercase_letter;       node << UP::Letter::Lowercase.new(token, active_opts)
+    when :titlecase_letter;       node << UP::Letter::Titlecase.new(token, active_opts)
+    when :modifier_letter;        node << UP::Letter::Modifier.new(token, active_opts)
+    when :other_letter;           node << UP::Letter::Other.new(token, active_opts)
+    when :mark;                   node << UP::Mark::Any.new(token, active_opts)
+    when :combining_mark;         node << UP::Mark::Combining.new(token, active_opts)
+    when :nonspacing_mark;        node << UP::Mark::Nonspacing.new(token, active_opts)
+    when :spacing_mark;           node << UP::Mark::Spacing.new(token, active_opts)
+    when :enclosing_mark;         node << UP::Mark::Enclosing.new(token, active_opts)
+    when :number;                 node << UP::Number::Any.new(token, active_opts)
+    when :decimal_number;         node << UP::Number::Decimal.new(token, active_opts)
+    when :letter_number;          node << UP::Number::Letter.new(token, active_opts)
+    when :other_number;           node << UP::Number::Other.new(token, active_opts)
+    when :punctuation;            node << UP::Punctuation::Any.new(token, active_opts)
+    when :connector_punctuation;  node << UP::Punctuation::Connector.new(token, active_opts)
+    when :dash_punctuation;       node << UP::Punctuation::Dash.new(token, active_opts)
+    when :open_punctuation;       node << UP::Punctuation::Open.new(token, active_opts)
+    when :close_punctuation;      node << UP::Punctuation::Close.new(token, active_opts)
+    when :initial_punctuation;    node << UP::Punctuation::Initial.new(token, active_opts)
+    when :final_punctuation;      node << UP::Punctuation::Final.new(token, active_opts)
+    when :other_punctuation;      node << UP::Punctuation::Other.new(token, active_opts)
+    when :separator;              node << UP::Separator::Any.new(token, active_opts)
+    when :space_separator;        node << UP::Separator::Space.new(token, active_opts)
+    when :line_separator;         node << UP::Separator::Line.new(token, active_opts)
+    when :paragraph_separator;    node << UP::Separator::Paragraph.new(token, active_opts)
+    when :symbol;                 node << UP::Symbol::Any.new(token, active_opts)
+    when :math_symbol;            node << UP::Symbol::Math.new(token, active_opts)
+    when :currency_symbol;        node << UP::Symbol::Currency.new(token, active_opts)
+    when :modifier_symbol;        node << UP::Symbol::Modifier.new(token, active_opts)
+    when :other_symbol;           node << UP::Symbol::Other.new(token, active_opts)
+    when :other;                  node << UP::Codepoint::Any.new(token, active_opts)
+    when :control;                node << UP::Codepoint::Control.new(token, active_opts)
+    when :format;                 node << UP::Codepoint::Format.new(token, active_opts)
+    when :surrogate;              node << UP::Codepoint::Surrogate.new(token, active_opts)
+    when :private_use;            node << UP::Codepoint::PrivateUse.new(token, active_opts)
+    when :unassigned;             node << UP::Codepoint::Unassigned.new(token, active_opts)
+    when *UPTokens::Age;          node << UP::Age.new(token, active_opts)
+    when *UPTokens::Derived;      node << UP::Derived.new(token, active_opts)
+    when *UPTokens::Emoji;        node << UP::Emoji.new(token, active_opts)
+    when *UPTokens::Enumerated;   node << UP::Enumerated.new(token, active_opts)
+    when *UPTokens::Script;       node << UP::Script.new(token, active_opts)
+    when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
-    MOD_FLAGS.each do |flag|
-      if positive.include?(flag.to_s)
-        opt_changes[flag] = new_active_opts[flag] = true
-      end
-      if negative.include?(flag.to_s)
-        opt_changes[flag] = false
-        new_active_opts.delete(flag)
-      end
+    else
+      raise UnknownTokenError.new('UnicodeProperty', token)
     end
+  end
-    if (enc_flag = positive.reverse[/[adu]/])
-      enc_flag = enc_flag.to_sym
-      (ENC_FLAGS - [enc_flag]).each do |other|
-        opt_changes[other] = false if new_active_opts[other]
-        new_active_opts.delete(other)
-      end
-      opt_changes[enc_flag] = new_active_opts[enc_flag] = true
+  def quantifier(token)
+    target_node = node.extract_quantifier_target(token.text)
+    # in case of chained quantifiers, wrap target in an implicit passive group
+    # description of the problem: https://github.com/ammar/regexp_parser/issues/3
+    # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
+    if target_node.quantified?
+      new_group = Group::Passive.construct(
+        token:             :passive,
+        ts:                target_node.ts,
+        level:             target_node.level,
+        set_level:         target_node.set_level,
+        conditional_level: target_node.conditional_level,
+        options:           active_opts,
+      )
+      new_group.implicit = true
+      new_group << target_node
+      increase_group_level(target_node)
+      node.expressions[node.expressions.index(target_node)] = new_group
+      target_node = new_group
     end
-    options_stack << new_active_opts
+    unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
+                             (?:_greedy|_reluctant|_possessive)?\z/x
+      raise UnknownTokenError.new('Quantifier', token)
+    end
-    options_group = Group::Options.new(token, active_opts)
-    options_group.option_changes = opt_changes
+    target_node.quantify(token, active_opts)
+  end
-    nest(options_group)
+  def increase_group_level(exp)
+    exp.level += 1
+    exp.quantifier.level += 1 if exp.quantifier
+    exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
   end
-  def open_group(token)
+  def set(token)
     case token.token
-    when :passive
-      exp = Group::Passive.new(token, active_opts)
-    when :atomic
-      exp = Group::Atomic.new(token, active_opts)
-    when :named
-      exp = Group::Named.new(token, active_opts)
-    when :capture
-      exp = Group::Capture.new(token, active_opts)
-    when :absence
-      exp = Group::Absence.new(token, active_opts)
-    when :lookahead
-      exp = Assertion::Lookahead.new(token, active_opts)
-    when :nlookahead
-      exp = Assertion::NegativeLookahead.new(token, active_opts)
-    when :lookbehind
-      exp = Assertion::Lookbehind.new(token, active_opts)
-    when :nlookbehind
-      exp = Assertion::NegativeLookbehind.new(token, active_opts)
+    when :open;         open_set(token)
+    when :close;        close_set
+    when :negate;       negate_set
+    when :range;        range(token)
+    when :intersection; intersection(token)
     else
-      raise UnknownTokenError.new('Group type open', token)
-    end
-    if exp.capturing?
-      exp.number          = total_captured_group_count + 1
-      exp.number_at_level = captured_group_count_at_level + 1
-      count_captured_group
+      raise UnknownTokenError.new('CharacterSet', token)
     end
-    # Push the active options to the stack again. This way we can simply pop the
-    # stack for any group we close, no matter if it had its own options or not.
-    options_stack << active_opts
-    nest(exp)
-  end
-  def close_group
-    options_stack.pop unless switching_options
-    self.switching_options = false
-    decrease_nesting
   end
   def open_set(token)
+    # TODO: this and Quantifier are the only cases where Expression#token
+    # does not match the scanner/lexer output. Fix in v3.0.0.
     token.token = :character
     nest(CharacterSet.new(token, active_opts))
   end
@@ -595,59 +541,56 @@ class Regexp::Parser
   def range(token)
     exp = CharacterSet::Range.new(token, active_opts)
-    scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
+    scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
     exp << scope.expressions.pop
     nest(exp)
   end
-  def close_completed_character_set_range
-    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
-  end
   def intersection(token)
     sequence_operation(CharacterSet::Intersection, token)
   end
-  def sequence_operation(klass, token)
-    unless node.is_a?(klass)
-      operator = klass.new(token, active_opts)
-      sequence = operator.add_sequence(active_opts)
-      sequence.expressions = node.expressions
-      node.expressions = []
-      nest(operator)
+  def type(token)
+    case token.token
+    when :digit;     node << CharacterType::Digit.new(token, active_opts)
+    when :hex;       node << CharacterType::Hex.new(token, active_opts)
+    when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
+    when :nondigit;  node << CharacterType::NonDigit.new(token, active_opts)
+    when :nonhex;    node << CharacterType::NonHex.new(token, active_opts)
+    when :nonspace;  node << CharacterType::NonSpace.new(token, active_opts)
+    when :nonword;   node << CharacterType::NonWord.new(token, active_opts)
+    when :space;     node << CharacterType::Space.new(token, active_opts)
+    when :word;      node << CharacterType::Word.new(token, active_opts)
+    when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
+    else
+      raise UnknownTokenError.new('CharacterType', token)
     end
-    node.add_sequence(active_opts)
-  end
-  def active_opts
-    options_stack.last
-  end
-  def total_captured_group_count
-    captured_group_counts.values.reduce(0, :+)
-  end
-  def captured_group_count_at_level
-    captured_group_counts[node.level]
   end
-  def count_captured_group
-    captured_group_counts[node.level] += 1
+  def close_completed_character_set_range
+    decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
   end
-  def assign_effective_number(exp)
-    exp.effective_number =
-      exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
+  def active_opts
+    options_stack.last
   end
+  # Assigns referenced expressions to refering expressions, e.g. if there is
+  # an instance of Backreference::Number, its #referenced_expression is set to
+  # the instance of Group::Capture that it refers to via its number.
   def assign_referenced_expressions
-    targets = {}
+    # find all referencable and refering expressions
+    targets = { 0 => root }
+    referrers = []
     root.each_expression do |exp|
       exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
+      referrers << exp if exp.referential?
     end
-    root.each_expression do |exp|
-      exp.respond_to?(:reference) &&
-        exp.referenced_expression = targets[exp.reference]
+    # assign reference expression to refering expressions
+    # (in a second iteration because there might be forward references)
+    referrers.each do |exp|
+      exp.referenced_expression = targets[exp.reference] ||
+        raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
     end
   end
 end # module Regexp::Parser