RubyGems - regexp_parser - Versions diffs - 1.7.1 → 2.2.1 - Mend

regexp_parser 1.7.1 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +157 -1
data/Gemfile +6 -1
data/LICENSE +1 -1
data/README.md +38 -32
data/Rakefile +18 -27
data/lib/regexp_parser/error.rb +4 -0
data/lib/regexp_parser/expression/base.rb +123 -0
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
data/lib/regexp_parser/expression/classes/group.rb +28 -3
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/property.rb +1 -3
data/lib/regexp_parser/expression/classes/root.rb +4 -17
data/lib/regexp_parser/expression/classes/type.rb +0 -2
data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
data/lib/regexp_parser/expression/quantifier.rb +11 -2
data/lib/regexp_parser/expression/sequence.rb +3 -20
data/lib/regexp_parser/expression/subexpression.rb +1 -2
data/lib/regexp_parser/expression.rb +7 -139
data/lib/regexp_parser/lexer.rb +13 -11
data/lib/regexp_parser/parser.rb +325 -344
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/properties/long.csv +604 -0
data/lib/regexp_parser/scanner/properties/short.csv +242 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +235 -255
data/lib/regexp_parser/scanner.rb +1324 -1387
data/lib/regexp_parser/syntax/any.rb +4 -6
data/lib/regexp_parser/syntax/base.rb +13 -15
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +31 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
data/lib/regexp_parser/syntax.rb +8 -6
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +0 -2
data/regexp_parser.gemspec +20 -22
metadata +34 -165
data/lib/regexp_parser/scanner/properties/long.yml +0 -594
data/lib/regexp_parser/scanner/properties/short.yml +0 -237
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/expression/base_spec.rb +0 -94
data/spec/expression/clone_spec.rb +0 -120
data/spec/expression/conditional_spec.rb +0 -89
data/spec/expression/free_space_spec.rb +0 -27
data/spec/expression/methods/match_length_spec.rb +0 -161
data/spec/expression/methods/match_spec.rb +0 -25
data/spec/expression/methods/strfregexp_spec.rb +0 -224
data/spec/expression/methods/tests_spec.rb +0 -99
data/spec/expression/methods/traverse_spec.rb +0 -161
data/spec/expression/options_spec.rb +0 -128
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9
data/spec/expression/subexpression_spec.rb +0 -50
data/spec/expression/to_h_spec.rb +0 -26
data/spec/expression/to_s_spec.rb +0 -100
data/spec/lexer/all_spec.rb +0 -22
data/spec/lexer/conditionals_spec.rb +0 -53
data/spec/lexer/delimiters_spec.rb +0 -68
data/spec/lexer/escapes_spec.rb +0 -14
data/spec/lexer/keep_spec.rb +0 -10
data/spec/lexer/literals_spec.rb +0 -89
data/spec/lexer/nesting_spec.rb +0 -99
data/spec/lexer/refcalls_spec.rb +0 -55
data/spec/parser/all_spec.rb +0 -43
data/spec/parser/alternation_spec.rb +0 -88
data/spec/parser/anchors_spec.rb +0 -17
data/spec/parser/conditionals_spec.rb +0 -179
data/spec/parser/errors_spec.rb +0 -30
data/spec/parser/escapes_spec.rb +0 -121
data/spec/parser/free_space_spec.rb +0 -130
data/spec/parser/groups_spec.rb +0 -108
data/spec/parser/keep_spec.rb +0 -6
data/spec/parser/posix_classes_spec.rb +0 -8
data/spec/parser/properties_spec.rb +0 -115
data/spec/parser/quantifiers_spec.rb +0 -52
data/spec/parser/refcalls_spec.rb +0 -112
data/spec/parser/set/intersections_spec.rb +0 -127
data/spec/parser/set/ranges_spec.rb +0 -111
data/spec/parser/sets_spec.rb +0 -178
data/spec/parser/types_spec.rb +0 -18
data/spec/scanner/all_spec.rb +0 -18
data/spec/scanner/anchors_spec.rb +0 -21
data/spec/scanner/conditionals_spec.rb +0 -128
data/spec/scanner/delimiters_spec.rb +0 -52
data/spec/scanner/errors_spec.rb +0 -67
data/spec/scanner/escapes_spec.rb +0 -53
data/spec/scanner/free_space_spec.rb +0 -133
data/spec/scanner/groups_spec.rb +0 -52
data/spec/scanner/keep_spec.rb +0 -10
data/spec/scanner/literals_spec.rb +0 -49
data/spec/scanner/meta_spec.rb +0 -18
data/spec/scanner/properties_spec.rb +0 -64
data/spec/scanner/quantifiers_spec.rb +0 -20
data/spec/scanner/refcalls_spec.rb +0 -36
data/spec/scanner/sets_spec.rb +0 -102
data/spec/scanner/types_spec.rb +0 -14
data/spec/spec_helper.rb +0 -15
data/spec/support/runner.rb +0 -42
data/spec/support/shared_examples.rb +0 -77
data/spec/support/warning_extractor.rb +0 -60
data/spec/syntax/syntax_spec.rb +0 -48
data/spec/syntax/syntax_token_map_spec.rb +0 -23
data/spec/syntax/versions/1.8.6_spec.rb +0 -17
data/spec/syntax/versions/1.9.1_spec.rb +0 -10
data/spec/syntax/versions/1.9.3_spec.rb +0 -9
data/spec/syntax/versions/2.0.0_spec.rb +0 -13
data/spec/syntax/versions/2.2.0_spec.rb +0 -9
data/spec/syntax/versions/aliases_spec.rb +0 -37
data/spec/token/token_spec.rb +0 -85

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -1,10 +1,10 @@
+require 'regexp_parser/error'
 require 'regexp_parser/expression'
 class Regexp::Parser
   include Regexp::Expression
-  include Regexp::Syntax
-  class ParserError < StandardError; end
+  class ParserError < Regexp::Parser::Error; end
   class UnknownTokenTypeError < ParserError
     def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
     end
   end
-  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
-    new.parse(input, syntax, &block)
+  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+    new.parse(input, syntax, options: options, &block)
   end
-  def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
-    root = Root.build(options_from_input(input))
+  def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+    root = Root.build(extract_options(input, options))
     self.root = root
     self.node = root
@@ -35,7 +35,7 @@ class Regexp::Parser
     self.captured_group_counts = Hash.new(0)
-    Regexp::Lexer.scan(input, syntax) do |token|
+    Regexp::Lexer.scan(input, syntax, options: options) do |token|
       parse_token(token)
     end
@@ -54,105 +54,171 @@ class Regexp::Parser
                 :options_stack, :switching_options, :conditional_nesting,
                 :captured_group_counts
-  def options_from_input(input)
-    return {} unless input.is_a?(::Regexp)
+  def extract_options(input, options)
+    if options && !input.is_a?(String)
+      raise ArgumentError, 'options cannot be supplied unless parsing a String'
+    end
-    options = {}
-    options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
-    options[:m] = true if input.options & ::Regexp::MULTILINE  != 0
-    options[:x] = true if input.options & ::Regexp::EXTENDED   != 0
-    options
-  end
+    options = input.options if input.is_a?(::Regexp)
-  def nest(exp)
-    nesting.push(exp)
-    node << exp
-    update_transplanted_subtree(exp, node)
-    self.node = exp
-  end
-  # subtrees are transplanted to build Alternations, Intersections, Ranges
-  def update_transplanted_subtree(exp, new_parent)
-    exp.nesting_level = new_parent.nesting_level + 1
-    exp.respond_to?(:each) &&
-      exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
-  end
-  def decrease_nesting
-    while nesting.last.is_a?(SequenceOperation)
-      nesting.pop
-      self.node = nesting.last
-    end
-    nesting.pop
-    yield(node) if block_given?
-    self.node = nesting.last
-    self.node = node.last if node.last.is_a?(SequenceOperation)
-  end
+    return {} unless options
-  def nest_conditional(exp)
-    conditional_nesting.push(exp)
-    nest(exp)
+    enabled_options = {}
+    enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
+    enabled_options[:m] = true if options & ::Regexp::MULTILINE  != 0
+    enabled_options[:x] = true if options & ::Regexp::EXTENDED   != 0
+    enabled_options
   end
   def parse_token(token)
-    close_completed_character_set_range
     case token.type
-    when :meta;         meta(token)
-    when :quantifier;   quantifier(token)
-    when :anchor;       anchor(token)
-    when :escape;       escape(token)
-    when :group;        group(token)
-    when :assertion;    group(token)
-    when :set;          set(token)
-    when :type;         type(token)
-    when :backref;      backref(token)
-    when :conditional;  conditional(token)
-    when :keep;         keep(token)
-    when :posixclass, :nonposixclass
-      posixclass(token)
-    when :property, :nonproperty
-      property(token)
-    when :literal
-      node << Literal.new(token, active_opts)
-    when :free_space
-      free_space(token)
+    when :anchor;                     anchor(token)
+    when :assertion, :group;          group(token)
+    when :backref;                    backref(token)
+    when :conditional;                conditional(token)
+    when :escape;                     escape(token)
+    when :free_space;                 free_space(token)
+    when :keep;                       keep(token)
+    when :literal;                    literal(token)
+    when :meta;                       meta(token)
+    when :posixclass, :nonposixclass; posixclass(token)
+    when :property, :nonproperty;     property(token)
+    when :quantifier;                 quantifier(token)
+    when :set;                        set(token)
+    when :type;                       type(token)
     else
       raise UnknownTokenTypeError.new(token.type, token)
     end
+    close_completed_character_set_range
   end
-  def set(token)
+  def anchor(token)
     case token.token
-    when :open
-      open_set(token)
-    when :close
-      close_set
-    when :negate
-      negate_set
-    when :range
-      range(token)
-    when :intersection
-      intersection(token)
-    when :collation, :equivalent
-      node << Literal.new(token, active_opts)
+    when :bol;              node << Anchor::BeginningOfLine.new(token, active_opts)
+    when :bos;              node << Anchor::BOS.new(token, active_opts)
+    when :eol;              node << Anchor::EndOfLine.new(token, active_opts)
+    when :eos;              node << Anchor::EOS.new(token, active_opts)
+    when :eos_ob_eol;       node << Anchor::EOSobEOL.new(token, active_opts)
+    when :match_start;      node << Anchor::MatchStart.new(token, active_opts)
+    when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
+    when :word_boundary;    node << Anchor::WordBoundary.new(token, active_opts)
     else
-      raise UnknownTokenError.new('CharacterSet', token)
+      raise UnknownTokenError.new('Anchor', token)
     end
   end
-  def meta(token)
+  def group(token)
     case token.token
-    when :dot
-      node << CharacterType::Any.new(token, active_opts)
-    when :alternation
-      sequence_operation(Alternation, token)
+    when :options, :options_switch
+      options_group(token)
+    when :close
+      close_group
+    when :comment
+      node << Group::Comment.new(token, active_opts)
     else
-      raise UnknownTokenError.new('Meta', token)
+      open_group(token)
+    end
+  end
+  MOD_FLAGS = %w[i m x].map(&:to_sym)
+  ENC_FLAGS = %w[a d u].map(&:to_sym)
+  def options_group(token)
+    positive, negative = token.text.split('-', 2)
+    negative ||= ''
+    self.switching_options = token.token.equal?(:options_switch)
+    opt_changes = {}
+    new_active_opts = active_opts.dup
+    MOD_FLAGS.each do |flag|
+      if positive.include?(flag.to_s)
+        opt_changes[flag] = new_active_opts[flag] = true
+      end
+      if negative.include?(flag.to_s)
+        opt_changes[flag] = false
+        new_active_opts.delete(flag)
+      end
+    end
+    if (enc_flag = positive.reverse[/[adu]/])
+      enc_flag = enc_flag.to_sym
+      (ENC_FLAGS - [enc_flag]).each do |other|
+        opt_changes[other] = false if new_active_opts[other]
+        new_active_opts.delete(other)
+      end
+      opt_changes[enc_flag] = new_active_opts[enc_flag] = true
+    end
+    options_stack << new_active_opts
+    options_group = Group::Options.new(token, active_opts)
+    options_group.option_changes = opt_changes
+    nest(options_group)
+  end
+  def open_group(token)
+    group_class =
+      case token.token
+      when :absence;     Group::Absence
+      when :atomic;      Group::Atomic
+      when :capture;     Group::Capture
+      when :named;       Group::Named
+      when :passive;     Group::Passive
+      when :lookahead;   Assertion::Lookahead
+      when :lookbehind;  Assertion::Lookbehind
+      when :nlookahead;  Assertion::NegativeLookahead
+      when :nlookbehind; Assertion::NegativeLookbehind
+      else
+        raise UnknownTokenError.new('Group type open', token)
+      end
+    group = group_class.new(token, active_opts)
+    if group.capturing?
+      group.number          = total_captured_group_count + 1
+      group.number_at_level = captured_group_count_at_level + 1
+      count_captured_group
+    end
+    # Push the active options to the stack again. This way we can simply pop the
+    # stack for any group we close, no matter if it had its own options or not.
+    options_stack << active_opts
+    nest(group)
+  end
+  def total_captured_group_count
+    captured_group_counts.values.reduce(0, :+)
+  end
+  def captured_group_count_at_level
+    captured_group_counts[node.level]
+  end
+  def count_captured_group
+    captured_group_counts[node.level] += 1
+  end
+  def close_group
+    options_stack.pop unless switching_options
+    self.switching_options = false
+    decrease_nesting
+  end
+  def decrease_nesting
+    while nesting.last.is_a?(SequenceOperation)
+      nesting.pop
+      self.node = nesting.last
     end
+    nesting.pop
+    yield(node) if block_given?
+    self.node = nesting.last
+    self.node = node.last if node.last.is_a?(SequenceOperation)
   end
   def backref(token)
@@ -182,31 +248,9 @@ class Regexp::Parser
     end
   end
-  def type(token)
-    case token.token
-    when :digit
-      node << CharacterType::Digit.new(token, active_opts)
-    when :nondigit
-      node << CharacterType::NonDigit.new(token, active_opts)
-    when :hex
-      node << CharacterType::Hex.new(token, active_opts)
-    when :nonhex
-      node << CharacterType::NonHex.new(token, active_opts)
-    when :space
-      node << CharacterType::Space.new(token, active_opts)
-    when :nonspace
-      node << CharacterType::NonSpace.new(token, active_opts)
-    when :word
-      node << CharacterType::Word.new(token, active_opts)
-    when :nonword
-      node << CharacterType::NonWord.new(token, active_opts)
-    when :linebreak
-      node << CharacterType::Linebreak.new(token, active_opts)
-    when :xgrapheme
-      node << CharacterType::ExtendedGrapheme.new(token, active_opts)
-    else
-      raise UnknownTokenError.new('CharacterType', token)
-    end
+  def assign_effective_number(exp)
+    exp.effective_number =
+      exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
   end
   def conditional(token)
@@ -234,11 +278,118 @@ class Regexp::Parser
     end
   end
+  def nest_conditional(exp)
+    conditional_nesting.push(exp)
+    nest(exp)
+  end
+  def nest(exp)
+    nesting.push(exp)
+    node << exp
+    update_transplanted_subtree(exp, node)
+    self.node = exp
+  end
+  # subtrees are transplanted to build Alternations, Intersections, Ranges
+  def update_transplanted_subtree(exp, new_parent)
+    exp.nesting_level = new_parent.nesting_level + 1
+    exp.respond_to?(:each) &&
+      exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
+  end
+  def escape(token)
+    case token.token
+    when :backspace;      node << EscapeSequence::Backspace.new(token, active_opts)
+    when :escape;         node << EscapeSequence::AsciiEscape.new(token, active_opts)
+    when :bell;           node << EscapeSequence::Bell.new(token, active_opts)
+    when :form_feed;      node << EscapeSequence::FormFeed.new(token, active_opts)
+    when :newline;        node << EscapeSequence::Newline.new(token, active_opts)
+    when :carriage;       node << EscapeSequence::Return.new(token, active_opts)
+    when :tab;            node << EscapeSequence::Tab.new(token, active_opts)
+    when :vertical_tab;   node << EscapeSequence::VerticalTab.new(token, active_opts)
+    when :codepoint;      node << EscapeSequence::Codepoint.new(token, active_opts)
+    when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
+    when :hex;            node << EscapeSequence::Hex.new(token, active_opts)
+    when :octal;          node << EscapeSequence::Octal.new(token, active_opts)
+    when :control
+      if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
+        node << EscapeSequence::MetaControl.new(token, active_opts)
+      else
+        node << EscapeSequence::Control.new(token, active_opts)
+      end
+    when :meta_sequence
+      if token.text =~ /\A\\M-\\[Cc]/
+        node << EscapeSequence::MetaControl.new(token, active_opts)
+      else
+        node << EscapeSequence::Meta.new(token, active_opts)
+      end
+    else
+      # treating everything else as a literal
+      # TODO: maybe split this up a bit more in v3.0.0?
+      # E.g. escaped quantifiers or set meta chars are not the same
+      # as stuff that would be a literal even without the backslash.
+      # Right now, they all end up here.
+      node << EscapeSequence::Literal.new(token, active_opts)
+    end
+  end
+  def free_space(token)
+    case token.token
+    when :comment
+      node << Comment.new(token, active_opts)
+    when :whitespace
+      if node.last.is_a?(WhiteSpace)
+        node.last.merge(WhiteSpace.new(token, active_opts))
+      else
+        node << WhiteSpace.new(token, active_opts)
+      end
+    else
+      raise UnknownTokenError.new('FreeSpace', token)
+    end
+  end
+  def keep(token)
+    node << Keep::Mark.new(token, active_opts)
+  end
+  def literal(token)
+    node << Literal.new(token, active_opts)
+  end
+  def meta(token)
+    case token.token
+    when :dot
+      node << CharacterType::Any.new(token, active_opts)
+    when :alternation
+      sequence_operation(Alternation, token)
+    else
+      raise UnknownTokenError.new('Meta', token)
+    end
+  end
+  def sequence_operation(klass, token)
+    unless node.is_a?(klass)
+      operator = klass.new(token, active_opts)
+      sequence = operator.add_sequence(active_opts)
+      sequence.expressions = node.expressions
+      node.expressions = []
+      nest(operator)
+    end
+    node.add_sequence(active_opts)
+  end
   def posixclass(token)
     node << PosixClass.new(token, active_opts)
   end
   include Regexp::Expression::UnicodeProperty
+  UPTokens = Regexp::Syntax::Token::UnicodeProperty
   def property(token)
     case token.token
@@ -310,128 +461,43 @@ class Regexp::Parser
     when :private_use;            node << Codepoint::PrivateUse.new(token, active_opts)
     when :unassigned;             node << Codepoint::Unassigned.new(token, active_opts)
-    when *Token::UnicodeProperty::Age
-      node << Age.new(token, active_opts)
-    when *Token::UnicodeProperty::Derived
-      node << Derived.new(token, active_opts)
-    when *Token::UnicodeProperty::Emoji
-      node << Emoji.new(token, active_opts)
-    when *Token::UnicodeProperty::Script
-      node << Script.new(token, active_opts)
-    when *Token::UnicodeProperty::UnicodeBlock
-      node << Block.new(token, active_opts)
+    when *UPTokens::Age;          node << Age.new(token, active_opts)
+    when *UPTokens::Derived;      node << Derived.new(token, active_opts)
+    when *UPTokens::Emoji;        node << Emoji.new(token, active_opts)
+    when *UPTokens::Script;       node << Script.new(token, active_opts)
+    when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
     else
       raise UnknownTokenError.new('UnicodeProperty', token)
     end
   end
-  def anchor(token)
-    case token.token
-    when :bol
-      node << Anchor::BeginningOfLine.new(token, active_opts)
-    when :eol
-      node << Anchor::EndOfLine.new(token, active_opts)
-    when :bos
-      node << Anchor::BOS.new(token, active_opts)
-    when :eos
-      node << Anchor::EOS.new(token, active_opts)
-    when :eos_ob_eol
-      node << Anchor::EOSobEOL.new(token, active_opts)
-    when :word_boundary
-      node << Anchor::WordBoundary.new(token, active_opts)
-    when :nonword_boundary
-      node << Anchor::NonWordBoundary.new(token, active_opts)
-    when :match_start
-      node << Anchor::MatchStart.new(token, active_opts)
-    else
-      raise UnknownTokenError.new('Anchor', token)
-    end
-  end
-  def escape(token)
-    case token.token
-    when :backspace
-      node << EscapeSequence::Backspace.new(token, active_opts)
-    when :escape
-      node << EscapeSequence::AsciiEscape.new(token, active_opts)
-    when :bell
-      node << EscapeSequence::Bell.new(token, active_opts)
-    when :form_feed
-      node << EscapeSequence::FormFeed.new(token, active_opts)
-    when :newline
-      node << EscapeSequence::Newline.new(token, active_opts)
-    when :carriage
-      node << EscapeSequence::Return.new(token, active_opts)
-    when :tab
-      node << EscapeSequence::Tab.new(token, active_opts)
-    when :vertical_tab
-      node << EscapeSequence::VerticalTab.new(token, active_opts)
-    when :hex
-      node << EscapeSequence::Hex.new(token, active_opts)
-    when :octal
-      node << EscapeSequence::Octal.new(token, active_opts)
-    when :codepoint
-      node << EscapeSequence::Codepoint.new(token, active_opts)
-    when :codepoint_list
-      node << EscapeSequence::CodepointList.new(token, active_opts)
-    when :control
-      if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
-        node << EscapeSequence::MetaControl.new(token, active_opts)
-      else
-        node << EscapeSequence::Control.new(token, active_opts)
-      end
-    when :meta_sequence
-      if token.text =~ /\A\\M-\\[Cc]/
-        node << EscapeSequence::MetaControl.new(token, active_opts)
-      else
-        node << EscapeSequence::Meta.new(token, active_opts)
-      end
-    else
-      # treating everything else as a literal
-      node << EscapeSequence::Literal.new(token, active_opts)
-    end
-  end
-  def keep(token)
-    node << Keep::Mark.new(token, active_opts)
-  end
-  def free_space(token)
-    case token.token
-    when :comment
-      node << Comment.new(token, active_opts)
-    when :whitespace
-      if node.last.is_a?(WhiteSpace)
-        node.last.merge(WhiteSpace.new(token, active_opts))
-      else
-        node << WhiteSpace.new(token, active_opts)
-      end
-    else
-      raise UnknownTokenError.new('FreeSpace', token)
-    end
-  end
   def quantifier(token)
-    offset = -1
-    target_node = node.expressions[offset]
-    while target_node.is_a?(FreeSpace)
-      target_node = node.expressions[offset -= 1]
+    target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
+    target_node or raise ParserError, "No valid target found for '#{token.text}'"
+    # in case of chained quantifiers, wrap target in an implicit passive group
+    # description of the problem: https://github.com/ammar/regexp_parser/issues/3
+    # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
+    if target_node.quantified?
+      new_token = Regexp::Token.new(
+        :group,
+        :passive,
+        '', # text
+        target_node.ts,
+        nil, # te (unused)
+        target_node.level,
+        target_node.set_level,
+        target_node.conditional_level
+      )
+      new_group = Group::Passive.new(new_token, active_opts)
+      new_group.implicit = true
+      new_group << target_node
+      increase_level(target_node)
+      node.expressions[node.expressions.index(target_node)] = new_group
+      target_node = new_group
     end
-    target_node || raise(ArgumentError, 'No valid target found for '\
-                                        "'#{token.text}' ")
     case token.token
     when :zero_or_one
       target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -462,6 +528,11 @@ class Regexp::Parser
     end
   end
+  def increase_level(exp)
+    exp.level += 1
+    exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
+  end
   def interval(target_node, token)
     text = token.text
     mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
@@ -484,100 +555,16 @@ class Regexp::Parser
     target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
   end
-  def group(token)
-    case token.token
-    when :options, :options_switch
-      options_group(token)
-    when :close
-      close_group
-    when :comment
-      node << Group::Comment.new(token, active_opts)
-    else
-      open_group(token)
-    end
-  end
-  MOD_FLAGS = %w[i m x].map(&:to_sym)
-  ENC_FLAGS = %w[a d u].map(&:to_sym)
-  def options_group(token)
-    positive, negative = token.text.split('-', 2)
-    negative ||= ''
-    self.switching_options = token.token.equal?(:options_switch)
-    opt_changes = {}
-    new_active_opts = active_opts.dup
-    MOD_FLAGS.each do |flag|
-      if positive.include?(flag.to_s)
-        opt_changes[flag] = new_active_opts[flag] = true
-      end
-      if negative.include?(flag.to_s)
-        opt_changes[flag] = false
-        new_active_opts.delete(flag)
-      end
-    end
-    if (enc_flag = positive.reverse[/[adu]/])
-      enc_flag = enc_flag.to_sym
-      (ENC_FLAGS - [enc_flag]).each do |other|
-        opt_changes[other] = false if new_active_opts[other]
-        new_active_opts.delete(other)
-      end
-      opt_changes[enc_flag] = new_active_opts[enc_flag] = true
-    end
-    options_stack << new_active_opts
-    options_group = Group::Options.new(token, active_opts)
-    options_group.option_changes = opt_changes
-    nest(options_group)
-  end
-  def open_group(token)
+  def set(token)
     case token.token
-    when :passive
-      exp = Group::Passive.new(token, active_opts)
-    when :atomic
-      exp = Group::Atomic.new(token, active_opts)
-    when :named
-      exp = Group::Named.new(token, active_opts)
-    when :capture
-      exp = Group::Capture.new(token, active_opts)
-    when :absence
-      exp = Group::Absence.new(token, active_opts)
-    when :lookahead
-      exp = Assertion::Lookahead.new(token, active_opts)
-    when :nlookahead
-      exp = Assertion::NegativeLookahead.new(token, active_opts)
-    when :lookbehind
-      exp = Assertion::Lookbehind.new(token, active_opts)
-    when :nlookbehind
-      exp = Assertion::NegativeLookbehind.new(token, active_opts)
+    when :open;         open_set(token)
+    when :close;        close_set
+    when :negate;       negate_set
+    when :range;        range(token)
+    when :intersection; intersection(token)
     else
-      raise UnknownTokenError.new('Group type open', token)
-    end
-    if exp.capturing?
-      exp.number          = total_captured_group_count + 1
-      exp.number_at_level = captured_group_count_at_level + 1
-      count_captured_group
+      raise UnknownTokenError.new('CharacterSet', token)
     end
-    # Push the active options to the stack again. This way we can simply pop the
-    # stack for any group we close, no matter if it had its own options or not.
-    options_stack << active_opts
-    nest(exp)
-  end
-  def close_group
-    options_stack.pop unless switching_options
-    self.switching_options = false
-    decrease_nesting
   end
   def open_set(token)
@@ -600,51 +587,45 @@ class Regexp::Parser
     nest(exp)
   end
-  def close_completed_character_set_range
-    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
-  end
   def intersection(token)
     sequence_operation(CharacterSet::Intersection, token)
   end
-  def sequence_operation(klass, token)
-    unless node.is_a?(klass)
-      operator = klass.new(token, active_opts)
-      sequence = operator.add_sequence(active_opts)
-      sequence.expressions = node.expressions
-      node.expressions = []
-      nest(operator)
+  def type(token)
+    case token.token
+    when :digit;     node << CharacterType::Digit.new(token, active_opts)
+    when :hex;       node << CharacterType::Hex.new(token, active_opts)
+    when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
+    when :nondigit;  node << CharacterType::NonDigit.new(token, active_opts)
+    when :nonhex;    node << CharacterType::NonHex.new(token, active_opts)
+    when :nonspace;  node << CharacterType::NonSpace.new(token, active_opts)
+    when :nonword;   node << CharacterType::NonWord.new(token, active_opts)
+    when :space;     node << CharacterType::Space.new(token, active_opts)
+    when :word;      node << CharacterType::Word.new(token, active_opts)
+    when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
+    else
+      raise UnknownTokenError.new('CharacterType', token)
     end
-    node.add_sequence(active_opts)
-  end
-  def active_opts
-    options_stack.last
-  end
-  def total_captured_group_count
-    captured_group_counts.values.reduce(0, :+)
-  end
-  def captured_group_count_at_level
-    captured_group_counts[node.level]
   end
-  def count_captured_group
-    captured_group_counts[node.level] += 1
+  def close_completed_character_set_range
+    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
   end
-  def assign_effective_number(exp)
-    exp.effective_number =
-      exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
+  def active_opts
+    options_stack.last
   end
+  # Assigns referenced expressions to refering expressions, e.g. if there is
+  # an instance of Backreference::Number, its #referenced_expression is set to
+  # the instance of Group::Capture that it refers to via its number.
   def assign_referenced_expressions
     targets = {}
+    # find all referencable expressions
     root.each_expression do |exp|
       exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
     end
+    # assign them to any refering expressions
     root.each_expression do |exp|
       exp.respond_to?(:reference) &&
         exp.referenced_expression = targets[exp.reference]