regexp_parser 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +57 -0
- data/Gemfile +8 -0
- data/LICENSE +1 -1
- data/README.md +225 -206
- data/Rakefile +9 -3
- data/lib/regexp_parser.rb +7 -11
- data/lib/regexp_parser/expression.rb +72 -14
- data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
- data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
- data/lib/regexp_parser/expression/classes/keep.rb +7 -0
- data/lib/regexp_parser/expression/classes/set.rb +28 -7
- data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
- data/lib/regexp_parser/expression/methods/tests.rb +116 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
- data/lib/regexp_parser/expression/quantifier.rb +10 -0
- data/lib/regexp_parser/expression/sequence.rb +45 -0
- data/lib/regexp_parser/expression/subexpression.rb +29 -1
- data/lib/regexp_parser/lexer.rb +31 -8
- data/lib/regexp_parser/parser.rb +118 -45
- data/lib/regexp_parser/scanner.rb +1745 -1404
- data/lib/regexp_parser/scanner/property.rl +57 -3
- data/lib/regexp_parser/scanner/scanner.rl +161 -34
- data/lib/regexp_parser/syntax.rb +12 -2
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
- data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
- data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +19 -2
- data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
- data/lib/regexp_parser/token.rb +23 -8
- data/lib/regexp_parser/version.rb +5 -0
- data/regexp_parser.gemspec +35 -0
- data/test/expression/test_all.rb +6 -1
- data/test/expression/test_base.rb +19 -0
- data/test/expression/test_conditionals.rb +114 -0
- data/test/expression/test_free_space.rb +33 -0
- data/test/expression/test_set.rb +61 -0
- data/test/expression/test_strfregexp.rb +214 -0
- data/test/expression/test_subexpression.rb +24 -0
- data/test/expression/test_tests.rb +99 -0
- data/test/expression/test_to_h.rb +48 -0
- data/test/expression/test_to_s.rb +46 -0
- data/test/expression/test_traverse.rb +164 -0
- data/test/lexer/test_all.rb +16 -3
- data/test/lexer/test_conditionals.rb +101 -0
- data/test/lexer/test_keep.rb +24 -0
- data/test/lexer/test_literals.rb +51 -51
- data/test/lexer/test_nesting.rb +62 -62
- data/test/lexer/test_refcalls.rb +18 -20
- data/test/parser/test_all.rb +18 -3
- data/test/parser/test_alternation.rb +11 -14
- data/test/parser/test_conditionals.rb +148 -0
- data/test/parser/test_escapes.rb +29 -5
- data/test/parser/test_free_space.rb +139 -0
- data/test/parser/test_groups.rb +40 -0
- data/test/parser/test_keep.rb +21 -0
- data/test/scanner/test_all.rb +8 -2
- data/test/scanner/test_conditionals.rb +166 -0
- data/test/scanner/test_escapes.rb +8 -5
- data/test/scanner/test_free_space.rb +133 -0
- data/test/scanner/test_groups.rb +28 -0
- data/test/scanner/test_keep.rb +33 -0
- data/test/scanner/test_properties.rb +4 -0
- data/test/scanner/test_scripts.rb +71 -1
- data/test/syntax/ruby/test_1.9.3.rb +2 -2
- data/test/syntax/ruby/test_2.0.0.rb +38 -0
- data/test/syntax/ruby/test_2.2.0.rb +38 -0
- data/test/syntax/ruby/test_all.rb +1 -8
- data/test/syntax/ruby/test_files.rb +104 -0
- data/test/test_all.rb +2 -1
- data/test/token/test_all.rb +2 -0
- data/test/token/test_token.rb +109 -0
- metadata +75 -21
- data/VERSION.yml +0 -5
- data/lib/regexp_parser/ctype.rb +0 -48
- data/test/syntax/ruby/test_2.x.rb +0 -46
| @@ -0,0 +1,116 @@ | |
| 1 | 
            +
            module Regexp::Expression
         | 
| 2 | 
            +
              class Base
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                # Test if this expression has the given test_type, which can be either
         | 
| 5 | 
            +
                # a symbol or an array of symbols to check against the expression's type.
         | 
| 6 | 
            +
                #
         | 
| 7 | 
            +
                #   # is it a :group expression
         | 
| 8 | 
            +
                #   exp.type? :group
         | 
| 9 | 
            +
                #
         | 
| 10 | 
            +
                #   # is it a :set, :subset, or :meta
         | 
| 11 | 
            +
                #   exp.type? [:set, :subset, :meta]
         | 
| 12 | 
            +
                #
         | 
| 13 | 
            +
                def type?(test_type)
         | 
| 14 | 
            +
                  case test_type
         | 
| 15 | 
            +
                  when Array
         | 
| 16 | 
            +
                    if test_type.include?(:*)
         | 
| 17 | 
            +
                      return (test_type.include?(type) or test_type.include?(:*))
         | 
| 18 | 
            +
                    else
         | 
| 19 | 
            +
                      return test_type.include?(type)
         | 
| 20 | 
            +
                    end
         | 
| 21 | 
            +
                  when Symbol
         | 
| 22 | 
            +
                    return (type == test_type or test_type == :*)
         | 
| 23 | 
            +
                  else
         | 
| 24 | 
            +
                    raise "Array or Symbol expected, #{test_type.class.name} given"
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                # Test if this expression has the given test_token, and optionally a given
         | 
| 29 | 
            +
                # test_type.
         | 
| 30 | 
            +
                #
         | 
| 31 | 
            +
                #   # Any expressions
         | 
| 32 | 
            +
                #   exp.is? :*  # always returns true
         | 
| 33 | 
            +
                #
         | 
| 34 | 
            +
                #   # is it a :capture
         | 
| 35 | 
            +
                #   exp.is? :capture
         | 
| 36 | 
            +
                #
         | 
| 37 | 
            +
                #   # is it a :character and a :set
         | 
| 38 | 
            +
                #   exp.is? :character, :set
         | 
| 39 | 
            +
                #
         | 
| 40 | 
            +
                #   # is it a :meta :dot
         | 
| 41 | 
            +
                #   exp.is? :dot, :meta
         | 
| 42 | 
            +
                #
         | 
| 43 | 
            +
                #   # is it a :meta or :escape :dot
         | 
| 44 | 
            +
                #   exp.is? :dot, [:meta, :escape]
         | 
| 45 | 
            +
                #
         | 
| 46 | 
            +
                def is?(test_token, test_type = nil)
         | 
| 47 | 
            +
                  return true if test_token === :*
         | 
| 48 | 
            +
                  token == test_token and (test_type ? type?(test_type) : true)
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                # Test if this expression matches an entry in the given scope spec.
         | 
| 52 | 
            +
                #
         | 
| 53 | 
            +
                # A scope spec can be one of:
         | 
| 54 | 
            +
                #
         | 
| 55 | 
            +
                #   . An array: Interpreted as a set of tokens, tested for inclusion
         | 
| 56 | 
            +
                #               of the expression's token.
         | 
| 57 | 
            +
                #
         | 
| 58 | 
            +
                #   . A hash:   Where the key is interpreted as the expression type
         | 
| 59 | 
            +
                #               and the value is either a symbol or an array. In this
         | 
| 60 | 
            +
                #               case, when the scope is a hash, one_of? calls itself to
         | 
| 61 | 
            +
                #               evaluate the key's value.
         | 
| 62 | 
            +
                #
         | 
| 63 | 
            +
                #   . A symbol: matches the expression's token or type, depending on
         | 
| 64 | 
            +
                #               the level of the call. If one_of? is called directly with
         | 
| 65 | 
            +
                #               a symbol then it will always be checked against the
         | 
| 66 | 
            +
                #               type of the expression. If it's being called for a value
         | 
| 67 | 
            +
                #               from a hash, it will be checked against the token of the
         | 
| 68 | 
            +
                #               expression.
         | 
| 69 | 
            +
                #
         | 
| 70 | 
            +
                #   # any expression
         | 
| 71 | 
            +
                #   exp.one_of?(:*) # always true
         | 
| 72 | 
            +
                #
         | 
| 73 | 
            +
                #   # like exp.type?(:group)
         | 
| 74 | 
            +
                #   exp.one_of?(:group)
         | 
| 75 | 
            +
                #
         | 
| 76 | 
            +
                #   # any expression of type meta
         | 
| 77 | 
            +
                #   exp.one_of?(:meta => :*)
         | 
| 78 | 
            +
                #
         | 
| 79 | 
            +
                #   # meta dots and alternations
         | 
| 80 | 
            +
                #   exp.one_of?(:meta => [:dot, :alternation])
         | 
| 81 | 
            +
                #
         | 
| 82 | 
            +
                #   # meta dots and any set tokens
         | 
| 83 | 
            +
                #   exp.one_of?({meta: [:dot], set: :*})
         | 
| 84 | 
            +
                #
         | 
| 85 | 
            +
                def one_of?(scope, top = true)
         | 
| 86 | 
            +
                  case scope
         | 
| 87 | 
            +
                  when Array
         | 
| 88 | 
            +
                    if scope.include?(:*)
         | 
| 89 | 
            +
                      return (scope.include?(token) or scope.include?(:*))
         | 
| 90 | 
            +
                    else
         | 
| 91 | 
            +
                      return scope.include?(token)
         | 
| 92 | 
            +
                    end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  when Hash
         | 
| 95 | 
            +
                    if scope.has_key?(:*)
         | 
| 96 | 
            +
                      test_type = scope.has_key?(type) ? type : :*
         | 
| 97 | 
            +
                      return one_of?(scope[test_type], false)
         | 
| 98 | 
            +
                    else
         | 
| 99 | 
            +
                      return (scope.has_key?(type) and one_of?(scope[type], false))
         | 
| 100 | 
            +
                    end
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                  when Symbol
         | 
| 103 | 
            +
                    return true if scope == :*
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    return is?(scope) unless top
         | 
| 106 | 
            +
                    return type?(scope) if top
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                  else
         | 
| 109 | 
            +
                    raise "Array, Hash, or Symbol expected, #{scope.class.name} given"
         | 
| 110 | 
            +
                  end
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                  false
         | 
| 113 | 
            +
                end
         | 
| 114 | 
            +
             | 
| 115 | 
            +
              end
         | 
| 116 | 
            +
            end
         | 
| @@ -0,0 +1,63 @@ | |
| 1 | 
            +
            module Regexp::Expression
         | 
| 2 | 
            +
              class Subexpression < Regexp::Expression::Base
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                # Traverses the subexpression (depth-first, pre-order) and calls the given
         | 
| 5 | 
            +
                # block for each expression with three arguments; the traversal event,
         | 
| 6 | 
            +
                # the expression, and the index of the expression within its parent.
         | 
| 7 | 
            +
                #
         | 
| 8 | 
            +
                # The event argument is passed as follows:
         | 
| 9 | 
            +
                #
         | 
| 10 | 
            +
                # - For subexpressions, :enter upon entrering the subexpression, and
         | 
| 11 | 
            +
                #   :exit upon exiting it.
         | 
| 12 | 
            +
                #
         | 
| 13 | 
            +
                # - For terminal expressions, :visit is called once.
         | 
| 14 | 
            +
                #
         | 
| 15 | 
            +
                # Returns self.
         | 
| 16 | 
            +
                def traverse(include_self = false, &block)
         | 
| 17 | 
            +
                  raise 'traverse requires a block' unless block_given?
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  block.call(:enter, self, 0) if include_self
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  each_with_index do |exp, index|
         | 
| 22 | 
            +
                    if exp.terminal?
         | 
| 23 | 
            +
                      block.call(:visit, exp, index)
         | 
| 24 | 
            +
                    else
         | 
| 25 | 
            +
                      block.call(:enter, exp, index)
         | 
| 26 | 
            +
                      exp.traverse(&block)
         | 
| 27 | 
            +
                      block.call(:exit, exp, index)
         | 
| 28 | 
            +
                    end
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  block.call(:exit, self, 0) if include_self
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  self
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
                alias :walk :traverse
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                # Iterates over the expressions of this expression as an array, passing
         | 
| 38 | 
            +
                # the expression and its index within its parent to the given block.
         | 
| 39 | 
            +
                def each_expression(include_self = false, &block)
         | 
| 40 | 
            +
                  traverse(include_self) do |event, exp, index|
         | 
| 41 | 
            +
                    yield(exp, index) unless event == :exit
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                # Returns a new array with the results of calling the given block once
         | 
| 46 | 
            +
                # for every expression. If a block is not given, returns an array with
         | 
| 47 | 
            +
                # each expression and its level index as an array.
         | 
| 48 | 
            +
                def map(include_self = false, &block)
         | 
| 49 | 
            +
                  result = []
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  each_expression(include_self) do |exp, index|
         | 
| 52 | 
            +
                    if block_given?
         | 
| 53 | 
            +
                      result << yield(exp, index)
         | 
| 54 | 
            +
                    else
         | 
| 55 | 
            +
                      result << [exp, index]
         | 
| 56 | 
            +
                    end
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  result
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              end
         | 
| 63 | 
            +
            end
         | 
| @@ -0,0 +1,45 @@ | |
| 1 | 
            +
            module Regexp::Expression
         | 
| 2 | 
            +
             | 
| 3 | 
            +
              # A sequence of expressions. Differs from a Subexpressions by how it handles
         | 
| 4 | 
            +
              # quantifiers, as it applies them to its last element instead of itself as
         | 
| 5 | 
            +
              # a whole subexpression.
         | 
| 6 | 
            +
              #
         | 
| 7 | 
            +
              # Used as the base class for the Alternation alternatives and Conditional
         | 
| 8 | 
            +
              # branches.
         | 
| 9 | 
            +
              class Sequence < Regexp::Expression::Subexpression
         | 
| 10 | 
            +
                def initialize(level, set_level, conditional_level)
         | 
| 11 | 
            +
                  super Regexp::Token.new(
         | 
| 12 | 
            +
                    :expression,
         | 
| 13 | 
            +
                    :sequence,
         | 
| 14 | 
            +
                    '',
         | 
| 15 | 
            +
                    nil, # ts
         | 
| 16 | 
            +
                    nil, # te
         | 
| 17 | 
            +
                    level,
         | 
| 18 | 
            +
                    set_level,
         | 
| 19 | 
            +
                    conditional_level
         | 
| 20 | 
            +
                  )
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def text
         | 
| 24 | 
            +
                  to_s
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def starts_at
         | 
| 28 | 
            +
                  @expressions.first.starts_at
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def quantify(token, text, min = nil, max = nil, mode = :greedy)
         | 
| 32 | 
            +
                  offset = -1
         | 
| 33 | 
            +
                  target = expressions[offset]
         | 
| 34 | 
            +
                  while target and target.is_a?(FreeSpace)
         | 
| 35 | 
            +
                    target = expressions[offset -= 1]
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                  raise ArgumentError.new("No valid target found for '#{text}' " +
         | 
| 39 | 
            +
                                          "quantifier") unless target
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                  target.quantify(token, text, min, max, mode)
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            end
         | 
| @@ -17,7 +17,12 @@ module Regexp::Expression | |
| 17 17 | 
             
                end
         | 
| 18 18 |  | 
| 19 19 | 
             
                def <<(exp)
         | 
| 20 | 
            -
                  @expressions  | 
| 20 | 
            +
                  if exp.is_a?(WhiteSpace) and @expressions.last and
         | 
| 21 | 
            +
                    @expressions.last.is_a?(WhiteSpace)
         | 
| 22 | 
            +
                    @expressions.last.merge(exp)
         | 
| 23 | 
            +
                  else
         | 
| 24 | 
            +
                    @expressions << exp
         | 
| 25 | 
            +
                  end
         | 
| 21 26 | 
             
                end
         | 
| 22 27 |  | 
| 23 28 | 
             
                def insert(exp)
         | 
| @@ -48,6 +53,22 @@ module Regexp::Expression | |
| 48 53 | 
             
                  @expressions.length
         | 
| 49 54 | 
             
                end
         | 
| 50 55 |  | 
| 56 | 
            +
                def empty?
         | 
| 57 | 
            +
                  @expressions.empty?
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                def all?(&block)
         | 
| 61 | 
            +
                  @expressions.all? {|exp| yield(exp) }
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def ts
         | 
| 65 | 
            +
                  starts_at
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def te
         | 
| 69 | 
            +
                  ts + to_s.length
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
             | 
| 51 72 | 
             
                def to_s(format = :full)
         | 
| 52 73 | 
             
                  s = ''
         | 
| 53 74 |  | 
| @@ -64,6 +85,13 @@ module Regexp::Expression | |
| 64 85 |  | 
| 65 86 | 
             
                  s
         | 
| 66 87 | 
             
                end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                def to_h
         | 
| 90 | 
            +
                  h = super
         | 
| 91 | 
            +
                  h[:text] = to_s(:base)
         | 
| 92 | 
            +
                  h[:expressions] = @expressions.map(&:to_h)
         | 
| 93 | 
            +
                  h
         | 
| 94 | 
            +
                end
         | 
| 67 95 | 
             
              end
         | 
| 68 96 |  | 
| 69 97 | 
             
            end
         | 
    
        data/lib/regexp_parser/lexer.rb
    CHANGED
    
    | @@ -10,11 +10,11 @@ module Regexp::Lexer | |
| 10 10 |  | 
| 11 11 | 
             
              CLOSING_TOKENS = [:close].freeze
         | 
| 12 12 |  | 
| 13 | 
            -
              def self. | 
| 13 | 
            +
              def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
         | 
| 14 14 | 
             
                syntax = Regexp::Syntax.new(syntax)
         | 
| 15 15 |  | 
| 16 16 | 
             
                @tokens = []
         | 
| 17 | 
            -
                @nesting, @set_nesting = 0, 0
         | 
| 17 | 
            +
                @nesting, @set_nesting, @conditional_nesting = 0, 0, 0
         | 
| 18 18 |  | 
| 19 19 | 
             
                last = nil
         | 
| 20 20 | 
             
                Regexp::Scanner.scan(input) do |type, token, text, ts, te|
         | 
| @@ -27,11 +27,14 @@ module Regexp::Lexer | |
| 27 27 | 
             
                    last and last.type == :literal
         | 
| 28 28 |  | 
| 29 29 | 
             
                  current = Regexp::Token.new(type, token, text, ts, te,
         | 
| 30 | 
            -
             | 
| 30 | 
            +
                            @nesting, @set_nesting, @conditional_nesting)
         | 
| 31 31 |  | 
| 32 32 | 
             
                  current = merge_literal(current) if type == :literal and
         | 
| 33 33 | 
             
                    last and last.type == :literal
         | 
| 34 34 |  | 
| 35 | 
            +
                  current = merge_condition(current) if type == :conditional and
         | 
| 36 | 
            +
                    [:condition, :condition_close].include?(token)
         | 
| 37 | 
            +
             | 
| 35 38 | 
             
                  last.next(current) if last
         | 
| 36 39 | 
             
                  current.previous(last) if last
         | 
| 37 40 |  | 
| @@ -42,12 +45,18 @@ module Regexp::Lexer | |
| 42 45 | 
             
                end
         | 
| 43 46 |  | 
| 44 47 | 
             
                if block_given?
         | 
| 45 | 
            -
                  @tokens. | 
| 48 | 
            +
                  @tokens.map {|t| block.call(t)}
         | 
| 46 49 | 
             
                else
         | 
| 47 50 | 
             
                  @tokens
         | 
| 48 51 | 
             
                end
         | 
| 49 52 | 
             
              end
         | 
| 50 53 |  | 
| 54 | 
            +
              class << self
         | 
| 55 | 
            +
                alias :scan :lex
         | 
| 56 | 
            +
              end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              protected
         | 
| 59 | 
            +
             | 
| 51 60 | 
             
              def self.ascend(type, token)
         | 
| 52 61 | 
             
                if type == :group or type == :assertion
         | 
| 53 62 | 
             
                  @nesting -= 1 if CLOSING_TOKENS.include?(token)
         | 
| @@ -56,6 +65,10 @@ module Regexp::Lexer | |
| 56 65 | 
             
                if type == :set or type == :subset
         | 
| 57 66 | 
             
                  @set_nesting -= 1 if token == :close
         | 
| 58 67 | 
             
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                if type == :conditional
         | 
| 70 | 
            +
                  @conditional_nesting -= 1 if token == :close
         | 
| 71 | 
            +
                end
         | 
| 59 72 | 
             
              end
         | 
| 60 73 |  | 
| 61 74 | 
             
              def self.descend(type, token)
         | 
| @@ -66,6 +79,10 @@ module Regexp::Lexer | |
| 66 79 | 
             
                if type == :set or type == :subset
         | 
| 67 80 | 
             
                  @set_nesting += 1 if token == :open
         | 
| 68 81 | 
             
                end
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                if type == :conditional
         | 
| 84 | 
            +
                  @conditional_nesting += 1 if token == :open
         | 
| 85 | 
            +
                end
         | 
| 69 86 | 
             
              end
         | 
| 70 87 |  | 
| 71 88 | 
             
              # called by scan to break a literal run that is longer than one character
         | 
| @@ -86,11 +103,11 @@ module Regexp::Lexer | |
| 86 103 |  | 
| 87 104 | 
             
                  @tokens.pop
         | 
| 88 105 | 
             
                  @tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
         | 
| 89 | 
            -
             | 
| 106 | 
            +
                              (token.te - last_length), @nesting, @set_nesting, @conditional_nesting)
         | 
| 90 107 |  | 
| 91 108 | 
             
                  @tokens << Regexp::Token.new(:literal, :literal, last,
         | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 109 | 
            +
                              (token.ts + lead_length),
         | 
| 110 | 
            +
                              token.te, @nesting, @set_nesting, @conditional_nesting)
         | 
| 94 111 | 
             
                end
         | 
| 95 112 | 
             
              end
         | 
| 96 113 |  | 
| @@ -99,7 +116,13 @@ module Regexp::Lexer | |
| 99 116 | 
             
              def self.merge_literal(current)
         | 
| 100 117 | 
             
                last = @tokens.pop
         | 
| 101 118 | 
             
                replace = Regexp::Token.new(:literal, :literal, last.text + current.text,
         | 
| 102 | 
            -
             | 
| 119 | 
            +
                            last.ts, current.te, @nesting, @set_nesting, @conditional_nesting)
         | 
| 120 | 
            +
              end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
              def self.merge_condition(current)
         | 
| 123 | 
            +
                last = @tokens.pop
         | 
| 124 | 
            +
                Regexp::Token.new(:conditional, :condition, last.text + current.text,
         | 
| 125 | 
            +
                  last.ts, current.te, @nesting, @set_nesting, @conditional_nesting)
         | 
| 103 126 | 
             
              end
         | 
| 104 127 |  | 
| 105 128 | 
             
            end # module Regexp::Lexer
         | 
    
        data/lib/regexp_parser/parser.rb
    CHANGED
    
    | @@ -1,18 +1,14 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'regexp_parser/expression'
         | 
| 2 2 |  | 
| 3 3 | 
             
            module Regexp::Parser
         | 
| 4 4 | 
             
              include Regexp::Expression
         | 
| 5 5 | 
             
              include Regexp::Syntax
         | 
| 6 6 |  | 
| 7 | 
            -
              class ParserError < StandardError
         | 
| 8 | 
            -
                def initialize(what)
         | 
| 9 | 
            -
                  super what
         | 
| 10 | 
            -
                end
         | 
| 11 | 
            -
              end
         | 
| 7 | 
            +
              class ParserError < StandardError; end
         | 
| 12 8 |  | 
| 13 9 | 
             
              class UnknownTokenTypeError < ParserError
         | 
| 14 10 | 
             
                def initialize(type, token)
         | 
| 15 | 
            -
                  super "Unknown #{type}  | 
| 11 | 
            +
                  super "Unknown token type #{type} #{token.inspect}"
         | 
| 16 12 | 
             
                end
         | 
| 17 13 | 
             
              end
         | 
| 18 14 |  | 
| @@ -25,8 +21,10 @@ module Regexp::Parser | |
| 25 21 | 
             
              def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
         | 
| 26 22 | 
             
                @nesting = [@root = @node = Root.new]
         | 
| 27 23 |  | 
| 24 | 
            +
                @conditional_nesting = []
         | 
| 25 | 
            +
             | 
| 28 26 | 
             
                Regexp::Lexer.scan(input, syntax) do |token|
         | 
| 29 | 
            -
                   | 
| 27 | 
            +
                  parse_token token
         | 
| 30 28 | 
             
                end
         | 
| 31 29 |  | 
| 32 30 | 
             
                if block_given?
         | 
| @@ -43,23 +41,34 @@ module Regexp::Parser | |
| 43 41 | 
             
                @node  = exp
         | 
| 44 42 | 
             
              end
         | 
| 45 43 |  | 
| 44 | 
            +
              def self.nest_conditional(exp)
         | 
| 45 | 
            +
                @conditional_nesting.push exp
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                @node << exp
         | 
| 48 | 
            +
                @node  = exp
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
             | 
| 46 51 | 
             
              def self.parse_token(token)
         | 
| 47 52 | 
             
                case token.type
         | 
| 48 | 
            -
                when :meta;          | 
| 49 | 
            -
                when :quantifier;    | 
| 50 | 
            -
                when :anchor;        | 
| 51 | 
            -
                when :escape;        | 
| 52 | 
            -
                when :group;         | 
| 53 | 
            -
                when :assertion;     | 
| 54 | 
            -
                when :set, :subset;  | 
| 55 | 
            -
                when :type;          | 
| 56 | 
            -
                when :backref;       | 
| 53 | 
            +
                when :meta;         meta(token)
         | 
| 54 | 
            +
                when :quantifier;   quantifier(token)
         | 
| 55 | 
            +
                when :anchor;       anchor(token)
         | 
| 56 | 
            +
                when :escape;       escape(token)
         | 
| 57 | 
            +
                when :group;        group(token)
         | 
| 58 | 
            +
                when :assertion;    group(token)
         | 
| 59 | 
            +
                when :set, :subset; set(token)
         | 
| 60 | 
            +
                when :type;         type(token)
         | 
| 61 | 
            +
                when :backref;      backref(token)
         | 
| 62 | 
            +
                when :conditional;  conditional(token)
         | 
| 63 | 
            +
                when :keep;         keep(token)
         | 
| 57 64 |  | 
| 58 65 | 
             
                when :property, :nonproperty
         | 
| 59 | 
            -
                   | 
| 66 | 
            +
                  property(token)
         | 
| 60 67 |  | 
| 61 68 | 
             
                when :literal
         | 
| 62 69 | 
             
                  @node << Literal.new(token)
         | 
| 70 | 
            +
                when :free_space
         | 
| 71 | 
            +
                  free_space(token)
         | 
| 63 72 |  | 
| 64 73 | 
             
                else
         | 
| 65 74 | 
             
                  raise UnknownTokenTypeError.new(token.type, token)
         | 
| @@ -69,19 +78,19 @@ module Regexp::Parser | |
| 69 78 | 
             
              def self.set(token)
         | 
| 70 79 | 
             
                case token.token
         | 
| 71 80 | 
             
                when :open
         | 
| 72 | 
            -
                   | 
| 81 | 
            +
                  open_set(token)
         | 
| 73 82 | 
             
                when :close
         | 
| 74 | 
            -
                   | 
| 83 | 
            +
                  close_set(token)
         | 
| 75 84 | 
             
                when :negate
         | 
| 76 | 
            -
                   | 
| 85 | 
            +
                  negate_set
         | 
| 77 86 | 
             
                when :member, :range, :escape, :collation, :equivalent
         | 
| 78 | 
            -
                   | 
| 87 | 
            +
                  append_set(token)
         | 
| 79 88 | 
             
                when *Token::Escape::All
         | 
| 80 | 
            -
                   | 
| 89 | 
            +
                  append_set(token)
         | 
| 81 90 | 
             
                when *Token::CharacterSet::All
         | 
| 82 | 
            -
                   | 
| 91 | 
            +
                  append_set(token)
         | 
| 83 92 | 
             
                when *Token::UnicodeProperty::All
         | 
| 84 | 
            -
                   | 
| 93 | 
            +
                  append_set(token)
         | 
| 85 94 | 
             
                else
         | 
| 86 95 | 
             
                  raise UnknownTokenError.new('CharacterSet', token)
         | 
| 87 96 | 
             
                end
         | 
| @@ -95,7 +104,7 @@ module Regexp::Parser | |
| 95 104 | 
             
                  unless @node.token == :alternation
         | 
| 96 105 | 
             
                    unless @node.last.is_a?(Alternation)
         | 
| 97 106 | 
             
                      alt = Alternation.new(token)
         | 
| 98 | 
            -
                      seq =  | 
| 107 | 
            +
                      seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
         | 
| 99 108 |  | 
| 100 109 | 
             
                      while @node.expressions.last
         | 
| 101 110 | 
             
                        seq.insert @node.expressions.pop
         | 
| @@ -163,6 +172,30 @@ module Regexp::Parser | |
| 163 172 | 
             
                end
         | 
| 164 173 | 
             
              end
         | 
| 165 174 |  | 
| 175 | 
            +
              def self.conditional(token)
         | 
| 176 | 
            +
                case token.token
         | 
| 177 | 
            +
                when :open
         | 
| 178 | 
            +
                  nest_conditional(Conditional::Expression.new(token))
         | 
| 179 | 
            +
                when :condition
         | 
| 180 | 
            +
                  @conditional_nesting.last.condition(Conditional::Condition.new(token))
         | 
| 181 | 
            +
                  @conditional_nesting.last.branch
         | 
| 182 | 
            +
                when :separator
         | 
| 183 | 
            +
                  @conditional_nesting.last.branch
         | 
| 184 | 
            +
                  @node = @conditional_nesting.last.branches.last
         | 
| 185 | 
            +
                when :close
         | 
| 186 | 
            +
                  @conditional_nesting.pop
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                  @node = if @conditional_nesting.empty?
         | 
| 189 | 
            +
                    @nesting.last
         | 
| 190 | 
            +
                  else
         | 
| 191 | 
            +
                    @conditional_nesting.last
         | 
| 192 | 
            +
                  end
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                else
         | 
| 195 | 
            +
                  raise UnknownTokenError.new('Conditional', token)
         | 
| 196 | 
            +
                end
         | 
| 197 | 
            +
              end
         | 
| 198 | 
            +
             | 
| 166 199 | 
             
              def self.property(token)
         | 
| 167 200 | 
             
                include Regexp::Expression::UnicodeProperty
         | 
| 168 201 |  | 
| @@ -291,14 +324,50 @@ module Regexp::Parser | |
| 291 324 | 
             
                when :control
         | 
| 292 325 | 
             
                  @node << EscapeSequence::Control.new(token)
         | 
| 293 326 |  | 
| 327 | 
            +
                when :meta_sequence
         | 
| 328 | 
            +
                  if token.text =~ /\A\\M-\\C/
         | 
| 329 | 
            +
                    @node << EscapeSequence::MetaControl.new(token)
         | 
| 330 | 
            +
                  else
         | 
| 331 | 
            +
                    @node << EscapeSequence::Meta.new(token)
         | 
| 332 | 
            +
                  end
         | 
| 333 | 
            +
             | 
| 294 334 | 
             
                else
         | 
| 295 335 | 
             
                  # treating everything else as a literal
         | 
| 296 336 | 
             
                  @node << EscapeSequence::Literal.new(token)
         | 
| 297 337 | 
             
                end
         | 
| 298 338 | 
             
              end
         | 
| 299 339 |  | 
| 340 | 
            +
             | 
| 341 | 
            +
              def self.keep(token)
         | 
| 342 | 
            +
                @node << Keep::Mark.new(token)
         | 
| 343 | 
            +
              end
         | 
| 344 | 
            +
             | 
| 345 | 
            +
              def self.free_space(token)
         | 
| 346 | 
            +
                case token.token
         | 
| 347 | 
            +
                when :comment
         | 
| 348 | 
            +
                  @node << Comment.new(token)
         | 
| 349 | 
            +
                when :whitespace
         | 
| 350 | 
            +
                  if @node.last and @node.last.is_a?(WhiteSpace)
         | 
| 351 | 
            +
                    @node.last.merge(WhiteSpace.new(token))
         | 
| 352 | 
            +
                  else
         | 
| 353 | 
            +
                    @node << WhiteSpace.new(token)
         | 
| 354 | 
            +
                  end
         | 
| 355 | 
            +
                else
         | 
| 356 | 
            +
                  raise UnknownTokenError.new('FreeSpace', token)
         | 
| 357 | 
            +
                end
         | 
| 358 | 
            +
              end
         | 
| 359 | 
            +
             | 
| 300 360 | 
             
              def self.quantifier(token)
         | 
| 301 | 
            -
                 | 
| 361 | 
            +
                offset = -1
         | 
| 362 | 
            +
                target_node = @node.expressions[offset]
         | 
| 363 | 
            +
                while target_node and target_node.is_a?(FreeSpace)
         | 
| 364 | 
            +
                  target_node = @node.expressions[offset -= 1]
         | 
| 365 | 
            +
                end
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                raise ArgumentError.new("No valid target found for '#{token.text}' "+
         | 
| 368 | 
            +
                                        "quantifier") unless target_node
         | 
| 369 | 
            +
             | 
| 370 | 
            +
                unless target_node
         | 
| 302 371 | 
             
                  if token.token == :zero_or_one
         | 
| 303 372 | 
             
                    raise "Quantifier given without a target, or the syntax of the group " +
         | 
| 304 373 | 
             
                          "or its options is incorrect"
         | 
| @@ -309,35 +378,36 @@ module Regexp::Parser | |
| 309 378 |  | 
| 310 379 | 
             
                case token.token
         | 
| 311 380 | 
             
                when :zero_or_one
         | 
| 312 | 
            -
                   | 
| 381 | 
            +
                  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
         | 
| 313 382 | 
             
                when :zero_or_one_reluctant
         | 
| 314 | 
            -
                   | 
| 383 | 
            +
                  target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
         | 
| 315 384 | 
             
                when :zero_or_one_possessive
         | 
| 316 | 
            -
                   | 
| 385 | 
            +
                  target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
         | 
| 317 386 |  | 
| 318 387 | 
             
                when :zero_or_more
         | 
| 319 | 
            -
                   | 
| 388 | 
            +
                  target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
         | 
| 320 389 | 
             
                when :zero_or_more_reluctant
         | 
| 321 | 
            -
                   | 
| 390 | 
            +
                  target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
         | 
| 322 391 | 
             
                when :zero_or_more_possessive
         | 
| 323 | 
            -
                   | 
| 392 | 
            +
                  target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
         | 
| 324 393 |  | 
| 325 394 | 
             
                when :one_or_more
         | 
| 326 | 
            -
                   | 
| 395 | 
            +
                  target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
         | 
| 327 396 | 
             
                when :one_or_more_reluctant
         | 
| 328 | 
            -
                   | 
| 397 | 
            +
                  target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
         | 
| 329 398 | 
             
                when :one_or_more_possessive
         | 
| 330 | 
            -
                   | 
| 399 | 
            +
                  target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
         | 
| 331 400 |  | 
| 332 401 | 
             
                when :interval
         | 
| 333 | 
            -
                   | 
| 402 | 
            +
                  interval(target_node, token)
         | 
| 334 403 |  | 
| 335 404 | 
             
                else
         | 
| 336 405 | 
             
                  raise UnknownTokenError.new('Quantifier', token)
         | 
| 337 406 | 
             
                end
         | 
| 338 407 | 
             
              end
         | 
| 339 408 |  | 
| 340 | 
            -
              def self.interval( | 
| 409 | 
            +
              def self.interval(target_node, token)
         | 
| 410 | 
            +
                text = token.text
         | 
| 341 411 | 
             
                mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
         | 
| 342 412 | 
             
                mode = case mchr
         | 
| 343 413 | 
             
                when '?'; text.chop!; :reluctant
         | 
| @@ -349,19 +419,19 @@ module Regexp::Parser | |
| 349 419 | 
             
                min = range[0].empty? ? 0 : range[0]
         | 
| 350 420 | 
             
                max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
         | 
| 351 421 |  | 
| 352 | 
            -
                 | 
| 422 | 
            +
                target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
         | 
| 353 423 | 
             
              end
         | 
| 354 424 |  | 
| 355 425 | 
             
              def self.group(token)
         | 
| 356 426 | 
             
                case token.token
         | 
| 357 427 | 
             
                when :options
         | 
| 358 | 
            -
                   | 
| 428 | 
            +
                  options(token)
         | 
| 359 429 | 
             
                when :close
         | 
| 360 | 
            -
                   | 
| 430 | 
            +
                  close_group
         | 
| 361 431 | 
             
                when :comment
         | 
| 362 432 | 
             
                  @node << Group::Comment.new(token)
         | 
| 363 433 | 
             
                else
         | 
| 364 | 
            -
                   | 
| 434 | 
            +
                  open_group(token)
         | 
| 365 435 | 
             
                end
         | 
| 366 436 | 
             
              end
         | 
| 367 437 |  | 
| @@ -372,10 +442,13 @@ module Regexp::Parser | |
| 372 442 | 
             
                exp.options = {
         | 
| 373 443 | 
             
                  :m => opt[0].include?('m') ? true : false,
         | 
| 374 444 | 
             
                  :i => opt[0].include?('i') ? true : false,
         | 
| 375 | 
            -
                  :x => opt[0].include?('x') ? true : false
         | 
| 445 | 
            +
                  :x => opt[0].include?('x') ? true : false,
         | 
| 446 | 
            +
                  :d => opt[0].include?('d') ? true : false,
         | 
| 447 | 
            +
                  :a => opt[0].include?('a') ? true : false,
         | 
| 448 | 
            +
                  :u => opt[0].include?('u') ? true : false
         | 
| 376 449 | 
             
                }
         | 
| 377 450 |  | 
| 378 | 
            -
                 | 
| 451 | 
            +
                nest(exp)
         | 
| 379 452 | 
             
              end
         | 
| 380 453 |  | 
| 381 454 | 
             
              def self.open_group(token)
         | 
| @@ -402,7 +475,7 @@ module Regexp::Parser | |
| 402 475 | 
             
                  raise UnknownTokenError.new('Group type open', token)
         | 
| 403 476 | 
             
                end
         | 
| 404 477 |  | 
| 405 | 
            -
                 | 
| 478 | 
            +
                nest(exp)
         | 
| 406 479 | 
             
              end
         | 
| 407 480 |  | 
| 408 481 | 
             
              def self.close_group
         |