regexp_parser 1.7.1 → 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +138 -0
  3. data/Gemfile +6 -1
  4. data/README.md +23 -11
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +13 -21
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  14. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  15. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  16. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  17. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  18. data/lib/regexp_parser/expression/sequence.rb +3 -19
  19. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  20. data/lib/regexp_parser/lexer.rb +6 -6
  21. data/lib/regexp_parser/parser.rb +325 -344
  22. data/lib/regexp_parser/scanner.rb +1320 -1385
  23. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  24. data/lib/regexp_parser/scanner/property.rl +2 -2
  25. data/lib/regexp_parser/scanner/scanner.rl +231 -253
  26. data/lib/regexp_parser/syntax.rb +8 -6
  27. data/lib/regexp_parser/syntax/any.rb +3 -3
  28. data/lib/regexp_parser/syntax/base.rb +1 -1
  29. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  30. data/lib/regexp_parser/version.rb +1 -1
  31. data/regexp_parser.gemspec +1 -1
  32. data/spec/expression/base_spec.rb +10 -0
  33. data/spec/expression/clone_spec.rb +36 -4
  34. data/spec/expression/free_space_spec.rb +2 -2
  35. data/spec/expression/methods/match_length_spec.rb +2 -2
  36. data/spec/expression/subexpression_spec.rb +1 -1
  37. data/spec/expression/to_s_spec.rb +39 -31
  38. data/spec/lexer/literals_spec.rb +24 -49
  39. data/spec/lexer/refcalls_spec.rb +5 -0
  40. data/spec/parser/all_spec.rb +2 -2
  41. data/spec/parser/errors_spec.rb +1 -1
  42. data/spec/parser/escapes_spec.rb +1 -1
  43. data/spec/parser/options_spec.rb +28 -0
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +12 -1
  48. data/spec/scanner/free_space_spec.rb +32 -0
  49. data/spec/scanner/groups_spec.rb +10 -1
  50. data/spec/scanner/literals_spec.rb +28 -38
  51. data/spec/scanner/options_spec.rb +36 -0
  52. data/spec/scanner/quantifiers_spec.rb +18 -13
  53. data/spec/scanner/refcalls_spec.rb +19 -0
  54. data/spec/scanner/sets_spec.rb +65 -16
  55. data/spec/spec_helper.rb +1 -0
  56. metadata +61 -60
  57. data/spec/expression/root_spec.rb +0 -9
  58. data/spec/expression/sequence_spec.rb +0 -9
@@ -7,7 +7,7 @@ module Regexp::Expression
7
7
  end
8
8
 
9
9
  def name
10
- text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
10
+ text[/\A\\[pP]\{([^}]+)\}\z/, 1]
11
11
  end
12
12
 
13
13
  def shortcut
@@ -1,24 +1,12 @@
1
1
  module Regexp::Expression
2
2
 
3
3
  class Root < Regexp::Expression::Subexpression
4
- # TODO: this override is here for backwards compatibility, remove in 2.0.0
5
- def initialize(*args)
6
- unless args.first.is_a?(Regexp::Token)
7
- warn('WARNING: Root.new without a Token argument is deprecated and '\
8
- 'will be removed in 2.0.0. Use Root.build for the old behavior.')
9
- return super(self.class.build_token, *args)
10
- end
11
- super
4
+ def self.build(options = {})
5
+ new(build_token, options)
12
6
  end
13
7
 
14
- class << self
15
- def build(options = {})
16
- new(build_token, options)
17
- end
18
-
19
- def build_token
20
- Regexp::Token.new(:expression, :root, '', 0)
21
- end
8
+ def self.build_token
9
+ Regexp::Token.new(:expression, :root, '', 0)
22
10
  end
23
11
  end
24
12
  end
@@ -7,7 +7,8 @@ module Regexp::Expression
7
7
  alias :ts :starts_at
8
8
 
9
9
  def <<(exp)
10
- complete? && raise("Can't add more than 2 expressions to a Range")
10
+ complete? and raise Regexp::Parser::Error,
11
+ "Can't add more than 2 expressions to a Range"
11
12
  super
12
13
  end
13
14
 
@@ -10,7 +10,7 @@ class Regexp::MatchLength
10
10
  self.exp_class = exp.class
11
11
  self.min_rep = exp.repetitions.min
12
12
  self.max_rep = exp.repetitions.max
13
- if base = opts[:base]
13
+ if (base = opts[:base])
14
14
  self.base_min = base
15
15
  self.base_max = base
16
16
  self.reify = ->{ '.' * base }
@@ -32,7 +32,7 @@ class Regexp::MatchLength
32
32
  end
33
33
  end
34
34
 
35
- def endless_each(&block)
35
+ def endless_each
36
36
  return enum_for(__method__) unless block_given?
37
37
  (min..max).each { |num| yield(num) if include?(num) }
38
38
  end
@@ -36,7 +36,7 @@ module Regexp::Expression
36
36
 
37
37
  # Iterates over the expressions of this expression as an array, passing
38
38
  # the expression and its index within its parent to the given block.
39
- def each_expression(include_self = false, &block)
39
+ def each_expression(include_self = false)
40
40
  return enum_for(__method__, include_self) unless block_given?
41
41
 
42
42
  traverse(include_self) do |event, exp, index|
@@ -47,7 +47,7 @@ module Regexp::Expression
47
47
  # Returns a new array with the results of calling the given block once
48
48
  # for every expression. If a block is not given, returns an array with
49
49
  # each expression and its level index as an array.
50
- def flat_map(include_self = false, &block)
50
+ def flat_map(include_self = false)
51
51
  result = []
52
52
 
53
53
  each_expression(include_self) do |exp, index|
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  @text = orig.text.dup
17
17
  super
18
18
  end
@@ -40,5 +40,14 @@ module Regexp::Expression
40
40
  RUBY
41
41
  end
42
42
  alias :lazy? :reluctant?
43
+
44
+ def ==(other)
45
+ other.class == self.class &&
46
+ other.token == token &&
47
+ other.mode == mode &&
48
+ other.min == min &&
49
+ other.max == max
50
+ end
51
+ alias :eq :==
43
52
  end
44
53
  end
@@ -7,16 +7,6 @@ module Regexp::Expression
7
7
  # Used as the base class for the Alternation alternatives, Conditional
8
8
  # branches, and CharacterSet::Intersection intersected sequences.
9
9
  class Sequence < Regexp::Expression::Subexpression
10
- # TODO: this override is here for backwards compatibility, remove in 2.0.0
11
- def initialize(*args)
12
- if args.count == 3
13
- warn('WARNING: Sequence.new without a Regexp::Token argument is '\
14
- 'deprecated and will be removed in 2.0.0.')
15
- return self.class.at_levels(*args)
16
- end
17
- super
18
- end
19
-
20
10
  class << self
21
11
  def add_to(subexpression, params = {}, active_opts = {})
22
12
  sequence = at_levels(
@@ -51,17 +41,11 @@ module Regexp::Expression
51
41
  alias :ts :starts_at
52
42
 
53
43
  def quantify(token, text, min = nil, max = nil, mode = :greedy)
54
- offset = -1
55
- target = expressions[offset]
56
- while target.is_a?(FreeSpace)
57
- target = expressions[offset -= 1]
58
- end
59
-
60
- target || raise(ArgumentError, "No valid target found for '#{text}' "\
61
- 'quantifier')
44
+ target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
45
+ target or raise Regexp::Parser::Error,
46
+ "No valid target found for '#{text}' quantifier"
62
47
 
63
48
  target.quantify(token, text, min, max, mode)
64
49
  end
65
50
  end
66
-
67
51
  end
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
@@ -11,11 +11,11 @@ class Regexp::Lexer
11
11
 
12
12
  CLOSING_TOKENS = [:close].freeze
13
13
 
14
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
15
- new.lex(input, syntax, &block)
14
+ def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
15
+ new.lex(input, syntax, options: options, &block)
16
16
  end
17
17
 
18
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
18
+ def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
19
19
  syntax = Regexp::Syntax.new(syntax)
20
20
 
21
21
  self.tokens = []
@@ -25,7 +25,7 @@ class Regexp::Lexer
25
25
  self.shift = 0
26
26
 
27
27
  last = nil
28
- Regexp::Scanner.scan(input) do |type, token, text, ts, te|
28
+ Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
29
29
  type, token = *syntax.normalize(type, token)
30
30
  syntax.check! type, token
31
31
 
@@ -96,10 +96,10 @@ class Regexp::Lexer
96
96
 
97
97
  tokens.pop
98
98
  tokens << Regexp::Token.new(:literal, :literal, lead,
99
- token.ts, (token.te - last.bytesize),
99
+ token.ts, (token.te - last.length),
100
100
  nesting, set_nesting, conditional_nesting)
101
101
  tokens << Regexp::Token.new(:literal, :literal, last,
102
- (token.ts + lead.bytesize), token.te,
102
+ (token.ts + lead.length), token.te,
103
103
  nesting, set_nesting, conditional_nesting)
104
104
  end
105
105
 
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
22
- new.parse(input, syntax, &block)
21
+ def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
22
+ new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
26
- root = Root.build(options_from_input(input))
25
+ def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
26
+ root = Root.build(extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -35,7 +35,7 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
@@ -54,105 +54,171 @@ class Regexp::Parser
54
54
  :options_stack, :switching_options, :conditional_nesting,
55
55
  :captured_group_counts
56
56
 
57
- def options_from_input(input)
58
- return {} unless input.is_a?(::Regexp)
57
+ def extract_options(input, options)
58
+ if options && !input.is_a?(String)
59
+ raise ArgumentError, 'options cannot be supplied unless parsing a String'
60
+ end
59
61
 
60
- options = {}
61
- options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
62
- options[:m] = true if input.options & ::Regexp::MULTILINE != 0
63
- options[:x] = true if input.options & ::Regexp::EXTENDED != 0
64
- options
65
- end
62
+ options = input.options if input.is_a?(::Regexp)
66
63
 
67
- def nest(exp)
68
- nesting.push(exp)
69
- node << exp
70
- update_transplanted_subtree(exp, node)
71
- self.node = exp
72
- end
73
-
74
- # subtrees are transplanted to build Alternations, Intersections, Ranges
75
- def update_transplanted_subtree(exp, new_parent)
76
- exp.nesting_level = new_parent.nesting_level + 1
77
- exp.respond_to?(:each) &&
78
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
79
- end
80
-
81
- def decrease_nesting
82
- while nesting.last.is_a?(SequenceOperation)
83
- nesting.pop
84
- self.node = nesting.last
85
- end
86
- nesting.pop
87
- yield(node) if block_given?
88
- self.node = nesting.last
89
- self.node = node.last if node.last.is_a?(SequenceOperation)
90
- end
64
+ return {} unless options
91
65
 
92
- def nest_conditional(exp)
93
- conditional_nesting.push(exp)
94
- nest(exp)
66
+ enabled_options = {}
67
+ enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
68
+ enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
69
+ enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
70
+ enabled_options
95
71
  end
96
72
 
97
73
  def parse_token(token)
98
- close_completed_character_set_range
99
-
100
74
  case token.type
101
- when :meta; meta(token)
102
- when :quantifier; quantifier(token)
103
- when :anchor; anchor(token)
104
- when :escape; escape(token)
105
- when :group; group(token)
106
- when :assertion; group(token)
107
- when :set; set(token)
108
- when :type; type(token)
109
- when :backref; backref(token)
110
- when :conditional; conditional(token)
111
- when :keep; keep(token)
112
-
113
- when :posixclass, :nonposixclass
114
- posixclass(token)
115
- when :property, :nonproperty
116
- property(token)
117
-
118
- when :literal
119
- node << Literal.new(token, active_opts)
120
- when :free_space
121
- free_space(token)
122
-
75
+ when :anchor; anchor(token)
76
+ when :assertion, :group; group(token)
77
+ when :backref; backref(token)
78
+ when :conditional; conditional(token)
79
+ when :escape; escape(token)
80
+ when :free_space; free_space(token)
81
+ when :keep; keep(token)
82
+ when :literal; literal(token)
83
+ when :meta; meta(token)
84
+ when :posixclass, :nonposixclass; posixclass(token)
85
+ when :property, :nonproperty; property(token)
86
+ when :quantifier; quantifier(token)
87
+ when :set; set(token)
88
+ when :type; type(token)
123
89
  else
124
90
  raise UnknownTokenTypeError.new(token.type, token)
125
91
  end
92
+
93
+ close_completed_character_set_range
126
94
  end
127
95
 
128
- def set(token)
96
+ def anchor(token)
129
97
  case token.token
130
- when :open
131
- open_set(token)
132
- when :close
133
- close_set
134
- when :negate
135
- negate_set
136
- when :range
137
- range(token)
138
- when :intersection
139
- intersection(token)
140
- when :collation, :equivalent
141
- node << Literal.new(token, active_opts)
98
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
99
+ when :bos; node << Anchor::BOS.new(token, active_opts)
100
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
101
+ when :eos; node << Anchor::EOS.new(token, active_opts)
102
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
103
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
104
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
105
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
142
106
  else
143
- raise UnknownTokenError.new('CharacterSet', token)
107
+ raise UnknownTokenError.new('Anchor', token)
144
108
  end
145
109
  end
146
110
 
147
- def meta(token)
111
+ def group(token)
148
112
  case token.token
149
- when :dot
150
- node << CharacterType::Any.new(token, active_opts)
151
- when :alternation
152
- sequence_operation(Alternation, token)
113
+ when :options, :options_switch
114
+ options_group(token)
115
+ when :close
116
+ close_group
117
+ when :comment
118
+ node << Group::Comment.new(token, active_opts)
153
119
  else
154
- raise UnknownTokenError.new('Meta', token)
120
+ open_group(token)
121
+ end
122
+ end
123
+
124
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
125
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
126
+
127
+ def options_group(token)
128
+ positive, negative = token.text.split('-', 2)
129
+ negative ||= ''
130
+ self.switching_options = token.token.equal?(:options_switch)
131
+
132
+ opt_changes = {}
133
+ new_active_opts = active_opts.dup
134
+
135
+ MOD_FLAGS.each do |flag|
136
+ if positive.include?(flag.to_s)
137
+ opt_changes[flag] = new_active_opts[flag] = true
138
+ end
139
+ if negative.include?(flag.to_s)
140
+ opt_changes[flag] = false
141
+ new_active_opts.delete(flag)
142
+ end
143
+ end
144
+
145
+ if (enc_flag = positive.reverse[/[adu]/])
146
+ enc_flag = enc_flag.to_sym
147
+ (ENC_FLAGS - [enc_flag]).each do |other|
148
+ opt_changes[other] = false if new_active_opts[other]
149
+ new_active_opts.delete(other)
150
+ end
151
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
152
+ end
153
+
154
+ options_stack << new_active_opts
155
+
156
+ options_group = Group::Options.new(token, active_opts)
157
+ options_group.option_changes = opt_changes
158
+
159
+ nest(options_group)
160
+ end
161
+
162
+ def open_group(token)
163
+ group_class =
164
+ case token.token
165
+ when :absence; Group::Absence
166
+ when :atomic; Group::Atomic
167
+ when :capture; Group::Capture
168
+ when :named; Group::Named
169
+ when :passive; Group::Passive
170
+
171
+ when :lookahead; Assertion::Lookahead
172
+ when :lookbehind; Assertion::Lookbehind
173
+ when :nlookahead; Assertion::NegativeLookahead
174
+ when :nlookbehind; Assertion::NegativeLookbehind
175
+
176
+ else
177
+ raise UnknownTokenError.new('Group type open', token)
178
+ end
179
+
180
+ group = group_class.new(token, active_opts)
181
+
182
+ if group.capturing?
183
+ group.number = total_captured_group_count + 1
184
+ group.number_at_level = captured_group_count_at_level + 1
185
+ count_captured_group
186
+ end
187
+
188
+ # Push the active options to the stack again. This way we can simply pop the
189
+ # stack for any group we close, no matter if it had its own options or not.
190
+ options_stack << active_opts
191
+
192
+ nest(group)
193
+ end
194
+
195
+ def total_captured_group_count
196
+ captured_group_counts.values.reduce(0, :+)
197
+ end
198
+
199
+ def captured_group_count_at_level
200
+ captured_group_counts[node.level]
201
+ end
202
+
203
+ def count_captured_group
204
+ captured_group_counts[node.level] += 1
205
+ end
206
+
207
+ def close_group
208
+ options_stack.pop unless switching_options
209
+ self.switching_options = false
210
+ decrease_nesting
211
+ end
212
+
213
+ def decrease_nesting
214
+ while nesting.last.is_a?(SequenceOperation)
215
+ nesting.pop
216
+ self.node = nesting.last
155
217
  end
218
+ nesting.pop
219
+ yield(node) if block_given?
220
+ self.node = nesting.last
221
+ self.node = node.last if node.last.is_a?(SequenceOperation)
156
222
  end
157
223
 
158
224
  def backref(token)
@@ -182,31 +248,9 @@ class Regexp::Parser
182
248
  end
183
249
  end
184
250
 
185
- def type(token)
186
- case token.token
187
- when :digit
188
- node << CharacterType::Digit.new(token, active_opts)
189
- when :nondigit
190
- node << CharacterType::NonDigit.new(token, active_opts)
191
- when :hex
192
- node << CharacterType::Hex.new(token, active_opts)
193
- when :nonhex
194
- node << CharacterType::NonHex.new(token, active_opts)
195
- when :space
196
- node << CharacterType::Space.new(token, active_opts)
197
- when :nonspace
198
- node << CharacterType::NonSpace.new(token, active_opts)
199
- when :word
200
- node << CharacterType::Word.new(token, active_opts)
201
- when :nonword
202
- node << CharacterType::NonWord.new(token, active_opts)
203
- when :linebreak
204
- node << CharacterType::Linebreak.new(token, active_opts)
205
- when :xgrapheme
206
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
207
- else
208
- raise UnknownTokenError.new('CharacterType', token)
209
- end
251
+ def assign_effective_number(exp)
252
+ exp.effective_number =
253
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
210
254
  end
211
255
 
212
256
  def conditional(token)
@@ -234,11 +278,118 @@ class Regexp::Parser
234
278
  end
235
279
  end
236
280
 
281
+ def nest_conditional(exp)
282
+ conditional_nesting.push(exp)
283
+ nest(exp)
284
+ end
285
+
286
+ def nest(exp)
287
+ nesting.push(exp)
288
+ node << exp
289
+ update_transplanted_subtree(exp, node)
290
+ self.node = exp
291
+ end
292
+
293
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
294
+ def update_transplanted_subtree(exp, new_parent)
295
+ exp.nesting_level = new_parent.nesting_level + 1
296
+ exp.respond_to?(:each) &&
297
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
+ end
299
+
300
+ def escape(token)
301
+ case token.token
302
+
303
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
304
+
305
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
306
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
307
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
308
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
309
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
310
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
311
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
312
+
313
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
314
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
315
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
316
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
317
+
318
+ when :control
319
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
320
+ node << EscapeSequence::MetaControl.new(token, active_opts)
321
+ else
322
+ node << EscapeSequence::Control.new(token, active_opts)
323
+ end
324
+
325
+ when :meta_sequence
326
+ if token.text =~ /\A\\M-\\[Cc]/
327
+ node << EscapeSequence::MetaControl.new(token, active_opts)
328
+ else
329
+ node << EscapeSequence::Meta.new(token, active_opts)
330
+ end
331
+
332
+ else
333
+ # treating everything else as a literal
334
+ # TODO: maybe split this up a bit more in v3.0.0?
335
+ # E.g. escaped quantifiers or set meta chars are not the same
336
+ # as stuff that would be a literal even without the backslash.
337
+ # Right now, they all end up here.
338
+ node << EscapeSequence::Literal.new(token, active_opts)
339
+ end
340
+ end
341
+
342
+ def free_space(token)
343
+ case token.token
344
+ when :comment
345
+ node << Comment.new(token, active_opts)
346
+ when :whitespace
347
+ if node.last.is_a?(WhiteSpace)
348
+ node.last.merge(WhiteSpace.new(token, active_opts))
349
+ else
350
+ node << WhiteSpace.new(token, active_opts)
351
+ end
352
+ else
353
+ raise UnknownTokenError.new('FreeSpace', token)
354
+ end
355
+ end
356
+
357
+ def keep(token)
358
+ node << Keep::Mark.new(token, active_opts)
359
+ end
360
+
361
+ def literal(token)
362
+ node << Literal.new(token, active_opts)
363
+ end
364
+
365
+ def meta(token)
366
+ case token.token
367
+ when :dot
368
+ node << CharacterType::Any.new(token, active_opts)
369
+ when :alternation
370
+ sequence_operation(Alternation, token)
371
+ else
372
+ raise UnknownTokenError.new('Meta', token)
373
+ end
374
+ end
375
+
376
+ def sequence_operation(klass, token)
377
+ unless node.is_a?(klass)
378
+ operator = klass.new(token, active_opts)
379
+ sequence = operator.add_sequence(active_opts)
380
+ sequence.expressions = node.expressions
381
+ node.expressions = []
382
+ nest(operator)
383
+ end
384
+ node.add_sequence(active_opts)
385
+ end
386
+
237
387
  def posixclass(token)
238
388
  node << PosixClass.new(token, active_opts)
239
389
  end
240
390
 
241
391
  include Regexp::Expression::UnicodeProperty
392
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
242
393
 
243
394
  def property(token)
244
395
  case token.token
@@ -310,128 +461,43 @@ class Regexp::Parser
310
461
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
311
462
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
312
463
 
313
- when *Token::UnicodeProperty::Age
314
- node << Age.new(token, active_opts)
315
-
316
- when *Token::UnicodeProperty::Derived
317
- node << Derived.new(token, active_opts)
318
-
319
- when *Token::UnicodeProperty::Emoji
320
- node << Emoji.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Script
323
- node << Script.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::UnicodeBlock
326
- node << Block.new(token, active_opts)
464
+ when *UPTokens::Age; node << Age.new(token, active_opts)
465
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
+ when *UPTokens::Script; node << Script.new(token, active_opts)
468
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
327
469
 
328
470
  else
329
471
  raise UnknownTokenError.new('UnicodeProperty', token)
330
472
  end
331
473
  end
332
474
 
333
- def anchor(token)
334
- case token.token
335
- when :bol
336
- node << Anchor::BeginningOfLine.new(token, active_opts)
337
- when :eol
338
- node << Anchor::EndOfLine.new(token, active_opts)
339
- when :bos
340
- node << Anchor::BOS.new(token, active_opts)
341
- when :eos
342
- node << Anchor::EOS.new(token, active_opts)
343
- when :eos_ob_eol
344
- node << Anchor::EOSobEOL.new(token, active_opts)
345
- when :word_boundary
346
- node << Anchor::WordBoundary.new(token, active_opts)
347
- when :nonword_boundary
348
- node << Anchor::NonWordBoundary.new(token, active_opts)
349
- when :match_start
350
- node << Anchor::MatchStart.new(token, active_opts)
351
- else
352
- raise UnknownTokenError.new('Anchor', token)
353
- end
354
- end
355
-
356
- def escape(token)
357
- case token.token
358
-
359
- when :backspace
360
- node << EscapeSequence::Backspace.new(token, active_opts)
361
-
362
- when :escape
363
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
364
- when :bell
365
- node << EscapeSequence::Bell.new(token, active_opts)
366
- when :form_feed
367
- node << EscapeSequence::FormFeed.new(token, active_opts)
368
- when :newline
369
- node << EscapeSequence::Newline.new(token, active_opts)
370
- when :carriage
371
- node << EscapeSequence::Return.new(token, active_opts)
372
- when :tab
373
- node << EscapeSequence::Tab.new(token, active_opts)
374
- when :vertical_tab
375
- node << EscapeSequence::VerticalTab.new(token, active_opts)
376
-
377
- when :hex
378
- node << EscapeSequence::Hex.new(token, active_opts)
379
- when :octal
380
- node << EscapeSequence::Octal.new(token, active_opts)
381
- when :codepoint
382
- node << EscapeSequence::Codepoint.new(token, active_opts)
383
- when :codepoint_list
384
- node << EscapeSequence::CodepointList.new(token, active_opts)
385
-
386
- when :control
387
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
388
- node << EscapeSequence::MetaControl.new(token, active_opts)
389
- else
390
- node << EscapeSequence::Control.new(token, active_opts)
391
- end
392
-
393
- when :meta_sequence
394
- if token.text =~ /\A\\M-\\[Cc]/
395
- node << EscapeSequence::MetaControl.new(token, active_opts)
396
- else
397
- node << EscapeSequence::Meta.new(token, active_opts)
398
- end
399
-
400
- else
401
- # treating everything else as a literal
402
- node << EscapeSequence::Literal.new(token, active_opts)
403
- end
404
- end
405
-
406
- def keep(token)
407
- node << Keep::Mark.new(token, active_opts)
408
- end
409
-
410
- def free_space(token)
411
- case token.token
412
- when :comment
413
- node << Comment.new(token, active_opts)
414
- when :whitespace
415
- if node.last.is_a?(WhiteSpace)
416
- node.last.merge(WhiteSpace.new(token, active_opts))
417
- else
418
- node << WhiteSpace.new(token, active_opts)
419
- end
420
- else
421
- raise UnknownTokenError.new('FreeSpace', token)
422
- end
423
- end
424
-
425
475
  def quantifier(token)
426
- offset = -1
427
- target_node = node.expressions[offset]
428
- while target_node.is_a?(FreeSpace)
429
- target_node = node.expressions[offset -= 1]
476
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
478
+
479
+ # in case of chained quantifiers, wrap target in an implicit passive group
480
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
481
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
482
+ if target_node.quantified?
483
+ new_token = Regexp::Token.new(
484
+ :group,
485
+ :passive,
486
+ '', # text
487
+ target_node.ts,
488
+ nil, # te (unused)
489
+ target_node.level,
490
+ target_node.set_level,
491
+ target_node.conditional_level
492
+ )
493
+ new_group = Group::Passive.new(new_token, active_opts)
494
+ new_group.implicit = true
495
+ new_group << target_node
496
+ increase_level(target_node)
497
+ node.expressions[node.expressions.index(target_node)] = new_group
498
+ target_node = new_group
430
499
  end
431
500
 
432
- target_node || raise(ArgumentError, 'No valid target found for '\
433
- "'#{token.text}' ")
434
-
435
501
  case token.token
436
502
  when :zero_or_one
437
503
  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -462,6 +528,11 @@ class Regexp::Parser
462
528
  end
463
529
  end
464
530
 
531
+ def increase_level(exp)
532
+ exp.level += 1
533
+ exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
+ end
535
+
465
536
  def interval(target_node, token)
466
537
  text = token.text
467
538
  mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
@@ -484,100 +555,16 @@ class Regexp::Parser
484
555
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
485
556
  end
486
557
 
487
- def group(token)
488
- case token.token
489
- when :options, :options_switch
490
- options_group(token)
491
- when :close
492
- close_group
493
- when :comment
494
- node << Group::Comment.new(token, active_opts)
495
- else
496
- open_group(token)
497
- end
498
- end
499
-
500
- MOD_FLAGS = %w[i m x].map(&:to_sym)
501
- ENC_FLAGS = %w[a d u].map(&:to_sym)
502
-
503
- def options_group(token)
504
- positive, negative = token.text.split('-', 2)
505
- negative ||= ''
506
- self.switching_options = token.token.equal?(:options_switch)
507
-
508
- opt_changes = {}
509
- new_active_opts = active_opts.dup
510
-
511
- MOD_FLAGS.each do |flag|
512
- if positive.include?(flag.to_s)
513
- opt_changes[flag] = new_active_opts[flag] = true
514
- end
515
- if negative.include?(flag.to_s)
516
- opt_changes[flag] = false
517
- new_active_opts.delete(flag)
518
- end
519
- end
520
-
521
- if (enc_flag = positive.reverse[/[adu]/])
522
- enc_flag = enc_flag.to_sym
523
- (ENC_FLAGS - [enc_flag]).each do |other|
524
- opt_changes[other] = false if new_active_opts[other]
525
- new_active_opts.delete(other)
526
- end
527
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
528
- end
529
-
530
- options_stack << new_active_opts
531
-
532
- options_group = Group::Options.new(token, active_opts)
533
- options_group.option_changes = opt_changes
534
-
535
- nest(options_group)
536
- end
537
-
538
- def open_group(token)
558
+ def set(token)
539
559
  case token.token
540
- when :passive
541
- exp = Group::Passive.new(token, active_opts)
542
- when :atomic
543
- exp = Group::Atomic.new(token, active_opts)
544
- when :named
545
- exp = Group::Named.new(token, active_opts)
546
- when :capture
547
- exp = Group::Capture.new(token, active_opts)
548
- when :absence
549
- exp = Group::Absence.new(token, active_opts)
550
-
551
- when :lookahead
552
- exp = Assertion::Lookahead.new(token, active_opts)
553
- when :nlookahead
554
- exp = Assertion::NegativeLookahead.new(token, active_opts)
555
- when :lookbehind
556
- exp = Assertion::Lookbehind.new(token, active_opts)
557
- when :nlookbehind
558
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
559
-
560
+ when :open; open_set(token)
561
+ when :close; close_set
562
+ when :negate; negate_set
563
+ when :range; range(token)
564
+ when :intersection; intersection(token)
560
565
  else
561
- raise UnknownTokenError.new('Group type open', token)
562
- end
563
-
564
- if exp.capturing?
565
- exp.number = total_captured_group_count + 1
566
- exp.number_at_level = captured_group_count_at_level + 1
567
- count_captured_group
566
+ raise UnknownTokenError.new('CharacterSet', token)
568
567
  end
569
-
570
- # Push the active options to the stack again. This way we can simply pop the
571
- # stack for any group we close, no matter if it had its own options or not.
572
- options_stack << active_opts
573
-
574
- nest(exp)
575
- end
576
-
577
- def close_group
578
- options_stack.pop unless switching_options
579
- self.switching_options = false
580
- decrease_nesting
581
568
  end
582
569
 
583
570
  def open_set(token)
@@ -600,51 +587,45 @@ class Regexp::Parser
600
587
  nest(exp)
601
588
  end
602
589
 
603
- def close_completed_character_set_range
604
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
605
- end
606
-
607
590
  def intersection(token)
608
591
  sequence_operation(CharacterSet::Intersection, token)
609
592
  end
610
593
 
611
- def sequence_operation(klass, token)
612
- unless node.is_a?(klass)
613
- operator = klass.new(token, active_opts)
614
- sequence = operator.add_sequence(active_opts)
615
- sequence.expressions = node.expressions
616
- node.expressions = []
617
- nest(operator)
594
+ def type(token)
595
+ case token.token
596
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
597
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
598
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
599
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
600
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
601
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
602
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
603
+ when :space; node << CharacterType::Space.new(token, active_opts)
604
+ when :word; node << CharacterType::Word.new(token, active_opts)
605
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
606
+ else
607
+ raise UnknownTokenError.new('CharacterType', token)
618
608
  end
619
- node.add_sequence(active_opts)
620
- end
621
-
622
- def active_opts
623
- options_stack.last
624
- end
625
-
626
- def total_captured_group_count
627
- captured_group_counts.values.reduce(0, :+)
628
- end
629
-
630
- def captured_group_count_at_level
631
- captured_group_counts[node.level]
632
609
  end
633
610
 
634
- def count_captured_group
635
- captured_group_counts[node.level] += 1
611
+ def close_completed_character_set_range
612
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
636
613
  end
637
614
 
638
- def assign_effective_number(exp)
639
- exp.effective_number =
640
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
615
+ def active_opts
616
+ options_stack.last
641
617
  end
642
618
 
619
+ # Assigns referenced expressions to refering expressions, e.g. if there is
620
+ # an instance of Backreference::Number, its #referenced_expression is set to
621
+ # the instance of Group::Capture that it refers to via its number.
643
622
  def assign_referenced_expressions
644
623
  targets = {}
624
+ # find all referencable expressions
645
625
  root.each_expression do |exp|
646
626
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
647
627
  end
628
+ # assign them to any refering expressions
648
629
  root.each_expression do |exp|
649
630
  exp.respond_to?(:reference) &&
650
631
  exp.referenced_expression = targets[exp.reference]