regexp_parser 1.7.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +138 -0
  3. data/Gemfile +6 -1
  4. data/README.md +23 -11
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +13 -21
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  14. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  15. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  16. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  17. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  18. data/lib/regexp_parser/expression/sequence.rb +3 -19
  19. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  20. data/lib/regexp_parser/lexer.rb +6 -6
  21. data/lib/regexp_parser/parser.rb +325 -344
  22. data/lib/regexp_parser/scanner.rb +1320 -1385
  23. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  24. data/lib/regexp_parser/scanner/property.rl +2 -2
  25. data/lib/regexp_parser/scanner/scanner.rl +231 -253
  26. data/lib/regexp_parser/syntax.rb +8 -6
  27. data/lib/regexp_parser/syntax/any.rb +3 -3
  28. data/lib/regexp_parser/syntax/base.rb +1 -1
  29. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  30. data/lib/regexp_parser/version.rb +1 -1
  31. data/regexp_parser.gemspec +1 -1
  32. data/spec/expression/base_spec.rb +10 -0
  33. data/spec/expression/clone_spec.rb +36 -4
  34. data/spec/expression/free_space_spec.rb +2 -2
  35. data/spec/expression/methods/match_length_spec.rb +2 -2
  36. data/spec/expression/subexpression_spec.rb +1 -1
  37. data/spec/expression/to_s_spec.rb +39 -31
  38. data/spec/lexer/literals_spec.rb +24 -49
  39. data/spec/lexer/refcalls_spec.rb +5 -0
  40. data/spec/parser/all_spec.rb +2 -2
  41. data/spec/parser/errors_spec.rb +1 -1
  42. data/spec/parser/escapes_spec.rb +1 -1
  43. data/spec/parser/options_spec.rb +28 -0
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +12 -1
  48. data/spec/scanner/free_space_spec.rb +32 -0
  49. data/spec/scanner/groups_spec.rb +10 -1
  50. data/spec/scanner/literals_spec.rb +28 -38
  51. data/spec/scanner/options_spec.rb +36 -0
  52. data/spec/scanner/quantifiers_spec.rb +18 -13
  53. data/spec/scanner/refcalls_spec.rb +19 -0
  54. data/spec/scanner/sets_spec.rb +65 -16
  55. data/spec/spec_helper.rb +1 -0
  56. metadata +61 -60
  57. data/spec/expression/root_spec.rb +0 -9
  58. data/spec/expression/sequence_spec.rb +0 -9
@@ -7,7 +7,7 @@ module Regexp::Expression
7
7
  end
8
8
 
9
9
  def name
10
- text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
10
+ text[/\A\\[pP]\{([^}]+)\}\z/, 1]
11
11
  end
12
12
 
13
13
  def shortcut
@@ -1,24 +1,12 @@
1
1
  module Regexp::Expression
2
2
 
3
3
  class Root < Regexp::Expression::Subexpression
4
- # TODO: this override is here for backwards compatibility, remove in 2.0.0
5
- def initialize(*args)
6
- unless args.first.is_a?(Regexp::Token)
7
- warn('WARNING: Root.new without a Token argument is deprecated and '\
8
- 'will be removed in 2.0.0. Use Root.build for the old behavior.')
9
- return super(self.class.build_token, *args)
10
- end
11
- super
4
+ def self.build(options = {})
5
+ new(build_token, options)
12
6
  end
13
7
 
14
- class << self
15
- def build(options = {})
16
- new(build_token, options)
17
- end
18
-
19
- def build_token
20
- Regexp::Token.new(:expression, :root, '', 0)
21
- end
8
+ def self.build_token
9
+ Regexp::Token.new(:expression, :root, '', 0)
22
10
  end
23
11
  end
24
12
  end
@@ -7,7 +7,8 @@ module Regexp::Expression
7
7
  alias :ts :starts_at
8
8
 
9
9
  def <<(exp)
10
- complete? && raise("Can't add more than 2 expressions to a Range")
10
+ complete? and raise Regexp::Parser::Error,
11
+ "Can't add more than 2 expressions to a Range"
11
12
  super
12
13
  end
13
14
 
@@ -10,7 +10,7 @@ class Regexp::MatchLength
10
10
  self.exp_class = exp.class
11
11
  self.min_rep = exp.repetitions.min
12
12
  self.max_rep = exp.repetitions.max
13
- if base = opts[:base]
13
+ if (base = opts[:base])
14
14
  self.base_min = base
15
15
  self.base_max = base
16
16
  self.reify = ->{ '.' * base }
@@ -32,7 +32,7 @@ class Regexp::MatchLength
32
32
  end
33
33
  end
34
34
 
35
- def endless_each(&block)
35
+ def endless_each
36
36
  return enum_for(__method__) unless block_given?
37
37
  (min..max).each { |num| yield(num) if include?(num) }
38
38
  end
@@ -36,7 +36,7 @@ module Regexp::Expression
36
36
 
37
37
  # Iterates over the expressions of this expression as an array, passing
38
38
  # the expression and its index within its parent to the given block.
39
- def each_expression(include_self = false, &block)
39
+ def each_expression(include_self = false)
40
40
  return enum_for(__method__, include_self) unless block_given?
41
41
 
42
42
  traverse(include_self) do |event, exp, index|
@@ -47,7 +47,7 @@ module Regexp::Expression
47
47
  # Returns a new array with the results of calling the given block once
48
48
  # for every expression. If a block is not given, returns an array with
49
49
  # each expression and its level index as an array.
50
- def flat_map(include_self = false, &block)
50
+ def flat_map(include_self = false)
51
51
  result = []
52
52
 
53
53
  each_expression(include_self) do |exp, index|
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  @text = orig.text.dup
17
17
  super
18
18
  end
@@ -40,5 +40,14 @@ module Regexp::Expression
40
40
  RUBY
41
41
  end
42
42
  alias :lazy? :reluctant?
43
+
44
+ def ==(other)
45
+ other.class == self.class &&
46
+ other.token == token &&
47
+ other.mode == mode &&
48
+ other.min == min &&
49
+ other.max == max
50
+ end
51
+ alias :eq :==
43
52
  end
44
53
  end
@@ -7,16 +7,6 @@ module Regexp::Expression
7
7
  # Used as the base class for the Alternation alternatives, Conditional
8
8
  # branches, and CharacterSet::Intersection intersected sequences.
9
9
  class Sequence < Regexp::Expression::Subexpression
10
- # TODO: this override is here for backwards compatibility, remove in 2.0.0
11
- def initialize(*args)
12
- if args.count == 3
13
- warn('WARNING: Sequence.new without a Regexp::Token argument is '\
14
- 'deprecated and will be removed in 2.0.0.')
15
- return self.class.at_levels(*args)
16
- end
17
- super
18
- end
19
-
20
10
  class << self
21
11
  def add_to(subexpression, params = {}, active_opts = {})
22
12
  sequence = at_levels(
@@ -51,17 +41,11 @@ module Regexp::Expression
51
41
  alias :ts :starts_at
52
42
 
53
43
  def quantify(token, text, min = nil, max = nil, mode = :greedy)
54
- offset = -1
55
- target = expressions[offset]
56
- while target.is_a?(FreeSpace)
57
- target = expressions[offset -= 1]
58
- end
59
-
60
- target || raise(ArgumentError, "No valid target found for '#{text}' "\
61
- 'quantifier')
44
+ target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
45
+ target or raise Regexp::Parser::Error,
46
+ "No valid target found for '#{text}' quantifier"
62
47
 
63
48
  target.quantify(token, text, min, max, mode)
64
49
  end
65
50
  end
66
-
67
51
  end
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
@@ -11,11 +11,11 @@ class Regexp::Lexer
11
11
 
12
12
  CLOSING_TOKENS = [:close].freeze
13
13
 
14
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
15
- new.lex(input, syntax, &block)
14
+ def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
15
+ new.lex(input, syntax, options: options, &block)
16
16
  end
17
17
 
18
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
18
+ def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
19
19
  syntax = Regexp::Syntax.new(syntax)
20
20
 
21
21
  self.tokens = []
@@ -25,7 +25,7 @@ class Regexp::Lexer
25
25
  self.shift = 0
26
26
 
27
27
  last = nil
28
- Regexp::Scanner.scan(input) do |type, token, text, ts, te|
28
+ Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
29
29
  type, token = *syntax.normalize(type, token)
30
30
  syntax.check! type, token
31
31
 
@@ -96,10 +96,10 @@ class Regexp::Lexer
96
96
 
97
97
  tokens.pop
98
98
  tokens << Regexp::Token.new(:literal, :literal, lead,
99
- token.ts, (token.te - last.bytesize),
99
+ token.ts, (token.te - last.length),
100
100
  nesting, set_nesting, conditional_nesting)
101
101
  tokens << Regexp::Token.new(:literal, :literal, last,
102
- (token.ts + lead.bytesize), token.te,
102
+ (token.ts + lead.length), token.te,
103
103
  nesting, set_nesting, conditional_nesting)
104
104
  end
105
105
 
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
22
- new.parse(input, syntax, &block)
21
+ def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
22
+ new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
26
- root = Root.build(options_from_input(input))
25
+ def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
26
+ root = Root.build(extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -35,7 +35,7 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
@@ -54,105 +54,171 @@ class Regexp::Parser
54
54
  :options_stack, :switching_options, :conditional_nesting,
55
55
  :captured_group_counts
56
56
 
57
- def options_from_input(input)
58
- return {} unless input.is_a?(::Regexp)
57
+ def extract_options(input, options)
58
+ if options && !input.is_a?(String)
59
+ raise ArgumentError, 'options cannot be supplied unless parsing a String'
60
+ end
59
61
 
60
- options = {}
61
- options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
62
- options[:m] = true if input.options & ::Regexp::MULTILINE != 0
63
- options[:x] = true if input.options & ::Regexp::EXTENDED != 0
64
- options
65
- end
62
+ options = input.options if input.is_a?(::Regexp)
66
63
 
67
- def nest(exp)
68
- nesting.push(exp)
69
- node << exp
70
- update_transplanted_subtree(exp, node)
71
- self.node = exp
72
- end
73
-
74
- # subtrees are transplanted to build Alternations, Intersections, Ranges
75
- def update_transplanted_subtree(exp, new_parent)
76
- exp.nesting_level = new_parent.nesting_level + 1
77
- exp.respond_to?(:each) &&
78
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
79
- end
80
-
81
- def decrease_nesting
82
- while nesting.last.is_a?(SequenceOperation)
83
- nesting.pop
84
- self.node = nesting.last
85
- end
86
- nesting.pop
87
- yield(node) if block_given?
88
- self.node = nesting.last
89
- self.node = node.last if node.last.is_a?(SequenceOperation)
90
- end
64
+ return {} unless options
91
65
 
92
- def nest_conditional(exp)
93
- conditional_nesting.push(exp)
94
- nest(exp)
66
+ enabled_options = {}
67
+ enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
68
+ enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
69
+ enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
70
+ enabled_options
95
71
  end
96
72
 
97
73
  def parse_token(token)
98
- close_completed_character_set_range
99
-
100
74
  case token.type
101
- when :meta; meta(token)
102
- when :quantifier; quantifier(token)
103
- when :anchor; anchor(token)
104
- when :escape; escape(token)
105
- when :group; group(token)
106
- when :assertion; group(token)
107
- when :set; set(token)
108
- when :type; type(token)
109
- when :backref; backref(token)
110
- when :conditional; conditional(token)
111
- when :keep; keep(token)
112
-
113
- when :posixclass, :nonposixclass
114
- posixclass(token)
115
- when :property, :nonproperty
116
- property(token)
117
-
118
- when :literal
119
- node << Literal.new(token, active_opts)
120
- when :free_space
121
- free_space(token)
122
-
75
+ when :anchor; anchor(token)
76
+ when :assertion, :group; group(token)
77
+ when :backref; backref(token)
78
+ when :conditional; conditional(token)
79
+ when :escape; escape(token)
80
+ when :free_space; free_space(token)
81
+ when :keep; keep(token)
82
+ when :literal; literal(token)
83
+ when :meta; meta(token)
84
+ when :posixclass, :nonposixclass; posixclass(token)
85
+ when :property, :nonproperty; property(token)
86
+ when :quantifier; quantifier(token)
87
+ when :set; set(token)
88
+ when :type; type(token)
123
89
  else
124
90
  raise UnknownTokenTypeError.new(token.type, token)
125
91
  end
92
+
93
+ close_completed_character_set_range
126
94
  end
127
95
 
128
- def set(token)
96
+ def anchor(token)
129
97
  case token.token
130
- when :open
131
- open_set(token)
132
- when :close
133
- close_set
134
- when :negate
135
- negate_set
136
- when :range
137
- range(token)
138
- when :intersection
139
- intersection(token)
140
- when :collation, :equivalent
141
- node << Literal.new(token, active_opts)
98
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
99
+ when :bos; node << Anchor::BOS.new(token, active_opts)
100
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
101
+ when :eos; node << Anchor::EOS.new(token, active_opts)
102
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
103
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
104
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
105
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
142
106
  else
143
- raise UnknownTokenError.new('CharacterSet', token)
107
+ raise UnknownTokenError.new('Anchor', token)
144
108
  end
145
109
  end
146
110
 
147
- def meta(token)
111
+ def group(token)
148
112
  case token.token
149
- when :dot
150
- node << CharacterType::Any.new(token, active_opts)
151
- when :alternation
152
- sequence_operation(Alternation, token)
113
+ when :options, :options_switch
114
+ options_group(token)
115
+ when :close
116
+ close_group
117
+ when :comment
118
+ node << Group::Comment.new(token, active_opts)
153
119
  else
154
- raise UnknownTokenError.new('Meta', token)
120
+ open_group(token)
121
+ end
122
+ end
123
+
124
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
125
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
126
+
127
+ def options_group(token)
128
+ positive, negative = token.text.split('-', 2)
129
+ negative ||= ''
130
+ self.switching_options = token.token.equal?(:options_switch)
131
+
132
+ opt_changes = {}
133
+ new_active_opts = active_opts.dup
134
+
135
+ MOD_FLAGS.each do |flag|
136
+ if positive.include?(flag.to_s)
137
+ opt_changes[flag] = new_active_opts[flag] = true
138
+ end
139
+ if negative.include?(flag.to_s)
140
+ opt_changes[flag] = false
141
+ new_active_opts.delete(flag)
142
+ end
143
+ end
144
+
145
+ if (enc_flag = positive.reverse[/[adu]/])
146
+ enc_flag = enc_flag.to_sym
147
+ (ENC_FLAGS - [enc_flag]).each do |other|
148
+ opt_changes[other] = false if new_active_opts[other]
149
+ new_active_opts.delete(other)
150
+ end
151
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
152
+ end
153
+
154
+ options_stack << new_active_opts
155
+
156
+ options_group = Group::Options.new(token, active_opts)
157
+ options_group.option_changes = opt_changes
158
+
159
+ nest(options_group)
160
+ end
161
+
162
+ def open_group(token)
163
+ group_class =
164
+ case token.token
165
+ when :absence; Group::Absence
166
+ when :atomic; Group::Atomic
167
+ when :capture; Group::Capture
168
+ when :named; Group::Named
169
+ when :passive; Group::Passive
170
+
171
+ when :lookahead; Assertion::Lookahead
172
+ when :lookbehind; Assertion::Lookbehind
173
+ when :nlookahead; Assertion::NegativeLookahead
174
+ when :nlookbehind; Assertion::NegativeLookbehind
175
+
176
+ else
177
+ raise UnknownTokenError.new('Group type open', token)
178
+ end
179
+
180
+ group = group_class.new(token, active_opts)
181
+
182
+ if group.capturing?
183
+ group.number = total_captured_group_count + 1
184
+ group.number_at_level = captured_group_count_at_level + 1
185
+ count_captured_group
186
+ end
187
+
188
+ # Push the active options to the stack again. This way we can simply pop the
189
+ # stack for any group we close, no matter if it had its own options or not.
190
+ options_stack << active_opts
191
+
192
+ nest(group)
193
+ end
194
+
195
+ def total_captured_group_count
196
+ captured_group_counts.values.reduce(0, :+)
197
+ end
198
+
199
+ def captured_group_count_at_level
200
+ captured_group_counts[node.level]
201
+ end
202
+
203
+ def count_captured_group
204
+ captured_group_counts[node.level] += 1
205
+ end
206
+
207
+ def close_group
208
+ options_stack.pop unless switching_options
209
+ self.switching_options = false
210
+ decrease_nesting
211
+ end
212
+
213
+ def decrease_nesting
214
+ while nesting.last.is_a?(SequenceOperation)
215
+ nesting.pop
216
+ self.node = nesting.last
155
217
  end
218
+ nesting.pop
219
+ yield(node) if block_given?
220
+ self.node = nesting.last
221
+ self.node = node.last if node.last.is_a?(SequenceOperation)
156
222
  end
157
223
 
158
224
  def backref(token)
@@ -182,31 +248,9 @@ class Regexp::Parser
182
248
  end
183
249
  end
184
250
 
185
- def type(token)
186
- case token.token
187
- when :digit
188
- node << CharacterType::Digit.new(token, active_opts)
189
- when :nondigit
190
- node << CharacterType::NonDigit.new(token, active_opts)
191
- when :hex
192
- node << CharacterType::Hex.new(token, active_opts)
193
- when :nonhex
194
- node << CharacterType::NonHex.new(token, active_opts)
195
- when :space
196
- node << CharacterType::Space.new(token, active_opts)
197
- when :nonspace
198
- node << CharacterType::NonSpace.new(token, active_opts)
199
- when :word
200
- node << CharacterType::Word.new(token, active_opts)
201
- when :nonword
202
- node << CharacterType::NonWord.new(token, active_opts)
203
- when :linebreak
204
- node << CharacterType::Linebreak.new(token, active_opts)
205
- when :xgrapheme
206
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
207
- else
208
- raise UnknownTokenError.new('CharacterType', token)
209
- end
251
+ def assign_effective_number(exp)
252
+ exp.effective_number =
253
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
210
254
  end
211
255
 
212
256
  def conditional(token)
@@ -234,11 +278,118 @@ class Regexp::Parser
234
278
  end
235
279
  end
236
280
 
281
+ def nest_conditional(exp)
282
+ conditional_nesting.push(exp)
283
+ nest(exp)
284
+ end
285
+
286
+ def nest(exp)
287
+ nesting.push(exp)
288
+ node << exp
289
+ update_transplanted_subtree(exp, node)
290
+ self.node = exp
291
+ end
292
+
293
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
294
+ def update_transplanted_subtree(exp, new_parent)
295
+ exp.nesting_level = new_parent.nesting_level + 1
296
+ exp.respond_to?(:each) &&
297
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
+ end
299
+
300
+ def escape(token)
301
+ case token.token
302
+
303
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
304
+
305
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
306
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
307
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
308
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
309
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
310
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
311
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
312
+
313
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
314
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
315
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
316
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
317
+
318
+ when :control
319
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
320
+ node << EscapeSequence::MetaControl.new(token, active_opts)
321
+ else
322
+ node << EscapeSequence::Control.new(token, active_opts)
323
+ end
324
+
325
+ when :meta_sequence
326
+ if token.text =~ /\A\\M-\\[Cc]/
327
+ node << EscapeSequence::MetaControl.new(token, active_opts)
328
+ else
329
+ node << EscapeSequence::Meta.new(token, active_opts)
330
+ end
331
+
332
+ else
333
+ # treating everything else as a literal
334
+ # TODO: maybe split this up a bit more in v3.0.0?
335
+ # E.g. escaped quantifiers or set meta chars are not the same
336
+ # as stuff that would be a literal even without the backslash.
337
+ # Right now, they all end up here.
338
+ node << EscapeSequence::Literal.new(token, active_opts)
339
+ end
340
+ end
341
+
342
+ def free_space(token)
343
+ case token.token
344
+ when :comment
345
+ node << Comment.new(token, active_opts)
346
+ when :whitespace
347
+ if node.last.is_a?(WhiteSpace)
348
+ node.last.merge(WhiteSpace.new(token, active_opts))
349
+ else
350
+ node << WhiteSpace.new(token, active_opts)
351
+ end
352
+ else
353
+ raise UnknownTokenError.new('FreeSpace', token)
354
+ end
355
+ end
356
+
357
+ def keep(token)
358
+ node << Keep::Mark.new(token, active_opts)
359
+ end
360
+
361
+ def literal(token)
362
+ node << Literal.new(token, active_opts)
363
+ end
364
+
365
+ def meta(token)
366
+ case token.token
367
+ when :dot
368
+ node << CharacterType::Any.new(token, active_opts)
369
+ when :alternation
370
+ sequence_operation(Alternation, token)
371
+ else
372
+ raise UnknownTokenError.new('Meta', token)
373
+ end
374
+ end
375
+
376
+ def sequence_operation(klass, token)
377
+ unless node.is_a?(klass)
378
+ operator = klass.new(token, active_opts)
379
+ sequence = operator.add_sequence(active_opts)
380
+ sequence.expressions = node.expressions
381
+ node.expressions = []
382
+ nest(operator)
383
+ end
384
+ node.add_sequence(active_opts)
385
+ end
386
+
237
387
  def posixclass(token)
238
388
  node << PosixClass.new(token, active_opts)
239
389
  end
240
390
 
241
391
  include Regexp::Expression::UnicodeProperty
392
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
242
393
 
243
394
  def property(token)
244
395
  case token.token
@@ -310,128 +461,43 @@ class Regexp::Parser
310
461
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
311
462
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
312
463
 
313
- when *Token::UnicodeProperty::Age
314
- node << Age.new(token, active_opts)
315
-
316
- when *Token::UnicodeProperty::Derived
317
- node << Derived.new(token, active_opts)
318
-
319
- when *Token::UnicodeProperty::Emoji
320
- node << Emoji.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Script
323
- node << Script.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::UnicodeBlock
326
- node << Block.new(token, active_opts)
464
+ when *UPTokens::Age; node << Age.new(token, active_opts)
465
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
+ when *UPTokens::Script; node << Script.new(token, active_opts)
468
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
327
469
 
328
470
  else
329
471
  raise UnknownTokenError.new('UnicodeProperty', token)
330
472
  end
331
473
  end
332
474
 
333
- def anchor(token)
334
- case token.token
335
- when :bol
336
- node << Anchor::BeginningOfLine.new(token, active_opts)
337
- when :eol
338
- node << Anchor::EndOfLine.new(token, active_opts)
339
- when :bos
340
- node << Anchor::BOS.new(token, active_opts)
341
- when :eos
342
- node << Anchor::EOS.new(token, active_opts)
343
- when :eos_ob_eol
344
- node << Anchor::EOSobEOL.new(token, active_opts)
345
- when :word_boundary
346
- node << Anchor::WordBoundary.new(token, active_opts)
347
- when :nonword_boundary
348
- node << Anchor::NonWordBoundary.new(token, active_opts)
349
- when :match_start
350
- node << Anchor::MatchStart.new(token, active_opts)
351
- else
352
- raise UnknownTokenError.new('Anchor', token)
353
- end
354
- end
355
-
356
- def escape(token)
357
- case token.token
358
-
359
- when :backspace
360
- node << EscapeSequence::Backspace.new(token, active_opts)
361
-
362
- when :escape
363
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
364
- when :bell
365
- node << EscapeSequence::Bell.new(token, active_opts)
366
- when :form_feed
367
- node << EscapeSequence::FormFeed.new(token, active_opts)
368
- when :newline
369
- node << EscapeSequence::Newline.new(token, active_opts)
370
- when :carriage
371
- node << EscapeSequence::Return.new(token, active_opts)
372
- when :tab
373
- node << EscapeSequence::Tab.new(token, active_opts)
374
- when :vertical_tab
375
- node << EscapeSequence::VerticalTab.new(token, active_opts)
376
-
377
- when :hex
378
- node << EscapeSequence::Hex.new(token, active_opts)
379
- when :octal
380
- node << EscapeSequence::Octal.new(token, active_opts)
381
- when :codepoint
382
- node << EscapeSequence::Codepoint.new(token, active_opts)
383
- when :codepoint_list
384
- node << EscapeSequence::CodepointList.new(token, active_opts)
385
-
386
- when :control
387
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
388
- node << EscapeSequence::MetaControl.new(token, active_opts)
389
- else
390
- node << EscapeSequence::Control.new(token, active_opts)
391
- end
392
-
393
- when :meta_sequence
394
- if token.text =~ /\A\\M-\\[Cc]/
395
- node << EscapeSequence::MetaControl.new(token, active_opts)
396
- else
397
- node << EscapeSequence::Meta.new(token, active_opts)
398
- end
399
-
400
- else
401
- # treating everything else as a literal
402
- node << EscapeSequence::Literal.new(token, active_opts)
403
- end
404
- end
405
-
406
- def keep(token)
407
- node << Keep::Mark.new(token, active_opts)
408
- end
409
-
410
- def free_space(token)
411
- case token.token
412
- when :comment
413
- node << Comment.new(token, active_opts)
414
- when :whitespace
415
- if node.last.is_a?(WhiteSpace)
416
- node.last.merge(WhiteSpace.new(token, active_opts))
417
- else
418
- node << WhiteSpace.new(token, active_opts)
419
- end
420
- else
421
- raise UnknownTokenError.new('FreeSpace', token)
422
- end
423
- end
424
-
425
475
  def quantifier(token)
426
- offset = -1
427
- target_node = node.expressions[offset]
428
- while target_node.is_a?(FreeSpace)
429
- target_node = node.expressions[offset -= 1]
476
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
478
+
479
+ # in case of chained quantifiers, wrap target in an implicit passive group
480
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
481
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
482
+ if target_node.quantified?
483
+ new_token = Regexp::Token.new(
484
+ :group,
485
+ :passive,
486
+ '', # text
487
+ target_node.ts,
488
+ nil, # te (unused)
489
+ target_node.level,
490
+ target_node.set_level,
491
+ target_node.conditional_level
492
+ )
493
+ new_group = Group::Passive.new(new_token, active_opts)
494
+ new_group.implicit = true
495
+ new_group << target_node
496
+ increase_level(target_node)
497
+ node.expressions[node.expressions.index(target_node)] = new_group
498
+ target_node = new_group
430
499
  end
431
500
 
432
- target_node || raise(ArgumentError, 'No valid target found for '\
433
- "'#{token.text}' ")
434
-
435
501
  case token.token
436
502
  when :zero_or_one
437
503
  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -462,6 +528,11 @@ class Regexp::Parser
462
528
  end
463
529
  end
464
530
 
531
+ def increase_level(exp)
532
+ exp.level += 1
533
+ exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
+ end
535
+
465
536
  def interval(target_node, token)
466
537
  text = token.text
467
538
  mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
@@ -484,100 +555,16 @@ class Regexp::Parser
484
555
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
485
556
  end
486
557
 
487
- def group(token)
488
- case token.token
489
- when :options, :options_switch
490
- options_group(token)
491
- when :close
492
- close_group
493
- when :comment
494
- node << Group::Comment.new(token, active_opts)
495
- else
496
- open_group(token)
497
- end
498
- end
499
-
500
- MOD_FLAGS = %w[i m x].map(&:to_sym)
501
- ENC_FLAGS = %w[a d u].map(&:to_sym)
502
-
503
- def options_group(token)
504
- positive, negative = token.text.split('-', 2)
505
- negative ||= ''
506
- self.switching_options = token.token.equal?(:options_switch)
507
-
508
- opt_changes = {}
509
- new_active_opts = active_opts.dup
510
-
511
- MOD_FLAGS.each do |flag|
512
- if positive.include?(flag.to_s)
513
- opt_changes[flag] = new_active_opts[flag] = true
514
- end
515
- if negative.include?(flag.to_s)
516
- opt_changes[flag] = false
517
- new_active_opts.delete(flag)
518
- end
519
- end
520
-
521
- if (enc_flag = positive.reverse[/[adu]/])
522
- enc_flag = enc_flag.to_sym
523
- (ENC_FLAGS - [enc_flag]).each do |other|
524
- opt_changes[other] = false if new_active_opts[other]
525
- new_active_opts.delete(other)
526
- end
527
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
528
- end
529
-
530
- options_stack << new_active_opts
531
-
532
- options_group = Group::Options.new(token, active_opts)
533
- options_group.option_changes = opt_changes
534
-
535
- nest(options_group)
536
- end
537
-
538
- def open_group(token)
558
+ def set(token)
539
559
  case token.token
540
- when :passive
541
- exp = Group::Passive.new(token, active_opts)
542
- when :atomic
543
- exp = Group::Atomic.new(token, active_opts)
544
- when :named
545
- exp = Group::Named.new(token, active_opts)
546
- when :capture
547
- exp = Group::Capture.new(token, active_opts)
548
- when :absence
549
- exp = Group::Absence.new(token, active_opts)
550
-
551
- when :lookahead
552
- exp = Assertion::Lookahead.new(token, active_opts)
553
- when :nlookahead
554
- exp = Assertion::NegativeLookahead.new(token, active_opts)
555
- when :lookbehind
556
- exp = Assertion::Lookbehind.new(token, active_opts)
557
- when :nlookbehind
558
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
559
-
560
+ when :open; open_set(token)
561
+ when :close; close_set
562
+ when :negate; negate_set
563
+ when :range; range(token)
564
+ when :intersection; intersection(token)
560
565
  else
561
- raise UnknownTokenError.new('Group type open', token)
562
- end
563
-
564
- if exp.capturing?
565
- exp.number = total_captured_group_count + 1
566
- exp.number_at_level = captured_group_count_at_level + 1
567
- count_captured_group
566
+ raise UnknownTokenError.new('CharacterSet', token)
568
567
  end
569
-
570
- # Push the active options to the stack again. This way we can simply pop the
571
- # stack for any group we close, no matter if it had its own options or not.
572
- options_stack << active_opts
573
-
574
- nest(exp)
575
- end
576
-
577
- def close_group
578
- options_stack.pop unless switching_options
579
- self.switching_options = false
580
- decrease_nesting
581
568
  end
582
569
 
583
570
  def open_set(token)
@@ -600,51 +587,45 @@ class Regexp::Parser
600
587
  nest(exp)
601
588
  end
602
589
 
603
- def close_completed_character_set_range
604
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
605
- end
606
-
607
590
  def intersection(token)
608
591
  sequence_operation(CharacterSet::Intersection, token)
609
592
  end
610
593
 
611
- def sequence_operation(klass, token)
612
- unless node.is_a?(klass)
613
- operator = klass.new(token, active_opts)
614
- sequence = operator.add_sequence(active_opts)
615
- sequence.expressions = node.expressions
616
- node.expressions = []
617
- nest(operator)
594
+ def type(token)
595
+ case token.token
596
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
597
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
598
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
599
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
600
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
601
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
602
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
603
+ when :space; node << CharacterType::Space.new(token, active_opts)
604
+ when :word; node << CharacterType::Word.new(token, active_opts)
605
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
606
+ else
607
+ raise UnknownTokenError.new('CharacterType', token)
618
608
  end
619
- node.add_sequence(active_opts)
620
- end
621
-
622
- def active_opts
623
- options_stack.last
624
- end
625
-
626
- def total_captured_group_count
627
- captured_group_counts.values.reduce(0, :+)
628
- end
629
-
630
- def captured_group_count_at_level
631
- captured_group_counts[node.level]
632
609
  end
633
610
 
634
- def count_captured_group
635
- captured_group_counts[node.level] += 1
611
+ def close_completed_character_set_range
612
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
636
613
  end
637
614
 
638
- def assign_effective_number(exp)
639
- exp.effective_number =
640
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
615
+ def active_opts
616
+ options_stack.last
641
617
  end
642
618
 
619
+ # Assigns referenced expressions to refering expressions, e.g. if there is
620
+ # an instance of Backreference::Number, its #referenced_expression is set to
621
+ # the instance of Group::Capture that it refers to via its number.
643
622
  def assign_referenced_expressions
644
623
  targets = {}
624
+ # find all referencable expressions
645
625
  root.each_expression do |exp|
646
626
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
647
627
  end
628
+ # assign them to any refering expressions
648
629
  root.each_expression do |exp|
649
630
  exp.respond_to?(:reference) &&
650
631
  exp.referenced_expression = targets[exp.reference]