regexp_parser 1.8.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +93 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser.rb +1 -0
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression.rb +5 -18
  9. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  10. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  11. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  12. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  13. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  15. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  16. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  17. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  18. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  19. data/lib/regexp_parser/expression/sequence.rb +3 -19
  20. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  21. data/lib/regexp_parser/lexer.rb +2 -2
  22. data/lib/regexp_parser/parser.rb +306 -332
  23. data/lib/regexp_parser/scanner.rb +1272 -1338
  24. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  25. data/lib/regexp_parser/scanner/property.rl +2 -2
  26. data/lib/regexp_parser/scanner/scanner.rl +206 -238
  27. data/lib/regexp_parser/syntax.rb +7 -7
  28. data/lib/regexp_parser/syntax/any.rb +3 -3
  29. data/lib/regexp_parser/syntax/base.rb +1 -1
  30. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  31. data/lib/regexp_parser/syntax/versions.rb +1 -1
  32. data/lib/regexp_parser/version.rb +1 -1
  33. data/spec/expression/base_spec.rb +10 -0
  34. data/spec/expression/clone_spec.rb +36 -4
  35. data/spec/expression/free_space_spec.rb +2 -2
  36. data/spec/expression/methods/match_length_spec.rb +2 -2
  37. data/spec/expression/subexpression_spec.rb +1 -1
  38. data/spec/expression/to_s_spec.rb +39 -31
  39. data/spec/lexer/literals_spec.rb +24 -49
  40. data/spec/lexer/refcalls_spec.rb +5 -0
  41. data/spec/parser/all_spec.rb +2 -2
  42. data/spec/parser/errors_spec.rb +1 -1
  43. data/spec/parser/escapes_spec.rb +1 -1
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +8 -1
  48. data/spec/scanner/groups_spec.rb +10 -1
  49. data/spec/scanner/literals_spec.rb +28 -38
  50. data/spec/scanner/quantifiers_spec.rb +18 -13
  51. data/spec/scanner/refcalls_spec.rb +19 -0
  52. data/spec/scanner/sets_spec.rb +65 -16
  53. data/spec/spec_helper.rb +1 -0
  54. metadata +4 -7
  55. data/spec/expression/root_spec.rb +0 -9
  56. data/spec/expression/sequence_spec.rb +0 -9
@@ -7,16 +7,6 @@ module Regexp::Expression
7
7
  # Used as the base class for the Alternation alternatives, Conditional
8
8
  # branches, and CharacterSet::Intersection intersected sequences.
9
9
  class Sequence < Regexp::Expression::Subexpression
10
- # TODO: this override is here for backwards compatibility, remove in 2.0.0
11
- def initialize(*args)
12
- if args.count == 3
13
- warn('WARNING: Sequence.new without a Regexp::Token argument is '\
14
- 'deprecated and will be removed in 2.0.0.')
15
- return self.class.at_levels(*args)
16
- end
17
- super
18
- end
19
-
20
10
  class << self
21
11
  def add_to(subexpression, params = {}, active_opts = {})
22
12
  sequence = at_levels(
@@ -51,17 +41,11 @@ module Regexp::Expression
51
41
  alias :ts :starts_at
52
42
 
53
43
  def quantify(token, text, min = nil, max = nil, mode = :greedy)
54
- offset = -1
55
- target = expressions[offset]
56
- while target.is_a?(FreeSpace)
57
- target = expressions[offset -= 1]
58
- end
59
-
60
- target || raise(ArgumentError, "No valid target found for '#{text}' "\
61
- 'quantifier')
44
+ target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
45
+ target or raise Regexp::Parser::Error,
46
+ "No valid target found for '#{text}' quantifier"
62
47
 
63
48
  target.quantify(token, text, min, max, mode)
64
49
  end
65
50
  end
66
-
67
51
  end
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
@@ -96,10 +96,10 @@ class Regexp::Lexer
96
96
 
97
97
  tokens.pop
98
98
  tokens << Regexp::Token.new(:literal, :literal, lead,
99
- token.ts, (token.te - last.bytesize),
99
+ token.ts, (token.te - last.length),
100
100
  nesting, set_nesting, conditional_nesting)
101
101
  tokens << Regexp::Token.new(:literal, :literal, last,
102
- (token.ts + lead.bytesize), token.te,
102
+ (token.ts + lead.length), token.te,
103
103
  nesting, set_nesting, conditional_nesting)
104
104
  end
105
105
 
@@ -2,9 +2,8 @@ require 'regexp_parser/expression'
2
2
 
3
3
  class Regexp::Parser
4
4
  include Regexp::Expression
5
- include Regexp::Syntax
6
5
 
7
- class ParserError < StandardError; end
6
+ class ParserError < Regexp::Parser::Error; end
8
7
 
9
8
  class UnknownTokenTypeError < ParserError
10
9
  def initialize(type, token)
@@ -70,95 +69,155 @@ class Regexp::Parser
70
69
  enabled_options
71
70
  end
72
71
 
73
- def nest(exp)
74
- nesting.push(exp)
75
- node << exp
76
- update_transplanted_subtree(exp, node)
77
- self.node = exp
78
- end
72
+ def parse_token(token)
73
+ case token.type
74
+ when :anchor; anchor(token)
75
+ when :assertion, :group; group(token)
76
+ when :backref; backref(token)
77
+ when :conditional; conditional(token)
78
+ when :escape; escape(token)
79
+ when :free_space; free_space(token)
80
+ when :keep; keep(token)
81
+ when :literal; literal(token)
82
+ when :meta; meta(token)
83
+ when :posixclass, :nonposixclass; posixclass(token)
84
+ when :property, :nonproperty; property(token)
85
+ when :quantifier; quantifier(token)
86
+ when :set; set(token)
87
+ when :type; type(token)
88
+ else
89
+ raise UnknownTokenTypeError.new(token.type, token)
90
+ end
79
91
 
80
- # subtrees are transplanted to build Alternations, Intersections, Ranges
81
- def update_transplanted_subtree(exp, new_parent)
82
- exp.nesting_level = new_parent.nesting_level + 1
83
- exp.respond_to?(:each) &&
84
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
92
+ close_completed_character_set_range
85
93
  end
86
94
 
87
- def decrease_nesting
88
- while nesting.last.is_a?(SequenceOperation)
89
- nesting.pop
90
- self.node = nesting.last
95
+ def anchor(token)
96
+ case token.token
97
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
98
+ when :bos; node << Anchor::BOS.new(token, active_opts)
99
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
100
+ when :eos; node << Anchor::EOS.new(token, active_opts)
101
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
102
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
103
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
104
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
105
+ else
106
+ raise UnknownTokenError.new('Anchor', token)
91
107
  end
92
- nesting.pop
93
- yield(node) if block_given?
94
- self.node = nesting.last
95
- self.node = node.last if node.last.is_a?(SequenceOperation)
96
108
  end
97
109
 
98
- def nest_conditional(exp)
99
- conditional_nesting.push(exp)
100
- nest(exp)
110
+ def group(token)
111
+ case token.token
112
+ when :options, :options_switch
113
+ options_group(token)
114
+ when :close
115
+ close_group
116
+ when :comment
117
+ node << Group::Comment.new(token, active_opts)
118
+ else
119
+ open_group(token)
120
+ end
101
121
  end
102
122
 
103
- def parse_token(token)
104
- close_completed_character_set_range
123
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
124
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
105
125
 
106
- case token.type
107
- when :meta; meta(token)
108
- when :quantifier; quantifier(token)
109
- when :anchor; anchor(token)
110
- when :escape; escape(token)
111
- when :group; group(token)
112
- when :assertion; group(token)
113
- when :set; set(token)
114
- when :type; type(token)
115
- when :backref; backref(token)
116
- when :conditional; conditional(token)
117
- when :keep; keep(token)
118
-
119
- when :posixclass, :nonposixclass
120
- posixclass(token)
121
- when :property, :nonproperty
122
- property(token)
123
-
124
- when :literal
125
- node << Literal.new(token, active_opts)
126
- when :free_space
127
- free_space(token)
126
+ def options_group(token)
127
+ positive, negative = token.text.split('-', 2)
128
+ negative ||= ''
129
+ self.switching_options = token.token.equal?(:options_switch)
128
130
 
129
- else
130
- raise UnknownTokenTypeError.new(token.type, token)
131
+ opt_changes = {}
132
+ new_active_opts = active_opts.dup
133
+
134
+ MOD_FLAGS.each do |flag|
135
+ if positive.include?(flag.to_s)
136
+ opt_changes[flag] = new_active_opts[flag] = true
137
+ end
138
+ if negative.include?(flag.to_s)
139
+ opt_changes[flag] = false
140
+ new_active_opts.delete(flag)
141
+ end
142
+ end
143
+
144
+ if (enc_flag = positive.reverse[/[adu]/])
145
+ enc_flag = enc_flag.to_sym
146
+ (ENC_FLAGS - [enc_flag]).each do |other|
147
+ opt_changes[other] = false if new_active_opts[other]
148
+ new_active_opts.delete(other)
149
+ end
150
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
131
151
  end
152
+
153
+ options_stack << new_active_opts
154
+
155
+ options_group = Group::Options.new(token, active_opts)
156
+ options_group.option_changes = opt_changes
157
+
158
+ nest(options_group)
132
159
  end
133
160
 
134
- def set(token)
135
- case token.token
136
- when :open
137
- open_set(token)
138
- when :close
139
- close_set
140
- when :negate
141
- negate_set
142
- when :range
143
- range(token)
144
- when :intersection
145
- intersection(token)
146
- when :collation, :equivalent
147
- node << Literal.new(token, active_opts)
148
- else
149
- raise UnknownTokenError.new('CharacterSet', token)
161
+ def open_group(token)
162
+ group_class =
163
+ case token.token
164
+ when :absence; Group::Absence
165
+ when :atomic; Group::Atomic
166
+ when :capture; Group::Capture
167
+ when :named; Group::Named
168
+ when :passive; Group::Passive
169
+
170
+ when :lookahead; Assertion::Lookahead
171
+ when :lookbehind; Assertion::Lookbehind
172
+ when :nlookahead; Assertion::NegativeLookahead
173
+ when :nlookbehind; Assertion::NegativeLookbehind
174
+
175
+ else
176
+ raise UnknownTokenError.new('Group type open', token)
177
+ end
178
+
179
+ group = group_class.new(token, active_opts)
180
+
181
+ if group.capturing?
182
+ group.number = total_captured_group_count + 1
183
+ group.number_at_level = captured_group_count_at_level + 1
184
+ count_captured_group
150
185
  end
186
+
187
+ # Push the active options to the stack again. This way we can simply pop the
188
+ # stack for any group we close, no matter if it had its own options or not.
189
+ options_stack << active_opts
190
+
191
+ nest(group)
151
192
  end
152
193
 
153
- def meta(token)
154
- case token.token
155
- when :dot
156
- node << CharacterType::Any.new(token, active_opts)
157
- when :alternation
158
- sequence_operation(Alternation, token)
159
- else
160
- raise UnknownTokenError.new('Meta', token)
194
+ def total_captured_group_count
195
+ captured_group_counts.values.reduce(0, :+)
196
+ end
197
+
198
+ def captured_group_count_at_level
199
+ captured_group_counts[node.level]
200
+ end
201
+
202
+ def count_captured_group
203
+ captured_group_counts[node.level] += 1
204
+ end
205
+
206
+ def close_group
207
+ options_stack.pop unless switching_options
208
+ self.switching_options = false
209
+ decrease_nesting
210
+ end
211
+
212
+ def decrease_nesting
213
+ while nesting.last.is_a?(SequenceOperation)
214
+ nesting.pop
215
+ self.node = nesting.last
161
216
  end
217
+ nesting.pop
218
+ yield(node) if block_given?
219
+ self.node = nesting.last
220
+ self.node = node.last if node.last.is_a?(SequenceOperation)
162
221
  end
163
222
 
164
223
  def backref(token)
@@ -188,31 +247,9 @@ class Regexp::Parser
188
247
  end
189
248
  end
190
249
 
191
- def type(token)
192
- case token.token
193
- when :digit
194
- node << CharacterType::Digit.new(token, active_opts)
195
- when :nondigit
196
- node << CharacterType::NonDigit.new(token, active_opts)
197
- when :hex
198
- node << CharacterType::Hex.new(token, active_opts)
199
- when :nonhex
200
- node << CharacterType::NonHex.new(token, active_opts)
201
- when :space
202
- node << CharacterType::Space.new(token, active_opts)
203
- when :nonspace
204
- node << CharacterType::NonSpace.new(token, active_opts)
205
- when :word
206
- node << CharacterType::Word.new(token, active_opts)
207
- when :nonword
208
- node << CharacterType::NonWord.new(token, active_opts)
209
- when :linebreak
210
- node << CharacterType::Linebreak.new(token, active_opts)
211
- when :xgrapheme
212
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
213
- else
214
- raise UnknownTokenError.new('CharacterType', token)
215
- end
250
+ def assign_effective_number(exp)
251
+ exp.effective_number =
252
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
216
253
  end
217
254
 
218
255
  def conditional(token)
@@ -240,11 +277,118 @@ class Regexp::Parser
240
277
  end
241
278
  end
242
279
 
280
+ def nest_conditional(exp)
281
+ conditional_nesting.push(exp)
282
+ nest(exp)
283
+ end
284
+
285
+ def nest(exp)
286
+ nesting.push(exp)
287
+ node << exp
288
+ update_transplanted_subtree(exp, node)
289
+ self.node = exp
290
+ end
291
+
292
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
293
+ def update_transplanted_subtree(exp, new_parent)
294
+ exp.nesting_level = new_parent.nesting_level + 1
295
+ exp.respond_to?(:each) &&
296
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
297
+ end
298
+
299
+ def escape(token)
300
+ case token.token
301
+
302
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
303
+
304
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
305
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
306
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
307
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
308
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
309
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
310
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
311
+
312
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
313
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
314
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
315
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
316
+
317
+ when :control
318
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
319
+ node << EscapeSequence::MetaControl.new(token, active_opts)
320
+ else
321
+ node << EscapeSequence::Control.new(token, active_opts)
322
+ end
323
+
324
+ when :meta_sequence
325
+ if token.text =~ /\A\\M-\\[Cc]/
326
+ node << EscapeSequence::MetaControl.new(token, active_opts)
327
+ else
328
+ node << EscapeSequence::Meta.new(token, active_opts)
329
+ end
330
+
331
+ else
332
+ # treating everything else as a literal
333
+ # TODO: maybe split this up a bit more in v3.0.0?
334
+ # E.g. escaped quantifiers or set meta chars are not the same
335
+ # as stuff that would be a literal even without the backslash.
336
+ # Right now, they all end up here.
337
+ node << EscapeSequence::Literal.new(token, active_opts)
338
+ end
339
+ end
340
+
341
+ def free_space(token)
342
+ case token.token
343
+ when :comment
344
+ node << Comment.new(token, active_opts)
345
+ when :whitespace
346
+ if node.last.is_a?(WhiteSpace)
347
+ node.last.merge(WhiteSpace.new(token, active_opts))
348
+ else
349
+ node << WhiteSpace.new(token, active_opts)
350
+ end
351
+ else
352
+ raise UnknownTokenError.new('FreeSpace', token)
353
+ end
354
+ end
355
+
356
+ def keep(token)
357
+ node << Keep::Mark.new(token, active_opts)
358
+ end
359
+
360
+ def literal(token)
361
+ node << Literal.new(token, active_opts)
362
+ end
363
+
364
+ def meta(token)
365
+ case token.token
366
+ when :dot
367
+ node << CharacterType::Any.new(token, active_opts)
368
+ when :alternation
369
+ sequence_operation(Alternation, token)
370
+ else
371
+ raise UnknownTokenError.new('Meta', token)
372
+ end
373
+ end
374
+
375
+ def sequence_operation(klass, token)
376
+ unless node.is_a?(klass)
377
+ operator = klass.new(token, active_opts)
378
+ sequence = operator.add_sequence(active_opts)
379
+ sequence.expressions = node.expressions
380
+ node.expressions = []
381
+ nest(operator)
382
+ end
383
+ node.add_sequence(active_opts)
384
+ end
385
+
243
386
  def posixclass(token)
244
387
  node << PosixClass.new(token, active_opts)
245
388
  end
246
389
 
247
390
  include Regexp::Expression::UnicodeProperty
391
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
248
392
 
249
393
  def property(token)
250
394
  case token.token
@@ -316,128 +460,43 @@ class Regexp::Parser
316
460
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
317
461
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
318
462
 
319
- when *Token::UnicodeProperty::Age
320
- node << Age.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Derived
323
- node << Derived.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::Emoji
326
- node << Emoji.new(token, active_opts)
327
-
328
- when *Token::UnicodeProperty::Script
329
- node << Script.new(token, active_opts)
330
-
331
- when *Token::UnicodeProperty::UnicodeBlock
332
- node << Block.new(token, active_opts)
463
+ when *UPTokens::Age; node << Age.new(token, active_opts)
464
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
465
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
466
+ when *UPTokens::Script; node << Script.new(token, active_opts)
467
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
333
468
 
334
469
  else
335
470
  raise UnknownTokenError.new('UnicodeProperty', token)
336
471
  end
337
472
  end
338
473
 
339
- def anchor(token)
340
- case token.token
341
- when :bol
342
- node << Anchor::BeginningOfLine.new(token, active_opts)
343
- when :eol
344
- node << Anchor::EndOfLine.new(token, active_opts)
345
- when :bos
346
- node << Anchor::BOS.new(token, active_opts)
347
- when :eos
348
- node << Anchor::EOS.new(token, active_opts)
349
- when :eos_ob_eol
350
- node << Anchor::EOSobEOL.new(token, active_opts)
351
- when :word_boundary
352
- node << Anchor::WordBoundary.new(token, active_opts)
353
- when :nonword_boundary
354
- node << Anchor::NonWordBoundary.new(token, active_opts)
355
- when :match_start
356
- node << Anchor::MatchStart.new(token, active_opts)
357
- else
358
- raise UnknownTokenError.new('Anchor', token)
359
- end
360
- end
361
-
362
- def escape(token)
363
- case token.token
364
-
365
- when :backspace
366
- node << EscapeSequence::Backspace.new(token, active_opts)
367
-
368
- when :escape
369
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
370
- when :bell
371
- node << EscapeSequence::Bell.new(token, active_opts)
372
- when :form_feed
373
- node << EscapeSequence::FormFeed.new(token, active_opts)
374
- when :newline
375
- node << EscapeSequence::Newline.new(token, active_opts)
376
- when :carriage
377
- node << EscapeSequence::Return.new(token, active_opts)
378
- when :tab
379
- node << EscapeSequence::Tab.new(token, active_opts)
380
- when :vertical_tab
381
- node << EscapeSequence::VerticalTab.new(token, active_opts)
382
-
383
- when :hex
384
- node << EscapeSequence::Hex.new(token, active_opts)
385
- when :octal
386
- node << EscapeSequence::Octal.new(token, active_opts)
387
- when :codepoint
388
- node << EscapeSequence::Codepoint.new(token, active_opts)
389
- when :codepoint_list
390
- node << EscapeSequence::CodepointList.new(token, active_opts)
391
-
392
- when :control
393
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
394
- node << EscapeSequence::MetaControl.new(token, active_opts)
395
- else
396
- node << EscapeSequence::Control.new(token, active_opts)
397
- end
398
-
399
- when :meta_sequence
400
- if token.text =~ /\A\\M-\\[Cc]/
401
- node << EscapeSequence::MetaControl.new(token, active_opts)
402
- else
403
- node << EscapeSequence::Meta.new(token, active_opts)
404
- end
405
-
406
- else
407
- # treating everything else as a literal
408
- node << EscapeSequence::Literal.new(token, active_opts)
409
- end
410
- end
411
-
412
- def keep(token)
413
- node << Keep::Mark.new(token, active_opts)
414
- end
415
-
416
- def free_space(token)
417
- case token.token
418
- when :comment
419
- node << Comment.new(token, active_opts)
420
- when :whitespace
421
- if node.last.is_a?(WhiteSpace)
422
- node.last.merge(WhiteSpace.new(token, active_opts))
423
- else
424
- node << WhiteSpace.new(token, active_opts)
425
- end
426
- else
427
- raise UnknownTokenError.new('FreeSpace', token)
428
- end
429
- end
430
-
431
474
  def quantifier(token)
432
- offset = -1
433
- target_node = node.expressions[offset]
434
- while target_node.is_a?(FreeSpace)
435
- target_node = node.expressions[offset -= 1]
475
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
476
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
477
+
478
+ # in case of chained quantifiers, wrap target in an implicit passive group
479
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
480
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
481
+ if target_node.quantified?
482
+ new_token = Regexp::Token.new(
483
+ :group,
484
+ :passive,
485
+ '', # text
486
+ target_node.ts,
487
+ nil, # te (unused)
488
+ target_node.level,
489
+ target_node.set_level,
490
+ target_node.conditional_level
491
+ )
492
+ new_group = Group::Passive.new(new_token, active_opts)
493
+ new_group.implicit = true
494
+ new_group << target_node
495
+ increase_level(target_node)
496
+ node.expressions[node.expressions.index(target_node)] = new_group
497
+ target_node = new_group
436
498
  end
437
499
 
438
- target_node || raise(ArgumentError, 'No valid target found for '\
439
- "'#{token.text}' ")
440
-
441
500
  case token.token
442
501
  when :zero_or_one
443
502
  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -468,6 +527,11 @@ class Regexp::Parser
468
527
  end
469
528
  end
470
529
 
530
+ def increase_level(exp)
531
+ exp.level += 1
532
+ exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
533
+ end
534
+
471
535
  def interval(target_node, token)
472
536
  text = token.text
473
537
  mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
@@ -490,100 +554,16 @@ class Regexp::Parser
490
554
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
491
555
  end
492
556
 
493
- def group(token)
494
- case token.token
495
- when :options, :options_switch
496
- options_group(token)
497
- when :close
498
- close_group
499
- when :comment
500
- node << Group::Comment.new(token, active_opts)
501
- else
502
- open_group(token)
503
- end
504
- end
505
-
506
- MOD_FLAGS = %w[i m x].map(&:to_sym)
507
- ENC_FLAGS = %w[a d u].map(&:to_sym)
508
-
509
- def options_group(token)
510
- positive, negative = token.text.split('-', 2)
511
- negative ||= ''
512
- self.switching_options = token.token.equal?(:options_switch)
513
-
514
- opt_changes = {}
515
- new_active_opts = active_opts.dup
516
-
517
- MOD_FLAGS.each do |flag|
518
- if positive.include?(flag.to_s)
519
- opt_changes[flag] = new_active_opts[flag] = true
520
- end
521
- if negative.include?(flag.to_s)
522
- opt_changes[flag] = false
523
- new_active_opts.delete(flag)
524
- end
525
- end
526
-
527
- if (enc_flag = positive.reverse[/[adu]/])
528
- enc_flag = enc_flag.to_sym
529
- (ENC_FLAGS - [enc_flag]).each do |other|
530
- opt_changes[other] = false if new_active_opts[other]
531
- new_active_opts.delete(other)
532
- end
533
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
534
- end
535
-
536
- options_stack << new_active_opts
537
-
538
- options_group = Group::Options.new(token, active_opts)
539
- options_group.option_changes = opt_changes
540
-
541
- nest(options_group)
542
- end
543
-
544
- def open_group(token)
557
+ def set(token)
545
558
  case token.token
546
- when :passive
547
- exp = Group::Passive.new(token, active_opts)
548
- when :atomic
549
- exp = Group::Atomic.new(token, active_opts)
550
- when :named
551
- exp = Group::Named.new(token, active_opts)
552
- when :capture
553
- exp = Group::Capture.new(token, active_opts)
554
- when :absence
555
- exp = Group::Absence.new(token, active_opts)
556
-
557
- when :lookahead
558
- exp = Assertion::Lookahead.new(token, active_opts)
559
- when :nlookahead
560
- exp = Assertion::NegativeLookahead.new(token, active_opts)
561
- when :lookbehind
562
- exp = Assertion::Lookbehind.new(token, active_opts)
563
- when :nlookbehind
564
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
565
-
559
+ when :open; open_set(token)
560
+ when :close; close_set
561
+ when :negate; negate_set
562
+ when :range; range(token)
563
+ when :intersection; intersection(token)
566
564
  else
567
- raise UnknownTokenError.new('Group type open', token)
568
- end
569
-
570
- if exp.capturing?
571
- exp.number = total_captured_group_count + 1
572
- exp.number_at_level = captured_group_count_at_level + 1
573
- count_captured_group
565
+ raise UnknownTokenError.new('CharacterSet', token)
574
566
  end
575
-
576
- # Push the active options to the stack again. This way we can simply pop the
577
- # stack for any group we close, no matter if it had its own options or not.
578
- options_stack << active_opts
579
-
580
- nest(exp)
581
- end
582
-
583
- def close_group
584
- options_stack.pop unless switching_options
585
- self.switching_options = false
586
- decrease_nesting
587
567
  end
588
568
 
589
569
  def open_set(token)
@@ -606,51 +586,45 @@ class Regexp::Parser
606
586
  nest(exp)
607
587
  end
608
588
 
609
- def close_completed_character_set_range
610
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
611
- end
612
-
613
589
  def intersection(token)
614
590
  sequence_operation(CharacterSet::Intersection, token)
615
591
  end
616
592
 
617
- def sequence_operation(klass, token)
618
- unless node.is_a?(klass)
619
- operator = klass.new(token, active_opts)
620
- sequence = operator.add_sequence(active_opts)
621
- sequence.expressions = node.expressions
622
- node.expressions = []
623
- nest(operator)
593
+ def type(token)
594
+ case token.token
595
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
596
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
597
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
598
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
599
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
600
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
601
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
602
+ when :space; node << CharacterType::Space.new(token, active_opts)
603
+ when :word; node << CharacterType::Word.new(token, active_opts)
604
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
605
+ else
606
+ raise UnknownTokenError.new('CharacterType', token)
624
607
  end
625
- node.add_sequence(active_opts)
626
- end
627
-
628
- def active_opts
629
- options_stack.last
630
- end
631
-
632
- def total_captured_group_count
633
- captured_group_counts.values.reduce(0, :+)
634
- end
635
-
636
- def captured_group_count_at_level
637
- captured_group_counts[node.level]
638
608
  end
639
609
 
640
- def count_captured_group
641
- captured_group_counts[node.level] += 1
610
+ def close_completed_character_set_range
611
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
642
612
  end
643
613
 
644
- def assign_effective_number(exp)
645
- exp.effective_number =
646
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
614
+ def active_opts
615
+ options_stack.last
647
616
  end
648
617
 
618
+ # Assigns referenced expressions to refering expressions, e.g. if there is
619
+ # an instance of Backreference::Number, its #referenced_expression is set to
620
+ # the instance of Group::Capture that it refers to via its number.
649
621
  def assign_referenced_expressions
650
622
  targets = {}
623
+ # find all referencable expressions
651
624
  root.each_expression do |exp|
652
625
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
653
626
  end
627
+ # assign them to any refering expressions
654
628
  root.each_expression do |exp|
655
629
  exp.respond_to?(:reference) &&
656
630
  exp.referenced_expression = targets[exp.reference]