regexp_parser 1.8.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +100 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  8. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  9. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  10. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  11. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  12. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  13. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  14. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  15. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  16. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  17. data/lib/regexp_parser/expression/sequence.rb +3 -19
  18. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  19. data/lib/regexp_parser/expression.rb +7 -19
  20. data/lib/regexp_parser/lexer.rb +2 -2
  21. data/lib/regexp_parser/parser.rb +307 -332
  22. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  23. data/lib/regexp_parser/scanner/property.rl +2 -2
  24. data/lib/regexp_parser/scanner/scanner.rl +209 -240
  25. data/lib/regexp_parser/scanner.rb +1275 -1340
  26. data/lib/regexp_parser/syntax/any.rb +3 -3
  27. data/lib/regexp_parser/syntax/base.rb +1 -1
  28. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  29. data/lib/regexp_parser/syntax.rb +8 -6
  30. data/lib/regexp_parser/version.rb +1 -1
  31. data/spec/expression/base_spec.rb +10 -0
  32. data/spec/expression/clone_spec.rb +36 -4
  33. data/spec/expression/free_space_spec.rb +2 -2
  34. data/spec/expression/methods/match_length_spec.rb +2 -2
  35. data/spec/expression/subexpression_spec.rb +1 -1
  36. data/spec/expression/to_s_spec.rb +39 -31
  37. data/spec/lexer/literals_spec.rb +24 -49
  38. data/spec/lexer/refcalls_spec.rb +5 -0
  39. data/spec/parser/all_spec.rb +2 -2
  40. data/spec/parser/errors_spec.rb +1 -1
  41. data/spec/parser/escapes_spec.rb +1 -1
  42. data/spec/parser/quantifiers_spec.rb +16 -0
  43. data/spec/parser/refcalls_spec.rb +5 -0
  44. data/spec/parser/set/ranges_spec.rb +3 -3
  45. data/spec/scanner/escapes_spec.rb +8 -1
  46. data/spec/scanner/groups_spec.rb +10 -1
  47. data/spec/scanner/literals_spec.rb +28 -38
  48. data/spec/scanner/quantifiers_spec.rb +18 -13
  49. data/spec/scanner/refcalls_spec.rb +19 -0
  50. data/spec/scanner/sets_spec.rb +65 -16
  51. data/spec/spec_helper.rb +1 -0
  52. metadata +4 -7
  53. data/spec/expression/root_spec.rb +0 -9
  54. data/spec/expression/sequence_spec.rb +0 -9
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
@@ -1,5 +1,6 @@
1
- module Regexp::Expression
1
+ require 'regexp_parser/error'
2
2
 
3
+ module Regexp::Expression
3
4
  class Base
4
5
  attr_accessor :type, :token
5
6
  attr_accessor :text, :ts
@@ -21,7 +22,7 @@ module Regexp::Expression
21
22
  self.options = options
22
23
  end
23
24
 
24
- def initialize_clone(orig)
25
+ def initialize_copy(orig)
25
26
  self.text = (orig.text ? orig.text.dup : nil)
26
27
  self.options = (orig.options ? orig.options.dup : nil)
27
28
  self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
@@ -34,6 +35,10 @@ module Regexp::Expression
34
35
 
35
36
  alias :starts_at :ts
36
37
 
38
+ def base_length
39
+ to_s(:base).length
40
+ end
41
+
37
42
  def full_length
38
43
  to_s.length
39
44
  end
@@ -118,23 +123,6 @@ module Regexp::Expression
118
123
  alias :to_h :attributes
119
124
  end
120
125
 
121
- def self.parsed(exp)
122
- warn('WARNING: Regexp::Expression::Base.parsed is buggy and '\
123
- 'will be removed in 2.0.0. Use Regexp::Parser.parse instead.')
124
- case exp
125
- when String
126
- Regexp::Parser.parse(exp)
127
- when Regexp
128
- Regexp::Parser.parse(exp.source) # <- causes loss of root options
129
- when Regexp::Expression # <- never triggers
130
- exp
131
- else
132
- raise ArgumentError, 'Expression.parsed accepts a String, Regexp, or '\
133
- 'a Regexp::Expression as a value for exp, but it '\
134
- "was given #{exp.class.name}."
135
- end
136
- end
137
-
138
126
  end # module Regexp::Expression
139
127
 
140
128
  require 'regexp_parser/expression/quantifier'
@@ -96,10 +96,10 @@ class Regexp::Lexer
96
96
 
97
97
  tokens.pop
98
98
  tokens << Regexp::Token.new(:literal, :literal, lead,
99
- token.ts, (token.te - last.bytesize),
99
+ token.ts, (token.te - last.length),
100
100
  nesting, set_nesting, conditional_nesting)
101
101
  tokens << Regexp::Token.new(:literal, :literal, last,
102
- (token.ts + lead.bytesize), token.te,
102
+ (token.ts + lead.length), token.te,
103
103
  nesting, set_nesting, conditional_nesting)
104
104
  end
105
105
 
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -70,95 +70,155 @@ class Regexp::Parser
70
70
  enabled_options
71
71
  end
72
72
 
73
- def nest(exp)
74
- nesting.push(exp)
75
- node << exp
76
- update_transplanted_subtree(exp, node)
77
- self.node = exp
78
- end
73
+ def parse_token(token)
74
+ case token.type
75
+ when :anchor; anchor(token)
76
+ when :assertion, :group; group(token)
77
+ when :backref; backref(token)
78
+ when :conditional; conditional(token)
79
+ when :escape; escape(token)
80
+ when :free_space; free_space(token)
81
+ when :keep; keep(token)
82
+ when :literal; literal(token)
83
+ when :meta; meta(token)
84
+ when :posixclass, :nonposixclass; posixclass(token)
85
+ when :property, :nonproperty; property(token)
86
+ when :quantifier; quantifier(token)
87
+ when :set; set(token)
88
+ when :type; type(token)
89
+ else
90
+ raise UnknownTokenTypeError.new(token.type, token)
91
+ end
79
92
 
80
- # subtrees are transplanted to build Alternations, Intersections, Ranges
81
- def update_transplanted_subtree(exp, new_parent)
82
- exp.nesting_level = new_parent.nesting_level + 1
83
- exp.respond_to?(:each) &&
84
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
93
+ close_completed_character_set_range
85
94
  end
86
95
 
87
- def decrease_nesting
88
- while nesting.last.is_a?(SequenceOperation)
89
- nesting.pop
90
- self.node = nesting.last
96
+ def anchor(token)
97
+ case token.token
98
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
99
+ when :bos; node << Anchor::BOS.new(token, active_opts)
100
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
101
+ when :eos; node << Anchor::EOS.new(token, active_opts)
102
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
103
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
104
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
105
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
106
+ else
107
+ raise UnknownTokenError.new('Anchor', token)
91
108
  end
92
- nesting.pop
93
- yield(node) if block_given?
94
- self.node = nesting.last
95
- self.node = node.last if node.last.is_a?(SequenceOperation)
96
109
  end
97
110
 
98
- def nest_conditional(exp)
99
- conditional_nesting.push(exp)
100
- nest(exp)
111
+ def group(token)
112
+ case token.token
113
+ when :options, :options_switch
114
+ options_group(token)
115
+ when :close
116
+ close_group
117
+ when :comment
118
+ node << Group::Comment.new(token, active_opts)
119
+ else
120
+ open_group(token)
121
+ end
101
122
  end
102
123
 
103
- def parse_token(token)
104
- close_completed_character_set_range
124
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
125
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
105
126
 
106
- case token.type
107
- when :meta; meta(token)
108
- when :quantifier; quantifier(token)
109
- when :anchor; anchor(token)
110
- when :escape; escape(token)
111
- when :group; group(token)
112
- when :assertion; group(token)
113
- when :set; set(token)
114
- when :type; type(token)
115
- when :backref; backref(token)
116
- when :conditional; conditional(token)
117
- when :keep; keep(token)
118
-
119
- when :posixclass, :nonposixclass
120
- posixclass(token)
121
- when :property, :nonproperty
122
- property(token)
123
-
124
- when :literal
125
- node << Literal.new(token, active_opts)
126
- when :free_space
127
- free_space(token)
127
+ def options_group(token)
128
+ positive, negative = token.text.split('-', 2)
129
+ negative ||= ''
130
+ self.switching_options = token.token.equal?(:options_switch)
128
131
 
129
- else
130
- raise UnknownTokenTypeError.new(token.type, token)
132
+ opt_changes = {}
133
+ new_active_opts = active_opts.dup
134
+
135
+ MOD_FLAGS.each do |flag|
136
+ if positive.include?(flag.to_s)
137
+ opt_changes[flag] = new_active_opts[flag] = true
138
+ end
139
+ if negative.include?(flag.to_s)
140
+ opt_changes[flag] = false
141
+ new_active_opts.delete(flag)
142
+ end
143
+ end
144
+
145
+ if (enc_flag = positive.reverse[/[adu]/])
146
+ enc_flag = enc_flag.to_sym
147
+ (ENC_FLAGS - [enc_flag]).each do |other|
148
+ opt_changes[other] = false if new_active_opts[other]
149
+ new_active_opts.delete(other)
150
+ end
151
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
131
152
  end
153
+
154
+ options_stack << new_active_opts
155
+
156
+ options_group = Group::Options.new(token, active_opts)
157
+ options_group.option_changes = opt_changes
158
+
159
+ nest(options_group)
132
160
  end
133
161
 
134
- def set(token)
135
- case token.token
136
- when :open
137
- open_set(token)
138
- when :close
139
- close_set
140
- when :negate
141
- negate_set
142
- when :range
143
- range(token)
144
- when :intersection
145
- intersection(token)
146
- when :collation, :equivalent
147
- node << Literal.new(token, active_opts)
148
- else
149
- raise UnknownTokenError.new('CharacterSet', token)
162
+ def open_group(token)
163
+ group_class =
164
+ case token.token
165
+ when :absence; Group::Absence
166
+ when :atomic; Group::Atomic
167
+ when :capture; Group::Capture
168
+ when :named; Group::Named
169
+ when :passive; Group::Passive
170
+
171
+ when :lookahead; Assertion::Lookahead
172
+ when :lookbehind; Assertion::Lookbehind
173
+ when :nlookahead; Assertion::NegativeLookahead
174
+ when :nlookbehind; Assertion::NegativeLookbehind
175
+
176
+ else
177
+ raise UnknownTokenError.new('Group type open', token)
178
+ end
179
+
180
+ group = group_class.new(token, active_opts)
181
+
182
+ if group.capturing?
183
+ group.number = total_captured_group_count + 1
184
+ group.number_at_level = captured_group_count_at_level + 1
185
+ count_captured_group
150
186
  end
187
+
188
+ # Push the active options to the stack again. This way we can simply pop the
189
+ # stack for any group we close, no matter if it had its own options or not.
190
+ options_stack << active_opts
191
+
192
+ nest(group)
151
193
  end
152
194
 
153
- def meta(token)
154
- case token.token
155
- when :dot
156
- node << CharacterType::Any.new(token, active_opts)
157
- when :alternation
158
- sequence_operation(Alternation, token)
159
- else
160
- raise UnknownTokenError.new('Meta', token)
195
+ def total_captured_group_count
196
+ captured_group_counts.values.reduce(0, :+)
197
+ end
198
+
199
+ def captured_group_count_at_level
200
+ captured_group_counts[node.level]
201
+ end
202
+
203
+ def count_captured_group
204
+ captured_group_counts[node.level] += 1
205
+ end
206
+
207
+ def close_group
208
+ options_stack.pop unless switching_options
209
+ self.switching_options = false
210
+ decrease_nesting
211
+ end
212
+
213
+ def decrease_nesting
214
+ while nesting.last.is_a?(SequenceOperation)
215
+ nesting.pop
216
+ self.node = nesting.last
161
217
  end
218
+ nesting.pop
219
+ yield(node) if block_given?
220
+ self.node = nesting.last
221
+ self.node = node.last if node.last.is_a?(SequenceOperation)
162
222
  end
163
223
 
164
224
  def backref(token)
@@ -188,31 +248,9 @@ class Regexp::Parser
188
248
  end
189
249
  end
190
250
 
191
- def type(token)
192
- case token.token
193
- when :digit
194
- node << CharacterType::Digit.new(token, active_opts)
195
- when :nondigit
196
- node << CharacterType::NonDigit.new(token, active_opts)
197
- when :hex
198
- node << CharacterType::Hex.new(token, active_opts)
199
- when :nonhex
200
- node << CharacterType::NonHex.new(token, active_opts)
201
- when :space
202
- node << CharacterType::Space.new(token, active_opts)
203
- when :nonspace
204
- node << CharacterType::NonSpace.new(token, active_opts)
205
- when :word
206
- node << CharacterType::Word.new(token, active_opts)
207
- when :nonword
208
- node << CharacterType::NonWord.new(token, active_opts)
209
- when :linebreak
210
- node << CharacterType::Linebreak.new(token, active_opts)
211
- when :xgrapheme
212
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
213
- else
214
- raise UnknownTokenError.new('CharacterType', token)
215
- end
251
+ def assign_effective_number(exp)
252
+ exp.effective_number =
253
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
216
254
  end
217
255
 
218
256
  def conditional(token)
@@ -240,11 +278,118 @@ class Regexp::Parser
240
278
  end
241
279
  end
242
280
 
281
+ def nest_conditional(exp)
282
+ conditional_nesting.push(exp)
283
+ nest(exp)
284
+ end
285
+
286
+ def nest(exp)
287
+ nesting.push(exp)
288
+ node << exp
289
+ update_transplanted_subtree(exp, node)
290
+ self.node = exp
291
+ end
292
+
293
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
294
+ def update_transplanted_subtree(exp, new_parent)
295
+ exp.nesting_level = new_parent.nesting_level + 1
296
+ exp.respond_to?(:each) &&
297
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
+ end
299
+
300
+ def escape(token)
301
+ case token.token
302
+
303
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
304
+
305
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
306
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
307
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
308
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
309
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
310
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
311
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
312
+
313
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
314
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
315
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
316
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
317
+
318
+ when :control
319
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
320
+ node << EscapeSequence::MetaControl.new(token, active_opts)
321
+ else
322
+ node << EscapeSequence::Control.new(token, active_opts)
323
+ end
324
+
325
+ when :meta_sequence
326
+ if token.text =~ /\A\\M-\\[Cc]/
327
+ node << EscapeSequence::MetaControl.new(token, active_opts)
328
+ else
329
+ node << EscapeSequence::Meta.new(token, active_opts)
330
+ end
331
+
332
+ else
333
+ # treating everything else as a literal
334
+ # TODO: maybe split this up a bit more in v3.0.0?
335
+ # E.g. escaped quantifiers or set meta chars are not the same
336
+ # as stuff that would be a literal even without the backslash.
337
+ # Right now, they all end up here.
338
+ node << EscapeSequence::Literal.new(token, active_opts)
339
+ end
340
+ end
341
+
342
+ def free_space(token)
343
+ case token.token
344
+ when :comment
345
+ node << Comment.new(token, active_opts)
346
+ when :whitespace
347
+ if node.last.is_a?(WhiteSpace)
348
+ node.last.merge(WhiteSpace.new(token, active_opts))
349
+ else
350
+ node << WhiteSpace.new(token, active_opts)
351
+ end
352
+ else
353
+ raise UnknownTokenError.new('FreeSpace', token)
354
+ end
355
+ end
356
+
357
+ def keep(token)
358
+ node << Keep::Mark.new(token, active_opts)
359
+ end
360
+
361
+ def literal(token)
362
+ node << Literal.new(token, active_opts)
363
+ end
364
+
365
+ def meta(token)
366
+ case token.token
367
+ when :dot
368
+ node << CharacterType::Any.new(token, active_opts)
369
+ when :alternation
370
+ sequence_operation(Alternation, token)
371
+ else
372
+ raise UnknownTokenError.new('Meta', token)
373
+ end
374
+ end
375
+
376
+ def sequence_operation(klass, token)
377
+ unless node.is_a?(klass)
378
+ operator = klass.new(token, active_opts)
379
+ sequence = operator.add_sequence(active_opts)
380
+ sequence.expressions = node.expressions
381
+ node.expressions = []
382
+ nest(operator)
383
+ end
384
+ node.add_sequence(active_opts)
385
+ end
386
+
243
387
  def posixclass(token)
244
388
  node << PosixClass.new(token, active_opts)
245
389
  end
246
390
 
247
391
  include Regexp::Expression::UnicodeProperty
392
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
248
393
 
249
394
  def property(token)
250
395
  case token.token
@@ -316,128 +461,43 @@ class Regexp::Parser
316
461
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
317
462
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
318
463
 
319
- when *Token::UnicodeProperty::Age
320
- node << Age.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Derived
323
- node << Derived.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::Emoji
326
- node << Emoji.new(token, active_opts)
327
-
328
- when *Token::UnicodeProperty::Script
329
- node << Script.new(token, active_opts)
330
-
331
- when *Token::UnicodeProperty::UnicodeBlock
332
- node << Block.new(token, active_opts)
464
+ when *UPTokens::Age; node << Age.new(token, active_opts)
465
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
+ when *UPTokens::Script; node << Script.new(token, active_opts)
468
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
333
469
 
334
470
  else
335
471
  raise UnknownTokenError.new('UnicodeProperty', token)
336
472
  end
337
473
  end
338
474
 
339
- def anchor(token)
340
- case token.token
341
- when :bol
342
- node << Anchor::BeginningOfLine.new(token, active_opts)
343
- when :eol
344
- node << Anchor::EndOfLine.new(token, active_opts)
345
- when :bos
346
- node << Anchor::BOS.new(token, active_opts)
347
- when :eos
348
- node << Anchor::EOS.new(token, active_opts)
349
- when :eos_ob_eol
350
- node << Anchor::EOSobEOL.new(token, active_opts)
351
- when :word_boundary
352
- node << Anchor::WordBoundary.new(token, active_opts)
353
- when :nonword_boundary
354
- node << Anchor::NonWordBoundary.new(token, active_opts)
355
- when :match_start
356
- node << Anchor::MatchStart.new(token, active_opts)
357
- else
358
- raise UnknownTokenError.new('Anchor', token)
359
- end
360
- end
361
-
362
- def escape(token)
363
- case token.token
364
-
365
- when :backspace
366
- node << EscapeSequence::Backspace.new(token, active_opts)
367
-
368
- when :escape
369
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
370
- when :bell
371
- node << EscapeSequence::Bell.new(token, active_opts)
372
- when :form_feed
373
- node << EscapeSequence::FormFeed.new(token, active_opts)
374
- when :newline
375
- node << EscapeSequence::Newline.new(token, active_opts)
376
- when :carriage
377
- node << EscapeSequence::Return.new(token, active_opts)
378
- when :tab
379
- node << EscapeSequence::Tab.new(token, active_opts)
380
- when :vertical_tab
381
- node << EscapeSequence::VerticalTab.new(token, active_opts)
382
-
383
- when :hex
384
- node << EscapeSequence::Hex.new(token, active_opts)
385
- when :octal
386
- node << EscapeSequence::Octal.new(token, active_opts)
387
- when :codepoint
388
- node << EscapeSequence::Codepoint.new(token, active_opts)
389
- when :codepoint_list
390
- node << EscapeSequence::CodepointList.new(token, active_opts)
391
-
392
- when :control
393
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
394
- node << EscapeSequence::MetaControl.new(token, active_opts)
395
- else
396
- node << EscapeSequence::Control.new(token, active_opts)
397
- end
398
-
399
- when :meta_sequence
400
- if token.text =~ /\A\\M-\\[Cc]/
401
- node << EscapeSequence::MetaControl.new(token, active_opts)
402
- else
403
- node << EscapeSequence::Meta.new(token, active_opts)
404
- end
405
-
406
- else
407
- # treating everything else as a literal
408
- node << EscapeSequence::Literal.new(token, active_opts)
409
- end
410
- end
411
-
412
- def keep(token)
413
- node << Keep::Mark.new(token, active_opts)
414
- end
415
-
416
- def free_space(token)
417
- case token.token
418
- when :comment
419
- node << Comment.new(token, active_opts)
420
- when :whitespace
421
- if node.last.is_a?(WhiteSpace)
422
- node.last.merge(WhiteSpace.new(token, active_opts))
423
- else
424
- node << WhiteSpace.new(token, active_opts)
425
- end
426
- else
427
- raise UnknownTokenError.new('FreeSpace', token)
428
- end
429
- end
430
-
431
475
  def quantifier(token)
432
- offset = -1
433
- target_node = node.expressions[offset]
434
- while target_node.is_a?(FreeSpace)
435
- target_node = node.expressions[offset -= 1]
476
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
478
+
479
+ # in case of chained quantifiers, wrap target in an implicit passive group
480
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
481
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
482
+ if target_node.quantified?
483
+ new_token = Regexp::Token.new(
484
+ :group,
485
+ :passive,
486
+ '', # text
487
+ target_node.ts,
488
+ nil, # te (unused)
489
+ target_node.level,
490
+ target_node.set_level,
491
+ target_node.conditional_level
492
+ )
493
+ new_group = Group::Passive.new(new_token, active_opts)
494
+ new_group.implicit = true
495
+ new_group << target_node
496
+ increase_level(target_node)
497
+ node.expressions[node.expressions.index(target_node)] = new_group
498
+ target_node = new_group
436
499
  end
437
500
 
438
- target_node || raise(ArgumentError, 'No valid target found for '\
439
- "'#{token.text}' ")
440
-
441
501
  case token.token
442
502
  when :zero_or_one
443
503
  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -468,6 +528,11 @@ class Regexp::Parser
468
528
  end
469
529
  end
470
530
 
531
+ def increase_level(exp)
532
+ exp.level += 1
533
+ exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
+ end
535
+
471
536
  def interval(target_node, token)
472
537
  text = token.text
473
538
  mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
@@ -490,100 +555,16 @@ class Regexp::Parser
490
555
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
491
556
  end
492
557
 
493
- def group(token)
494
- case token.token
495
- when :options, :options_switch
496
- options_group(token)
497
- when :close
498
- close_group
499
- when :comment
500
- node << Group::Comment.new(token, active_opts)
501
- else
502
- open_group(token)
503
- end
504
- end
505
-
506
- MOD_FLAGS = %w[i m x].map(&:to_sym)
507
- ENC_FLAGS = %w[a d u].map(&:to_sym)
508
-
509
- def options_group(token)
510
- positive, negative = token.text.split('-', 2)
511
- negative ||= ''
512
- self.switching_options = token.token.equal?(:options_switch)
513
-
514
- opt_changes = {}
515
- new_active_opts = active_opts.dup
516
-
517
- MOD_FLAGS.each do |flag|
518
- if positive.include?(flag.to_s)
519
- opt_changes[flag] = new_active_opts[flag] = true
520
- end
521
- if negative.include?(flag.to_s)
522
- opt_changes[flag] = false
523
- new_active_opts.delete(flag)
524
- end
525
- end
526
-
527
- if (enc_flag = positive.reverse[/[adu]/])
528
- enc_flag = enc_flag.to_sym
529
- (ENC_FLAGS - [enc_flag]).each do |other|
530
- opt_changes[other] = false if new_active_opts[other]
531
- new_active_opts.delete(other)
532
- end
533
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
534
- end
535
-
536
- options_stack << new_active_opts
537
-
538
- options_group = Group::Options.new(token, active_opts)
539
- options_group.option_changes = opt_changes
540
-
541
- nest(options_group)
542
- end
543
-
544
- def open_group(token)
558
+ def set(token)
545
559
  case token.token
546
- when :passive
547
- exp = Group::Passive.new(token, active_opts)
548
- when :atomic
549
- exp = Group::Atomic.new(token, active_opts)
550
- when :named
551
- exp = Group::Named.new(token, active_opts)
552
- when :capture
553
- exp = Group::Capture.new(token, active_opts)
554
- when :absence
555
- exp = Group::Absence.new(token, active_opts)
556
-
557
- when :lookahead
558
- exp = Assertion::Lookahead.new(token, active_opts)
559
- when :nlookahead
560
- exp = Assertion::NegativeLookahead.new(token, active_opts)
561
- when :lookbehind
562
- exp = Assertion::Lookbehind.new(token, active_opts)
563
- when :nlookbehind
564
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
565
-
560
+ when :open; open_set(token)
561
+ when :close; close_set
562
+ when :negate; negate_set
563
+ when :range; range(token)
564
+ when :intersection; intersection(token)
566
565
  else
567
- raise UnknownTokenError.new('Group type open', token)
568
- end
569
-
570
- if exp.capturing?
571
- exp.number = total_captured_group_count + 1
572
- exp.number_at_level = captured_group_count_at_level + 1
573
- count_captured_group
566
+ raise UnknownTokenError.new('CharacterSet', token)
574
567
  end
575
-
576
- # Push the active options to the stack again. This way we can simply pop the
577
- # stack for any group we close, no matter if it had its own options or not.
578
- options_stack << active_opts
579
-
580
- nest(exp)
581
- end
582
-
583
- def close_group
584
- options_stack.pop unless switching_options
585
- self.switching_options = false
586
- decrease_nesting
587
568
  end
588
569
 
589
570
  def open_set(token)
@@ -606,51 +587,45 @@ class Regexp::Parser
606
587
  nest(exp)
607
588
  end
608
589
 
609
- def close_completed_character_set_range
610
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
611
- end
612
-
613
590
  def intersection(token)
614
591
  sequence_operation(CharacterSet::Intersection, token)
615
592
  end
616
593
 
617
- def sequence_operation(klass, token)
618
- unless node.is_a?(klass)
619
- operator = klass.new(token, active_opts)
620
- sequence = operator.add_sequence(active_opts)
621
- sequence.expressions = node.expressions
622
- node.expressions = []
623
- nest(operator)
594
+ def type(token)
595
+ case token.token
596
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
597
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
598
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
599
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
600
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
601
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
602
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
603
+ when :space; node << CharacterType::Space.new(token, active_opts)
604
+ when :word; node << CharacterType::Word.new(token, active_opts)
605
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
606
+ else
607
+ raise UnknownTokenError.new('CharacterType', token)
624
608
  end
625
- node.add_sequence(active_opts)
626
- end
627
-
628
- def active_opts
629
- options_stack.last
630
- end
631
-
632
- def total_captured_group_count
633
- captured_group_counts.values.reduce(0, :+)
634
- end
635
-
636
- def captured_group_count_at_level
637
- captured_group_counts[node.level]
638
609
  end
639
610
 
640
- def count_captured_group
641
- captured_group_counts[node.level] += 1
611
+ def close_completed_character_set_range
612
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
642
613
  end
643
614
 
644
- def assign_effective_number(exp)
645
- exp.effective_number =
646
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
615
+ def active_opts
616
+ options_stack.last
647
617
  end
648
618
 
619
+ # Assigns referenced expressions to refering expressions, e.g. if there is
620
+ # an instance of Backreference::Number, its #referenced_expression is set to
621
+ # the instance of Group::Capture that it refers to via its number.
649
622
  def assign_referenced_expressions
650
623
  targets = {}
624
+ # find all referencable expressions
651
625
  root.each_expression do |exp|
652
626
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
653
627
  end
628
+ # assign them to any refering expressions
654
629
  root.each_expression do |exp|
655
630
  exp.respond_to?(:reference) &&
656
631
  exp.referenced_expression = targets[exp.reference]