regexp_parser 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +54 -0
  3. data/Gemfile +5 -1
  4. data/README.md +15 -21
  5. data/Rakefile +11 -17
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression/base.rb +123 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
  12. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +1 -0
  15. data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
  16. data/lib/regexp_parser/expression/classes/group.rb +6 -1
  17. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  18. data/lib/regexp_parser/expression/classes/property.rb +1 -3
  19. data/lib/regexp_parser/expression/classes/root.rb +0 -1
  20. data/lib/regexp_parser/expression/classes/type.rb +0 -2
  21. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  22. data/lib/regexp_parser/expression/sequence.rb +3 -10
  23. data/lib/regexp_parser/expression/subexpression.rb +1 -2
  24. data/lib/regexp_parser/expression.rb +7 -130
  25. data/lib/regexp_parser/lexer.rb +7 -5
  26. data/lib/regexp_parser/parser.rb +282 -334
  27. data/lib/regexp_parser/scanner/properties/long.yml +13 -0
  28. data/lib/regexp_parser/scanner/properties/short.yml +9 -1
  29. data/lib/regexp_parser/scanner/scanner.rl +64 -87
  30. data/lib/regexp_parser/scanner.rb +1024 -1073
  31. data/lib/regexp_parser/syntax/any.rb +2 -4
  32. data/lib/regexp_parser/syntax/base.rb +10 -10
  33. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  34. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  35. data/lib/regexp_parser/syntax/{tokens/backref.rb → token/backreference.rb} +6 -5
  36. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  37. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  38. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  39. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  40. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  41. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  42. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  43. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  44. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  45. data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
  46. data/lib/regexp_parser/syntax/token.rb +45 -0
  47. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  48. data/lib/regexp_parser/syntax/versions/1.8.6.rb +1 -1
  49. data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
  50. data/lib/regexp_parser/syntax.rb +8 -6
  51. data/lib/regexp_parser/token.rb +9 -20
  52. data/lib/regexp_parser/version.rb +1 -1
  53. data/lib/regexp_parser.rb +0 -2
  54. data/spec/expression/clone_spec.rb +36 -4
  55. data/spec/expression/free_space_spec.rb +2 -2
  56. data/spec/expression/methods/match_length_spec.rb +2 -2
  57. data/spec/lexer/nesting_spec.rb +2 -2
  58. data/spec/lexer/refcalls_spec.rb +5 -0
  59. data/spec/parser/all_spec.rb +2 -2
  60. data/spec/parser/escapes_spec.rb +43 -31
  61. data/spec/parser/properties_spec.rb +6 -4
  62. data/spec/parser/refcalls_spec.rb +5 -0
  63. data/spec/parser/set/ranges_spec.rb +26 -16
  64. data/spec/scanner/escapes_spec.rb +29 -20
  65. data/spec/scanner/refcalls_spec.rb +19 -0
  66. data/spec/scanner/sets_spec.rb +66 -23
  67. data/spec/spec_helper.rb +13 -1
  68. data/spec/support/capturing_stderr.rb +9 -0
  69. data/spec/syntax/versions/1.8.6_spec.rb +2 -2
  70. data/spec/syntax/versions/2.0.0_spec.rb +2 -2
  71. data/spec/syntax/versions/aliases_spec.rb +1 -0
  72. metadata +27 -26
  73. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  74. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  75. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  76. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  77. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  78. data/spec/support/runner.rb +0 -42
  79. data/spec/support/warning_extractor.rb +0 -60
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -70,95 +70,155 @@ class Regexp::Parser
70
70
  enabled_options
71
71
  end
72
72
 
73
- def nest(exp)
74
- nesting.push(exp)
75
- node << exp
76
- update_transplanted_subtree(exp, node)
77
- self.node = exp
78
- end
73
+ def parse_token(token)
74
+ case token.type
75
+ when :anchor; anchor(token)
76
+ when :assertion, :group; group(token)
77
+ when :backref; backref(token)
78
+ when :conditional; conditional(token)
79
+ when :escape; escape(token)
80
+ when :free_space; free_space(token)
81
+ when :keep; keep(token)
82
+ when :literal; literal(token)
83
+ when :meta; meta(token)
84
+ when :posixclass, :nonposixclass; posixclass(token)
85
+ when :property, :nonproperty; property(token)
86
+ when :quantifier; quantifier(token)
87
+ when :set; set(token)
88
+ when :type; type(token)
89
+ else
90
+ raise UnknownTokenTypeError.new(token.type, token)
91
+ end
79
92
 
80
- # subtrees are transplanted to build Alternations, Intersections, Ranges
81
- def update_transplanted_subtree(exp, new_parent)
82
- exp.nesting_level = new_parent.nesting_level + 1
83
- exp.respond_to?(:each) &&
84
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
93
+ close_completed_character_set_range
85
94
  end
86
95
 
87
- def decrease_nesting
88
- while nesting.last.is_a?(SequenceOperation)
89
- nesting.pop
90
- self.node = nesting.last
96
+ def anchor(token)
97
+ case token.token
98
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
99
+ when :bos; node << Anchor::BOS.new(token, active_opts)
100
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
101
+ when :eos; node << Anchor::EOS.new(token, active_opts)
102
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
103
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
104
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
105
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
106
+ else
107
+ raise UnknownTokenError.new('Anchor', token)
91
108
  end
92
- nesting.pop
93
- yield(node) if block_given?
94
- self.node = nesting.last
95
- self.node = node.last if node.last.is_a?(SequenceOperation)
96
109
  end
97
110
 
98
- def nest_conditional(exp)
99
- conditional_nesting.push(exp)
100
- nest(exp)
111
+ def group(token)
112
+ case token.token
113
+ when :options, :options_switch
114
+ options_group(token)
115
+ when :close
116
+ close_group
117
+ when :comment
118
+ node << Group::Comment.new(token, active_opts)
119
+ else
120
+ open_group(token)
121
+ end
101
122
  end
102
123
 
103
- def parse_token(token)
104
- close_completed_character_set_range
124
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
125
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
105
126
 
106
- case token.type
107
- when :meta; meta(token)
108
- when :quantifier; quantifier(token)
109
- when :anchor; anchor(token)
110
- when :escape; escape(token)
111
- when :group; group(token)
112
- when :assertion; group(token)
113
- when :set; set(token)
114
- when :type; type(token)
115
- when :backref; backref(token)
116
- when :conditional; conditional(token)
117
- when :keep; keep(token)
118
-
119
- when :posixclass, :nonposixclass
120
- posixclass(token)
121
- when :property, :nonproperty
122
- property(token)
123
-
124
- when :literal
125
- node << Literal.new(token, active_opts)
126
- when :free_space
127
- free_space(token)
127
+ def options_group(token)
128
+ positive, negative = token.text.split('-', 2)
129
+ negative ||= ''
130
+ self.switching_options = token.token.equal?(:options_switch)
128
131
 
129
- else
130
- raise UnknownTokenTypeError.new(token.type, token)
132
+ opt_changes = {}
133
+ new_active_opts = active_opts.dup
134
+
135
+ MOD_FLAGS.each do |flag|
136
+ if positive.include?(flag.to_s)
137
+ opt_changes[flag] = new_active_opts[flag] = true
138
+ end
139
+ if negative.include?(flag.to_s)
140
+ opt_changes[flag] = false
141
+ new_active_opts.delete(flag)
142
+ end
143
+ end
144
+
145
+ if (enc_flag = positive.reverse[/[adu]/])
146
+ enc_flag = enc_flag.to_sym
147
+ (ENC_FLAGS - [enc_flag]).each do |other|
148
+ opt_changes[other] = false if new_active_opts[other]
149
+ new_active_opts.delete(other)
150
+ end
151
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
131
152
  end
153
+
154
+ options_stack << new_active_opts
155
+
156
+ options_group = Group::Options.new(token, active_opts)
157
+ options_group.option_changes = opt_changes
158
+
159
+ nest(options_group)
132
160
  end
133
161
 
134
- def set(token)
135
- case token.token
136
- when :open
137
- open_set(token)
138
- when :close
139
- close_set
140
- when :negate
141
- negate_set
142
- when :range
143
- range(token)
144
- when :intersection
145
- intersection(token)
146
- when :collation, :equivalent
147
- node << Literal.new(token, active_opts)
148
- else
149
- raise UnknownTokenError.new('CharacterSet', token)
162
+ def open_group(token)
163
+ group_class =
164
+ case token.token
165
+ when :absence; Group::Absence
166
+ when :atomic; Group::Atomic
167
+ when :capture; Group::Capture
168
+ when :named; Group::Named
169
+ when :passive; Group::Passive
170
+
171
+ when :lookahead; Assertion::Lookahead
172
+ when :lookbehind; Assertion::Lookbehind
173
+ when :nlookahead; Assertion::NegativeLookahead
174
+ when :nlookbehind; Assertion::NegativeLookbehind
175
+
176
+ else
177
+ raise UnknownTokenError.new('Group type open', token)
178
+ end
179
+
180
+ group = group_class.new(token, active_opts)
181
+
182
+ if group.capturing?
183
+ group.number = total_captured_group_count + 1
184
+ group.number_at_level = captured_group_count_at_level + 1
185
+ count_captured_group
150
186
  end
187
+
188
+ # Push the active options to the stack again. This way we can simply pop the
189
+ # stack for any group we close, no matter if it had its own options or not.
190
+ options_stack << active_opts
191
+
192
+ nest(group)
151
193
  end
152
194
 
153
- def meta(token)
154
- case token.token
155
- when :dot
156
- node << CharacterType::Any.new(token, active_opts)
157
- when :alternation
158
- sequence_operation(Alternation, token)
159
- else
160
- raise UnknownTokenError.new('Meta', token)
195
+ def total_captured_group_count
196
+ captured_group_counts.values.reduce(0, :+)
197
+ end
198
+
199
+ def captured_group_count_at_level
200
+ captured_group_counts[node.level]
201
+ end
202
+
203
+ def count_captured_group
204
+ captured_group_counts[node.level] += 1
205
+ end
206
+
207
+ def close_group
208
+ options_stack.pop unless switching_options
209
+ self.switching_options = false
210
+ decrease_nesting
211
+ end
212
+
213
+ def decrease_nesting
214
+ while nesting.last.is_a?(SequenceOperation)
215
+ nesting.pop
216
+ self.node = nesting.last
161
217
  end
218
+ nesting.pop
219
+ yield(node) if block_given?
220
+ self.node = nesting.last
221
+ self.node = node.last if node.last.is_a?(SequenceOperation)
162
222
  end
163
223
 
164
224
  def backref(token)
@@ -188,31 +248,9 @@ class Regexp::Parser
188
248
  end
189
249
  end
190
250
 
191
- def type(token)
192
- case token.token
193
- when :digit
194
- node << CharacterType::Digit.new(token, active_opts)
195
- when :nondigit
196
- node << CharacterType::NonDigit.new(token, active_opts)
197
- when :hex
198
- node << CharacterType::Hex.new(token, active_opts)
199
- when :nonhex
200
- node << CharacterType::NonHex.new(token, active_opts)
201
- when :space
202
- node << CharacterType::Space.new(token, active_opts)
203
- when :nonspace
204
- node << CharacterType::NonSpace.new(token, active_opts)
205
- when :word
206
- node << CharacterType::Word.new(token, active_opts)
207
- when :nonword
208
- node << CharacterType::NonWord.new(token, active_opts)
209
- when :linebreak
210
- node << CharacterType::Linebreak.new(token, active_opts)
211
- when :xgrapheme
212
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
213
- else
214
- raise UnknownTokenError.new('CharacterType', token)
215
- end
251
+ def assign_effective_number(exp)
252
+ exp.effective_number =
253
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
216
254
  end
217
255
 
218
256
  def conditional(token)
@@ -240,11 +278,118 @@ class Regexp::Parser
240
278
  end
241
279
  end
242
280
 
281
+ def nest_conditional(exp)
282
+ conditional_nesting.push(exp)
283
+ nest(exp)
284
+ end
285
+
286
+ def nest(exp)
287
+ nesting.push(exp)
288
+ node << exp
289
+ update_transplanted_subtree(exp, node)
290
+ self.node = exp
291
+ end
292
+
293
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
294
+ def update_transplanted_subtree(exp, new_parent)
295
+ exp.nesting_level = new_parent.nesting_level + 1
296
+ exp.respond_to?(:each) &&
297
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
+ end
299
+
300
+ def escape(token)
301
+ case token.token
302
+
303
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
304
+
305
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
306
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
307
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
308
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
309
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
310
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
311
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
312
+
313
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
314
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
315
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
316
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
317
+
318
+ when :control
319
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
320
+ node << EscapeSequence::MetaControl.new(token, active_opts)
321
+ else
322
+ node << EscapeSequence::Control.new(token, active_opts)
323
+ end
324
+
325
+ when :meta_sequence
326
+ if token.text =~ /\A\\M-\\[Cc]/
327
+ node << EscapeSequence::MetaControl.new(token, active_opts)
328
+ else
329
+ node << EscapeSequence::Meta.new(token, active_opts)
330
+ end
331
+
332
+ else
333
+ # treating everything else as a literal
334
+ # TODO: maybe split this up a bit more in v3.0.0?
335
+ # E.g. escaped quantifiers or set meta chars are not the same
336
+ # as stuff that would be a literal even without the backslash.
337
+ # Right now, they all end up here.
338
+ node << EscapeSequence::Literal.new(token, active_opts)
339
+ end
340
+ end
341
+
342
+ def free_space(token)
343
+ case token.token
344
+ when :comment
345
+ node << Comment.new(token, active_opts)
346
+ when :whitespace
347
+ if node.last.is_a?(WhiteSpace)
348
+ node.last.merge(WhiteSpace.new(token, active_opts))
349
+ else
350
+ node << WhiteSpace.new(token, active_opts)
351
+ end
352
+ else
353
+ raise UnknownTokenError.new('FreeSpace', token)
354
+ end
355
+ end
356
+
357
+ def keep(token)
358
+ node << Keep::Mark.new(token, active_opts)
359
+ end
360
+
361
+ def literal(token)
362
+ node << Literal.new(token, active_opts)
363
+ end
364
+
365
+ def meta(token)
366
+ case token.token
367
+ when :dot
368
+ node << CharacterType::Any.new(token, active_opts)
369
+ when :alternation
370
+ sequence_operation(Alternation, token)
371
+ else
372
+ raise UnknownTokenError.new('Meta', token)
373
+ end
374
+ end
375
+
376
+ def sequence_operation(klass, token)
377
+ unless node.is_a?(klass)
378
+ operator = klass.new(token, active_opts)
379
+ sequence = operator.add_sequence(active_opts)
380
+ sequence.expressions = node.expressions
381
+ node.expressions = []
382
+ nest(operator)
383
+ end
384
+ node.add_sequence(active_opts)
385
+ end
386
+
243
387
  def posixclass(token)
244
388
  node << PosixClass.new(token, active_opts)
245
389
  end
246
390
 
247
391
  include Regexp::Expression::UnicodeProperty
392
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
248
393
 
249
394
  def property(token)
250
395
  case token.token
@@ -316,127 +461,20 @@ class Regexp::Parser
316
461
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
317
462
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
318
463
 
319
- when *Token::UnicodeProperty::Age
320
- node << Age.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Derived
323
- node << Derived.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::Emoji
326
- node << Emoji.new(token, active_opts)
327
-
328
- when *Token::UnicodeProperty::Script
329
- node << Script.new(token, active_opts)
330
-
331
- when *Token::UnicodeProperty::UnicodeBlock
332
- node << Block.new(token, active_opts)
464
+ when *UPTokens::Age; node << Age.new(token, active_opts)
465
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
+ when *UPTokens::Script; node << Script.new(token, active_opts)
468
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
333
469
 
334
470
  else
335
471
  raise UnknownTokenError.new('UnicodeProperty', token)
336
472
  end
337
473
  end
338
474
 
339
- def anchor(token)
340
- case token.token
341
- when :bol
342
- node << Anchor::BeginningOfLine.new(token, active_opts)
343
- when :eol
344
- node << Anchor::EndOfLine.new(token, active_opts)
345
- when :bos
346
- node << Anchor::BOS.new(token, active_opts)
347
- when :eos
348
- node << Anchor::EOS.new(token, active_opts)
349
- when :eos_ob_eol
350
- node << Anchor::EOSobEOL.new(token, active_opts)
351
- when :word_boundary
352
- node << Anchor::WordBoundary.new(token, active_opts)
353
- when :nonword_boundary
354
- node << Anchor::NonWordBoundary.new(token, active_opts)
355
- when :match_start
356
- node << Anchor::MatchStart.new(token, active_opts)
357
- else
358
- raise UnknownTokenError.new('Anchor', token)
359
- end
360
- end
361
-
362
- def escape(token)
363
- case token.token
364
-
365
- when :backspace
366
- node << EscapeSequence::Backspace.new(token, active_opts)
367
-
368
- when :escape
369
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
370
- when :bell
371
- node << EscapeSequence::Bell.new(token, active_opts)
372
- when :form_feed
373
- node << EscapeSequence::FormFeed.new(token, active_opts)
374
- when :newline
375
- node << EscapeSequence::Newline.new(token, active_opts)
376
- when :carriage
377
- node << EscapeSequence::Return.new(token, active_opts)
378
- when :tab
379
- node << EscapeSequence::Tab.new(token, active_opts)
380
- when :vertical_tab
381
- node << EscapeSequence::VerticalTab.new(token, active_opts)
382
-
383
- when :hex
384
- node << EscapeSequence::Hex.new(token, active_opts)
385
- when :octal
386
- node << EscapeSequence::Octal.new(token, active_opts)
387
- when :codepoint
388
- node << EscapeSequence::Codepoint.new(token, active_opts)
389
- when :codepoint_list
390
- node << EscapeSequence::CodepointList.new(token, active_opts)
391
-
392
- when :control
393
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
394
- node << EscapeSequence::MetaControl.new(token, active_opts)
395
- else
396
- node << EscapeSequence::Control.new(token, active_opts)
397
- end
398
-
399
- when :meta_sequence
400
- if token.text =~ /\A\\M-\\[Cc]/
401
- node << EscapeSequence::MetaControl.new(token, active_opts)
402
- else
403
- node << EscapeSequence::Meta.new(token, active_opts)
404
- end
405
-
406
- else
407
- # treating everything else as a literal
408
- node << EscapeSequence::Literal.new(token, active_opts)
409
- end
410
- end
411
-
412
- def keep(token)
413
- node << Keep::Mark.new(token, active_opts)
414
- end
415
-
416
- def free_space(token)
417
- case token.token
418
- when :comment
419
- node << Comment.new(token, active_opts)
420
- when :whitespace
421
- if node.last.is_a?(WhiteSpace)
422
- node.last.merge(WhiteSpace.new(token, active_opts))
423
- else
424
- node << WhiteSpace.new(token, active_opts)
425
- end
426
- else
427
- raise UnknownTokenError.new('FreeSpace', token)
428
- end
429
- end
430
-
431
475
  def quantifier(token)
432
- offset = -1
433
- target_node = node.expressions[offset]
434
- while target_node.is_a?(FreeSpace)
435
- target_node = node.expressions[offset -= 1]
436
- end
437
-
438
- target_node || raise(ArgumentError, 'No valid target found for '\
439
- "'#{token.text}' ")
476
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
440
478
 
441
479
  # in case of chained quantifiers, wrap target in an implicit passive group
442
480
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -456,7 +494,7 @@ class Regexp::Parser
456
494
  new_group.implicit = true
457
495
  new_group << target_node
458
496
  increase_level(target_node)
459
- node.expressions[offset] = new_group
497
+ node.expressions[node.expressions.index(target_node)] = new_group
460
498
  target_node = new_group
461
499
  end
462
500
 
@@ -517,100 +555,16 @@ class Regexp::Parser
517
555
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
518
556
  end
519
557
 
520
- def group(token)
521
- case token.token
522
- when :options, :options_switch
523
- options_group(token)
524
- when :close
525
- close_group
526
- when :comment
527
- node << Group::Comment.new(token, active_opts)
528
- else
529
- open_group(token)
530
- end
531
- end
532
-
533
- MOD_FLAGS = %w[i m x].map(&:to_sym)
534
- ENC_FLAGS = %w[a d u].map(&:to_sym)
535
-
536
- def options_group(token)
537
- positive, negative = token.text.split('-', 2)
538
- negative ||= ''
539
- self.switching_options = token.token.equal?(:options_switch)
540
-
541
- opt_changes = {}
542
- new_active_opts = active_opts.dup
543
-
544
- MOD_FLAGS.each do |flag|
545
- if positive.include?(flag.to_s)
546
- opt_changes[flag] = new_active_opts[flag] = true
547
- end
548
- if negative.include?(flag.to_s)
549
- opt_changes[flag] = false
550
- new_active_opts.delete(flag)
551
- end
552
- end
553
-
554
- if (enc_flag = positive.reverse[/[adu]/])
555
- enc_flag = enc_flag.to_sym
556
- (ENC_FLAGS - [enc_flag]).each do |other|
557
- opt_changes[other] = false if new_active_opts[other]
558
- new_active_opts.delete(other)
559
- end
560
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
561
- end
562
-
563
- options_stack << new_active_opts
564
-
565
- options_group = Group::Options.new(token, active_opts)
566
- options_group.option_changes = opt_changes
567
-
568
- nest(options_group)
569
- end
570
-
571
- def open_group(token)
558
+ def set(token)
572
559
  case token.token
573
- when :passive
574
- exp = Group::Passive.new(token, active_opts)
575
- when :atomic
576
- exp = Group::Atomic.new(token, active_opts)
577
- when :named
578
- exp = Group::Named.new(token, active_opts)
579
- when :capture
580
- exp = Group::Capture.new(token, active_opts)
581
- when :absence
582
- exp = Group::Absence.new(token, active_opts)
583
-
584
- when :lookahead
585
- exp = Assertion::Lookahead.new(token, active_opts)
586
- when :nlookahead
587
- exp = Assertion::NegativeLookahead.new(token, active_opts)
588
- when :lookbehind
589
- exp = Assertion::Lookbehind.new(token, active_opts)
590
- when :nlookbehind
591
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
592
-
560
+ when :open; open_set(token)
561
+ when :close; close_set
562
+ when :negate; negate_set
563
+ when :range; range(token)
564
+ when :intersection; intersection(token)
593
565
  else
594
- raise UnknownTokenError.new('Group type open', token)
595
- end
596
-
597
- if exp.capturing?
598
- exp.number = total_captured_group_count + 1
599
- exp.number_at_level = captured_group_count_at_level + 1
600
- count_captured_group
566
+ raise UnknownTokenError.new('CharacterSet', token)
601
567
  end
602
-
603
- # Push the active options to the stack again. This way we can simply pop the
604
- # stack for any group we close, no matter if it had its own options or not.
605
- options_stack << active_opts
606
-
607
- nest(exp)
608
- end
609
-
610
- def close_group
611
- options_stack.pop unless switching_options
612
- self.switching_options = false
613
- decrease_nesting
614
568
  end
615
569
 
616
570
  def open_set(token)
@@ -633,51 +587,45 @@ class Regexp::Parser
633
587
  nest(exp)
634
588
  end
635
589
 
636
- def close_completed_character_set_range
637
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
638
- end
639
-
640
590
  def intersection(token)
641
591
  sequence_operation(CharacterSet::Intersection, token)
642
592
  end
643
593
 
644
- def sequence_operation(klass, token)
645
- unless node.is_a?(klass)
646
- operator = klass.new(token, active_opts)
647
- sequence = operator.add_sequence(active_opts)
648
- sequence.expressions = node.expressions
649
- node.expressions = []
650
- nest(operator)
594
+ def type(token)
595
+ case token.token
596
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
597
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
598
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
599
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
600
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
601
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
602
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
603
+ when :space; node << CharacterType::Space.new(token, active_opts)
604
+ when :word; node << CharacterType::Word.new(token, active_opts)
605
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
606
+ else
607
+ raise UnknownTokenError.new('CharacterType', token)
651
608
  end
652
- node.add_sequence(active_opts)
653
- end
654
-
655
- def active_opts
656
- options_stack.last
657
- end
658
-
659
- def total_captured_group_count
660
- captured_group_counts.values.reduce(0, :+)
661
- end
662
-
663
- def captured_group_count_at_level
664
- captured_group_counts[node.level]
665
609
  end
666
610
 
667
- def count_captured_group
668
- captured_group_counts[node.level] += 1
611
+ def close_completed_character_set_range
612
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
669
613
  end
670
614
 
671
- def assign_effective_number(exp)
672
- exp.effective_number =
673
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
615
+ def active_opts
616
+ options_stack.last
674
617
  end
675
618
 
619
+ # Assigns referenced expressions to refering expressions, e.g. if there is
620
+ # an instance of Backreference::Number, its #referenced_expression is set to
621
+ # the instance of Group::Capture that it refers to via its number.
676
622
  def assign_referenced_expressions
677
623
  targets = {}
624
+ # find all referencable expressions
678
625
  root.each_expression do |exp|
679
626
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
680
627
  end
628
+ # assign them to any refering expressions
681
629
  root.each_expression do |exp|
682
630
  exp.respond_to?(:reference) &&
683
631
  exp.referenced_expression = targets[exp.reference]