regexp_parser 2.6.0 → 2.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +5 -5
  3. data/LICENSE +1 -1
  4. data/lib/regexp_parser/expression/base.rb +0 -7
  5. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  6. data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
  7. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  8. data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
  9. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  10. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  11. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  13. data/lib/regexp_parser/expression/classes/keep.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
  18. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  19. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  20. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  21. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  22. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  23. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  24. data/lib/regexp_parser/expression/sequence.rb +5 -10
  25. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  26. data/lib/regexp_parser/expression/shared.rb +37 -20
  27. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  28. data/lib/regexp_parser/expression.rb +34 -31
  29. data/lib/regexp_parser/lexer.rb +76 -36
  30. data/lib/regexp_parser/parser.rb +101 -100
  31. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  32. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  33. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  34. data/lib/regexp_parser/scanner/properties/long.csv +29 -0
  35. data/lib/regexp_parser/scanner/properties/short.csv +3 -0
  36. data/lib/regexp_parser/scanner/property.rl +2 -2
  37. data/lib/regexp_parser/scanner/scanner.rl +101 -172
  38. data/lib/regexp_parser/scanner.rb +1132 -1283
  39. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  40. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  42. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  43. data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
  44. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  45. data/lib/regexp_parser/syntax/token.rb +13 -13
  46. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  47. data/lib/regexp_parser/syntax/versions.rb +3 -1
  48. data/lib/regexp_parser/syntax.rb +1 -1
  49. data/lib/regexp_parser/version.rb +1 -1
  50. data/lib/regexp_parser.rb +6 -6
  51. data/regexp_parser.gemspec +5 -5
  52. metadata +14 -8
  53. data/CHANGELOG.md +0 -601
  54. data/README.md +0 -503
@@ -30,11 +30,6 @@
30
30
 
31
31
  class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
32
32
 
33
-
34
- # these are not supported in ruby at the moment
35
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
36
- character_equivalent = '[=' . alpha . '=]';
37
-
38
33
  line_anchor = beginning_of_line | end_of_line;
39
34
  anchor_char = [AbBzZG];
40
35
 
@@ -59,9 +54,6 @@
59
54
  one_or_more = '+' | '+?' | '++';
60
55
 
61
56
  quantifier_greedy = '?' | '*' | '+';
62
- quantifier_reluctant = '??' | '*?' | '+?';
63
- quantifier_possessive = '?+' | '*+' | '++';
64
- quantifier_mode = '?' | '+';
65
57
 
66
58
  quantity_exact = (digit+);
67
59
  quantity_minimum = (digit+) . ',';
@@ -70,9 +62,6 @@
70
62
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
71
63
  quantity_maximum | quantity_range ) . range_close;
72
64
 
73
- quantifiers = quantifier_greedy | quantifier_reluctant |
74
- quantifier_possessive | quantifier_interval;
75
-
76
65
  conditional = '(?(';
77
66
 
78
67
  group_comment = '?#' . [^)]* . group_close;
@@ -89,10 +78,9 @@
89
78
  # try to treat every other group head as options group, like Ruby
90
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
80
 
92
- group_ref = [gk];
93
- group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
94
- group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
95
- group_number = '-'? . [1-9] . [0-9]*;
81
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
96
84
  group_level = [+\-] . [0-9]+;
97
85
 
98
86
  group_name = ('<' . group_name_id_ab? . '>') |
@@ -101,15 +89,11 @@
101
89
 
102
90
  group_named = ('?' . group_name );
103
91
 
104
- group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
105
- ("'" . group_name_id_sq? . group_level? "'"));
106
- group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
107
- ("'" . group_name_id_sq? . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
108
94
 
109
- group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
110
- ("'" . group_number . group_level? "'"));
111
- group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
112
- ("'" . ((group_number . group_level?) | '0') "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
113
97
 
114
98
  group_type = group_atomic | group_passive | group_absence | group_named;
115
99
 
@@ -132,20 +116,21 @@
132
116
  keep_mark | sequence_char;
133
117
 
134
118
  # escapes that also work within a character set
135
- set_escape = backslash | brackets | escaped_ascii | property_char |
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
136
121
  sequence_char | single_codepoint_char_type;
137
122
 
138
123
 
139
124
  # EOF error, used where it can be detected
140
125
  action premature_end_error {
141
126
  text = copy(data, ts ? ts-1 : 0, -1)
142
- raise PrematureEndError.new( text )
127
+ raise PrematureEndError.new(text)
143
128
  }
144
129
 
145
130
  # Invalid sequence error, used from sequences, like escapes and sets
146
131
  action invalid_sequence_error {
147
132
  text = copy(data, ts ? ts-1 : 0, -1)
148
- validation_error(:sequence, 'sequence', text)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
149
134
  }
150
135
 
151
136
  # group (nesting) and set open/close actions
@@ -168,8 +153,8 @@
168
153
  };
169
154
 
170
155
  '-]' @set_closed { # special case, emits two tokens
171
- emit(:literal, :literal, copy(data, ts, te-1))
172
- emit(:set, :close, copy(data, ts+1, te))
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
173
158
  if in_set?
174
159
  fret;
175
160
  else
@@ -183,28 +168,27 @@
183
168
  };
184
169
 
185
170
  '^' {
186
- text = copy(data, ts, te)
187
- if tokens.last[1] == :open
188
- emit(:set, :negate, text)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
189
173
  else
190
- emit(:literal, :literal, text)
174
+ emit(:literal, :literal, '^')
191
175
  end
192
176
  };
193
177
 
194
178
  '-' {
195
- text = copy(data, ts, te)
196
- # ranges cant start with a subset or intersection/negation/range operator
197
- if tokens.last[0] == :set
198
- emit(:literal, :literal, text)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
199
183
  else
200
- emit(:set, :range, text)
184
+ emit(:set, :range, '-')
201
185
  end
202
186
  };
203
187
 
204
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
205
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
206
190
  '&&' {
207
- emit(:set, :intersection, copy(data, ts, te))
191
+ emit(:set, :intersection, '&&')
208
192
  };
209
193
 
210
194
  backslash {
@@ -212,7 +196,7 @@
212
196
  };
213
197
 
214
198
  set_open >(open_bracket, 1) >set_opened {
215
- emit(:set, :open, copy(data, ts, te))
199
+ emit(:set, :open, '[')
216
200
  fcall character_set;
217
201
  };
218
202
 
@@ -227,20 +211,12 @@
227
211
  end
228
212
 
229
213
  unless self.class.posix_classes.include?(class_name)
230
- validation_error(:posix_class, text)
214
+ raise ValidationError.for(:posix_class, text)
231
215
  end
232
216
 
233
217
  emit(type, class_name.to_sym, text)
234
218
  };
235
219
 
236
- # These are not supported in ruby at the moment. Enable them if they are.
237
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
238
- # emit(:set, :collation, copy(data, ts, te))
239
- # };
240
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
- # emit(:set, :equivalent, copy(data, ts, te))
242
- # };
243
-
244
220
  meta_char > (set_meta, 1) {
245
221
  emit(:literal, :literal, copy(data, ts, te))
246
222
  };
@@ -254,12 +230,22 @@
254
230
  # set escapes scanner
255
231
  # --------------------------------------------------------------------------
256
232
  set_escape_sequence := |*
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
236
+ fret;
237
+ };
238
+
239
+ # Scan all other escapes that work in sets with the generic escape scanner
257
240
  set_escape > (escaped_set_alpha, 2) {
258
241
  fhold;
259
242
  fnext character_set;
260
243
  fcall escape_sequence;
261
244
  };
262
245
 
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
263
249
  any > (escaped_set_alpha, 1) {
264
250
  emit(:escape, :literal, copy(data, ts-1, te))
265
251
  fret;
@@ -281,6 +267,13 @@
281
267
  fret;
282
268
  };
283
269
 
270
+ [8-9] . [0-9] { # special case, emits two tokens
271
+ text = copy(data, ts-1, te)
272
+ emit(:escape, :literal, text[0, 2])
273
+ emit(:literal, :literal, text[2])
274
+ fret;
275
+ };
276
+
284
277
  meta_char {
285
278
  case text = copy(data, ts-1, te)
286
279
  when '\.'; emit(:escape, :dot, text)
@@ -371,6 +364,7 @@
371
364
  conditional_expression := |*
372
365
  group_lookup . ')' {
373
366
  text = copy(data, ts, te-1)
367
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
374
368
  emit(:conditional, :condition, text)
375
369
  emit(:conditional, :condition_close, ')')
376
370
  };
@@ -453,10 +447,9 @@
453
447
 
454
448
  # (?#...) comments: parsed as a single expression, without introducing a
455
449
  # new nesting level. Comments may not include parentheses, escaped or not.
456
- # special case for close, action performed on all transitions to get the
457
- # correct closing count.
450
+ # special case for close to get the correct closing count.
458
451
  # ------------------------------------------------------------------------
459
- group_open . group_comment $group_closed {
452
+ (group_open . group_comment) @group_closed {
460
453
  emit(:group, :comment, copy(data, ts, te))
461
454
  };
462
455
 
@@ -471,10 +464,10 @@
471
464
  #
472
465
  # (?imxdau-imx:subexp) option on/off for subexp
473
466
  # ------------------------------------------------------------------------
474
- group_open . group_options >group_opened {
467
+ (group_open . group_options) >group_opened {
475
468
  text = copy(data, ts, te)
476
469
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
477
- validation_error(:group_option, $1 || "-#{$2}", text)
470
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
478
471
  end
479
472
  emit_options(text)
480
473
  };
@@ -485,7 +478,7 @@
485
478
  # (?<=subexp) look-behind
486
479
  # (?<!subexp) negative look-behind
487
480
  # ------------------------------------------------------------------------
488
- group_open . assertion_type >group_opened {
481
+ (group_open . assertion_type) >group_opened {
489
482
  case text = copy(data, ts, te)
490
483
  when '(?='; emit(:assertion, :lookahead, text)
491
484
  when '(?!'; emit(:assertion, :nlookahead, text)
@@ -502,14 +495,14 @@
502
495
  # (?'name'subexp) named group (single quoted version)
503
496
  # (subexp) captured group
504
497
  # ------------------------------------------------------------------------
505
- group_open . group_type >group_opened {
498
+ (group_open . group_type) >group_opened {
506
499
  case text = copy(data, ts, te)
507
500
  when '(?:'; emit(:group, :passive, text)
508
501
  when '(?>'; emit(:group, :atomic, text)
509
502
  when '(?~'; emit(:group, :absence, text)
510
503
 
511
504
  when /^\(\?(?:<>|'')/
512
- validation_error(:group, 'named group', 'name is empty')
505
+ raise ValidationError.for(:group, 'named group', 'name is empty')
513
506
 
514
507
  when /^\(\?<[^>]+>/
515
508
  emit(:group, :named_ab, text)
@@ -528,50 +521,52 @@
528
521
  group_close @group_closed {
529
522
  if conditional_stack.last == group_depth + 1
530
523
  conditional_stack.pop
531
- emit(:conditional, :close, copy(data, ts, te))
532
- else
524
+ emit(:conditional, :close, ')')
525
+ elsif group_depth >= 0
533
526
  if spacing_stack.length > 1 &&
534
527
  spacing_stack.last[:depth] == group_depth + 1
535
528
  spacing_stack.pop
536
529
  self.free_spacing = spacing_stack.last[:free_spacing]
537
530
  end
538
531
 
539
- emit(:group, :close, copy(data, ts, te))
532
+ emit(:group, :close, ')')
533
+ else
534
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
540
535
  end
541
536
  };
542
537
 
543
538
 
544
539
  # Group backreference, named and numbered
545
540
  # ------------------------------------------------------------------------
546
- backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
541
+ backslash . (group_ref) > (backslashed, 4) {
547
542
  case text = copy(data, ts, te)
548
- when /^\\k(<>|'')/
549
- validation_error(:backref, 'backreference', 'ref ID is empty')
550
- when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
543
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
551
544
  emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
552
- when /^\\k(.)\d+\D$/
545
+ when /^\\k(.)0*[1-9]\d*['>]$/
553
546
  emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
554
- when /^\\k(.)-\d+\D$/
547
+ when /^\\k(.)-0*[1-9]\d*['>]$/
555
548
  emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
556
- when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
549
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
557
550
  emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
558
- when /^\\k(.)-?\d+[+\-]\d+\D$/
551
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
559
552
  emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
553
+ else
554
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
560
555
  end
561
556
  };
562
557
 
563
558
  # Group call, named and numbered
564
559
  # ------------------------------------------------------------------------
565
- backslash . (group_name_call | group_number_call) > (backslashed, 4) {
560
+ backslash . (group_call) > (backslashed, 4) {
566
561
  case text = copy(data, ts, te)
567
- when /^\\g(<>|'')/
568
- validation_error(:backref, 'subexpression call', 'ref ID is empty')
569
- when /^\\g(.)[^\p{digit}+\->][^+\-]*/
562
+ when /^\\g(.)[^0-9+\-].*['>]$/
570
563
  emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
571
- when /^\\g(.)\d+\D$/
564
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
572
565
  emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
573
- when /^\\g(.)[+-]\d+/
566
+ when /^\\g(.)[+-]0*[1-9]\d*/
574
567
  emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
568
+ else
569
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
575
570
  end
576
571
  };
577
572
 
@@ -645,95 +640,35 @@
645
640
  *|;
646
641
  }%%
647
642
 
648
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
649
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
650
-
651
- require 'regexp_parser/error'
643
+ require_relative 'scanner/errors/scanner_error'
644
+ require_relative 'scanner/errors/premature_end_error'
645
+ require_relative 'scanner/errors/validation_error'
652
646
 
653
647
  class Regexp::Scanner
654
- # General scanner error (catch all)
655
- class ScannerError < Regexp::Parser::Error; end
656
-
657
- # Base for all scanner validation errors
658
- class ValidationError < Regexp::Parser::Error
659
- def initialize(reason)
660
- super reason
661
- end
662
- end
663
-
664
- # Unexpected end of pattern
665
- class PrematureEndError < ScannerError
666
- def initialize(where = '')
667
- super "Premature end of pattern at #{where}"
668
- end
669
- end
670
-
671
- # Invalid sequence format. Used for escape sequences, mainly.
672
- class InvalidSequenceError < ValidationError
673
- def initialize(what = 'sequence', where = '')
674
- super "Invalid #{what} at #{where}"
675
- end
676
- end
677
-
678
- # Invalid group. Used for named groups.
679
- class InvalidGroupError < ValidationError
680
- def initialize(what, reason)
681
- super "Invalid #{what}, #{reason}."
682
- end
683
- end
684
-
685
- # Invalid groupOption. Used for inline options.
686
- # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
687
- class InvalidGroupOption < ValidationError
688
- def initialize(option, text)
689
- super "Invalid group option #{option} in #{text}"
690
- end
691
- end
692
-
693
- # Invalid back reference. Used for name a number refs/calls.
694
- class InvalidBackrefError < ValidationError
695
- def initialize(what, reason)
696
- super "Invalid back reference #{what}, #{reason}"
697
- end
698
- end
699
-
700
- # The property name was not recognized by the scanner.
701
- class UnknownUnicodePropertyError < ValidationError
702
- def initialize(name)
703
- super "Unknown unicode character property name #{name}"
704
- end
705
- end
706
-
707
- # The POSIX class name was not recognized by the scanner.
708
- class UnknownPosixClassError < ValidationError
709
- def initialize(text)
710
- super "Unknown POSIX class #{text}"
711
- end
712
- end
713
-
714
648
  # Scans the given regular expression text, or Regexp object and collects the
715
649
  # emitted token into an array that gets returned at the end. If a block is
716
650
  # given, it gets called for each emitted token.
717
651
  #
718
652
  # This method may raise errors if a syntax error is encountered.
719
653
  # --------------------------------------------------------------------------
720
- def self.scan(input_object, options: nil, &block)
721
- new.scan(input_object, options: options, &block)
654
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
655
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
722
656
  end
723
657
 
724
- def scan(input_object, options: nil, &block)
725
- self.literal = nil
658
+ def scan(input_object, options: nil, collect_tokens: true, &block)
659
+ self.collect_tokens = collect_tokens
660
+ self.literal_run = nil
726
661
  stack = []
727
662
 
728
663
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
729
664
  self.free_spacing = free_spacing?(input_object, options)
730
665
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
731
666
 
732
- data = input.unpack("c*") if input.is_a?(String)
667
+ data = input.unpack("c*")
733
668
  eof = data.length
734
669
 
735
670
  self.tokens = []
736
- self.block = block_given? ? block : nil
671
+ self.block = block
737
672
 
738
673
  self.set_depth = 0
739
674
  self.group_depth = 0
@@ -758,7 +693,7 @@ class Regexp::Scanner
758
693
  "[#{set_depth}]") if in_set?
759
694
 
760
695
  # when the entire expression is a literal run
761
- emit_literal if literal
696
+ emit_literal if literal_run
762
697
 
763
698
  tokens
764
699
  end
@@ -785,26 +720,37 @@ class Regexp::Scanner
785
720
  def emit(type, token, text)
786
721
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
787
722
 
788
- emit_literal if literal
723
+ emit_literal if literal_run
789
724
 
790
725
  # Ragel runs with byte-based indices (ts, te). These are of little value to
791
726
  # end-users, so we keep track of char-based indices and emit those instead.
792
727
  ts_char_pos = char_pos
793
728
  te_char_pos = char_pos + text.length
794
729
 
795
- if block
796
- block.call type, token, text, ts_char_pos, te_char_pos
797
- end
730
+ tok = [type, token, text, ts_char_pos, te_char_pos]
798
731
 
799
- tokens << [type, token, text, ts_char_pos, te_char_pos]
732
+ self.prev_token = tok
800
733
 
801
734
  self.char_pos = te_char_pos
735
+
736
+ if block
737
+ block.call type, token, text, ts_char_pos, te_char_pos
738
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
739
+ tokens << tok if collect_tokens
740
+ elsif collect_tokens
741
+ tokens << tok
742
+ end
802
743
  end
803
744
 
745
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
746
+
804
747
  private
805
748
 
806
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
807
- :group_depth, :set_depth, :conditional_stack, :char_pos
749
+ attr_accessor :block,
750
+ :collect_tokens, :tokens, :prev_token,
751
+ :free_spacing, :spacing_stack,
752
+ :group_depth, :set_depth, :conditional_stack,
753
+ :char_pos
808
754
 
809
755
  def free_spacing?(input_object, options)
810
756
  if options && !input_object.is_a?(String)
@@ -834,14 +780,13 @@ class Regexp::Scanner
834
780
  # Appends one or more characters to the literal buffer, to be emitted later
835
781
  # by a call to emit_literal.
836
782
  def append_literal(data, ts, te)
837
- self.literal = literal || []
838
- literal << copy(data, ts, te)
783
+ (self.literal_run ||= []) << copy(data, ts, te)
839
784
  end
840
785
 
841
786
  # Emits the literal run collected by calls to the append_literal method.
842
787
  def emit_literal
843
- text = literal.join
844
- self.literal = nil
788
+ text = literal_run.join
789
+ self.literal_run = nil
845
790
  emit(:literal, :literal, text)
846
791
  end
847
792
 
@@ -876,24 +821,8 @@ class Regexp::Scanner
876
821
 
877
822
  def emit_meta_control_sequence(data, ts, te, token)
878
823
  if data.last < 0x00 || data.last > 0x7F
879
- validation_error(:sequence, 'escape', token.to_s)
824
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
880
825
  end
881
826
  emit(:escape, token, copy(data, ts-1, te))
882
827
  end
883
-
884
- # Centralizes and unifies the handling of validation related
885
- # errors.
886
- def validation_error(type, what, reason = nil)
887
- error =
888
- case type
889
- when :backref then InvalidBackrefError.new(what, reason)
890
- when :group then InvalidGroupError.new(what, reason)
891
- when :group_option then InvalidGroupOption.new(what, reason)
892
- when :posix_class then UnknownPosixClassError.new(what)
893
- when :property then UnknownUnicodePropertyError.new(what)
894
- when :sequence then InvalidSequenceError.new(what, reason)
895
- end
896
-
897
- raise error # unless @@config.validation_ignore
898
- end
899
828
  end # module Regexp::Scanner