RubyGems - regexp_parser - Versions diffs - 2.6.0 → 2.10.0 - Mend

regexp_parser 2.6.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +4 -4
data/Gemfile +5 -5
data/LICENSE +1 -1
data/lib/regexp_parser/expression/base.rb +0 -7
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
data/lib/regexp_parser/expression/classes/group.rb +0 -22
data/lib/regexp_parser/expression/classes/keep.rb +1 -1
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
data/lib/regexp_parser/expression/methods/construct.rb +2 -4
data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
data/lib/regexp_parser/expression/methods/negative.rb +20 -0
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
data/lib/regexp_parser/expression/methods/tests.rb +40 -3
data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
data/lib/regexp_parser/expression/quantifier.rb +30 -17
data/lib/regexp_parser/expression/sequence.rb +5 -10
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +37 -20
data/lib/regexp_parser/expression/subexpression.rb +20 -15
data/lib/regexp_parser/expression.rb +37 -31
data/lib/regexp_parser/lexer.rb +76 -36
data/lib/regexp_parser/parser.rb +107 -103
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +29 -0
data/lib/regexp_parser/scanner/properties/short.csv +3 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +101 -172
data/lib/regexp_parser/scanner.rb +1171 -1365
data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
data/lib/regexp_parser/syntax/token/escape.rb +3 -1
data/lib/regexp_parser/syntax/token/meta.rb +9 -2
data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +13 -13
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +1 -1
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +6 -6
data/regexp_parser.gemspec +5 -5
metadata +17 -8
data/CHANGELOG.md +0 -601
data/README.md +0 -503

data/lib/regexp_parser/scanner/properties/long.csv CHANGED Viewed

@@ -7,6 +7,8 @@ age=12.0,age=12.0
 age=12.1,age=12.1
 age=13.0,age=13.0
 age=14.0,age=14.0
+age=15.0,age=15.0
+age=15.1,age=15.1
 age=2.0,age=2.0
 age=2.1,age=2.1
 age=3.0,age=3.0
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
 emojipresentation,emoji_presentation
 enclosingmark,enclosing_mark
 ethiopic,ethiopic
+extendedpictographic,extended_pictographic
 extender,extender
 finalpunctuation,final_punctuation
 format,format
@@ -106,6 +109,19 @@ gothic,gothic
 grantha,grantha
 graph,graph
 graphemebase,grapheme_base
+graphemeclusterbreak=control,grapheme_cluster_break=control
+graphemeclusterbreak=cr,grapheme_cluster_break=cr
+graphemeclusterbreak=extend,grapheme_cluster_break=extend
+graphemeclusterbreak=l,grapheme_cluster_break=l
+graphemeclusterbreak=lf,grapheme_cluster_break=lf
+graphemeclusterbreak=lv,grapheme_cluster_break=lv
+graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
+graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
+graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
+graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
+graphemeclusterbreak=t,grapheme_cluster_break=t
+graphemeclusterbreak=v,grapheme_cluster_break=v
+graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
 graphemeextend,grapheme_extend
 graphemelink,grapheme_link
 greek,greek
@@ -121,11 +137,14 @@ hebrew,hebrew
 hexdigit,hex_digit
 hiragana,hiragana
 hyphen,hyphen
+idcompatmathcontinue,id_compat_math_continue
+idcompatmathstart,id_compat_math_start
 idcontinue,id_continue
 ideographic,ideographic
 idsbinaryoperator,ids_binary_operator
 idstart,id_start
 idstrinaryoperator,ids_trinary_operator
+idsunaryoperator,ids_unary_operator
 imperialaramaic,imperial_aramaic
 inadlam,in_adlam
 inaegeannumbers,in_aegean_numbers
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
 inarabic,in_arabic
 inarabicextendeda,in_arabic_extended_a
 inarabicextendedb,in_arabic_extended_b
+inarabicextendedc,in_arabic_extended_c
 inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
 inarabicpresentationformsa,in_arabic_presentation_forms_a
 inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
 incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
 incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
 incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
+incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
+incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
 incombiningdiacriticalmarks,in_combining_diacritical_marks
 incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
 incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
 incyrillicextendeda,in_cyrillic_extended_a
 incyrillicextendedb,in_cyrillic_extended_b
 incyrillicextendedc,in_cyrillic_extended_c
+incyrillicextendedd,in_cyrillic_extended_d
 incyrillicsupplement,in_cyrillic_supplement
 indeseret,in_deseret
 indevanagari,in_devanagari
 indevanagariextended,in_devanagari_extended
+indevanagariextendeda,in_devanagari_extended_a
 indingbats,in_dingbats
 indivesakuru,in_dives_akuru
 indogra,in_dogra
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
 initialpunctuation,initial_punctuation
 injavanese,in_javanese
 inkaithi,in_kaithi
+inkaktoviknumerals,in_kaktovik_numerals
 inkanaextendeda,in_kana_extended_a
 inkanaextendedb,in_kana_extended_b
 inkanasupplement,in_kana_supplement
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
 inkannada,in_kannada
 inkatakana,in_katakana
 inkatakanaphoneticextensions,in_katakana_phonetic_extensions
+inkawi,in_kawi
 inkayahli,in_kayah_li
 inkharoshthi,in_kharoshthi
 inkhitansmallscript,in_khitan_small_script
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
 inmyanmarextendeda,in_myanmar_extended_a
 inmyanmarextendedb,in_myanmar_extended_b
 innabataean,in_nabataean
+innagmundari,in_nag_mundari
 innandinagari,in_nandinagari
 innewa,in_newa
 innewtailue,in_new_tai_lue
@@ -457,6 +484,7 @@ joincontrol,join_control
 kaithi,kaithi
 kannada,kannada
 katakana,katakana
+kawi,kawi
 kayahli,kayah_li
 kharoshthi,kharoshthi
 khitansmallscript,khitan_small_script
@@ -503,6 +531,7 @@ mro,mro
 multani,multani
 myanmar,myanmar
 nabataean,nabataean
+nagmundari,nag_mundari
 nandinagari,nandinagari
 newa,newa
 newline,newline

data/lib/regexp_parser/scanner/properties/short.csv CHANGED Viewed

@@ -57,6 +57,7 @@ emod,emoji_modifier
 epres,emoji_presentation
 ethi,ethiopic
 ext,extender
+extpict,extended_pictographic
 geor,georgian
 glag,glagolitic
 gong,gunjala_gondi
@@ -85,6 +86,7 @@ ideo,ideographic
 ids,id_start
 idsb,ids_binary_operator
 idst,ids_trinary_operator
+idsu,ids_unary_operator
 ital,old_italic
 java,javanese
 joinc,join_control
@@ -133,6 +135,7 @@ mtei,meetei_mayek
 mult,multani
 mymr,myanmar
 n,number
+nagm,nag_mundari
 nand,nandinagari
 narb,old_north_arabian
 nbat,nabataean

data/lib/regexp_parser/scanner/property.rl CHANGED Viewed

@@ -17,10 +17,10 @@
       text = copy(data, ts-1, te)
       type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
-      name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
+      name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
       token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
-      validation_error(:property, name) unless token
+      raise ValidationError.for(:property, name) unless token
       self.emit(type, token.to_sym, text)

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -30,11 +30,6 @@
   class_posix           = ('[:' . '^'? . [^\[\]]* . ':]');
-  # these are not supported in ruby at the moment
-  collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
-  character_equivalent  = '[=' . alpha . '=]';
   line_anchor           = beginning_of_line | end_of_line;
   anchor_char           = [AbBzZG];
@@ -59,9 +54,6 @@
   one_or_more           = '+' | '+?' | '++';
   quantifier_greedy     = '?'  | '*'  | '+';
-  quantifier_reluctant  = '??' | '*?' | '+?';
-  quantifier_possessive = '?+' | '*+' | '++';
-  quantifier_mode       = '?'  | '+';
   quantity_exact        = (digit+);
   quantity_minimum      = (digit+) . ',';
@@ -70,9 +62,6 @@
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
                           quantity_maximum | quantity_range ) . range_close;
-  quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_interval;
   conditional           = '(?(';
   group_comment         = '?#' . [^)]* . group_close;
@@ -89,10 +78,9 @@
   # try to treat every other group head as options group, like Ruby
   group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
-  group_ref             = [gk];
-  group_name_id_ab      = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
-  group_name_id_sq      = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
-  group_number          = '-'? . [1-9] . [0-9]*;
+  group_name_id_ab      = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
+  group_name_id_sq      = ([^0-9\-']   | utf8_multibyte) . ([^'] | utf8_multibyte)*;
+  group_number          = '-'? . [0-9]+;
   group_level           = [+\-] . [0-9]+;
   group_name            = ('<' . group_name_id_ab? . '>') |
@@ -101,15 +89,11 @@
   group_named           = ('?' . group_name );
-  group_name_backref    = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
-                                 ("'" . group_name_id_sq? . group_level? "'"));
-  group_name_call       = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
-                                 ("'" . group_name_id_sq? . group_level? "'"));
+  group_ref_body        = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
+                           ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
-  group_number_backref  = 'k' . (('<' . group_number . group_level? '>') |
-                                 ("'" . group_number . group_level? "'"));
-  group_number_call     = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
-                                 ("'" . ((group_number . group_level?) | '0') "'"));
+  group_ref             = 'k' . group_ref_body;
+  group_call            = 'g' . group_ref_body;
   group_type            = group_atomic | group_passive | group_absence | group_named;
@@ -132,20 +116,21 @@
                           keep_mark | sequence_char;
   # escapes that also work within a character set
-  set_escape            = backslash | brackets | escaped_ascii | property_char |
+  set_escape            = backslash | brackets | escaped_ascii |
+                          octal_sequence | property_char |
                           sequence_char | single_codepoint_char_type;
   # EOF error, used where it can be detected
   action premature_end_error {
     text = copy(data, ts ? ts-1 : 0, -1)
-    raise PrematureEndError.new( text )
+    raise PrematureEndError.new(text)
   }
   # Invalid sequence error, used from sequences, like escapes and sets
   action invalid_sequence_error {
     text = copy(data, ts ? ts-1 : 0, -1)
-    validation_error(:sequence, 'sequence', text)
+    raise ValidationError.for(:sequence, 'sequence', text)
   }
   # group (nesting) and set open/close actions
@@ -168,8 +153,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts, te-1))
-      emit(:set, :close, copy(data, ts+1, te))
+      emit(:literal, :literal, '-')
+      emit(:set, :close, ']')
       if in_set?
         fret;
       else
@@ -183,28 +168,27 @@
     };
     '^' {
-      text = copy(data, ts, te)
-      if tokens.last[1] == :open
-        emit(:set, :negate, text)
+      if prev_token[1] == :open
+        emit(:set, :negate, '^')
       else
-        emit(:literal, :literal, text)
+        emit(:literal, :literal, '^')
       end
     };
     '-' {
-      text = copy(data, ts, te)
-      # ranges cant start with a subset or intersection/negation/range operator
-      if tokens.last[0] == :set
-        emit(:literal, :literal, text)
+      # ranges cant start with the opening bracket, a subset, or
+      # intersection/negation/range operators
+      if prev_token[0] == :set
+        emit(:literal, :literal, '-')
       else
-        emit(:set, :range, text)
+        emit(:set, :range, '-')
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, copy(data, ts, te))
+      emit(:set, :intersection, '&&')
     };
     backslash {
@@ -212,7 +196,7 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, copy(data, ts, te))
+      emit(:set, :open, '[')
       fcall character_set;
     };
@@ -227,20 +211,12 @@
       end
       unless self.class.posix_classes.include?(class_name)
-        validation_error(:posix_class, text)
+        raise ValidationError.for(:posix_class, text)
       end
       emit(type, class_name.to_sym, text)
     };
-    # These are not supported in ruby at the moment. Enable them if they are.
-    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
-    #   emit(:set, :collation, copy(data, ts, te))
-    # };
-    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
-    #   emit(:set, :equivalent, copy(data, ts, te))
-    # };
     meta_char > (set_meta, 1) {
       emit(:literal, :literal, copy(data, ts, te))
     };
@@ -254,12 +230,22 @@
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
+    # Special case: in sets, octal sequences have higher priority than backrefs
+    octal_sequence {
+      emit(:escape, :octal, copy(data, ts-1, te))
+      fret;
+    };
+    # Scan all other escapes that work in sets with the generic escape scanner
     set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    # Treat all remaining escapes - those not supported in sets - as literal.
+    # (This currently includes \^, \-, \&, \:, although these could potentially
+    # be meta chars when not escaped, depending on their position in the set.)
     any > (escaped_set_alpha, 1) {
       emit(:escape, :literal, copy(data, ts-1, te))
       fret;
@@ -281,6 +267,13 @@
       fret;
     };
+    [8-9] . [0-9] { # special case, emits two tokens
+      text = copy(data, ts-1, te)
+      emit(:escape, :literal, text[0, 2])
+      emit(:literal, :literal, text[2])
+      fret;
+    };
     meta_char {
       case text = copy(data, ts-1, te)
       when '\.';  emit(:escape, :dot,               text)
@@ -371,6 +364,7 @@
   conditional_expression := |*
     group_lookup . ')' {
       text = copy(data, ts, te-1)
+      text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
       emit(:conditional, :condition, text)
       emit(:conditional, :condition_close, ')')
     };
@@ -453,10 +447,9 @@
     # (?#...) comments: parsed as a single expression, without introducing a
     # new nesting level. Comments may not include parentheses, escaped or not.
-    # special case for close, action performed on all transitions to get the
-    # correct closing count.
+    # special case for close to get the correct closing count.
     # ------------------------------------------------------------------------
-    group_open . group_comment $group_closed {
+    (group_open . group_comment) @group_closed {
       emit(:group, :comment, copy(data, ts, te))
     };
@@ -471,10 +464,10 @@
     #
     #   (?imxdau-imx:subexp)  option on/off for subexp
     # ------------------------------------------------------------------------
-    group_open . group_options >group_opened {
+    (group_open . group_options) >group_opened {
       text = copy(data, ts, te)
       if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
-        validation_error(:group_option, $1 || "-#{$2}", text)
+        raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
       end
       emit_options(text)
     };
@@ -485,7 +478,7 @@
     #   (?<=subexp)         look-behind
     #   (?<!subexp)         negative look-behind
     # ------------------------------------------------------------------------
-    group_open . assertion_type >group_opened {
+    (group_open . assertion_type) >group_opened {
       case text = copy(data, ts, te)
       when '(?=';  emit(:assertion, :lookahead,    text)
       when '(?!';  emit(:assertion, :nlookahead,   text)
@@ -502,14 +495,14 @@
     #   (?'name'subexp)     named group (single quoted version)
     #   (subexp)            captured group
     # ------------------------------------------------------------------------
-    group_open . group_type >group_opened {
+    (group_open . group_type) >group_opened {
       case text = copy(data, ts, te)
       when '(?:';  emit(:group, :passive,      text)
       when '(?>';  emit(:group, :atomic,       text)
       when '(?~';  emit(:group, :absence,      text)
       when /^\(\?(?:<>|'')/
-        validation_error(:group, 'named group', 'name is empty')
+        raise ValidationError.for(:group, 'named group', 'name is empty')
       when /^\(\?<[^>]+>/
         emit(:group, :named_ab,  text)
@@ -528,50 +521,52 @@
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, copy(data, ts, te))
-      else
+        emit(:conditional, :close, ')')
+      elsif group_depth >= 0
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
           spacing_stack.pop
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, copy(data, ts, te))
+        emit(:group, :close, ')')
+      else
+        raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
       end
     };
     # Group backreference, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
+    backslash . (group_ref) > (backslashed, 4) {
       case text = copy(data, ts, te)
-      when /^\\k(<>|'')/
-        validation_error(:backref, 'backreference', 'ref ID is empty')
-      when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
+      when /^\\k(.)[^0-9\-][^+\-]*['>]$/
         emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
-      when /^\\k(.)\d+\D$/
+      when /^\\k(.)0*[1-9]\d*['>]$/
         emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
-      when /^\\k(.)-\d+\D$/
+      when /^\\k(.)-0*[1-9]\d*['>]$/
         emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
-      when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
+      when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
         emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
-      when /^\\k(.)-?\d+[+\-]\d+\D$/
+      when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
         emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
+      else
+        raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
       end
     };
     # Group call, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_call | group_number_call) > (backslashed, 4) {
+    backslash . (group_call) > (backslashed, 4) {
       case text = copy(data, ts, te)
-      when /^\\g(<>|'')/
-        validation_error(:backref, 'subexpression call', 'ref ID is empty')
-      when /^\\g(.)[^\p{digit}+\->][^+\-]*/
+      when /^\\g(.)[^0-9+\-].*['>]$/
         emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
-      when /^\\g(.)\d+\D$/
+      when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
         emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
-      when /^\\g(.)[+-]\d+/
+      when /^\\g(.)[+-]0*[1-9]\d*/
         emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
+      else
+        raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
       end
     };
@@ -645,95 +640,35 @@
   *|;
 }%%
-# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
-# This file was generated from lib/regexp_parser/scanner/scanner.rl
-require 'regexp_parser/error'
+require_relative 'scanner/errors/scanner_error'
+require_relative 'scanner/errors/premature_end_error'
+require_relative 'scanner/errors/validation_error'
 class Regexp::Scanner
-  # General scanner error (catch all)
-  class ScannerError < Regexp::Parser::Error; end
-  # Base for all scanner validation errors
-  class ValidationError < Regexp::Parser::Error
-    def initialize(reason)
-      super reason
-    end
-  end
-  # Unexpected end of pattern
-  class PrematureEndError < ScannerError
-    def initialize(where = '')
-      super "Premature end of pattern at #{where}"
-    end
-  end
-  # Invalid sequence format. Used for escape sequences, mainly.
-  class InvalidSequenceError < ValidationError
-    def initialize(what = 'sequence', where = '')
-      super "Invalid #{what} at #{where}"
-    end
-  end
-  # Invalid group. Used for named groups.
-  class InvalidGroupError < ValidationError
-    def initialize(what, reason)
-      super "Invalid #{what}, #{reason}."
-    end
-  end
-  # Invalid groupOption. Used for inline options.
-  # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
-  class InvalidGroupOption < ValidationError
-    def initialize(option, text)
-      super "Invalid group option #{option} in #{text}"
-    end
-  end
-  # Invalid back reference. Used for name a number refs/calls.
-  class InvalidBackrefError < ValidationError
-    def initialize(what, reason)
-      super "Invalid back reference #{what}, #{reason}"
-    end
-  end
-  # The property name was not recognized by the scanner.
-  class UnknownUnicodePropertyError < ValidationError
-    def initialize(name)
-      super "Unknown unicode character property name #{name}"
-    end
-  end
-  # The POSIX class name was not recognized by the scanner.
-  class UnknownPosixClassError < ValidationError
-    def initialize(text)
-      super "Unknown POSIX class #{text}"
-    end
-  end
   # Scans the given regular expression text, or Regexp object and collects the
   # emitted token into an array that gets returned at the end. If a block is
   # given, it gets called for each emitted token.
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, options: nil, &block)
-    new.scan(input_object, options: options, &block)
+  def self.scan(input_object, options: nil, collect_tokens: true, &block)
+    new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
   end
-  def scan(input_object, options: nil, &block)
-    self.literal = nil
+  def scan(input_object, options: nil, collect_tokens: true, &block)
+    self.collect_tokens = collect_tokens
+    self.literal_run = nil
     stack = []
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    data  = input.unpack("c*") if input.is_a?(String)
+    data  = input.unpack("c*")
     eof   = data.length
     self.tokens = []
-    self.block  = block_given? ? block : nil
+    self.block  = block
     self.set_depth = 0
     self.group_depth = 0
@@ -758,7 +693,7 @@ class Regexp::Scanner
           "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
-    emit_literal if literal
+    emit_literal if literal_run
     tokens
   end
@@ -785,26 +720,37 @@ class Regexp::Scanner
   def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    emit_literal if literal
+    emit_literal if literal_run
     # Ragel runs with byte-based indices (ts, te). These are of little value to
     # end-users, so we keep track of char-based indices and emit those instead.
     ts_char_pos = char_pos
     te_char_pos = char_pos + text.length
-    if block
-      block.call type, token, text, ts_char_pos, te_char_pos
-    end
+    tok = [type, token, text, ts_char_pos, te_char_pos]
-    tokens << [type, token, text, ts_char_pos, te_char_pos]
+    self.prev_token = tok
     self.char_pos = te_char_pos
+    if block
+      block.call type, token, text, ts_char_pos, te_char_pos
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
+      tokens << tok if collect_tokens
+    elsif collect_tokens
+      tokens << tok
+    end
   end
+  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
   private
-  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack, :char_pos
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token,
+                :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack,
+                :char_pos
   def free_spacing?(input_object, options)
     if options && !input_object.is_a?(String)
@@ -834,14 +780,13 @@ class Regexp::Scanner
   # Appends one or more characters to the literal buffer, to be emitted later
   # by a call to emit_literal.
   def append_literal(data, ts, te)
-    self.literal = literal || []
-    literal << copy(data, ts, te)
+    (self.literal_run ||= []) << copy(data, ts, te)
   end
   # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    text = literal.join
-    self.literal = nil
+    text = literal_run.join
+    self.literal_run = nil
     emit(:literal, :literal, text)
   end
@@ -876,24 +821,8 @@ class Regexp::Scanner
   def emit_meta_control_sequence(data, ts, te, token)
     if data.last < 0x00 || data.last > 0x7F
-      validation_error(:sequence, 'escape', token.to_s)
+      raise ValidationError.for(:sequence, 'escape', token.to_s)
     end
     emit(:escape, token, copy(data, ts-1, te))
   end
-  # Centralizes and unifies the handling of validation related
-  # errors.
-  def validation_error(type, what, reason = nil)
-    error =
-      case type
-      when :backref      then InvalidBackrefError.new(what, reason)
-      when :group        then InvalidGroupError.new(what, reason)
-      when :group_option then InvalidGroupOption.new(what, reason)
-      when :posix_class  then UnknownPosixClassError.new(what)
-      when :property     then UnknownUnicodePropertyError.new(what)
-      when :sequence     then InvalidSequenceError.new(what, reason)
-      end
-    raise error # unless @@config.validation_ignore
-  end
 end # module Regexp::Scanner