RubyGems - regexp_parser - Versions diffs - 1.7.0 → 2.8.1 - Mend

regexp_parser 1.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +364 -22
data/Gemfile +8 -2
data/LICENSE +1 -1
data/README.md +124 -88
data/Rakefile +6 -70
data/lib/regexp_parser/error.rb +4 -0
data/lib/regexp_parser/expression/base.rb +76 -0
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
data/lib/regexp_parser/expression/classes/group.rb +28 -15
data/lib/regexp_parser/expression/classes/keep.rb +2 -0
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
data/lib/regexp_parser/expression/classes/root.rb +4 -19
data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
data/lib/regexp_parser/expression/methods/construct.rb +41 -0
data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
data/lib/regexp_parser/expression/methods/tests.rb +47 -1
data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
data/lib/regexp_parser/expression/quantifier.rb +57 -17
data/lib/regexp_parser/expression/sequence.rb +11 -47
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +111 -0
data/lib/regexp_parser/expression/subexpression.rb +27 -19
data/lib/regexp_parser/expression.rb +14 -141
data/lib/regexp_parser/lexer.rb +83 -41
data/lib/regexp_parser/parser.rb +371 -429
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +633 -0
data/lib/regexp_parser/scanner/properties/short.csv +248 -0
data/lib/regexp_parser/scanner/property.rl +4 -4
data/lib/regexp_parser/scanner/scanner.rl +295 -368
data/lib/regexp_parser/scanner.rb +1405 -1674
data/lib/regexp_parser/syntax/any.rb +2 -7
data/lib/regexp_parser/syntax/base.rb +92 -67
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +33 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/token/meta.rb +20 -0
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +8 -6
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +0 -2
data/regexp_parser.gemspec +20 -22
metadata +49 -166
data/lib/regexp_parser/scanner/properties/long.yml +0 -594
data/lib/regexp_parser/scanner/properties/short.yml +0 -237
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/expression/base_spec.rb +0 -94
data/spec/expression/clone_spec.rb +0 -120
data/spec/expression/conditional_spec.rb +0 -89
data/spec/expression/free_space_spec.rb +0 -27
data/spec/expression/methods/match_length_spec.rb +0 -161
data/spec/expression/methods/match_spec.rb +0 -25
data/spec/expression/methods/strfregexp_spec.rb +0 -224
data/spec/expression/methods/tests_spec.rb +0 -99
data/spec/expression/methods/traverse_spec.rb +0 -161
data/spec/expression/options_spec.rb +0 -128
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9
data/spec/expression/subexpression_spec.rb +0 -50
data/spec/expression/to_h_spec.rb +0 -26
data/spec/expression/to_s_spec.rb +0 -100
data/spec/lexer/all_spec.rb +0 -22
data/spec/lexer/conditionals_spec.rb +0 -53
data/spec/lexer/escapes_spec.rb +0 -14
data/spec/lexer/keep_spec.rb +0 -10
data/spec/lexer/literals_spec.rb +0 -89
data/spec/lexer/nesting_spec.rb +0 -99
data/spec/lexer/refcalls_spec.rb +0 -55
data/spec/parser/all_spec.rb +0 -43
data/spec/parser/alternation_spec.rb +0 -88
data/spec/parser/anchors_spec.rb +0 -17
data/spec/parser/conditionals_spec.rb +0 -179
data/spec/parser/errors_spec.rb +0 -30
data/spec/parser/escapes_spec.rb +0 -121
data/spec/parser/free_space_spec.rb +0 -130
data/spec/parser/groups_spec.rb +0 -108
data/spec/parser/keep_spec.rb +0 -6
data/spec/parser/posix_classes_spec.rb +0 -8
data/spec/parser/properties_spec.rb +0 -115
data/spec/parser/quantifiers_spec.rb +0 -51
data/spec/parser/refcalls_spec.rb +0 -112
data/spec/parser/set/intersections_spec.rb +0 -127
data/spec/parser/set/ranges_spec.rb +0 -111
data/spec/parser/sets_spec.rb +0 -178
data/spec/parser/types_spec.rb +0 -18
data/spec/scanner/all_spec.rb +0 -18
data/spec/scanner/anchors_spec.rb +0 -21
data/spec/scanner/conditionals_spec.rb +0 -128
data/spec/scanner/errors_spec.rb +0 -68
data/spec/scanner/escapes_spec.rb +0 -53
data/spec/scanner/free_space_spec.rb +0 -133
data/spec/scanner/groups_spec.rb +0 -52
data/spec/scanner/keep_spec.rb +0 -10
data/spec/scanner/literals_spec.rb +0 -49
data/spec/scanner/meta_spec.rb +0 -18
data/spec/scanner/properties_spec.rb +0 -64
data/spec/scanner/quantifiers_spec.rb +0 -20
data/spec/scanner/refcalls_spec.rb +0 -36
data/spec/scanner/sets_spec.rb +0 -102
data/spec/scanner/types_spec.rb +0 -14
data/spec/spec_helper.rb +0 -15
data/spec/support/runner.rb +0 -42
data/spec/support/shared_examples.rb +0 -77
data/spec/support/warning_extractor.rb +0 -60
data/spec/syntax/syntax_spec.rb +0 -48
data/spec/syntax/syntax_token_map_spec.rb +0 -23
data/spec/syntax/versions/1.8.6_spec.rb +0 -17
data/spec/syntax/versions/1.9.1_spec.rb +0 -10
data/spec/syntax/versions/1.9.3_spec.rb +0 -9
data/spec/syntax/versions/2.0.0_spec.rb +0 -13
data/spec/syntax/versions/2.2.0_spec.rb +0 -9
data/spec/syntax/versions/aliases_spec.rb +0 -37
data/spec/token/token_spec.rb +0 -85
/data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -3,6 +3,11 @@
   include re_char_type "char_type.rl";
   include re_property  "property.rl";
+  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
+  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
+  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
+  utf8_multibyte        = utf8_2_byte | utf8_3_byte | utf8_4_byte;
   dot                   = '.';
   backslash             = '\\';
   alternation           = '|';
@@ -15,26 +20,15 @@
   group_open            = '(';
   group_close           = ')';
-  parantheses           = group_open | group_close;
+  parentheses           = group_open | group_close;
   set_open              = '[';
   set_close             = ']';
   brackets              = set_open | set_close;
-  comment               = ('#' . [^\n]* . '\n');
-  class_name_posix      = 'alnum' | 'alpha' | 'blank' |
-                          'cntrl' | 'digit' | 'graph' |
-                          'lower' | 'print' | 'punct' |
-                          'space' | 'upper' | 'xdigit' |
-                          'word'  | 'ascii';
-  class_posix           = ('[:' . '^'? . class_name_posix . ':]');
+  comment               = ('#' . [^\n]* . '\n'?);
-  # these are not supported in ruby, and need verification
-  collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
-  character_equivalent  = '[=' . alpha . '=]';
+  class_posix           = ('[:' . '^'? . [^\[\]]* . ':]');
   line_anchor           = beginning_of_line | end_of_line;
   anchor_char           = [AbBzZG];
@@ -53,21 +47,20 @@
   meta_sequence         = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
+  sequence_char         = [CMcux];
   zero_or_one           = '?' | '??' | '?+';
   zero_or_more          = '*' | '*?' | '*+';
   one_or_more           = '+' | '+?' | '++';
   quantifier_greedy     = '?'  | '*'  | '+';
-  quantifier_reluctant  = '??' | '*?' | '+?';
-  quantifier_possessive = '?+' | '*+' | '++';
-  quantifier_mode       = '?'  | '+';
-  quantifier_interval   = range_open . (digit+)? . ','? . (digit+)? .
-                          range_close . quantifier_mode?;
-  quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_interval;
+  quantity_exact        = (digit+);
+  quantity_minimum      = (digit+) . ',';
+  quantity_maximum      = ',' . (digit+);
+  quantity_range        = (digit+) . ',' . (digit+);
+  quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
+                          quantity_maximum | quantity_range ) . range_close;
   conditional           = '(?(';
@@ -85,22 +78,22 @@
   # try to treat every other group head as options group, like Ruby
   group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
-  group_ref             = [gk];
-  group_name_char       = (alnum | '_');
-  group_name_id         = (group_name_char . (group_name_char+)?)?;
-  group_number          = '-'? . [1-9] . ([0-9]+)?;
+  group_name_id_ab      = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
+  group_name_id_sq      = ([^0-9\-']  | utf8_multibyte) . ([^'] | utf8_multibyte)*;
+  group_number          = '-'? . [0-9]+;
   group_level           = [+\-] . [0-9]+;
-  group_name            = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
+  group_name            = ('<' . group_name_id_ab? . '>') |
+                          ("'" . group_name_id_sq? . "'");
   group_lookup          = group_name | group_number;
   group_named           = ('?' . group_name );
-  group_name_ref        = group_ref . (('<' . group_name_id . group_level? '>') |
-                                       ("'" . group_name_id . group_level? "'"));
+  group_ref_body        = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
+                           ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
-  group_number_ref      = group_ref . (('<' . group_number . group_level? '>') |
-                                       ("'" . group_number . group_level? "'"));
+  group_ref             = 'k' . group_ref_body;
+  group_call            = 'g' . group_ref_body;
   group_type            = group_atomic | group_passive | group_absence | group_named;
@@ -111,32 +104,33 @@
   # characters that 'break' a literal
   meta_char             = dot | backslash | alternation |
-                          curlies | parantheses | brackets |
+                          curlies | parentheses | brackets |
                           line_anchor | quantifier_greedy;
-  ascii_print           = ((0x20..0x7e) - meta_char);
-  ascii_nonprint        = (0x01..0x1f | 0x7f);
+  literal_delimiters    = ']' | '}';
-  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
-  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
-  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
+  ascii_print           = ((0x20..0x7e) - meta_char - '#');
+  ascii_nonprint        = (0x01..0x1f | 0x7f);
   non_literal_escape    = char_type_char | anchor_char | escaped_ascii |
-                          group_ref | keep_mark | [xucCM];
+                          keep_mark | sequence_char;
+  # escapes that also work within a character set
+  set_escape            = backslash | brackets | escaped_ascii |
+                          octal_sequence | property_char |
+                          sequence_char | single_codepoint_char_type;
-  non_set_escape        = (anchor_char - 'b') | group_ref | keep_mark |
-                          multi_codepoint_char_type | [0-9cCM];
   # EOF error, used where it can be detected
   action premature_end_error {
-    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
-    raise PrematureEndError.new( text )
+    text = copy(data, ts ? ts-1 : 0, -1)
+    raise PrematureEndError.new(text)
   }
   # Invalid sequence error, used from sequences, like escapes and sets
   action invalid_sequence_error {
-    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
-    validation_error(:sequence, 'sequence', text)
+    text = copy(data, ts ? ts-1 : 0, -1)
+    raise ValidationError.for(:sequence, 'sequence', text)
   }
   # group (nesting) and set open/close actions
@@ -150,7 +144,7 @@
   # --------------------------------------------------------------------------
   character_set := |*
     set_close > (set_meta, 2) @set_closed {
-      emit(:set, :close, *text(data, ts, te))
+      emit(:set, :close, copy(data, ts, te))
       if in_set?
         fret;
       else
@@ -159,8 +153,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
-      emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
+      emit(:literal, :literal, '-')
+      emit(:set, :close, ']')
       if in_set?
         fret;
       else
@@ -169,33 +163,32 @@
     };
     '-&&' { # special case, emits two tokens
-      emit(:literal, :literal, '-', ts, te)
-      emit(:set, :intersection, '&&', ts, te)
+      emit(:literal, :literal, '-')
+      emit(:set, :intersection, '&&')
     };
     '^' {
-      text = text(data, ts, te).first
-      if tokens.last[1] == :open
-        emit(:set, :negate, text, ts, te)
+      if prev_token[1] == :open
+        emit(:set, :negate, '^')
       else
-        emit(:literal, :literal, text, ts, te)
+        emit(:literal, :literal, '^')
       end
     };
     '-' {
-      text = text(data, ts, te).first
-      # ranges cant start with a subset or intersection/negation/range operator
-      if tokens.last[0] == :set
-        emit(:literal, :literal, text, ts, te)
+      # ranges cant start with the opening bracket, a subset, or
+      # intersection/negation/range operators
+      if prev_token[0] == :set
+        emit(:literal, :literal, '-')
       else
-        emit(:set, :range, text, ts, te)
+        emit(:set, :range, '-')
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, *text(data, ts, te))
+      emit(:set, :intersection, '&&')
     };
     backslash {
@@ -203,59 +196,60 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, *text(data, ts, te))
+      emit(:set, :open, '[')
       fcall character_set;
     };
-    class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      text = text(data, ts, te).first
+    class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
+      text = copy(data, ts, te)
       type = :posixclass
       class_name = text[2..-3]
-      if class_name[0].chr == '^'
+      if class_name[0] == '^'
         class_name = class_name[1..-1]
         type = :nonposixclass
       end
-      emit(type, class_name.to_sym, text, ts, te)
-    };
-    collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :collation, *text(data, ts, te))
-    };
+      unless self.class.posix_classes.include?(class_name)
+        raise ValidationError.for(:posix_class, text)
+      end
-    character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :equivalent, *text(data, ts, te))
+      emit(type, class_name.to_sym, text)
     };
     meta_char > (set_meta, 1) {
-      emit(:literal, :literal, *text(data, ts, te))
+      emit(:literal, :literal, copy(data, ts, te))
     };
-    any            |
-    ascii_nonprint |
-    utf8_2_byte    |
-    utf8_3_byte    |
-    utf8_4_byte    {
-      char, *rest = *text(data, ts, te)
-      char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
-      emit(:literal, :literal, char, *rest)
+    any | ascii_nonprint | utf8_multibyte {
+      text = copy(data, ts, te)
+      emit(:literal, :literal, text)
     };
   *|;
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
-    non_set_escape > (escaped_set_alpha, 2) {
-      emit(:escape, :literal, *text(data, ts, te, 1))
+    # Special case: in sets, octal sequences have higher priority than backrefs
+    octal_sequence {
+      emit(:escape, :octal, copy(data, ts-1, te))
       fret;
     };
-    any > (escaped_set_alpha, 1) {
+    # Scan all other escapes that work in sets with the generic escape scanner
+    set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    # Treat all remaining escapes - those not supported in sets - as literal.
+    # (This currently includes \^, \-, \&, \:, although these could potentially
+    # be meta chars when not escaped, depending on their position in the set.)
+    any > (escaped_set_alpha, 1) {
+      emit(:escape, :literal, copy(data, ts-1, te))
+      fret;
+    };
   *|;
@@ -263,33 +257,33 @@
   # --------------------------------------------------------------------------
   escape_sequence := |*
     [1-9] {
-      text = text(data, ts, te, 1).first
-      emit(:backref, :number, text, ts-1, te)
+      text = copy(data, ts-1, te)
+      emit(:backref, :number, text)
       fret;
     };
     octal_sequence {
-      emit(:escape, :octal, *text(data, ts, te, 1))
+      emit(:escape, :octal, copy(data, ts-1, te))
       fret;
     };
     meta_char {
-      case text = text(data, ts, te, 1).first
-      when '\.';  emit(:escape, :dot,               text, ts-1, te)
-      when '\|';  emit(:escape, :alternation,       text, ts-1, te)
-      when '\^';  emit(:escape, :bol,               text, ts-1, te)
-      when '\$';  emit(:escape, :eol,               text, ts-1, te)
-      when '\?';  emit(:escape, :zero_or_one,       text, ts-1, te)
-      when '\*';  emit(:escape, :zero_or_more,      text, ts-1, te)
-      when '\+';  emit(:escape, :one_or_more,       text, ts-1, te)
-      when '\(';  emit(:escape, :group_open,        text, ts-1, te)
-      when '\)';  emit(:escape, :group_close,       text, ts-1, te)
-      when '\{';  emit(:escape, :interval_open,     text, ts-1, te)
-      when '\}';  emit(:escape, :interval_close,    text, ts-1, te)
-      when '\[';  emit(:escape, :set_open,          text, ts-1, te)
-      when '\]';  emit(:escape, :set_close,         text, ts-1, te)
+      case text = copy(data, ts-1, te)
+      when '\.';  emit(:escape, :dot,               text)
+      when '\|';  emit(:escape, :alternation,       text)
+      when '\^';  emit(:escape, :bol,               text)
+      when '\$';  emit(:escape, :eol,               text)
+      when '\?';  emit(:escape, :zero_or_one,       text)
+      when '\*';  emit(:escape, :zero_or_more,      text)
+      when '\+';  emit(:escape, :one_or_more,       text)
+      when '\(';  emit(:escape, :group_open,        text)
+      when '\)';  emit(:escape, :group_close,       text)
+      when '\{';  emit(:escape, :interval_open,     text)
+      when '\}';  emit(:escape, :interval_close,    text)
+      when '\[';  emit(:escape, :set_open,          text)
+      when '\]';  emit(:escape, :set_close,         text)
       when "\\\\";
-        emit(:escape, :backslash, text, ts-1, te)
+        emit(:escape, :backslash, text)
       end
       fret;
     };
@@ -297,31 +291,31 @@
     escaped_ascii > (escaped_alpha, 7) {
       # \b is emitted as backspace only when inside a character set, otherwise
       # it is a word boundary anchor. A syntax might "normalize" it if needed.
-      case text = text(data, ts, te, 1).first
-      when '\a'; emit(:escape, :bell,           text, ts-1, te)
-      when '\b'; emit(:escape, :backspace,      text, ts-1, te)
-      when '\e'; emit(:escape, :escape,         text, ts-1, te)
-      when '\f'; emit(:escape, :form_feed,      text, ts-1, te)
-      when '\n'; emit(:escape, :newline,        text, ts-1, te)
-      when '\r'; emit(:escape, :carriage,       text, ts-1, te)
-      when '\t'; emit(:escape, :tab,            text, ts-1, te)
-      when '\v'; emit(:escape, :vertical_tab,   text, ts-1, te)
+      case text = copy(data, ts-1, te)
+      when '\a'; emit(:escape, :bell,           text)
+      when '\b'; emit(:escape, :backspace,      text)
+      when '\e'; emit(:escape, :escape,         text)
+      when '\f'; emit(:escape, :form_feed,      text)
+      when '\n'; emit(:escape, :newline,        text)
+      when '\r'; emit(:escape, :carriage,       text)
+      when '\t'; emit(:escape, :tab,            text)
+      when '\v'; emit(:escape, :vertical_tab,   text)
       end
       fret;
     };
     codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
-      text = text(data, ts, te, 1).first
-      if text[2].chr == '{'
-        emit(:escape, :codepoint_list, text, ts-1, te)
+      text = copy(data, ts-1, te)
+      if text[2] == '{'
+        emit(:escape, :codepoint_list, text)
       else
-        emit(:escape, :codepoint,      text, ts-1, te)
+        emit(:escape, :codepoint,      text)
       end
       fret;
     };
-    hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
-      emit(:escape, :hex, *text(data, ts, te, 1))
+    hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
+      emit(:escape, :hex, copy(data, ts-1, te))
       fret;
     };
@@ -351,8 +345,8 @@
       fcall unicode_property;
     };
-    (any -- non_literal_escape) > (escaped_alpha, 1)  {
-      emit(:escape, :literal, *text(data, ts, te, 1))
+    (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
+      emit(:escape, :literal, copy(data, ts-1, te))
       fret;
     };
   *|;
@@ -362,9 +356,9 @@
   # --------------------------------------------------------------------------
   conditional_expression := |*
     group_lookup . ')' {
-      text = text(data, ts, te-1).first
-      emit(:conditional, :condition, text, ts, te-1)
-      emit(:conditional, :condition_close, ')', te-1, te)
+      text = copy(data, ts, te-1)
+      emit(:conditional, :condition, text)
+      emit(:conditional, :condition_close, ')')
     };
     any {
@@ -381,46 +375,50 @@
     # Meta characters
     # ------------------------------------------------------------------------
     dot {
-      emit(:meta, :dot, *text(data, ts, te))
+      emit(:meta, :dot, copy(data, ts, te))
     };
     alternation {
       if conditional_stack.last == group_depth
-        emit(:conditional, :separator, *text(data, ts, te))
+        emit(:conditional, :separator, copy(data, ts, te))
       else
-        emit(:meta, :alternation, *text(data, ts, te))
+        emit(:meta, :alternation, copy(data, ts, te))
       end
     };
     # Anchors
     # ------------------------------------------------------------------------
     beginning_of_line {
-      emit(:anchor, :bol, *text(data, ts, te))
+      emit(:anchor, :bol, copy(data, ts, te))
     };
     end_of_line {
-      emit(:anchor, :eol, *text(data, ts, te))
+      emit(:anchor, :eol, copy(data, ts, te))
     };
     backslash . keep_mark > (backslashed, 4) {
-      emit(:keep, :mark, *text(data, ts, te))
+      emit(:keep, :mark, copy(data, ts, te))
     };
     backslash . anchor_char > (backslashed, 3) {
-      case text = text(data, ts, te).first
-      when '\\A'; emit(:anchor, :bos,                text, ts, te)
-      when '\\z'; emit(:anchor, :eos,                text, ts, te)
-      when '\\Z'; emit(:anchor, :eos_ob_eol,         text, ts, te)
-      when '\\b'; emit(:anchor, :word_boundary,      text, ts, te)
-      when '\\B'; emit(:anchor, :nonword_boundary,   text, ts, te)
-      when '\\G'; emit(:anchor, :match_start,        text, ts, te)
+      case text = copy(data, ts, te)
+      when '\A';  emit(:anchor, :bos,                text)
+      when '\z';  emit(:anchor, :eos,                text)
+      when '\Z';  emit(:anchor, :eos_ob_eol,         text)
+      when '\b';  emit(:anchor, :word_boundary,      text)
+      when '\B';  emit(:anchor, :nonword_boundary,   text)
+      when '\G';  emit(:anchor, :match_start,        text)
       end
     };
+    literal_delimiters {
+      append_literal(data, ts, te)
+    };
     # Character sets
     # ------------------------------------------------------------------------
     set_open >set_opened {
-      emit(:set, :open, *text(data, ts, te))
+      emit(:set, :open, copy(data, ts, te))
       fcall character_set;
     };
@@ -429,23 +427,22 @@
     #   (?(condition)Y|N)   conditional expression
     # ------------------------------------------------------------------------
     conditional {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       conditional_stack << group_depth
-      emit(:conditional, :open, text[0..-2], ts, te-1)
-      emit(:conditional, :condition_open, '(', te-1, te)
+      emit(:conditional, :open, text[0..-2])
+      emit(:conditional, :condition_open, '(')
       fcall conditional_expression;
     };
     # (?#...) comments: parsed as a single expression, without introducing a
     # new nesting level. Comments may not include parentheses, escaped or not.
-    # special case for close, action performed on all transitions to get the
-    # correct closing count.
+    # special case for close to get the correct closing count.
     # ------------------------------------------------------------------------
-    group_open . group_comment $group_closed {
-      emit(:group, :comment, *text(data, ts, te))
+    (group_open . group_comment) @group_closed {
+      emit(:group, :comment, copy(data, ts, te))
     };
     # Expression options:
@@ -459,12 +456,12 @@
     #
     #   (?imxdau-imx:subexp)  option on/off for subexp
     # ------------------------------------------------------------------------
-    group_open . group_options >group_opened {
-      text = text(data, ts, te).first
+    (group_open . group_options) >group_opened {
+      text = copy(data, ts, te)
       if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
-        raise InvalidGroupOption.new($1 || "-#{$2}", text)
+        raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
       end
-      emit_options(text, ts, te)
+      emit_options(text)
     };
     # Assertions
@@ -473,12 +470,12 @@
     #   (?<=subexp)         look-behind
     #   (?<!subexp)         negative look-behind
     # ------------------------------------------------------------------------
-    group_open . assertion_type >group_opened {
-      case text = text(data, ts, te).first
-      when '(?=';  emit(:assertion, :lookahead,    text, ts, te)
-      when '(?!';  emit(:assertion, :nlookahead,   text, ts, te)
-      when '(?<='; emit(:assertion, :lookbehind,   text, ts, te)
-      when '(?<!'; emit(:assertion, :nlookbehind,  text, ts, te)
+    (group_open . assertion_type) >group_opened {
+      case text = copy(data, ts, te)
+      when '(?=';  emit(:assertion, :lookahead,    text)
+      when '(?!';  emit(:assertion, :nlookahead,   text)
+      when '(?<='; emit(:assertion, :lookbehind,   text)
+      when '(?<!'; emit(:assertion, :nlookbehind,  text)
       end
     };
@@ -490,106 +487,78 @@
     #   (?'name'subexp)     named group (single quoted version)
     #   (subexp)            captured group
     # ------------------------------------------------------------------------
-    group_open . group_type >group_opened {
-      case text = text(data, ts, te).first
-      when '(?:';  emit(:group, :passive,      text, ts, te)
-      when '(?>';  emit(:group, :atomic,       text, ts, te)
-      when '(?~';  emit(:group, :absence,      text, ts, te)
+    (group_open . group_type) >group_opened {
+      case text = copy(data, ts, te)
+      when '(?:';  emit(:group, :passive,      text)
+      when '(?>';  emit(:group, :atomic,       text)
+      when '(?~';  emit(:group, :absence,      text)
       when /^\(\?(?:<>|'')/
-        validation_error(:group, 'named group', 'name is empty')
+        raise ValidationError.for(:group, 'named group', 'name is empty')
-      when /^\(\?<\w*>/
-        emit(:group, :named_ab,  text, ts, te)
+      when /^\(\?<[^>]+>/
+        emit(:group, :named_ab,  text)
-      when /^\(\?'\w*'/
-        emit(:group, :named_sq,  text, ts, te)
+      when /^\(\?'[^']+'/
+        emit(:group, :named_sq,  text)
       end
     };
     group_open @group_opened {
-      text = text(data, ts, te).first
-      emit(:group, :capture, text, ts, te)
+      text = copy(data, ts, te)
+      emit(:group, :capture, text)
     };
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, *text(data, ts, te))
-      else
+        emit(:conditional, :close, ')')
+      elsif group_depth >= 0
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
           spacing_stack.pop
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, *text(data, ts, te))
+        emit(:group, :close, ')')
+      else
+        raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
       end
     };
     # Group backreference, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
-      case text = text(data, ts, te).first
-      when /^\\([gk])(<>|'')/ # angle brackets
-        validation_error(:backref, 'ref/call', 'ref ID is empty')
-      when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :name_ref_ab, text, ts, te)
-        else
-          emit(:backref, :name_call_ab, text, ts, te)
-        end
-      when /^\\([gk])'[^\d+-]\w*'/ #single quotes
-        if $1 == 'k'
-          emit(:backref, :name_ref_sq, text, ts, te)
-        else
-          emit(:backref, :name_call_sq, text, ts, te)
-        end
-      when /^\\([gk])<\d+>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :number_ref_ab, text, ts, te)
-        else
-          emit(:backref, :number_call_ab, text, ts, te)
-        end
-      when /^\\([gk])'\d+'/ # single quotes
-        if $1 == 'k'
-          emit(:backref, :number_ref_sq, text, ts, te)
-        else
-          emit(:backref, :number_call_sq, text, ts, te)
-        end
-      when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :number_rel_ref_ab, text, ts, te)
-        else
-          emit(:backref, :number_rel_call_ab, text, ts, te)
-        end
-      when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
-        if $1 == 'k'
-          emit(:backref, :number_rel_ref_sq, text, ts, te)
-        else
-          emit(:backref, :number_rel_call_sq, text, ts, te)
-        end
-      when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
-        emit(:backref, :name_recursion_ref_ab, text, ts, te)
-      when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
-        emit(:backref, :name_recursion_ref_sq, text, ts, te)
-      when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
-        emit(:backref, :number_recursion_ref_ab, text, ts, te)
-      when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
-        emit(:backref, :number_recursion_ref_sq, text, ts, te)
+    backslash . (group_ref) > (backslashed, 4) {
+      case text = copy(data, ts, te)
+      when /^\\k(.)[^0-9\-][^+\-]*['>]$/
+        emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
+      when /^\\k(.)[1-9]\d*['>]$/
+        emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
+      when /^\\k(.)-[1-9]\d*['>]$/
+        emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
+      when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
+        emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
+      when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
+        emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
+      else
+        raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
+      end
+    };
+    # Group call, named and numbered
+    # ------------------------------------------------------------------------
+    backslash . (group_call) > (backslashed, 4) {
+      case text = copy(data, ts, te)
+      when /^\\g(.)[^0-9+\-].*['>]$/
+        emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
+      when /^\\g(.)\d+['>]$/
+        emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
+      when /^\\g(.)[+-]\d+/
+        emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
+      else
+        raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
       end
     };
@@ -597,31 +566,36 @@
     # Quantifiers
     # ------------------------------------------------------------------------
     zero_or_one {
-      case text = text(data, ts, te).first
-      when '?' ;  emit(:quantifier, :zero_or_one,            text, ts, te)
-      when '??';  emit(:quantifier, :zero_or_one_reluctant,  text, ts, te)
-      when '?+';  emit(:quantifier, :zero_or_one_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '?' ;  emit(:quantifier, :zero_or_one,            text)
+      when '??';  emit(:quantifier, :zero_or_one_reluctant,  text)
+      when '?+';  emit(:quantifier, :zero_or_one_possessive, text)
       end
     };
     zero_or_more {
-      case text = text(data, ts, te).first
-      when '*' ;  emit(:quantifier, :zero_or_more,            text, ts, te)
-      when '*?';  emit(:quantifier, :zero_or_more_reluctant,  text, ts, te)
-      when '*+';  emit(:quantifier, :zero_or_more_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '*' ;  emit(:quantifier, :zero_or_more,            text)
+      when '*?';  emit(:quantifier, :zero_or_more_reluctant,  text)
+      when '*+';  emit(:quantifier, :zero_or_more_possessive, text)
       end
     };
     one_or_more {
-      case text = text(data, ts, te).first
-      when '+' ;  emit(:quantifier, :one_or_more,            text, ts, te)
-      when '+?';  emit(:quantifier, :one_or_more_reluctant,  text, ts, te)
-      when '++';  emit(:quantifier, :one_or_more_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '+' ;  emit(:quantifier, :one_or_more,            text)
+      when '+?';  emit(:quantifier, :one_or_more_reluctant,  text)
+      when '++';  emit(:quantifier, :one_or_more_possessive, text)
       end
     };
-    quantifier_interval  @err(premature_end_error) {
-      emit(:quantifier, :interval, *text(data, ts, te))
+    quantifier_interval {
+      emit(:quantifier, :interval, copy(data, ts, te))
+    };
+    # Catch unmatched curly braces as literals
+    range_open {
+      append_literal(data, ts, te)
     };
     # Escaped sequences
@@ -632,15 +606,17 @@
     comment {
       if free_spacing
-        emit(:free_space, :comment, *text(data, ts, te))
+        emit(:free_space, :comment, copy(data, ts, te))
       else
-        append_literal(data, ts, te)
+        # consume only the pound sign (#) and backtrack to do regular scanning
+        append_literal(data, ts, ts + 1)
+        fexec ts + 1;
       end
     };
     space+ {
       if free_spacing
-        emit(:free_space, :whitespace, *text(data, ts, te))
+        emit(:free_space, :whitespace, copy(data, ts, te))
       else
         append_literal(data, ts, te)
       end
@@ -649,105 +625,47 @@
     # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
     # except meta characters.
     # ------------------------------------------------------------------------
-    (ascii_print -- space)+    |
-    ascii_nonprint+ |
-    utf8_2_byte+    |
-    utf8_3_byte+    |
-    utf8_4_byte+    {
+    (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
       append_literal(data, ts, te)
     };
   *|;
 }%%
-# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
-# This file was generated from lib/regexp_parser/scanner/scanner.rl
+require 'regexp_parser/scanner/errors/scanner_error'
+require 'regexp_parser/scanner/errors/premature_end_error'
+require 'regexp_parser/scanner/errors/validation_error'
 class Regexp::Scanner
-  # General scanner error (catch all)
-  class ScannerError < StandardError; end
-  # Base for all scanner validation errors
-  class ValidationError < StandardError
-    def initialize(reason)
-      super reason
-    end
-  end
-  # Unexpected end of pattern
-  class PrematureEndError < ScannerError
-    def initialize(where = '')
-      super "Premature end of pattern at #{where}"
-    end
-  end
-  # Invalid sequence format. Used for escape sequences, mainly.
-  class InvalidSequenceError < ValidationError
-    def initialize(what = 'sequence', where = '')
-      super "Invalid #{what} at #{where}"
-    end
-  end
-  # Invalid group. Used for named groups.
-  class InvalidGroupError < ValidationError
-    def initialize(what, reason)
-      super "Invalid #{what}, #{reason}."
-    end
-  end
-  # Invalid groupOption. Used for inline options.
-  class InvalidGroupOption < ValidationError
-    def initialize(option, text)
-      super "Invalid group option #{option} in #{text}"
-    end
-  end
-  # Invalid back reference. Used for name a number refs/calls.
-  class InvalidBackrefError < ValidationError
-    def initialize(what, reason)
-      super "Invalid back reference #{what}, #{reason}"
-    end
-  end
-  # The property name was not recognized by the scanner.
-  class UnknownUnicodePropertyError < ValidationError
-    def initialize(name)
-      super "Unknown unicode character property name #{name}"
-    end
-  end
   # Scans the given regular expression text, or Regexp object and collects the
   # emitted token into an array that gets returned at the end. If a block is
   # given, it gets called for each emitted token.
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, &block)
-    new.scan(input_object, &block)
+  def self.scan(input_object, options: nil, collect_tokens: true, &block)
+    new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
   end
-  def scan(input_object, &block)
-    self.literal = nil
+  def scan(input_object, options: nil, collect_tokens: true, &block)
+    self.collect_tokens = collect_tokens
+    self.literal_run = nil
     stack = []
-    if input_object.is_a?(Regexp)
-      input = input_object.source
-      self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
-    else
-      input = input_object
-      self.free_spacing = false
-    end
+    input = input_object.is_a?(Regexp) ? input_object.source : input_object
+    self.free_spacing = free_spacing?(input_object, options)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    data  = input.unpack("c*") if input.is_a?(String)
+    data  = input.unpack("c*")
     eof   = data.length
     self.tokens = []
-    self.block  = block_given? ? block : nil
+    self.block  = block
     self.set_depth = 0
     self.group_depth = 0
     self.conditional_stack = []
+    self.char_pos = 0
     %% write data;
     %% write init;
@@ -757,7 +675,7 @@ class Regexp::Scanner
     testEof = testEof
     if cs == re_scanner_error
-      text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+      text = copy(data, ts ? ts-1 : 0, -1)
       raise ScannerError.new("Scan error at '#{text}'")
     end
@@ -767,40 +685,76 @@ class Regexp::Scanner
           "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
-    emit_literal if literal
+    emit_literal if literal_run
     tokens
   end
   # lazy-load property maps when first needed
-  require 'yaml'
-  PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
   def self.short_prop_map
-    @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
+    @short_prop_map ||= parse_prop_map('short')
   end
   def self.long_prop_map
-    @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
+    @long_prop_map ||= parse_prop_map('long')
+  end
+  def self.parse_prop_map(name)
+    File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
+  end
+  def self.posix_classes
+    %w[alnum alpha ascii blank cntrl digit graph
+       lower print punct space upper word xdigit]
   end
   # Emits an array with the details of the scanned pattern
-  def emit(type, token, text, ts, te)
+  def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    emit_literal if literal
+    emit_literal if literal_run
+    # Ragel runs with byte-based indices (ts, te). These are of little value to
+    # end-users, so we keep track of char-based indices and emit those instead.
+    ts_char_pos = char_pos
+    te_char_pos = char_pos + text.length
+    tok = [type, token, text, ts_char_pos, te_char_pos]
+    self.prev_token = tok
+    self.char_pos = te_char_pos
     if block
-      block.call type, token, text, ts, te
+      block.call type, token, text, ts_char_pos, te_char_pos
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
+      tokens << tok if collect_tokens
+    elsif collect_tokens
+      tokens << tok
     end
-    tokens << [type, token, text, ts, te]
   end
+  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
   private
-  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token,
+                :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack,
+                :char_pos
+  def free_spacing?(input_object, options)
+    if options && !input_object.is_a?(String)
+      raise ArgumentError, 'options cannot be supplied unless scanning a String'
+    end
+    options = input_object.options if input_object.is_a?(::Regexp)
+    return false unless options
+    options & Regexp::EXTENDED != 0
+  end
   def in_group?
     group_depth > 0
@@ -811,36 +765,24 @@ class Regexp::Scanner
   end
   # Copy from ts to te from data as text
-  def copy(data, range)
-    data[range].pack('c*')
-  end
-  # Copy from ts to te from data as text, returning an array with the text
-  #  and the offsets used to copy it.
-  def text(data, ts, te, soff = 0)
-    [copy(data, ts-soff..te-1), ts-soff, te]
+  def copy(data, ts, te)
+    data[ts...te].pack('c*').force_encoding('utf-8')
   end
   # Appends one or more characters to the literal buffer, to be emitted later
-  # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
+  # by a call to emit_literal.
   def append_literal(data, ts, te)
-    self.literal = literal || []
-    literal << text(data, ts, te)
+    (self.literal_run ||= []) << copy(data, ts, te)
   end
-  # Emits the literal run collected by calls to the append_literal method,
-  # using the total start (ts) and end (te) offsets of the run.
+  # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    ts, te = literal.first[1], literal.last[2]
-    text = literal.map {|t| t[0]}.join
-    text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
-    self.literal = nil
-    emit(:literal, :literal, text, ts, te)
+    text = literal_run.join
+    self.literal_run = nil
+    emit(:literal, :literal, text)
   end
-  def emit_options(text, ts, te)
+  def emit_options(text)
     token = nil
     # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,28 +808,13 @@ class Regexp::Scanner
       token = :options_switch
     end
-    emit(:group, token, text, ts, te)
+    emit(:group, token, text)
   end
   def emit_meta_control_sequence(data, ts, te, token)
     if data.last < 0x00 || data.last > 0x7F
-      validation_error(:sequence, 'escape', token.to_s)
-    end
-    emit(:escape, token, *text(data, ts, te, 1))
-  end
-  # Centralizes and unifies the handling of validation related
-  # errors.
-  def validation_error(type, what, reason)
-    case type
-    when :group
-      error = InvalidGroupError.new(what, reason)
-    when :backref
-      error = InvalidBackrefError.new(what, reason)
-    when :sequence
-      error = InvalidSequenceError.new(what, reason)
+      raise ValidationError.for(:sequence, 'escape', token.to_s)
     end
-    raise error # unless @@config.validation_ignore
+    emit(:escape, token, copy(data, ts-1, te))
   end
 end # module Regexp::Scanner