RubyGems - regexp_parser - Versions diffs - 1.8.1 → 2.0.3 - Mend

regexp_parser 1.8.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +70 -0
data/Gemfile +1 -0
data/README.md +12 -11
data/Rakefile +2 -2
data/lib/regexp_parser/expression.rb +10 -19
data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
data/lib/regexp_parser/expression/classes/group.rb +22 -2
data/lib/regexp_parser/expression/classes/root.rb +4 -16
data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
data/lib/regexp_parser/expression/quantifier.rb +9 -0
data/lib/regexp_parser/expression/sequence.rb +0 -10
data/lib/regexp_parser/lexer.rb +2 -2
data/lib/regexp_parser/parser.rb +27 -2
data/lib/regexp_parser/scanner.rb +1194 -1272
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +178 -186
data/lib/regexp_parser/syntax.rb +4 -4
data/lib/regexp_parser/syntax/any.rb +2 -2
data/lib/regexp_parser/syntax/base.rb +1 -1
data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
data/lib/regexp_parser/version.rb +1 -1
data/spec/expression/base_spec.rb +10 -0
data/spec/expression/subexpression_spec.rb +1 -1
data/spec/expression/to_s_spec.rb +39 -31
data/spec/lexer/literals_spec.rb +24 -49
data/spec/parser/errors_spec.rb +1 -1
data/spec/parser/escapes_spec.rb +1 -1
data/spec/parser/quantifiers_spec.rb +16 -0
data/spec/parser/set/ranges_spec.rb +3 -3
data/spec/scanner/escapes_spec.rb +7 -0
data/spec/scanner/groups_spec.rb +10 -1
data/spec/scanner/literals_spec.rb +28 -38
data/spec/scanner/quantifiers_spec.rb +18 -13
data/spec/scanner/sets_spec.rb +23 -5
data/spec/spec_helper.rb +1 -0
metadata +56 -60
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9

data/lib/regexp_parser/scanner/char_type.rl CHANGED

@@ -10,17 +10,17 @@
   # --------------------------------------------------------------------------
   char_type := |*
     char_type_char {
-      case text = text(data, ts, te, 1).first
-      when '\d'; emit(:type, :digit,      text, ts - 1, te)
-      when '\D'; emit(:type, :nondigit,   text, ts - 1, te)
-      when '\h'; emit(:type, :hex,        text, ts - 1, te)
-      when '\H'; emit(:type, :nonhex,     text, ts - 1, te)
-      when '\s'; emit(:type, :space,      text, ts - 1, te)
-      when '\S'; emit(:type, :nonspace,   text, ts - 1, te)
-      when '\w'; emit(:type, :word,       text, ts - 1, te)
-      when '\W'; emit(:type, :nonword,    text, ts - 1, te)
-      when '\R'; emit(:type, :linebreak,  text, ts - 1, te)
-      when '\X'; emit(:type, :xgrapheme,  text, ts - 1, te)
+      case text = copy(data, ts-1, te)
+      when '\d'; emit(:type, :digit,      text)
+      when '\D'; emit(:type, :nondigit,   text)
+      when '\h'; emit(:type, :hex,        text)
+      when '\H'; emit(:type, :nonhex,     text)
+      when '\s'; emit(:type, :space,      text)
+      when '\S'; emit(:type, :nonspace,   text)
+      when '\w'; emit(:type, :word,       text)
+      when '\W'; emit(:type, :nonword,    text)
+      when '\R'; emit(:type, :linebreak,  text)
+      when '\X'; emit(:type, :xgrapheme,  text)
       end
       fret;
     };

data/lib/regexp_parser/scanner/property.rl CHANGED

@@ -14,7 +14,7 @@
   unicode_property := |*
     property_sequence < eof(premature_property_end) {
-      text = text(data, ts, te, 1).first
+      text = copy(data, ts-1, te)
       type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
       name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
       token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
       raise UnknownUnicodePropertyError.new(name) unless token
-      self.emit(type, token.to_sym, text, ts-1, te)
+      self.emit(type, token.to_sym, text)
       fret;
     };

data/lib/regexp_parser/scanner/scanner.rl CHANGED

@@ -3,6 +3,11 @@
   include re_char_type "char_type.rl";
   include re_property  "property.rl";
+  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
+  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
+  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
+  utf8_multibyte        = utf8_2_byte | utf8_3_byte | utf8_4_byte;
   dot                   = '.';
   backslash             = '\\';
   alternation           = '|';
@@ -32,7 +37,7 @@
   class_posix           = ('[:' . '^'? . class_name_posix . ':]');
-  # these are not supported in ruby, and need verification
+  # these are not supported in ruby at the moment
   collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
   character_equivalent  = '[=' . alpha . '=]';
@@ -90,18 +95,19 @@
   group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
   group_ref             = [gk];
-  group_name_char       = (alnum | '_');
-  group_name_id         = (group_name_char . (group_name_char+)?)?;
-  group_number          = '-'? . [1-9] . ([0-9]+)?;
+  group_name_id_ab      = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
+  group_name_id_sq      = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
+  group_number          = '-'? . [1-9] . [0-9]*;
   group_level           = [+\-] . [0-9]+;
-  group_name            = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
+  group_name            = ('<' . group_name_id_ab? . '>') |
+                          ("'" . group_name_id_sq? . "'");
   group_lookup          = group_name | group_number;
   group_named           = ('?' . group_name );
-  group_name_ref        = group_ref . (('<' . group_name_id . group_level? '>') |
-                                       ("'" . group_name_id . group_level? "'"));
+  group_name_ref        = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
+                                       ("'" . group_name_id_sq? . group_level? "'"));
   group_number_ref      = group_ref . (('<' . group_number . group_level? '>') |
                                        ("'" . group_number . group_level? "'"));
@@ -123,10 +129,6 @@
   ascii_print           = ((0x20..0x7e) - meta_char - '#');
   ascii_nonprint        = (0x01..0x1f | 0x7f);
-  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
-  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
-  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
   non_literal_escape    = char_type_char | anchor_char | escaped_ascii |
                           keep_mark | [xucCM];
@@ -135,13 +137,13 @@
   # EOF error, used where it can be detected
   action premature_end_error {
-    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+    text = copy(data, ts ? ts-1 : 0, -1)
     raise PrematureEndError.new( text )
   }
   # Invalid sequence error, used from sequences, like escapes and sets
   action invalid_sequence_error {
-    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+    text = copy(data, ts ? ts-1 : 0, -1)
     validation_error(:sequence, 'sequence', text)
   }
@@ -156,7 +158,7 @@
   # --------------------------------------------------------------------------
   character_set := |*
     set_close > (set_meta, 2) @set_closed {
-      emit(:set, :close, *text(data, ts, te))
+      emit(:set, :close, copy(data, ts, te))
       if in_set?
         fret;
       else
@@ -165,8 +167,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
-      emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
+      emit(:literal, :literal, copy(data, ts, te-1))
+      emit(:set, :close, copy(data, ts+1, te))
       if in_set?
         fret;
       else
@@ -175,33 +177,33 @@
     };
     '-&&' { # special case, emits two tokens
-      emit(:literal, :literal, '-', ts, te)
-      emit(:set, :intersection, '&&', ts, te)
+      emit(:literal, :literal, '-')
+      emit(:set, :intersection, '&&')
     };
     '^' {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       if tokens.last[1] == :open
-        emit(:set, :negate, text, ts, te)
+        emit(:set, :negate, text)
       else
-        emit(:literal, :literal, text, ts, te)
+        emit(:literal, :literal, text)
       end
     };
     '-' {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       # ranges cant start with a subset or intersection/negation/range operator
       if tokens.last[0] == :set
-        emit(:literal, :literal, text, ts, te)
+        emit(:literal, :literal, text)
       else
-        emit(:set, :range, text, ts, te)
+        emit(:set, :range, text)
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, *text(data, ts, te))
+      emit(:set, :intersection, copy(data, ts, te))
     };
     backslash {
@@ -209,12 +211,12 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, *text(data, ts, te))
+      emit(:set, :open, copy(data, ts, te))
       fcall character_set;
     };
     class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       type = :posixclass
       class_name = text[2..-3]
@@ -223,29 +225,24 @@
         type = :nonposixclass
       end
-      emit(type, class_name.to_sym, text, ts, te)
+      emit(type, class_name.to_sym, text)
     };
-    collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :collation, *text(data, ts, te))
-    };
-    character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :equivalent, *text(data, ts, te))
-    };
+    # These are not supported in ruby at the moment. Enable them if they are.
+    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    #   emit(:set, :collation, copy(data, ts, te))
+    # };
+    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    #   emit(:set, :equivalent, copy(data, ts, te))
+    # };
     meta_char > (set_meta, 1) {
-      emit(:literal, :literal, *text(data, ts, te))
+      emit(:literal, :literal, copy(data, ts, te))
     };
-    any            |
-    ascii_nonprint |
-    utf8_2_byte    |
-    utf8_3_byte    |
-    utf8_4_byte    {
-      char, *rest = *text(data, ts, te)
-      char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
-      emit(:literal, :literal, char, *rest)
+    any | ascii_nonprint | utf8_multibyte {
+      text = copy(data, ts, te)
+      emit(:literal, :literal, text)
     };
   *|;
@@ -253,7 +250,7 @@
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
     non_set_escape > (escaped_set_alpha, 2) {
-      emit(:escape, :literal, *text(data, ts, te, 1))
+      emit(:escape, :literal, copy(data, ts-1, te))
       fret;
     };
@@ -269,33 +266,33 @@
   # --------------------------------------------------------------------------
   escape_sequence := |*
     [1-9] {
-      text = text(data, ts, te, 1).first
-      emit(:backref, :number, text, ts-1, te)
+      text = copy(data, ts-1, te)
+      emit(:backref, :number, text)
       fret;
     };
     octal_sequence {
-      emit(:escape, :octal, *text(data, ts, te, 1))
+      emit(:escape, :octal, copy(data, ts-1, te))
       fret;
     };
     meta_char {
-      case text = text(data, ts, te, 1).first
-      when '\.';  emit(:escape, :dot,               text, ts-1, te)
-      when '\|';  emit(:escape, :alternation,       text, ts-1, te)
-      when '\^';  emit(:escape, :bol,               text, ts-1, te)
-      when '\$';  emit(:escape, :eol,               text, ts-1, te)
-      when '\?';  emit(:escape, :zero_or_one,       text, ts-1, te)
-      when '\*';  emit(:escape, :zero_or_more,      text, ts-1, te)
-      when '\+';  emit(:escape, :one_or_more,       text, ts-1, te)
-      when '\(';  emit(:escape, :group_open,        text, ts-1, te)
-      when '\)';  emit(:escape, :group_close,       text, ts-1, te)
-      when '\{';  emit(:escape, :interval_open,     text, ts-1, te)
-      when '\}';  emit(:escape, :interval_close,    text, ts-1, te)
-      when '\[';  emit(:escape, :set_open,          text, ts-1, te)
-      when '\]';  emit(:escape, :set_close,         text, ts-1, te)
+      case text = copy(data, ts-1, te)
+      when '\.';  emit(:escape, :dot,               text)
+      when '\|';  emit(:escape, :alternation,       text)
+      when '\^';  emit(:escape, :bol,               text)
+      when '\$';  emit(:escape, :eol,               text)
+      when '\?';  emit(:escape, :zero_or_one,       text)
+      when '\*';  emit(:escape, :zero_or_more,      text)
+      when '\+';  emit(:escape, :one_or_more,       text)
+      when '\(';  emit(:escape, :group_open,        text)
+      when '\)';  emit(:escape, :group_close,       text)
+      when '\{';  emit(:escape, :interval_open,     text)
+      when '\}';  emit(:escape, :interval_close,    text)
+      when '\[';  emit(:escape, :set_open,          text)
+      when '\]';  emit(:escape, :set_close,         text)
       when "\\\\";
-        emit(:escape, :backslash, text, ts-1, te)
+        emit(:escape, :backslash, text)
       end
       fret;
     };
@@ -303,31 +300,31 @@
     escaped_ascii > (escaped_alpha, 7) {
       # \b is emitted as backspace only when inside a character set, otherwise
       # it is a word boundary anchor. A syntax might "normalize" it if needed.
-      case text = text(data, ts, te, 1).first
-      when '\a'; emit(:escape, :bell,           text, ts-1, te)
-      when '\b'; emit(:escape, :backspace,      text, ts-1, te)
-      when '\e'; emit(:escape, :escape,         text, ts-1, te)
-      when '\f'; emit(:escape, :form_feed,      text, ts-1, te)
-      when '\n'; emit(:escape, :newline,        text, ts-1, te)
-      when '\r'; emit(:escape, :carriage,       text, ts-1, te)
-      when '\t'; emit(:escape, :tab,            text, ts-1, te)
-      when '\v'; emit(:escape, :vertical_tab,   text, ts-1, te)
+      case text = copy(data, ts-1, te)
+      when '\a'; emit(:escape, :bell,           text)
+      when '\b'; emit(:escape, :backspace,      text)
+      when '\e'; emit(:escape, :escape,         text)
+      when '\f'; emit(:escape, :form_feed,      text)
+      when '\n'; emit(:escape, :newline,        text)
+      when '\r'; emit(:escape, :carriage,       text)
+      when '\t'; emit(:escape, :tab,            text)
+      when '\v'; emit(:escape, :vertical_tab,   text)
       end
       fret;
     };
     codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
-      text = text(data, ts, te, 1).first
+      text = copy(data, ts-1, te)
       if text[2].chr == '{'
-        emit(:escape, :codepoint_list, text, ts-1, te)
+        emit(:escape, :codepoint_list, text)
       else
-        emit(:escape, :codepoint,      text, ts-1, te)
+        emit(:escape, :codepoint,      text)
       end
       fret;
     };
-    hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
-      emit(:escape, :hex, *text(data, ts, te, 1))
+    hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
+      emit(:escape, :hex, copy(data, ts-1, te))
       fret;
     };
@@ -357,8 +354,8 @@
       fcall unicode_property;
     };
-    (any -- non_literal_escape) > (escaped_alpha, 1)  {
-      emit(:escape, :literal, *text(data, ts, te, 1))
+    (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
+      emit(:escape, :literal, copy(data, ts-1, te))
       fret;
     };
   *|;
@@ -368,9 +365,9 @@
   # --------------------------------------------------------------------------
   conditional_expression := |*
     group_lookup . ')' {
-      text = text(data, ts, te-1).first
-      emit(:conditional, :condition, text, ts, te-1)
-      emit(:conditional, :condition_close, ')', te-1, te)
+      text = copy(data, ts, te-1)
+      emit(:conditional, :condition, text)
+      emit(:conditional, :condition_close, ')')
     };
     any {
@@ -387,39 +384,39 @@
     # Meta characters
     # ------------------------------------------------------------------------
     dot {
-      emit(:meta, :dot, *text(data, ts, te))
+      emit(:meta, :dot, copy(data, ts, te))
     };
     alternation {
       if conditional_stack.last == group_depth
-        emit(:conditional, :separator, *text(data, ts, te))
+        emit(:conditional, :separator, copy(data, ts, te))
       else
-        emit(:meta, :alternation, *text(data, ts, te))
+        emit(:meta, :alternation, copy(data, ts, te))
       end
     };
     # Anchors
     # ------------------------------------------------------------------------
     beginning_of_line {
-      emit(:anchor, :bol, *text(data, ts, te))
+      emit(:anchor, :bol, copy(data, ts, te))
     };
     end_of_line {
-      emit(:anchor, :eol, *text(data, ts, te))
+      emit(:anchor, :eol, copy(data, ts, te))
     };
     backslash . keep_mark > (backslashed, 4) {
-      emit(:keep, :mark, *text(data, ts, te))
+      emit(:keep, :mark, copy(data, ts, te))
     };
     backslash . anchor_char > (backslashed, 3) {
-      case text = text(data, ts, te).first
-      when '\\A'; emit(:anchor, :bos,                text, ts, te)
-      when '\\z'; emit(:anchor, :eos,                text, ts, te)
-      when '\\Z'; emit(:anchor, :eos_ob_eol,         text, ts, te)
-      when '\\b'; emit(:anchor, :word_boundary,      text, ts, te)
-      when '\\B'; emit(:anchor, :nonword_boundary,   text, ts, te)
-      when '\\G'; emit(:anchor, :match_start,        text, ts, te)
+      case text = copy(data, ts, te)
+      when '\\A'; emit(:anchor, :bos,                text)
+      when '\\z'; emit(:anchor, :eos,                text)
+      when '\\Z'; emit(:anchor, :eos_ob_eol,         text)
+      when '\\b'; emit(:anchor, :word_boundary,      text)
+      when '\\B'; emit(:anchor, :nonword_boundary,   text)
+      when '\\G'; emit(:anchor, :match_start,        text)
       end
     };
@@ -430,7 +427,7 @@
     # Character sets
     # ------------------------------------------------------------------------
     set_open >set_opened {
-      emit(:set, :open, *text(data, ts, te))
+      emit(:set, :open, copy(data, ts, te))
       fcall character_set;
     };
@@ -439,12 +436,12 @@
     #   (?(condition)Y|N)   conditional expression
     # ------------------------------------------------------------------------
     conditional {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       conditional_stack << group_depth
-      emit(:conditional, :open, text[0..-2], ts, te-1)
-      emit(:conditional, :condition_open, '(', te-1, te)
+      emit(:conditional, :open, text[0..-2])
+      emit(:conditional, :condition_open, '(')
       fcall conditional_expression;
     };
@@ -455,7 +452,7 @@
     # correct closing count.
     # ------------------------------------------------------------------------
     group_open . group_comment $group_closed {
-      emit(:group, :comment, *text(data, ts, te))
+      emit(:group, :comment, copy(data, ts, te))
     };
     # Expression options:
@@ -470,11 +467,11 @@
     #   (?imxdau-imx:subexp)  option on/off for subexp
     # ------------------------------------------------------------------------
     group_open . group_options >group_opened {
-      text = text(data, ts, te).first
+      text = copy(data, ts, te)
       if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
         raise InvalidGroupOption.new($1 || "-#{$2}", text)
       end
-      emit_options(text, ts, te)
+      emit_options(text)
     };
     # Assertions
@@ -484,11 +481,11 @@
     #   (?<!subexp)         negative look-behind
     # ------------------------------------------------------------------------
     group_open . assertion_type >group_opened {
-      case text = text(data, ts, te).first
-      when '(?=';  emit(:assertion, :lookahead,    text, ts, te)
-      when '(?!';  emit(:assertion, :nlookahead,   text, ts, te)
-      when '(?<='; emit(:assertion, :lookbehind,   text, ts, te)
-      when '(?<!'; emit(:assertion, :nlookbehind,  text, ts, te)
+      case text = copy(data, ts, te)
+      when '(?=';  emit(:assertion, :lookahead,    text)
+      when '(?!';  emit(:assertion, :nlookahead,   text)
+      when '(?<='; emit(:assertion, :lookbehind,   text)
+      when '(?<!'; emit(:assertion, :nlookbehind,  text)
       end
     };
@@ -501,32 +498,32 @@
     #   (subexp)            captured group
     # ------------------------------------------------------------------------
     group_open . group_type >group_opened {
-      case text = text(data, ts, te).first
-      when '(?:';  emit(:group, :passive,      text, ts, te)
-      when '(?>';  emit(:group, :atomic,       text, ts, te)
-      when '(?~';  emit(:group, :absence,      text, ts, te)
+      case text = copy(data, ts, te)
+      when '(?:';  emit(:group, :passive,      text)
+      when '(?>';  emit(:group, :atomic,       text)
+      when '(?~';  emit(:group, :absence,      text)
       when /^\(\?(?:<>|'')/
         validation_error(:group, 'named group', 'name is empty')
-      when /^\(\?<\w*>/
-        emit(:group, :named_ab,  text, ts, te)
+      when /^\(\?<[^>]+>/
+        emit(:group, :named_ab,  text)
-      when /^\(\?'\w*'/
-        emit(:group, :named_sq,  text, ts, te)
+      when /^\(\?'[^']+'/
+        emit(:group, :named_sq,  text)
       end
     };
     group_open @group_opened {
-      text = text(data, ts, te).first
-      emit(:group, :capture, text, ts, te)
+      text = copy(data, ts, te)
+      emit(:group, :capture, text)
     };
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, *text(data, ts, te))
+        emit(:conditional, :close, copy(data, ts, te))
       else
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +531,7 @@
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, *text(data, ts, te))
+        emit(:group, :close, copy(data, ts, te))
       end
     };
@@ -542,63 +539,65 @@
     # Group backreference, named and numbered
     # ------------------------------------------------------------------------
     backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
-      case text = text(data, ts, te).first
+      case text = copy(data, ts, te)
       when /^\\([gk])(<>|'')/ # angle brackets
         validation_error(:backref, 'ref/call', 'ref ID is empty')
-      when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
+      # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
+      # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
+      when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :name_ref_ab, text, ts, te)
+          emit(:backref, :name_ref_ab, text)
         else
-          emit(:backref, :name_call_ab, text, ts, te)
+          emit(:backref, :name_call_ab, text)
         end
-      when /^\\([gk])'[^\d+-]\w*'/ #single quotes
+      when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
         if $1 == 'k'
-          emit(:backref, :name_ref_sq, text, ts, te)
+          emit(:backref, :name_ref_sq, text)
         else
-          emit(:backref, :name_call_sq, text, ts, te)
+          emit(:backref, :name_call_sq, text)
         end
       when /^\\([gk])<\d+>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :number_ref_ab, text, ts, te)
+          emit(:backref, :number_ref_ab, text)
         else
-          emit(:backref, :number_call_ab, text, ts, te)
+          emit(:backref, :number_call_ab, text)
         end
       when /^\\([gk])'\d+'/ # single quotes
         if $1 == 'k'
-          emit(:backref, :number_ref_sq, text, ts, te)
+          emit(:backref, :number_ref_sq, text)
         else
-          emit(:backref, :number_call_sq, text, ts, te)
+          emit(:backref, :number_call_sq, text)
         end
       when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :number_rel_ref_ab, text, ts, te)
+          emit(:backref, :number_rel_ref_ab, text)
         else
-          emit(:backref, :number_rel_call_ab, text, ts, te)
+          emit(:backref, :number_rel_call_ab, text)
         end
       when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
         if $1 == 'k'
-          emit(:backref, :number_rel_ref_sq, text, ts, te)
+          emit(:backref, :number_rel_ref_sq, text)
         else
-          emit(:backref, :number_rel_call_sq, text, ts, te)
+          emit(:backref, :number_rel_call_sq, text)
         end
-      when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
-        emit(:backref, :name_recursion_ref_ab, text, ts, te)
+      when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
+        emit(:backref, :name_recursion_ref_ab, text)
-      when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
-        emit(:backref, :name_recursion_ref_sq, text, ts, te)
+      when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
+        emit(:backref, :name_recursion_ref_sq, text)
       when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
-        emit(:backref, :number_recursion_ref_ab, text, ts, te)
+        emit(:backref, :number_recursion_ref_ab, text)
       when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
-        emit(:backref, :number_recursion_ref_sq, text, ts, te)
+        emit(:backref, :number_recursion_ref_sq, text)
       end
     };
@@ -607,31 +606,31 @@
     # Quantifiers
     # ------------------------------------------------------------------------
     zero_or_one {
-      case text = text(data, ts, te).first
-      when '?' ;  emit(:quantifier, :zero_or_one,            text, ts, te)
-      when '??';  emit(:quantifier, :zero_or_one_reluctant,  text, ts, te)
-      when '?+';  emit(:quantifier, :zero_or_one_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '?' ;  emit(:quantifier, :zero_or_one,            text)
+      when '??';  emit(:quantifier, :zero_or_one_reluctant,  text)
+      when '?+';  emit(:quantifier, :zero_or_one_possessive, text)
       end
     };
     zero_or_more {
-      case text = text(data, ts, te).first
-      when '*' ;  emit(:quantifier, :zero_or_more,            text, ts, te)
-      when '*?';  emit(:quantifier, :zero_or_more_reluctant,  text, ts, te)
-      when '*+';  emit(:quantifier, :zero_or_more_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '*' ;  emit(:quantifier, :zero_or_more,            text)
+      when '*?';  emit(:quantifier, :zero_or_more_reluctant,  text)
+      when '*+';  emit(:quantifier, :zero_or_more_possessive, text)
       end
     };
     one_or_more {
-      case text = text(data, ts, te).first
-      when '+' ;  emit(:quantifier, :one_or_more,            text, ts, te)
-      when '+?';  emit(:quantifier, :one_or_more_reluctant,  text, ts, te)
-      when '++';  emit(:quantifier, :one_or_more_possessive, text, ts, te)
+      case text = copy(data, ts, te)
+      when '+' ;  emit(:quantifier, :one_or_more,            text)
+      when '+?';  emit(:quantifier, :one_or_more_reluctant,  text)
+      when '++';  emit(:quantifier, :one_or_more_possessive, text)
       end
     };
     quantifier_interval  {
-      emit(:quantifier, :interval, *text(data, ts, te))
+      emit(:quantifier, :interval, copy(data, ts, te))
     };
     # Catch unmatched curly braces as literals
@@ -647,7 +646,7 @@
     comment {
       if free_spacing
-        emit(:free_space, :comment, *text(data, ts, te))
+        emit(:free_space, :comment, copy(data, ts, te))
       else
         # consume only the pound sign (#) and backtrack to do regular scanning
         append_literal(data, ts, ts + 1)
@@ -657,7 +656,7 @@
     space+ {
       if free_spacing
-        emit(:free_space, :whitespace, *text(data, ts, te))
+        emit(:free_space, :whitespace, copy(data, ts, te))
       else
         append_literal(data, ts, te)
       end
@@ -666,11 +665,7 @@
     # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
     # except meta characters.
     # ------------------------------------------------------------------------
-    (ascii_print -- space)+    |
-    ascii_nonprint+ |
-    utf8_2_byte+    |
-    utf8_3_byte+    |
-    utf8_4_byte+    {
+    (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
       append_literal(data, ts, te)
     };
@@ -760,6 +755,7 @@ class Regexp::Scanner
     self.set_depth = 0
     self.group_depth = 0
     self.conditional_stack = []
+    self.char_pos = 0
     %% write data;
     %% write init;
@@ -769,7 +765,7 @@ class Regexp::Scanner
     testEof = testEof
     if cs == re_scanner_error
-      text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+      text = copy(data, ts ? ts-1 : 0, -1)
       raise ScannerError.new("Scan error at '#{text}'")
     end
@@ -797,22 +793,29 @@ class Regexp::Scanner
   end
   # Emits an array with the details of the scanned pattern
-  def emit(type, token, text, ts, te)
+  def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
     emit_literal if literal
+    # Ragel runs with byte-based indices (ts, te). These are of little value to
+    # end-users, so we keep track of char-based indices and emit those instead.
+    ts_char_pos = char_pos
+    te_char_pos = char_pos + text.length
     if block
-      block.call type, token, text, ts, te
+      block.call type, token, text, ts_char_pos, te_char_pos
     end
-    tokens << [type, token, text, ts, te]
+    tokens << [type, token, text, ts_char_pos, te_char_pos]
+    self.char_pos = te_char_pos
   end
   private
   attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack
+                :group_depth, :set_depth, :conditional_stack, :char_pos
   def free_spacing?(input_object, options)
     if options && !input_object.is_a?(String)
@@ -835,36 +838,25 @@ class Regexp::Scanner
   end
   # Copy from ts to te from data as text
-  def copy(data, range)
-    data[range].pack('c*')
-  end
-  # Copy from ts to te from data as text, returning an array with the text
-  #  and the offsets used to copy it.
-  def text(data, ts, te, soff = 0)
-    [copy(data, ts-soff..te-1), ts-soff, te]
+  def copy(data, ts, te)
+    data[ts...te].pack('c*').force_encoding('utf-8')
   end
   # Appends one or more characters to the literal buffer, to be emitted later
-  # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
+  # by a call to emit_literal.
   def append_literal(data, ts, te)
     self.literal = literal || []
-    literal << text(data, ts, te)
+    literal << copy(data, ts, te)
   end
-  # Emits the literal run collected by calls to the append_literal method,
-  # using the total start (ts) and end (te) offsets of the run.
+  # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    ts, te = literal.first[1], literal.last[2]
-    text = literal.map {|t| t[0]}.join
-    text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
+    text = literal.join
     self.literal = nil
-    emit(:literal, :literal, text, ts, te)
+    emit(:literal, :literal, text)
   end
-  def emit_options(text, ts, te)
+  def emit_options(text)
     token = nil
     # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +882,14 @@ class Regexp::Scanner
       token = :options_switch
     end
-    emit(:group, token, text, ts, te)
+    emit(:group, token, text)
   end
   def emit_meta_control_sequence(data, ts, te, token)
     if data.last < 0x00 || data.last > 0x7F
       validation_error(:sequence, 'escape', token.to_s)
     end
-    emit(:escape, token, *text(data, ts, te, 1))
+    emit(:escape, token, copy(data, ts-1, te))
   end
   # Centralizes and unifies the handling of validation related