RubyGems - regexp_parser - Versions diffs - 0.1.1 → 0.1.5 - Mend

regexp_parser 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +7 -0
data/ChangeLog +45 -0
data/Rakefile +12 -44
data/VERSION.yml +5 -0
data/lib/regexp_parser.rb +5 -38
data/lib/regexp_parser/expression.rb +68 -221
data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
data/lib/regexp_parser/expression/classes/backref.rb +42 -0
data/lib/regexp_parser/expression/classes/escape.rb +27 -0
data/lib/regexp_parser/expression/classes/group.rb +67 -0
data/lib/regexp_parser/expression/classes/literal.rb +7 -0
data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
data/lib/regexp_parser/expression/classes/root.rb +26 -0
data/lib/regexp_parser/expression/classes/set.rb +100 -0
data/lib/regexp_parser/expression/classes/type.rb +17 -0
data/lib/regexp_parser/expression/quantifier.rb +26 -0
data/lib/regexp_parser/expression/subexpression.rb +69 -0
data/lib/regexp_parser/lexer.rb +4 -4
data/lib/regexp_parser/parser.rb +31 -13
data/lib/regexp_parser/scanner.rb +1849 -1488
data/lib/regexp_parser/scanner/property.rl +7 -2
data/lib/regexp_parser/scanner/scanner.rl +377 -191
data/lib/regexp_parser/syntax.rb +7 -0
data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
data/lib/regexp_parser/syntax/tokens.rb +21 -320
data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
data/lib/regexp_parser/token.rb +37 -0
data/test/expression/test_all.rb +7 -0
data/test/expression/test_base.rb +72 -0
data/test/expression/test_clone.rb +144 -0
data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
data/test/helpers.rb +1 -0
data/test/parser/test_all.rb +1 -1
data/test/parser/test_alternation.rb +35 -0
data/test/parser/test_anchors.rb +2 -2
data/test/parser/test_refcalls.rb +1 -1
data/test/parser/test_sets.rb +54 -8
data/test/scanner/test_anchors.rb +2 -2
data/test/scanner/test_conditionals.rb +31 -0
data/test/scanner/test_errors.rb +88 -8
data/test/scanner/test_escapes.rb +4 -4
data/test/scanner/test_groups.rb +7 -0
data/test/scanner/test_quoting.rb +29 -0
data/test/scanner/test_sets.rb +1 -0
data/test/syntax/ruby/test_1.8.rb +3 -3
data/test/test_all.rb +1 -1
metadata +62 -48
data/lib/regexp_parser/expression/set.rb +0 -59

data/lib/regexp_parser/scanner/property.rl CHANGED Viewed

@@ -58,7 +58,7 @@
   unicode_property := |*
     property_sequence < eof(premature_property_end) {
-      text = data[ts-1..te-1].pack('c*')
+      text = text(data, ts, te, 1).first
       if in_set
         type = :set
       else
@@ -525,9 +525,14 @@
         self.emit(type, :script_unknown,                  text, ts-1, te)
       else
-        raise UnknownUnicodePropertyError.new(name)
+        # Should this really be an error? Or would emitting
+        # an :unknown for the property be better?
+        #
+        # self.emit(type, :unknown, text, ts-1, te)
+        raise UnknownUnicodePropertyError.new(name)
       end
       fret;
     };
   *|;

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -28,6 +28,7 @@
   class_posix           = ('[:' . '^'? . class_name_posix . ':]');
   # these are not supported in ruby, and need verification
   collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
   character_equivalent  = '[=' . alpha . '=]';
@@ -41,14 +42,21 @@
   octal_sequence        = [0-7]{1,3};
   hex_sequence          = 'x' . xdigit{1,2};
+  hex_sequence_err      = 'x' . [^0-9a-fA-F{];
   wide_hex_sequence     = 'x' . '{' . xdigit{1,8} . '}';
+  hex_or_not            = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
+  wide_hex_seq_invalid  = 'x' . '{' . hex_or_not{1,9};
+  wide_hex_seq_empty    = 'x' . '{' . (space+)? . '}';
   codepoint_single      = 'u' . xdigit{4};
   codepoint_list        = 'u{' . (xdigit{4} . space?)+'}';
   codepoint_sequence    = codepoint_single | codepoint_list;
-  control_sequence      = ('c' | 'C-') . alpha;
-  meta_sequence         = 'M-' . ((backslash . control_sequence) | alpha);
+  control_sequence      = ('c' | 'C-');
+  meta_sequence         = 'M-' . (backslash . control_sequence)?;
   zero_or_one           = '?' | '??' | '?+';
   zero_or_more          = '*' | '*?' | '*+';
@@ -59,11 +67,11 @@
   quantifier_possessive = '?+' | '*+' | '++';
   quantifier_mode       = '?'  | '+';
-  quantifier_range      = range_open . (digit+)? . ','? . (digit+)? .
+  quantifier_interval   = range_open . (digit+)? . ','? . (digit+)? .
                           range_close . quantifier_mode?;
   quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_range;
+                          quantifier_possessive | quantifier_interval;
   group_comment         = '?#' . [^)]+ . group_close;
@@ -76,10 +84,10 @@
   assertion_lookbehind  = '?<=';
   assertion_nlookbehind = '?<!';
-  group_options         = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
+  group_options         = '?' . [\-mix];
   group_ref             = [gk];
-  group_name            = alpha . (alnum+)?;
+  group_name            = (alnum . (alnum+)?)?;
   group_number          = '-'? . [1-9] . ([0-9]+)?;
   group_level           = [+\-] . [0-9]+;
@@ -113,7 +121,16 @@
                           group_ref | [xucCM];
   # EOF error, used where it can be detected
-  action premature_end_error { raise PrematureEndError }
+  action premature_end_error {
+    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+    raise PrematureEndError.new( text )
+  }
+  # Invalid sequence error, used from sequences, like escapes and sets
+  action invalid_sequence_error {
+    text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+    raise InvalidSequenceError.new('sequence', text)
+  }
   # group (nesting) and set open/close actions
   action group_opened { group_depth += 1; in_group = true }
@@ -127,7 +144,7 @@
       set_type  = set_depth > 1 ? :subset : :set
       set_depth -= 1; in_set = set_depth > 0 ? true : false
-      self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :close, *text(data, ts, te))
       if set_depth == 0
         fgoto main;
@@ -140,8 +157,8 @@
       set_type  = set_depth > 1 ? :subset : :set
       set_depth -= 1; in_set = set_depth > 0 ? true : false
-      self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
-      self.emit(set_type, :close,  data[ts+1..te-1].pack('c*'), ts, te)
+      emit(set_type, :member, copy(data, ts..te-2), ts, te)
+      emit(set_type, :close,  copy(data, ts+1..te-1), ts, te)
       if set_depth == 0
         fgoto main;
@@ -151,20 +168,20 @@
     };
     '^' {
-      text = data[ts..te-1].pack('c*')
+      text = text(data, ts, te).first
       if @tokens.last[1] == :open
-        self.emit(set_type, :negate, text, ts, te)
+        emit(set_type, :negate, text, ts, te)
       else
-        self.emit(set_type, :member, text, ts, te)
+        emit(set_type, :member, text, ts, te)
       end
     };
     alnum . '-' . alnum {
-      self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :range, *text(data, ts, te))
     };
     '&&' {
-      self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :intersection, *text(data, ts, te))
     };
     '\\' {
@@ -175,12 +192,12 @@
       set_depth += 1; in_set = true
       set_type  = set_depth > 1 ? :subset : :set
-      self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :open, *text(data, ts, te))
       fcall character_set;
     };
     class_posix >(open_bracket, 1) @eof(premature_end_error) {
-      text = data[ts..te-1].pack('c*')
+      text = text(data, ts, te).first
       class_name = text[2..-3]
       if class_name[0].chr == '^'
@@ -188,21 +205,21 @@
       end
       token_sym = "class_#{class_name}".to_sym
-      self.emit(set_type, token_sym, text, ts, te)
+      emit(set_type, token_sym, text, ts, te)
     };
     collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
-      self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :collation, *text(data, ts, te))
     };
     character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
-      self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :equivalent, *text(data, ts, te))
     };
     # exclude the closing bracket as a cleaner workaround for dealing with the
     # ambiguity caused upon exit from the unicode properties machine
     meta_char -- ']' {
-     self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :member, *text(data, ts, te))
     };
     any            |
@@ -210,48 +227,48 @@
     utf8_2_byte    |
     utf8_3_byte    |
     utf8_4_byte    {
-      self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :member, *text(data, ts, te))
     };
   *|;
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
-    'b' {
-      self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
+    'b' > (escaped_set_alpha, 2) {
+      emit(set_type, :backspace, *text(data, ts, te, 1))
       fret;
     };
     char_type {
-      case text = data[ts-1..te-1].pack('c*')
-      when '\d'; self.emit(set_type, :type_digit,     text, ts-1, te)
-      when '\D'; self.emit(set_type, :type_nondigit,  text, ts-1, te)
-      when '\h'; self.emit(set_type, :type_hex,       text, ts-1, te)
-      when '\H'; self.emit(set_type, :type_nonhex,    text, ts-1, te)
-      when '\s'; self.emit(set_type, :type_space,     text, ts-1, te)
-      when '\S'; self.emit(set_type, :type_nonspace,  text, ts-1, te)
-      when '\w'; self.emit(set_type, :type_word,      text, ts-1, te)
-      when '\W'; self.emit(set_type, :type_nonword,   text, ts-1, te)
+      case text = text(data, ts, te, 1).first
+      when '\d'; emit(set_type, :type_digit,     text, ts-1, te)
+      when '\D'; emit(set_type, :type_nondigit,  text, ts-1, te)
+      when '\h'; emit(set_type, :type_hex,       text, ts-1, te)
+      when '\H'; emit(set_type, :type_nonhex,    text, ts-1, te)
+      when '\s'; emit(set_type, :type_space,     text, ts-1, te)
+      when '\S'; emit(set_type, :type_nonspace,  text, ts-1, te)
+      when '\w'; emit(set_type, :type_word,      text, ts-1, te)
+      when '\W'; emit(set_type, :type_nonword,   text, ts-1, te)
       end
       fret;
     };
     hex_sequence . '-\\' . hex_sequence {
-      self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(set_type, :range_hex, *text(data, ts, te, 1))
       fret;
     };
     hex_sequence {
-      self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(set_type, :member_hex, *text(data, ts, te, 1))
       fret;
     };
     meta_char | [\\\]\-\,] {
-      self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(set_type, :escape, *text(data, ts, te, 1))
       fret;
     };
-    property_char > (escaped_set_alpha, 2) {
+    property_char > (escaped_set_alpha, 3) {
       fhold;
       fnext character_set;
       fcall unicode_property;
@@ -264,7 +281,7 @@
     utf8_2_byte               |
     utf8_3_byte               |
     utf8_4_byte               {
-      self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(set_type, :escape, *text(data, ts, te, 1))
       fret;
     };
   *|;
@@ -274,33 +291,33 @@
   # --------------------------------------------------------------------------
   escape_sequence := |*
     [1-9] {
-      text = data[ts-1..te-1].pack('c*')
-      self.emit(:backref, :number, text, ts-1, te)
+      text = text(data, ts, te, 1).first
+      emit(:backref, :number, text, ts-1, te)
       fret;
     };
     octal_sequence {
-      self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(:escape, :octal, *text(data, ts, te, 1))
       fret;
     };
     meta_char {
-      case text = data[ts-1..te-1].pack('c*')
-      when '\.';  self.emit(:escape, :dot,               text, ts-1, te)
-      when '\|';  self.emit(:escape, :alternation,       text, ts-1, te)
-      when '\^';  self.emit(:escape, :beginning_of_line, text, ts-1, te)
-      when '\$';  self.emit(:escape, :end_of_line,       text, ts-1, te)
-      when '\?';  self.emit(:escape, :zero_or_one,       text, ts-1, te)
-      when '\*';  self.emit(:escape, :zero_or_more,      text, ts-1, te)
-      when '\+';  self.emit(:escape, :one_or_more,       text, ts-1, te)
-      when '\(';  self.emit(:escape, :group_open,        text, ts-1, te)
-      when '\)';  self.emit(:escape, :group_close,       text, ts-1, te)
-      when '\{';  self.emit(:escape, :interval_open,     text, ts-1, te)
-      when '\}';  self.emit(:escape, :interval_close,    text, ts-1, te)
-      when '\[';  self.emit(:escape, :set_open,          text, ts-1, te)
-      when '\]';  self.emit(:escape, :set_close,         text, ts-1, te)
+      case text = text(data, ts, te, 1).first
+      when '\.';  emit(:escape, :dot,               text, ts-1, te)
+      when '\|';  emit(:escape, :alternation,       text, ts-1, te)
+      when '\^';  emit(:escape, :bol,               text, ts-1, te)
+      when '\$';  emit(:escape, :eol,               text, ts-1, te)
+      when '\?';  emit(:escape, :zero_or_one,       text, ts-1, te)
+      when '\*';  emit(:escape, :zero_or_more,      text, ts-1, te)
+      when '\+';  emit(:escape, :one_or_more,       text, ts-1, te)
+      when '\(';  emit(:escape, :group_open,        text, ts-1, te)
+      when '\)';  emit(:escape, :group_close,       text, ts-1, te)
+      when '\{';  emit(:escape, :interval_open,     text, ts-1, te)
+      when '\}';  emit(:escape, :interval_close,    text, ts-1, te)
+      when '\[';  emit(:escape, :set_open,          text, ts-1, te)
+      when '\]';  emit(:escape, :set_close,         text, ts-1, te)
       when "\\\\";
-        self.emit(:escape, :backslash, text, ts-1, te)
+        emit(:escape, :backslash, text, ts-1, te)
       end
       fret;
     };
@@ -308,46 +325,76 @@
     escaped_ascii > (escaped_alpha, 7) {
       # \b is emitted as backspace only when inside a character set, otherwise
       # it is a word boundary anchor. A syntax might "normalize" it if needed.
-      case text = data[ts-1..te-1].pack('c*')
-      when '\a'; self.emit(:escape, :bell,           text, ts-1, te)
-      when '\e'; self.emit(:escape, :escape,         text, ts-1, te)
-      when '\f'; self.emit(:escape, :form_feed,      text, ts-1, te)
-      when '\n'; self.emit(:escape, :newline,        text, ts-1, te)
-      when '\r'; self.emit(:escape, :carriage,       text, ts-1, te)
-      when '\s'; self.emit(:escape, :space,          text, ts-1, te)
-      when '\t'; self.emit(:escape, :tab,            text, ts-1, te)
-      when '\v'; self.emit(:escape, :vertical_tab,   text, ts-1, te)
+      case text = text(data, ts, te, 1).first
+      when '\a'; emit(:escape, :bell,           text, ts-1, te)
+      when '\e'; emit(:escape, :escape,         text, ts-1, te)
+      when '\f'; emit(:escape, :form_feed,      text, ts-1, te)
+      when '\n'; emit(:escape, :newline,        text, ts-1, te)
+      when '\r'; emit(:escape, :carriage,       text, ts-1, te)
+      when '\s'; emit(:escape, :space,          text, ts-1, te)
+      when '\t'; emit(:escape, :tab,            text, ts-1, te)
+      when '\v'; emit(:escape, :vertical_tab,   text, ts-1, te)
       end
       fret;
     };
-    codepoint_sequence > (escaped_alpha, 6) {
-      text = data[ts-1..te-1].pack('c*')
+    codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
+      text = text(data, ts, te, 1).first
       if text[2].chr == '{'
-        self.emit(:escape, :codepoint_list, text, ts-1, te)
+        emit(:escape, :codepoint_list, text, ts-1, te)
       else
-        self.emit(:escape, :codepoint,      text, ts-1, te)
+        emit(:escape, :codepoint,      text, ts-1, te)
       end
       fret;
     };
-    hex_sequence > (escaped_alpha, 5) {
-      self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
+    hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
+      emit(:escape, :hex, *text(data, ts, te, 1))
+      fret;
+    };
+    wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
+      emit(:escape, :hex_wide, *text(data, ts, te, 1))
       fret;
     };
-    wide_hex_sequence > (escaped_alpha, 5) {
-      self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
+    hex_sequence_err @invalid_sequence_error {
       fret;
     };
-    control_sequence > (escaped_alpha, 4) {
-      self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
+    (wide_hex_seq_invalid | wide_hex_seq_empty) {
+      raise InvalidSequenceError.new("wide hex sequence")
       fret;
     };
-    meta_sequence > (backslashed, 3) {
-      self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
+    control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
+      if data[te]
+        c = data[te].chr
+        if c =~ /[\x00-\x7F]/
+          emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
+          p += 1
+        else
+          raise InvalidSequenceError.new("control sequence")
+        end
+      else
+        raise PrematureEndError.new("control sequence")
+      end
+      fret;
+    };
+    meta_sequence >(backslashed, 3) $eof(premature_end_error) {
+      if data[te]
+        c = data[te].chr
+        if c =~ /[\x00-\x7F]/
+          emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
+          p += 1
+        else
+          raise InvalidSequenceError.new("meta sequence")
+        end
+      else
+        raise PrematureEndError.new("meta sequence")
+      end
+      fret;
     };
     property_char > (escaped_alpha, 2) {
@@ -357,7 +404,7 @@
     };
     (any -- non_literal_escape) > (escaped_alpha, 1)  {
-      self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
+      emit(:escape, :literal, *text(data, ts, te, 1))
       fret;
     };
   *|;
@@ -370,32 +417,34 @@
     # Meta characters
     # ------------------------------------------------------------------------
     dot {
-      self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
+      emit(:meta, :dot, *text(data, ts, te))
     };
     alternation {
-      self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
+      emit(:meta, :alternation, *text(data, ts, te))
     };
     # Anchors
     # ------------------------------------------------------------------------
     beginning_of_line {
-      self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
+      emit(:anchor, :bol, *text(data, ts, te))
     };
     end_of_line {
-      self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
+      emit(:anchor, :eol, *text(data, ts, te))
     };
     backslash . anchor_char > (backslashed, 3) {
-      case text = data[ts..te-1].pack('c*')
-      when '\\A'; self.emit(:anchor, :bos,                text, ts, te)
-      when '\\z'; self.emit(:anchor, :eos,                text, ts, te)
-      when '\\Z'; self.emit(:anchor, :eos_ob_eol,         text, ts, te)
-      when '\\b'; self.emit(:anchor, :word_boundary,      text, ts, te)
-      when '\\B'; self.emit(:anchor, :nonword_boundary,   text, ts, te)
-      when '\\G'; self.emit(:anchor, :match_start,        text, ts, te)
-      else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
+      case text = text(data, ts, te).first
+      when '\\A'; emit(:anchor, :bos,                text, ts, te)
+      when '\\z'; emit(:anchor, :eos,                text, ts, te)
+      when '\\Z'; emit(:anchor, :eos_ob_eol,         text, ts, te)
+      when '\\b'; emit(:anchor, :word_boundary,      text, ts, te)
+      when '\\B'; emit(:anchor, :nonword_boundary,   text, ts, te)
+      when '\\G'; emit(:anchor, :match_start,        text, ts, te)
+      else
+        raise ScannerError.new(
+          "Unexpected character in anchor at #{text} (char #{ts})")
       end
     };
@@ -406,15 +455,18 @@
     #   \w, \W    word, non-word
     # ------------------------------------------------------------------------
     backslash . char_type > (backslashed, 2) {
-      case text = data[ts..te-1].pack('c*')
-      when '\\d'; self.emit(:type, :digit,      text, ts, te)
-      when '\\D'; self.emit(:type, :nondigit,   text, ts, te)
-      when '\\h'; self.emit(:type, :hex,        text, ts, te)
-      when '\\H'; self.emit(:type, :nonhex,     text, ts, te)
-      when '\\s'; self.emit(:type, :space,      text, ts, te)
-      when '\\S'; self.emit(:type, :nonspace,   text, ts, te)
-      when '\\w'; self.emit(:type, :word,       text, ts, te)
-      when '\\W'; self.emit(:type, :nonword,    text, ts, te)
+      case text = text(data, ts, te).first
+      when '\\d'; emit(:type, :digit,      text, ts, te)
+      when '\\D'; emit(:type, :nondigit,   text, ts, te)
+      when '\\h'; emit(:type, :hex,        text, ts, te)
+      when '\\H'; emit(:type, :nonhex,     text, ts, te)
+      when '\\s'; emit(:type, :space,      text, ts, te)
+      when '\\S'; emit(:type, :nonspace,   text, ts, te)
+      when '\\w'; emit(:type, :word,       text, ts, te)
+      when '\\W'; emit(:type, :nonword,    text, ts, te)
+      else
+        raise ScannerError.new(
+          "Unexpected character in type at #{text} (char #{ts})")
       end
     };
@@ -425,7 +477,7 @@
       set_depth += 1; in_set = true
       set_type  = set_depth > 1 ? :subset : :set
-      self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
+      emit(set_type, :open, *text(data, ts, te))
       fcall character_set;
     };
@@ -435,7 +487,7 @@
     # correct closing count.
     # ------------------------------------------------------------------------
     group_open . group_comment $group_closed {
-      self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
+      emit(:group, :comment, *text(data, ts, te))
     };
     # Expression options:
@@ -447,21 +499,7 @@
     #   (?imx-imx:subexp)   option on/off for subexp
     # ------------------------------------------------------------------------
     group_open . group_options >group_opened {
-      # special handling to resolve ambiguity with passive groups
-      if data[te]
-        c = data[te].chr
-        if c == ':' # include the ':'
-          self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
-          p += 1
-        elsif c == ')' # just options by themselves
-          self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
-        else
-          raise ScannerError.new(
-            "Unexpected '#{c}' in options sequence, ':' or ')' expected")
-        end
-      else
-        raise PrematureEndError.new("options") unless data[te]
-      end
+      p = scan_options(p, data, ts, te)
     };
     # Assertions
@@ -471,11 +509,11 @@
     #   (?<!subexp)         negative look-behind
     # ------------------------------------------------------------------------
     group_open . assertion_type >group_opened {
-      case text =  data[ts..te-1].pack('c*')
-      when '(?=';  self.emit(:assertion, :lookahead,    text, ts, te)
-      when '(?!';  self.emit(:assertion, :nlookahead,   text, ts, te)
-      when '(?<='; self.emit(:assertion, :lookbehind,   text, ts, te)
-      when '(?<!'; self.emit(:assertion, :nlookbehind,  text, ts, te)
+      case text = text(data, ts, te).first
+      when '(?=';  emit(:assertion, :lookahead,    text, ts, te)
+      when '(?!';  emit(:assertion, :nlookahead,   text, ts, te)
+      when '(?<='; emit(:assertion, :lookbehind,   text, ts, te)
+      when '(?<!'; emit(:assertion, :nlookbehind,  text, ts, te)
       end
     };
@@ -487,85 +525,103 @@
     #   (subexp)            captured group
     # ------------------------------------------------------------------------
     group_open . group_type >group_opened {
-      case text =  data[ts..te-1].pack('c*')
-      when '(?:';  self.emit(:group, :passive,      text, ts, te)
-      when '(?>';  self.emit(:group, :atomic,       text, ts, te)
-      when /\(\?<\w+>/
-        self.emit(:group, :named_ab,  text, ts, te)
-      when /\(\?'\w+'/
-        self.emit(:group, :named_sq,  text, ts, te)
+      case text = text(data, ts, te).first
+      when '(?:';  emit(:group, :passive,      text, ts, te)
+      when '(?>';  emit(:group, :atomic,       text, ts, te)
+      when /^\(\?<(\w*)>/
+        empty_name_error(:group, 'named group (ab)') if $1.empty?
+        emit(:group, :named_ab,  text, ts, te)
+      when /^\(\?'(\w*)'/
+        empty_name_error(:group, 'named group (sq)') if $1.empty?
+        emit(:group, :named_sq,  text, ts, te)
+      else
+        raise ScannerError.new(
+          "Unknown subexpression group format '#{text}'")
       end
     };
     group_open @group_opened {
-      text =  data[ts..te-1].pack('c*')
-      self.emit(:group, :capture, text, ts, te)
+      text = text(data, ts, te).first
+      emit(:group, :capture, text, ts, te)
     };
     group_close @group_closed {
-      self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
+      emit(:group, :close, *text(data, ts, te))
     };
-    # Group back-reference, named and numbered
+    # Group backreference, named and numbered
     # ------------------------------------------------------------------------
     backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
-      case text = data[ts..te-1].pack('c*')
-      when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
+      case text = text(data, ts, te).first
+      when /^\\([gk])<>/ # angle brackets
+        empty_backref_error("ref/call (ab)")
+      when /^\\([gk])''/ # single quotes
+        empty_backref_error("ref/call (sq)")
+      when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
         if $1 == 'k'
-          self.emit(:backref, :name_ref_ab,  text, ts, te)
+          emit(:backref, :name_ref_ab,  text, ts, te)
         else
-          self.emit(:backref, :name_call_ab,  text, ts, te)
+          emit(:backref, :name_call_ab,  text, ts, te)
         end
-      when /\\([gk])'[^\d-](\w+)?'/ #single quotes
+      when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
         if $1 == 'k'
-          self.emit(:backref, :name_ref_sq,  text, ts, te)
+          emit(:backref, :name_ref_sq,  text, ts, te)
         else
-          self.emit(:backref, :name_call_sq,  text, ts, te)
+          emit(:backref, :name_call_sq,  text, ts, te)
         end
-      when /\\([gk])<\d+>/ # angle-brackets
+      when /^\\([gk])<\d+>/ # angle-brackets
         if $1 == 'k'
-          self.emit(:backref, :number_ref_ab,  text, ts, te)
+          emit(:backref, :number_ref_ab,  text, ts, te)
         else
-          self.emit(:backref, :number_call_ab,  text, ts, te)
+          emit(:backref, :number_call_ab,  text, ts, te)
         end
-      when /\\([gk])'\d+'/ # single quotes
+      when /^\\([gk])'\d+'/ # single quotes
         if $1 == 'k'
-          self.emit(:backref, :number_ref_sq,  text, ts, te)
+          emit(:backref, :number_ref_sq,  text, ts, te)
         else
-          self.emit(:backref, :number_call_sq,  text, ts, te)
+          emit(:backref, :number_call_sq,  text, ts, te)
         end
-      when /\\([gk])<-\d+>/ # angle-brackets
+      when /^\\([gk])<-\d+>/ # angle-brackets
         if $1 == 'k'
-          self.emit(:backref, :number_rel_ref_ab,  text, ts, te)
+          emit(:backref, :number_rel_ref_ab,  text, ts, te)
         else
-          self.emit(:backref, :number_rel_call_ab,  text, ts, te)
+          emit(:backref, :number_rel_call_ab,  text, ts, te)
         end
-      when /\\([gk])'-\d+'/ # single quotes
+      when /^\\([gk])'-\d+'/ # single quotes
         if $1 == 'k'
-          self.emit(:backref, :number_rel_ref_sq,  text, ts, te)
+          emit(:backref, :number_rel_ref_sq,  text, ts, te)
         else
-          self.emit(:backref, :number_rel_call_sq,  text, ts, te)
+          emit(:backref, :number_rel_call_sq,  text, ts, te)
         end
-      when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
-        self.emit(:backref, :name_nest_ref_ab,  text, ts, te)
+      when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
+        emit(:backref, :name_nest_ref_ab,  text, ts, te)
-      when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
-        self.emit(:backref, :name_nest_ref_sq,  text, ts, te)
+      when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
+        emit(:backref, :name_nest_ref_sq,  text, ts, te)
-      when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
-        self.emit(:backref, :number_nest_ref_ab,  text, ts, te)
+      when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
+        emit(:backref, :number_nest_ref_ab,  text, ts, te)
-      when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
-        self.emit(:backref, :number_nest_ref_sq,  text, ts, te)
+      when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
+        emit(:backref, :number_nest_ref_sq,  text, ts, te)
+      else
+        raise ScannerError.new(
+          "Unknown backreference format '#{text}'")
       end
     };
@@ -573,31 +629,31 @@
     # Quantifiers
     # ------------------------------------------------------------------------
     zero_or_one {
-      case text =  data[ts..te-1].pack('c*')
-      when '?' ;  self.emit(:quantifier, :zero_or_one,            text, ts, te)
-      when '??';  self.emit(:quantifier, :zero_or_one_reluctant,  text, ts, te)
-      when '?+';  self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
+      case text = text(data, ts, te).first
+      when '?' ;  emit(:quantifier, :zero_or_one,            text, ts, te)
+      when '??';  emit(:quantifier, :zero_or_one_reluctant,  text, ts, te)
+      when '?+';  emit(:quantifier, :zero_or_one_possessive, text, ts, te)
       end
     };
     zero_or_more {
-      case text =  data[ts..te-1].pack('c*')
-      when '*' ;  self.emit(:quantifier, :zero_or_more,            text, ts, te)
-      when '*?';  self.emit(:quantifier, :zero_or_more_reluctant,  text, ts, te)
-      when '*+';  self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
+      case text = text(data, ts, te).first
+      when '*' ;  emit(:quantifier, :zero_or_more,            text, ts, te)
+      when '*?';  emit(:quantifier, :zero_or_more_reluctant,  text, ts, te)
+      when '*+';  emit(:quantifier, :zero_or_more_possessive, text, ts, te)
       end
     };
     one_or_more {
-      case text =  data[ts..te-1].pack('c*')
-      when '+' ;  self.emit(:quantifier, :one_or_more,            text, ts, te)
-      when '+?';  self.emit(:quantifier, :one_or_more_reluctant,  text, ts, te)
-      when '++';  self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
+      case text = text(data, ts, te).first
+      when '+' ;  emit(:quantifier, :one_or_more,            text, ts, te)
+      when '+?';  emit(:quantifier, :one_or_more_reluctant,  text, ts, te)
+      when '++';  emit(:quantifier, :one_or_more_possessive, text, ts, te)
       end
     };
-    quantifier_range  @err(premature_end_error) {
-      self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
+    quantifier_interval  @err(premature_end_error) {
+      emit(:quantifier, :interval, *text(data, ts, te))
     };
     # Escaped sequences
@@ -614,35 +670,67 @@
     utf8_2_byte+    |
     utf8_3_byte+    |
     utf8_4_byte+    {
-      self.append_literal(data, ts, te)
+      append_literal(data, ts, te)
     };
   *|;
 }%%
+# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
+# This file was generated from scanner.rl
 module Regexp::Scanner
   %% write data;
+  # General scanner error (catch all)
   class ScannerError < StandardError
     def initialize(what)
       super what
     end
   end
+  # Base for all scanner validation errors
+  class ValidationError < StandardError
+    def initialize(reason)
+      super reason
+    end
+  end
+  # Unexpected end of pattern
   class PrematureEndError < ScannerError
     def initialize(where = '')
-      super "Premature end of pattern: #{where}"
+      super "Premature end of pattern at #{where}"
+    end
+  end
+  # Invalid sequence format. Used for escape sequences, mainly.
+  class InvalidSequenceError < ValidationError
+    def initialize(what = 'sequence', where = '')
+      super "Invalid #{what} at #{where}"
+    end
+  end
+  # Invalid group. Used for named groups.
+  class InvalidGroupError < ValidationError
+    def initialize(what, reason)
+      super "Invalid #{what}, #{reason}."
+    end
+  end
+  # Invalid back reference. Used for name a number refs/calls.
+  class InvalidBackrefError < ValidationError
+    def initialize(what, reason)
+      super "Invalid back reference #{what}, #{reason}"
     end
   end
-  class UnknownUnicodePropertyError < ScannerError
+  # The property name was not recognized by the scanner.
+  class UnknownUnicodePropertyError < ValidationError
     def initialize(name)
       super "Unknown unicode character property name #{name}"
     end
   end
   # Scans the given regular expression text, or Regexp object and collects the
   # emitted token into an array that gets returned at the end. If a block is
   # given, it gets called for each emitted token.
@@ -665,42 +753,107 @@ module Regexp::Scanner
     %% write init;
     %% write exec;
+    if cs == re_scanner_error
+      text = ts ? copy(data, ts-1..-1) : data.pack('c*')
+      raise ScannerError.new("Scan error at '#{text}'")
+    end
     raise PrematureEndError.new("(missing group closing paranthesis) "+
           "[#{in_group}:#{group_depth}]") if in_group
     raise PrematureEndError.new("(missing set closing bracket) "+
           "[#{in_set}:#{set_depth}]") if in_set
     # when the entire expression is a literal run
-    self.emit_literal if @literal
+    emit_literal if @literal
     @tokens
   end
-  # appends one or more characters to the literal buffer, to be emitted later
-  # by a call to emit_literal. contents a mix of ASCII and UTF-8
+  private
+  # Ragel's regex-based scan of the group options introduced a lot of
+  # ambiguity, so we just ask it to find the beginning of what looks
+  # like an options run and handle the rest in here.
+  def self.scan_options(p, data, ts, te)
+    text = text(data, ts, te).first
+    options_char, options_length = true, 0
+    # Copy while we have option characters, the maximum is 7, for (?mix-mix,
+    # even though it doesn't make sense it is possible.
+    while options_char and options_length < 7
+      if data[te + options_length]
+        c = data[te + options_length].chr
+        if c =~ /[-mix]/
+          text << c ; p += 1 ; options_length += 1
+        else
+          options_char = false
+        end
+      else
+        raise PrematureEndError.new("expression options `#{text}'")
+      end
+    end
+    if data[te + options_length]
+      c = data[te + options_length].chr
+      if c == ':'
+        # Include the ':' in the options text
+        text << c ; p += 1 ; options_length += 1
+        emit(:group, :options, text, ts, te + options_length)
+      elsif c == ')'
+        # Don't include the closing ')', let group_close handle it.
+        emit(:group, :options, text, ts, te + options_length)
+      else
+        # Plain Regexp reports this as 'undefined group option'
+        raise ScannerError.new(
+          "Unexpected `#{c}' in options sequence, ':' or ')' expected")
+      end
+    else
+      raise PrematureEndError.new("expression options `#{text}'")
+    end
+    p # return the new value of the data pointer
+  end
+  # Copy from ts to te from data as text
+  def self.copy(data, range)
+    data[range].pack('c*')
+  end
+  # Copy from ts to te from data as text, returning an array with the text
+  #  and the offsets used to copy it.
+  def self.text(data, ts, te, soff = 0)
+    [copy(data, ts-soff..te-1), ts-soff, te]
+  end
+  # Appends one or more characters to the literal buffer, to be emitted later
+  # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
   def self.append_literal(data, ts, te)
     @literal ||= []
-    @literal << [data[ts..te-1].pack('c*'), ts, te]
+    @literal << text(data, ts, te)
   end
-  # emits the collected literal run collected by one or more calls to the
-  # append_literal method
+  # Emits the literal run collected by calls to the append_literal method,
+  # using the total start (ts) and end (te) offsets of the run.
   def self.emit_literal
     ts, te = @literal.first[1], @literal.last[2]
     text = @literal.map {|t| t[0]}.join
     text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
-    self.emit(:literal, :literal, text, ts, te)
     @literal = nil
+    emit(:literal, :literal, text, ts, te)
   end
+  # Emits an array with the details of the scanned pattern
   def self.emit(type, token, text, ts, te)
-    #puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
+    #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    if @literal and type != :literal
-      self.emit_literal
-    end
+    emit_literal if @literal
     if @block
       @block.call type, token, text, ts, te
@@ -709,4 +862,37 @@ module Regexp::Scanner
     @tokens << [type, token, text, ts, te]
   end
+  # Centralizes and unifies the handling of validation related
+  # errors.
+  def self.validation_error(type, what, reason)
+    case type
+    when :group
+      error = InvalidGroupError.new(what, reason)
+    when :backref
+      error = InvalidBackrefError.new(what, reason)
+    when :sequence
+      error = InvalidSequenceError.new(what, reason)
+    else
+      error = ValidationError.new('expression')
+    end
+    # TODO: configuration option to treat scanner level validation
+    # errors as warnings or ignore them
+    if false # @@config.validation_warn
+      $stderr.puts error.to_s # unless @@config.validation_ignore
+    else
+      raise error # unless @@config.validation_ignore
+    end
+  end
+  # Used for references with an empty name or number
+  def self.empty_backref_error(type, what)
+    validation_error(:backref, what, 'ref ID is empty')
+  end
+  # Used for named expressions with an empty name
+  def self.empty_name_error(type, what)
+    validation_error(type, what, 'name is empty')
+  end
 end # module Regexp::Scanner