RubyGems - regexp_parser - Versions diffs - 1.5.0 → 1.8.0 - Mend

regexp_parser 1.5.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +59 -0
data/Gemfile +3 -3
data/README.md +14 -6
data/Rakefile +3 -4
data/lib/regexp_parser/expression.rb +6 -43
data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
data/lib/regexp_parser/expression/classes/escape.rb +0 -4
data/lib/regexp_parser/expression/methods/match.rb +13 -0
data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
data/lib/regexp_parser/expression/methods/options.rb +35 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
data/lib/regexp_parser/expression/methods/tests.rb +6 -15
data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
data/lib/regexp_parser/expression/sequence.rb +3 -2
data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
data/lib/regexp_parser/lexer.rb +4 -25
data/lib/regexp_parser/parser.rb +40 -33
data/lib/regexp_parser/scanner.rb +1208 -1353
data/lib/regexp_parser/scanner/char_type.rl +0 -3
data/lib/regexp_parser/scanner/properties/long.yml +15 -1
data/lib/regexp_parser/scanner/properties/short.yml +5 -0
data/lib/regexp_parser/scanner/scanner.rl +116 -202
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
data/lib/regexp_parser/version.rb +1 -1
data/spec/expression/base_spec.rb +14 -0
data/spec/expression/methods/match_length_spec.rb +20 -0
data/spec/expression/methods/match_spec.rb +25 -0
data/spec/expression/methods/tests_spec.rb +2 -0
data/spec/expression/methods/traverse_spec.rb +21 -0
data/spec/expression/options_spec.rb +128 -0
data/spec/expression/root_spec.rb +9 -0
data/spec/expression/sequence_spec.rb +9 -0
data/spec/lexer/conditionals_spec.rb +49 -119
data/spec/lexer/delimiters_spec.rb +68 -0
data/spec/lexer/escapes_spec.rb +8 -32
data/spec/lexer/keep_spec.rb +5 -17
data/spec/lexer/literals_spec.rb +73 -110
data/spec/lexer/nesting_spec.rb +86 -117
data/spec/lexer/refcalls_spec.rb +51 -50
data/spec/parser/all_spec.rb +13 -1
data/spec/parser/anchors_spec.rb +9 -23
data/spec/parser/conditionals_spec.rb +9 -9
data/spec/parser/errors_spec.rb +22 -43
data/spec/parser/escapes_spec.rb +33 -44
data/spec/parser/free_space_spec.rb +25 -4
data/spec/parser/groups_spec.rb +98 -257
data/spec/parser/keep_spec.rb +2 -15
data/spec/parser/options_spec.rb +28 -0
data/spec/parser/posix_classes_spec.rb +5 -24
data/spec/parser/properties_spec.rb +42 -54
data/spec/parser/quantifiers_spec.rb +42 -283
data/spec/parser/refcalls_spec.rb +60 -185
data/spec/parser/set/intersections_spec.rb +17 -17
data/spec/parser/set/ranges_spec.rb +17 -17
data/spec/parser/sets_spec.rb +5 -5
data/spec/parser/types_spec.rb +11 -36
data/spec/scanner/anchors_spec.rb +13 -28
data/spec/scanner/conditionals_spec.rb +121 -173
data/spec/scanner/delimiters_spec.rb +52 -0
data/spec/scanner/errors_spec.rb +64 -87
data/spec/scanner/escapes_spec.rb +53 -50
data/spec/scanner/free_space_spec.rb +102 -165
data/spec/scanner/groups_spec.rb +45 -64
data/spec/scanner/keep_spec.rb +5 -28
data/spec/scanner/literals_spec.rb +45 -81
data/spec/scanner/meta_spec.rb +13 -33
data/spec/scanner/options_spec.rb +36 -0
data/spec/scanner/properties_spec.rb +43 -286
data/spec/scanner/quantifiers_spec.rb +13 -28
data/spec/scanner/refcalls_spec.rb +32 -48
data/spec/scanner/sets_spec.rb +88 -102
data/spec/scanner/types_spec.rb +10 -25
data/spec/spec_helper.rb +1 -0
data/spec/support/shared_examples.rb +77 -0
data/spec/syntax/syntax_spec.rb +4 -0
data/spec/syntax/versions/1.8.6_spec.rb +12 -33
data/spec/syntax/versions/1.9.1_spec.rb +5 -18
data/spec/syntax/versions/1.9.3_spec.rb +4 -17
data/spec/syntax/versions/2.0.0_spec.rb +8 -23
data/spec/syntax/versions/2.2.0_spec.rb +4 -17
data/spec/syntax/versions/aliases_spec.rb +27 -109
metadata +28 -10
data/spec/scanner/scripts_spec.rb +0 -49
data/spec/scanner/unicode_blocks_spec.rb +0 -28

data/lib/regexp_parser/scanner/char_type.rl CHANGED

@@ -21,9 +21,6 @@
       when '\W'; emit(:type, :nonword,    text, ts - 1, te)
       when '\R'; emit(:type, :linebreak,  text, ts - 1, te)
       when '\X'; emit(:type, :xgrapheme,  text, ts - 1, te)
-      else
-        raise ScannerError.new(
-          "Unexpected character in type at #{text} (char #{ts})")
       end
       fret;
     };

data/lib/regexp_parser/scanner/properties/long.yml CHANGED

@@ -6,6 +6,8 @@ adlam: adlam
 age=1.1: age=1.1
 age=10.0: age=10.0
 age=11.0: age=11.0
+age=12.0: age=12.0
+age=12.1: age=12.1
 age=2.0: age=2.0
 age=2.1: age=2.1
 age=3.0: age=3.0
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
 cherokee: cherokee
 closepunctuation: close_punctuation
 cntrl: cntrl
-combiningmark: combining_mark
 common: common
 connectorpunctuation: connector_punctuation
 control: control
@@ -86,6 +87,7 @@ dogra: dogra
 duployan: duployan
 egyptianhieroglyphs: egyptian_hieroglyphs
 elbasan: elbasan
+elymaic: elymaic
 emoji: emoji
 emojicomponent: emoji_component
 emojimodifier: emoji_modifier
@@ -206,8 +208,10 @@ indogra: in_dogra
 indominotiles: in_domino_tiles
 induployan: in_duployan
 inearlydynasticcuneiform: in_early_dynastic_cuneiform
+inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
 inegyptianhieroglyphs: in_egyptian_hieroglyphs
 inelbasan: in_elbasan
+inelymaic: in_elymaic
 inemoticons: in_emoticons
 inenclosedalphanumerics: in_enclosed_alphanumerics
 inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
 inmyanmarextendeda: in_myanmar_extended_a
 inmyanmarextendedb: in_myanmar_extended_b
 innabataean: in_nabataean
+innandinagari: in_nandinagari
 innewa: in_newa
 innewtailue: in_new_tai_lue
 innko: in_nko
 innoblock: in_no_block
 innumberforms: in_number_forms
 innushu: in_nushu
+innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
 inogham: in_ogham
 inolchiki: in_ol_chiki
 inoldhungarian: in_old_hungarian
@@ -343,6 +349,7 @@ inoriya: in_oriya
 inornamentaldingbats: in_ornamental_dingbats
 inosage: in_osage
 inosmanya: in_osmanya
+inottomansiyaqnumbers: in_ottoman_siyaq_numbers
 inpahawhhmong: in_pahawh_hmong
 inpalmyrene: in_palmyrene
 inpaucinhau: in_pau_cin_hau
@@ -368,6 +375,7 @@ insiddham: in_siddham
 insinhala: in_sinhala
 insinhalaarchaicnumbers: in_sinhala_archaic_numbers
 insmallformvariants: in_small_form_variants
+insmallkanaextension: in_small_kana_extension
 insogdian: in_sogdian
 insorasompeng: in_sora_sompeng
 insoyombo: in_soyombo
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
 insupplementaryprivateuseareab: in_supplementary_private_use_area_b
 insuttonsignwriting: in_sutton_signwriting
 insylotinagri: in_syloti_nagri
+insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
 insyriac: in_syriac
 insyriacsupplement: in_syriac_supplement
 intagalog: in_tagalog
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
 intaixuanjingsymbols: in_tai_xuan_jing_symbols
 intakri: in_takri
 intamil: in_tamil
+intamilsupplement: in_tamil_supplement
 intangut: in_tangut
 intangutcomponents: in_tangut_components
 intelugu: in_telugu
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
 invariationselectorssupplement: in_variation_selectors_supplement
 invedicextensions: in_vedic_extensions
 inverticalforms: in_vertical_forms
+inwancho: in_wancho
 inwarangciti: in_warang_citi
 inyijinghexagramsymbols: in_yijing_hexagram_symbols
 inyiradicals: in_yi_radicals
@@ -469,6 +480,7 @@ mro: mro
 multani: multani
 myanmar: myanmar
 nabataean: nabataean
+nandinagari: nandinagari
 newa: newa
 newline: newline
 newtailue: new_tai_lue
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
 nonspacingmark: nonspacing_mark
 number: number
 nushu: nushu
+nyiakengpuachuehmong: nyiakeng_puachue_hmong
 ogham: ogham
 olchiki: ol_chiki
 oldhungarian: old_hungarian
@@ -569,6 +582,7 @@ uppercase: uppercase
 uppercaseletter: uppercase_letter
 vai: vai
 variationselector: variation_selector
+wancho: wancho
 warangciti: warang_citi
 whitespace: white_space
 word: word

data/lib/regexp_parser/scanner/properties/short.yml CHANGED

@@ -31,6 +31,7 @@ cher: cherokee
 ci: case_ignorable
 cn: unassigned
 co: private_use
+combiningmark: mark
 copt: coptic
 cprt: cypriot
 cs: surrogate
@@ -49,6 +50,7 @@ dsrt: deseret
 dupl: duployan
 egyp: egyptian_hieroglyphs
 elba: elbasan
+elym: elymaic
 ethi: ethiopic
 ext: extender
 geor: georgian
@@ -72,6 +74,7 @@ hex: hex_digit
 hira: hiragana
 hluw: anatolian_hieroglyphs
 hmng: pahawh_hmong
+hmnp: nyiakeng_puachue_hmong
 hung: old_hungarian
 idc: id_continue
 ideo: ideographic
@@ -125,6 +128,7 @@ mtei: meetei_mayek
 mult: multani
 mymr: myanmar
 n: number
+nand: nandinagari
 narb: old_north_arabian
 nbat: nabataean
 nchar: noncharacter_code_point
@@ -216,6 +220,7 @@ uideo: unified_ideograph
 vaii: vai
 vs: variation_selector
 wara: warang_citi
+wcho: wancho
 wspace: white_space
 xidc: xid_continue
 xids: xid_start

data/lib/regexp_parser/scanner/scanner.rl CHANGED

@@ -21,7 +21,7 @@
   set_close             = ']';
   brackets              = set_open | set_close;
-  comment               = ('#' . [^\n]* . '\n');
+  comment               = ('#' . [^\n]* . '\n'?);
   class_name_posix      = 'alnum' | 'alpha' | 'blank' |
                           'cntrl' | 'digit' | 'graph' |
@@ -49,9 +49,9 @@
   codepoint_list        = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
   codepoint_sequence    = codepoint_single | codepoint_list;
-  control_sequence      = ('c' | 'C-') . (backslash . 'M-')?;
+  control_sequence      = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
-  meta_sequence         = 'M-' . (backslash . control_sequence)?;
+  meta_sequence         = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
   zero_or_one           = '?' | '??' | '?+';
   zero_or_more          = '*' | '*?' | '*+';
@@ -62,13 +62,17 @@
   quantifier_possessive = '?+' | '*+' | '++';
   quantifier_mode       = '?'  | '+';
-  quantifier_interval   = range_open . (digit+)? . ','? . (digit+)? .
-                          range_close . quantifier_mode?;
+  quantity_exact        = (digit+);
+  quantity_minimum      = (digit+) . ',';
+  quantity_maximum      = ',' . (digit+);
+  quantity_range        = (digit+) . ',' . (digit+);
+  quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
+                          quantity_maximum | quantity_range ) . range_close .
+                          quantifier_mode?;
   quantifiers           = quantifier_greedy | quantifier_reluctant |
                           quantifier_possessive | quantifier_interval;
   conditional           = '(?(';
   group_comment         = '?#' . [^)]* . group_close;
@@ -82,7 +86,8 @@
   assertion_lookbehind  = '?<=';
   assertion_nlookbehind = '?<!';
-  group_options         = '?' . [\-mixdau];
+  # try to treat every other group head as options group, like Ruby
+  group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
   group_ref             = [gk];
   group_name_char       = (alnum | '_');
@@ -113,7 +118,9 @@
                           curlies | parantheses | brackets |
                           line_anchor | quantifier_greedy;
-  ascii_print           = ((0x20..0x7e) - meta_char);
+  literal_delimiters    = ']' | '}';
+  ascii_print           = ((0x20..0x7e) - meta_char - '#');
   ascii_nonprint        = (0x01..0x1f | 0x7f);
   utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
@@ -121,7 +128,7 @@
   utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
   non_literal_escape    = char_type_char | anchor_char | escaped_ascii |
-                          group_ref | keep_mark | [xucCM];
+                          keep_mark | [xucCM];
   non_set_escape        = (anchor_char - 'b') | group_ref | keep_mark |
                           multi_codepoint_char_type | [0-9cCM];
@@ -135,41 +142,35 @@
   # Invalid sequence error, used from sequences, like escapes and sets
   action invalid_sequence_error {
     text = ts ? copy(data, ts-1..-1) : data.pack('c*')
-    raise InvalidSequenceError.new('sequence', text)
+    validation_error(:sequence, 'sequence', text)
   }
   # group (nesting) and set open/close actions
-  action group_opened { self.group_depth = group_depth + 1; in_group = true }
-  action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
+  action group_opened { self.group_depth = group_depth + 1 }
+  action group_closed { self.group_depth = group_depth - 1 }
+  action set_opened   { self.set_depth   = set_depth   + 1 }
+  action set_closed   { self.set_depth   = set_depth   - 1 }
   # Character set scanner, continues consuming characters until it meets the
   # closing bracket of the set.
   # --------------------------------------------------------------------------
   character_set := |*
-    set_close > (set_meta, 2) {
-      set_depth -= 1
-      in_set = set_depth > 0 ? true : false
+    set_close > (set_meta, 2) @set_closed {
       emit(:set, :close, *text(data, ts, te))
-      if set_depth == 0
-        fgoto main;
-      else
+      if in_set?
         fret;
+      else
+        fgoto main;
       end
     };
-    '-]' { # special case, emits two tokens
-      set_depth -= 1
-      in_set = set_depth > 0 ? true : false
-      emit(:literal, :literal, copy(data, ts..te-2), ts, te)
-      emit(:set, :close, copy(data, ts+1..te-1), ts, te)
-      if set_depth == 0
-        fgoto main;
-      else
+    '-]' @set_closed { # special case, emits two tokens
+      emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
+      emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
+      if in_set?
         fret;
+      else
+        fgoto main;
       end
     };
@@ -207,14 +208,12 @@
       fcall set_escape_sequence;
     };
-    set_open >(open_bracket, 1) {
-      set_depth += 1
+    set_open >(open_bracket, 1) >set_opened {
       emit(:set, :open, *text(data, ts, te))
       fcall character_set;
     };
-    class_posix >(open_bracket, 1) @eof(premature_end_error) {
+    class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
       text = text(data, ts, te).first
       type = :posixclass
@@ -227,11 +226,11 @@
       emit(type, class_name.to_sym, text, ts, te)
     };
-    collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
+    collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
       emit(:set, :collation, *text(data, ts, te))
     };
-    character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
+    character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
       emit(:set, :equivalent, *text(data, ts, te))
     };
@@ -337,44 +336,24 @@
     };
     control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
-      if data[te]
-        c = data[te].chr
-        if c =~ /[\x00-\x7F]/
-          emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
-          p += 1
-        else
-          raise InvalidSequenceError.new("control sequence")
-        end
-      else
-        raise PrematureEndError.new("control sequence")
-      end
+      emit_meta_control_sequence(data, ts, te, :control)
       fret;
     };
     meta_sequence >(backslashed, 3) $eof(premature_end_error) {
-      if data[te]
-        c = data[te].chr
-        if c =~ /[\x00-\x7F]/
-          emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
-          p += 1
-        else
-          raise InvalidSequenceError.new("meta sequence")
-        end
-      else
-        raise PrematureEndError.new("meta sequence")
-      end
+      emit_meta_control_sequence(data, ts, te, :meta_sequence)
       fret;
     };
     char_type_char > (escaped_alpha, 2) {
       fhold;
-      fnext *(in_set ? fentry(character_set) : fentry(main));
+      fnext *(in_set? ? fentry(character_set) : fentry(main));
       fcall char_type;
     };
     property_char > (escaped_alpha, 2) {
       fhold;
-      fnext *(in_set ? fentry(character_set) : fentry(main));
+      fnext *(in_set? ? fentry(character_set) : fentry(main));
       fcall unicode_property;
     };
@@ -412,8 +391,7 @@
     };
     alternation {
-      if in_conditional and conditional_stack.length > 0 and
-         conditional_stack.last[1] == group_depth
+      if conditional_stack.last == group_depth
         emit(:conditional, :separator, *text(data, ts, te))
       else
         emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +420,16 @@
       when '\\b'; emit(:anchor, :word_boundary,      text, ts, te)
       when '\\B'; emit(:anchor, :nonword_boundary,   text, ts, te)
       when '\\G'; emit(:anchor, :match_start,        text, ts, te)
-      else
-        raise ScannerError.new(
-          "Unexpected character in anchor at #{text} (char #{ts})")
       end
     };
+    literal_delimiters {
+      append_literal(data, ts, te)
+    };
     # Character sets
     # ------------------------------------------------------------------------
-    set_open {
-      set_depth += 1
-      in_set = true
+    set_open >set_opened {
       emit(:set, :open, *text(data, ts, te))
       fcall character_set;
     };
@@ -465,9 +441,7 @@
     conditional {
       text = text(data, ts, te).first
-      in_conditional = true unless in_conditional
-      conditional_depth += 1
-      conditional_stack << [conditional_depth, group_depth]
+      conditional_stack << group_depth
       emit(:conditional, :open, text[0..-2], ts, te-1)
       emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +470,11 @@
     #   (?imxdau-imx:subexp)  option on/off for subexp
     # ------------------------------------------------------------------------
     group_open . group_options >group_opened {
-      p = scan_options(p, data, ts, te)
+      text = text(data, ts, te).first
+      if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
+        raise InvalidGroupOption.new($1 || "-#{$2}", text)
+      end
+      emit_options(text, ts, te)
     };
     # Assertions
@@ -528,19 +506,15 @@
       when '(?>';  emit(:group, :atomic,       text, ts, te)
       when '(?~';  emit(:group, :absence,      text, ts, te)
-      when /^\(\?<(\w*)>/
-        empty_name_error(:group, 'named group (ab)') if $1.empty?
+      when /^\(\?(?:<>|'')/
+        validation_error(:group, 'named group', 'name is empty')
+      when /^\(\?<\w*>/
         emit(:group, :named_ab,  text, ts, te)
-      when /^\(\?'(\w*)'/
-        empty_name_error(:group, 'named group (sq)') if $1.empty?
+      when /^\(\?'\w*'/
         emit(:group, :named_sq,  text, ts, te)
-      else
-        raise ScannerError.new(
-          "Unknown subexpression group format '#{text}'")
       end
     };
@@ -550,20 +524,13 @@
     };
     group_close @group_closed {
-      if in_conditional and conditional_stack.last and
-         conditional_stack.last[1] == (group_depth + 1)
-        emit(:conditional, :close, *text(data, ts, te))
+      if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        if conditional_stack.length == 0
-          in_conditional = false
-        end
+        emit(:conditional, :close, *text(data, ts, te))
       else
-        if spacing_stack.length > 1 and
-          spacing_stack.last[:depth] == (group_depth + 1)
+        if spacing_stack.length > 1 &&
+           spacing_stack.last[:depth] == group_depth + 1
           spacing_stack.pop
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
@@ -576,11 +543,8 @@
     # ------------------------------------------------------------------------
     backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
       case text = text(data, ts, te).first
-      when /^\\([gk])<>/ # angle brackets
-        empty_backref_error("ref/call (ab)")
-      when /^\\([gk])''/ # single quotes
-        empty_backref_error("ref/call (sq)")
+      when /^\\([gk])(<>|'')/ # angle brackets
+        validation_error(:backref, 'ref/call', 'ref ID is empty')
       when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
         if $1 == 'k'
@@ -636,9 +600,6 @@
       when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
         emit(:backref, :number_recursion_ref_sq, text, ts, te)
-      else
-        raise ScannerError.new(
-          "Unknown backreference format '#{text}'")
       end
     };
@@ -669,10 +630,15 @@
       end
     };
-    quantifier_interval  @err(premature_end_error) {
+    quantifier_interval  {
       emit(:quantifier, :interval, *text(data, ts, te))
     };
+    # Catch unmatched curly braces as literals
+    range_open {
+      append_literal(data, ts, te)
+    };
     # Escaped sequences
     # ------------------------------------------------------------------------
     backslash > (backslashed, 1) {
@@ -771,22 +737,17 @@ class Regexp::Scanner
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, &block)
-    new.scan(input_object, &block)
+  def self.scan(input_object, options: nil, &block)
+    new.scan(input_object, options: options, &block)
   end
-  def scan(input_object, &block)
+  def scan(input_object, options: nil, &block)
     self.literal = nil
     stack = []
-    if input_object.is_a?(Regexp)
-      input = input_object.source
-      self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
-    else
-      input = input_object
-      self.free_spacing = false
-    end
+    input = input_object.is_a?(Regexp) ? input_object.source : input_object
+    self.free_spacing = free_spacing?(input_object, options)
+    self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
     data  = input.unpack("c*") if input.is_a?(String)
     eof   = data.length
@@ -794,15 +755,9 @@ class Regexp::Scanner
     self.tokens = []
     self.block  = block_given? ? block : nil
-    self.in_group = false
+    self.set_depth = 0
     self.group_depth = 0
-    self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    in_set = false
-    set_depth = 0
-    in_conditional = false
-    conditional_depth = 0
-    conditional_stack = []
+    self.conditional_stack = []
     %% write data;
     %% write init;
@@ -817,9 +772,9 @@ class Regexp::Scanner
     end
     raise PrematureEndError.new("(missing group closing paranthesis) "+
-          "[#{in_group}:#{group_depth}]") if in_group
+          "[#{group_depth}]") if in_group?
     raise PrematureEndError.new("(missing set closing bracket) "+
-          "[#{in_set}:#{set_depth}]") if in_set
+          "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
     emit_literal if literal
@@ -854,62 +809,27 @@ class Regexp::Scanner
   private
-  attr_accessor :tokens, :literal, :block,
-                :in_group, :group_depth,
-                :free_spacing, :spacing_stack
-  # Ragel's regex-based scan of the group options introduced a lot of
-  # ambiguity, so we just ask it to find the beginning of what looks
-  # like an options run and handle the rest in here.
-  def scan_options(p, data, ts, te)
-    text = text(data, ts, te).first
-    options_char, options_length = true, 0
+  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack
-    # Copy while we have option characters. There is no maximum length,
-    # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
-    negative_options = false
-    while options_char
-      if data[te + options_length]
-        c = data[te + options_length].chr
-        if c =~ /[-mixdau]/
-          negative_options = true if c == '-'
-          raise InvalidGroupOption.new(c, text) if negative_options and
-            c =~ /[dau]/
-          text << c ; p += 1 ; options_length += 1
-        else
-          options_char = false
-        end
-      else
-        raise PrematureEndError.new("expression options `#{text}'")
-      end
+  def free_spacing?(input_object, options)
+    if options && !input_object.is_a?(String)
+      raise ArgumentError, 'options cannot be supplied unless scanning a String'
     end
-    if data[te + options_length]
-      c = data[te + options_length].chr
+    options = input_object.options if input_object.is_a?(::Regexp)
-      if c == ':'
-        # Include the ':' in the options text
-        text << c ; p += 1 ; options_length += 1
-        emit_options(text, ts, te + options_length)
+    return false unless options
-      elsif c == ')'
-        # Don't include the closing ')', let group_close handle it.
-        emit_options(text, ts, te + options_length)
+    options & Regexp::EXTENDED != 0
+  end
-      else
-        # Plain Regexp reports this as 'undefined group option'
-        raise ScannerError.new(
-          "Unexpected `#{c}' in options sequence, ':' or ')' expected")
-      end
-    else
-      raise PrematureEndError.new("expression options `#{text}'")
-    end
+  def in_group?
+    group_depth > 0
+  end
-    p # return the new value of the data pointer
+  def in_set?
+    set_depth > 0
   end
   # Copy from ts to te from data as text
@@ -945,32 +865,39 @@ class Regexp::Scanner
   def emit_options(text, ts, te)
     token = nil
-    if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
-      positive, negative, group_local = $1, $2, $3
+    # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
+    text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
+    positive, negative, group_local = $1, $2, $3
-      if positive.include?('x')
-        self.free_spacing = true
-      end
+    if positive.include?('x')
+      self.free_spacing = true
+    end
-      # If the x appears in both, treat it like ruby does, the second cancels
-      # the first.
-      if negative.include?('x')
-        self.free_spacing = false
-      end
+    # If the x appears in both, treat it like ruby does, the second cancels
+    # the first.
+    if negative && negative.include?('x')
+      self.free_spacing = false
+    end
-      if group_local
-        spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
-        token = :options
-      else
-        # switch for parent group level
-        spacing_stack.last[:free_spacing] = free_spacing
-        token = :options_switch
-      end
+    if group_local
+      spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
+      token = :options
+    else
+      # switch for parent group level
+      spacing_stack.last[:free_spacing] = free_spacing
+      token = :options_switch
     end
     emit(:group, token, text, ts, te)
   end
+  def emit_meta_control_sequence(data, ts, te, token)
+    if data.last < 0x00 || data.last > 0x7F
+      validation_error(:sequence, 'escape', token.to_s)
+    end
+    emit(:escape, token, *text(data, ts, te, 1))
+  end
   # Centralizes and unifies the handling of validation related
   # errors.
   def validation_error(type, what, reason)
@@ -981,21 +908,8 @@ class Regexp::Scanner
       error = InvalidBackrefError.new(what, reason)
     when :sequence
       error = InvalidSequenceError.new(what, reason)
-    else
-      error = ValidationError.new('expression')
     end
     raise error # unless @@config.validation_ignore
   end
-  # Used for references with an empty name or number
-  def empty_backref_error(type, what)
-    validation_error(:backref, what, 'ref ID is empty')
-  end
-  # Used for named expressions with an empty name
-  def empty_name_error(type, what)
-    validation_error(type, what, 'name is empty')
-  end
 end # module Regexp::Scanner