RubyGems - regexp_parser - Versions diffs - 0.5.0 → 1.0.0 - Mend

regexp_parser 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +242 -0
data/Gemfile +1 -0
data/README.md +21 -17
data/Rakefile +31 -0
data/lib/regexp_parser/expression.rb +11 -9
data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
data/lib/regexp_parser/expression/classes/backref.rb +21 -16
data/lib/regexp_parser/expression/classes/escape.rb +81 -10
data/lib/regexp_parser/expression/classes/group.rb +20 -20
data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
data/lib/regexp_parser/expression/classes/property.rb +6 -0
data/lib/regexp_parser/expression/classes/set.rb +10 -93
data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
data/lib/regexp_parser/expression/methods/tests.rb +4 -14
data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
data/lib/regexp_parser/expression/quantifier.rb +3 -4
data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
data/lib/regexp_parser/expression/subexpression.rb +6 -10
data/lib/regexp_parser/lexer.rb +13 -17
data/lib/regexp_parser/parser.rb +170 -116
data/lib/regexp_parser/scanner.rb +952 -2431
data/lib/regexp_parser/scanner/char_type.rl +31 -0
data/lib/regexp_parser/scanner/properties/long.yml +561 -0
data/lib/regexp_parser/scanner/properties/short.yml +225 -0
data/lib/regexp_parser/scanner/property.rl +7 -806
data/lib/regexp_parser/scanner/scanner.rl +112 -154
data/lib/regexp_parser/syntax/base.rb +4 -4
data/lib/regexp_parser/syntax/tokens.rb +1 -0
data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
data/lib/regexp_parser/version.rb +1 -1
data/regexp_parser.gemspec +2 -1
data/test/expression/test_base.rb +2 -1
data/test/expression/test_clone.rb +0 -57
data/test/expression/test_set.rb +31 -8
data/test/expression/test_strfregexp.rb +13 -4
data/test/expression/test_subexpression.rb +25 -0
data/test/expression/test_traverse.rb +25 -25
data/test/helpers.rb +1 -0
data/test/lexer/test_all.rb +1 -1
data/test/lexer/test_conditionals.rb +9 -7
data/test/lexer/test_nesting.rb +39 -21
data/test/lexer/test_refcalls.rb +4 -4
data/test/parser/set/test_intersections.rb +127 -0
data/test/parser/set/test_ranges.rb +111 -0
data/test/parser/test_all.rb +4 -1
data/test/parser/test_escapes.rb +41 -9
data/test/parser/test_groups.rb +22 -3
data/test/parser/test_posix_classes.rb +27 -0
data/test/parser/test_properties.rb +17 -290
data/test/parser/test_refcalls.rb +66 -26
data/test/parser/test_sets.rb +132 -129
data/test/scanner/test_all.rb +1 -7
data/test/scanner/test_conditionals.rb +16 -16
data/test/scanner/test_errors.rb +0 -30
data/test/scanner/test_escapes.rb +1 -2
data/test/scanner/test_free_space.rb +28 -28
data/test/scanner/test_groups.rb +35 -35
data/test/scanner/test_meta.rb +1 -1
data/test/scanner/test_properties.rb +87 -114
data/test/scanner/test_refcalls.rb +18 -18
data/test/scanner/test_scripts.rb +19 -351
data/test/scanner/test_sets.rb +87 -60
data/test/scanner/test_unicode_blocks.rb +4 -105
data/test/support/warning_extractor.rb +1 -1
data/test/syntax/test_syntax.rb +7 -0
data/test/syntax/versions/test_1.8.rb +2 -4
metadata +17 -7
data/ChangeLog +0 -325
data/test/scanner/test_emojis.rb +0 -31

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -1,6 +1,7 @@
 %%{
   machine re_scanner;
-  include re_property "property.rl";
+  include re_char_type "char_type.rl";
+  include re_property  "property.rl";
   dot                   = '.';
   backslash             = '\\';
@@ -35,25 +36,17 @@
   collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
   character_equivalent  = '[=' . alpha . '=]';
-  char_type             = [dDhHsSwWRX];
   line_anchor           = beginning_of_line | end_of_line;
   anchor_char           = [AbBzZG];
-  escaped_ascii         = [abefnrstv];
+  escaped_ascii         = [abefnrtv];
   octal_sequence        = [0-7]{1,3};
   hex_sequence          = 'x' . xdigit{1,2};
   hex_sequence_err      = 'x' . [^0-9a-fA-F{];
-  wide_hex_sequence     = 'x' . '{' . xdigit{1,8} . '}';
-  hex_or_not            = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
-  wide_hex_seq_invalid  = 'x' . '{' . hex_or_not{1,9};
-  wide_hex_seq_empty    = 'x' . '{' . (space+)? . '}';
   codepoint_single      = 'u' . xdigit{4};
-  codepoint_list        = 'u{' . xdigit{1,5} . (space . xdigit{1,5})* . '}';
+  codepoint_list        = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
   codepoint_sequence    = codepoint_single | codepoint_list;
   control_sequence      = ('c' | 'C-') . (backslash . 'M-')?;
@@ -110,6 +103,7 @@
   group_type            = group_atomic | group_passive | group_absence | group_named;
+  keep_mark             = 'K';
   assertion_type        = assertion_lookahead  | assertion_nlookahead |
                           assertion_lookbehind | assertion_nlookbehind;
@@ -119,16 +113,18 @@
                           curlies | parantheses | brackets |
                           line_anchor | quantifier_greedy;
-  ascii_print           = ((0x20..0x7e) - meta_char)+;
-  ascii_nonprint        = (0x01..0x1f | 0x7f)+;
+  ascii_print           = ((0x20..0x7e) - meta_char);
+  ascii_nonprint        = (0x01..0x1f | 0x7f);
+  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf);
+  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
+  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
-  utf8_2_byte           = (0xc2..0xdf 0x80..0xbf)+;
-  utf8_3_byte           = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
-  utf8_4_byte           = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
-  utf8_byte_sequence    = utf8_2_byte | utf8_3_byte | utf8_4_byte;
+  non_literal_escape    = char_type_char | anchor_char | escaped_ascii |
+                          group_ref | keep_mark | [xucCM];
-  non_literal_escape    = char_type | anchor_char | escaped_ascii |
-                          group_ref | [xucCM];
+  non_set_escape        = (anchor_char - 'b') | group_ref | keep_mark |
+                          multi_codepoint_char_type | [0-9cCM];
   # EOF error, used where it can be detected
   action premature_end_error {
@@ -150,11 +146,11 @@
   # closing bracket of the set.
   # --------------------------------------------------------------------------
   character_set := |*
-    ']' {
-      set_type  = set_depth > 1 ? :subset : :set
-      set_depth -= 1; in_set = set_depth > 0 ? true : false
+    set_close > (set_meta, 2) {
+      set_depth -= 1
+      in_set = set_depth > 0 ? true : false
-      emit(set_type, :close, *text(data, ts, te))
+      emit(:set, :close, *text(data, ts, te))
       if set_depth == 0
         fgoto main;
@@ -164,11 +160,11 @@
     };
     '-]' { # special case, emits two tokens
-      set_type  = set_depth > 1 ? :subset : :set
-      set_depth -= 1; in_set = set_depth > 0 ? true : false
+      set_depth -= 1
+      in_set = set_depth > 0 ? true : false
-      emit(set_type, :member, copy(data, ts..te-2), ts, te)
-      emit(set_type, :close,  copy(data, ts+1..te-1), ts, te)
+      emit(:literal, :literal, copy(data, ts..te-2), ts, te)
+      emit(:set, :close, copy(data, ts+1..te-1), ts, te)
       if set_depth == 0
         fgoto main;
@@ -177,59 +173,70 @@
       end
     };
+    '-&&' { # special case, emits two tokens
+      emit(:literal, :literal, '-', ts, te)
+      emit(:set, :intersection, '&&', ts, te)
+    };
     '^' {
       text = text(data, ts, te).first
       if tokens.last[1] == :open
-        emit(set_type, :negate, text, ts, te)
+        emit(:set, :negate, text, ts, te)
       else
-        emit(set_type, :member, text, ts, te)
+        emit(:literal, :literal, text, ts, te)
       end
     };
-    alnum . '-' . alnum {
-      emit(set_type, :range, *text(data, ts, te))
+    '-' {
+      text = text(data, ts, te).first
+      # ranges cant start with a subset or intersection/negation/range operator
+      if tokens.last[0] == :set
+        emit(:literal, :literal, text, ts, te)
+      else
+        emit(:set, :range, text, ts, te)
+      end
     };
+    # Unlike ranges, intersections can start or end at set boundaries, whereupon
+    # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(set_type, :intersection, *text(data, ts, te))
+      emit(:set, :intersection, *text(data, ts, te))
     };
-    '\\' {
+    backslash {
       fcall set_escape_sequence;
     };
-    '[' >(open_bracket, 1) {
-      set_depth += 1; in_set = true
-      set_type  = set_depth > 1 ? :subset : :set
+    set_open >(open_bracket, 1) {
+      set_depth += 1
-      emit(set_type, :open, *text(data, ts, te))
+      emit(:set, :open, *text(data, ts, te))
       fcall character_set;
     };
     class_posix >(open_bracket, 1) @eof(premature_end_error) {
       text = text(data, ts, te).first
+      type = :posixclass
       class_name = text[2..-3]
       if class_name[0].chr == '^'
-        class_name = "non#{class_name[1..-1]}"
+        class_name = class_name[1..-1]
+        type = :nonposixclass
       end
-      token_sym = "class_#{class_name}".to_sym
-      emit(set_type, token_sym, text, ts, te)
+      emit(type, class_name.to_sym, text, ts, te)
     };
     collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
-      emit(set_type, :collation, *text(data, ts, te))
+      emit(:set, :collation, *text(data, ts, te))
     };
     character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
-      emit(set_type, :equivalent, *text(data, ts, te))
+      emit(:set, :equivalent, *text(data, ts, te))
     };
-    # exclude the closing bracket as a cleaner workaround for dealing with the
-    # ambiguity caused upon exit from the unicode properties machine
-    meta_char -- ']' {
-      emit(set_type, :member, *text(data, ts, te))
+    meta_char > (set_meta, 1) {
+      emit(:literal, :literal, *text(data, ts, te))
     };
     any            |
@@ -237,63 +244,24 @@
     utf8_2_byte    |
     utf8_3_byte    |
     utf8_4_byte    {
-      emit(set_type, :member, *text(data, ts, te))
+      char, *rest = *text(data, ts, te)
+      char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
+      emit(:literal, :literal, char, *rest)
     };
   *|;
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
-    'b' > (escaped_set_alpha, 2) {
-      emit(set_type, :backspace, *text(data, ts, te, 1))
-      fret;
-    };
-    char_type > (escaped_set_alpha, 4) {
-      case text = text(data, ts, te, 1).first
-      when '\d'; emit(set_type, :type_digit,     text, ts-1, te)
-      when '\D'; emit(set_type, :type_nondigit,  text, ts-1, te)
-      when '\h'; emit(set_type, :type_hex,       text, ts-1, te)
-      when '\H'; emit(set_type, :type_nonhex,    text, ts-1, te)
-      when '\s'; emit(set_type, :type_space,     text, ts-1, te)
-      when '\S'; emit(set_type, :type_nonspace,  text, ts-1, te)
-      when '\w'; emit(set_type, :type_word,      text, ts-1, te)
-      when '\W'; emit(set_type, :type_nonword,   text, ts-1, te)
-      when '\R'; emit(set_type, :type_linebreak, text, ts-1, te)
-      when '\X'; emit(set_type, :type_xgrapheme, text, ts-1, te)
-      end
-      fret;
-    };
-    hex_sequence . '-\\' . hex_sequence {
-      emit(set_type, :range_hex, *text(data, ts, te, 1))
-      fret;
-    };
-    hex_sequence {
-      emit(set_type, :member_hex, *text(data, ts, te, 1))
-      fret;
-    };
-    meta_char | [\\\]\-\,] {
-      emit(set_type, :escape, *text(data, ts, te, 1))
+    non_set_escape > (escaped_set_alpha, 2) {
+      emit(:escape, :literal, *text(data, ts, te, 1))
       fret;
     };
-    property_char > (escaped_set_alpha, 3) {
+    any > (escaped_set_alpha, 1) {
       fhold;
       fnext character_set;
-      fcall unicode_property;
-    };
-    # special case exclusion of escaped dash, could be cleaner.
-    (ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
-    ascii_nonprint            |
-    utf8_2_byte               |
-    utf8_3_byte               |
-    utf8_4_byte               {
-      emit(set_type, :escape, *text(data, ts, te, 1))
-      fret;
+      fcall escape_sequence;
     };
   *|;
@@ -338,11 +306,11 @@
       # it is a word boundary anchor. A syntax might "normalize" it if needed.
       case text = text(data, ts, te, 1).first
       when '\a'; emit(:escape, :bell,           text, ts-1, te)
+      when '\b'; emit(:escape, :backspace,      text, ts-1, te)
       when '\e'; emit(:escape, :escape,         text, ts-1, te)
       when '\f'; emit(:escape, :form_feed,      text, ts-1, te)
       when '\n'; emit(:escape, :newline,        text, ts-1, te)
       when '\r'; emit(:escape, :carriage,       text, ts-1, te)
-      when '\s'; emit(:escape, :space,          text, ts-1, te)
       when '\t'; emit(:escape, :tab,            text, ts-1, te)
       when '\v'; emit(:escape, :vertical_tab,   text, ts-1, te)
       end
@@ -364,20 +332,10 @@
       fret;
     };
-    wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
-      emit(:escape, :hex_wide, *text(data, ts, te, 1))
-      fret;
-    };
     hex_sequence_err @invalid_sequence_error {
       fret;
     };
-    (wide_hex_seq_invalid | wide_hex_seq_empty) {
-      raise InvalidSequenceError.new("wide hex sequence")
-      fret;
-    };
     control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
       if data[te]
         c = data[te].chr
@@ -408,9 +366,15 @@
       fret;
     };
+    char_type_char > (escaped_alpha, 2) {
+      fhold;
+      fnext *(in_set ? fentry(character_set) : fentry(main));
+      fcall char_type;
+    };
     property_char > (escaped_alpha, 2) {
       fhold;
-      fnext main;
+      fnext *(in_set ? fentry(character_set) : fentry(main));
       fcall unicode_property;
     };
@@ -466,7 +430,7 @@
       emit(:anchor, :eol, *text(data, ts, te))
     };
-    backslash . 'K' > (backslashed, 4) {
+    backslash . keep_mark > (backslashed, 4) {
       emit(:keep, :mark, *text(data, ts, te))
     };
@@ -484,38 +448,13 @@
       end
     };
-    # Character types
-    #   \d, \D    digit, non-digit
-    #   \h, \H    hex, non-hex
-    #   \s, \S    space, non-space
-    #   \w, \W    word, non-word
-    # ------------------------------------------------------------------------
-    backslash . char_type > (backslashed, 2) {
-      case text = text(data, ts, te).first
-      when '\\d'; emit(:type, :digit,      text, ts, te)
-      when '\\D'; emit(:type, :nondigit,   text, ts, te)
-      when '\\h'; emit(:type, :hex,        text, ts, te)
-      when '\\H'; emit(:type, :nonhex,     text, ts, te)
-      when '\\s'; emit(:type, :space,      text, ts, te)
-      when '\\S'; emit(:type, :nonspace,   text, ts, te)
-      when '\\w'; emit(:type, :word,       text, ts, te)
-      when '\\W'; emit(:type, :nonword,    text, ts, te)
-      when '\\R'; emit(:type, :linebreak,  text, ts, te)
-      when '\\X'; emit(:type, :xgrapheme,  text, ts, te)
-      else
-        raise ScannerError.new(
-          "Unexpected character in type at #{text} (char #{ts})")
-      end
-    };
     # Character sets
     # ------------------------------------------------------------------------
     set_open {
-      set_depth += 1; in_set = true
-      set_type  = set_depth > 1 ? :subset : :set
+      set_depth += 1
+      in_set = true
-      emit(set_type, :open, *text(data, ts, te))
+      emit(:set, :open, *text(data, ts, te))
       fcall character_set;
     };
@@ -645,57 +584,57 @@
       when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :name_ref_ab,  text, ts, te)
+          emit(:backref, :name_ref_ab, text, ts, te)
         else
-          emit(:backref, :name_call_ab,  text, ts, te)
+          emit(:backref, :name_call_ab, text, ts, te)
         end
       when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
         if $1 == 'k'
-          emit(:backref, :name_ref_sq,  text, ts, te)
+          emit(:backref, :name_ref_sq, text, ts, te)
         else
-          emit(:backref, :name_call_sq,  text, ts, te)
+          emit(:backref, :name_call_sq, text, ts, te)
         end
       when /^\\([gk])<\d+>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :number_ref_ab,  text, ts, te)
+          emit(:backref, :number_ref_ab, text, ts, te)
         else
-          emit(:backref, :number_call_ab,  text, ts, te)
+          emit(:backref, :number_call_ab, text, ts, te)
         end
       when /^\\([gk])'\d+'/ # single quotes
         if $1 == 'k'
-          emit(:backref, :number_ref_sq,  text, ts, te)
+          emit(:backref, :number_ref_sq, text, ts, te)
         else
-          emit(:backref, :number_call_sq,  text, ts, te)
+          emit(:backref, :number_call_sq, text, ts, te)
         end
       when /^\\([gk])<-\d+>/ # angle-brackets
         if $1 == 'k'
-          emit(:backref, :number_rel_ref_ab,  text, ts, te)
+          emit(:backref, :number_rel_ref_ab, text, ts, te)
         else
-          emit(:backref, :number_rel_call_ab,  text, ts, te)
+          emit(:backref, :number_rel_call_ab, text, ts, te)
         end
       when /^\\([gk])'-\d+'/ # single quotes
         if $1 == 'k'
-          emit(:backref, :number_rel_ref_sq,  text, ts, te)
+          emit(:backref, :number_rel_ref_sq, text, ts, te)
         else
-          emit(:backref, :number_rel_call_sq,  text, ts, te)
+          emit(:backref, :number_rel_call_sq, text, ts, te)
         end
       when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
-        emit(:backref, :name_nest_ref_ab,  text, ts, te)
+        emit(:backref, :name_recursion_ref_ab, text, ts, te)
       when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
-        emit(:backref, :name_nest_ref_sq,  text, ts, te)
+        emit(:backref, :name_recursion_ref_sq, text, ts, te)
-      when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
-        emit(:backref, :number_nest_ref_ab,  text, ts, te)
+      when /^\\([gk])<-?\d+[+\-]\d+>/ # angle-brackets
+        emit(:backref, :number_recursion_ref_ab, text, ts, te)
-      when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
-        emit(:backref, :number_nest_ref_sq,  text, ts, te)
+      when /^\\([gk])'-?\d+[+\-]\d+'/ # single-quotes
+        emit(:backref, :number_recursion_ref_sq, text, ts, te)
       else
         raise ScannerError.new(
@@ -859,8 +798,11 @@ class Regexp::Scanner
     self.group_depth = 0
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    in_set, set_depth, set_type = false, 0, :set
-    in_conditional, conditional_depth, conditional_stack = false, 0, []
+    in_set = false
+    set_depth = 0
+    in_conditional = false
+    conditional_depth = 0
+    conditional_stack = []
     %% write data;
     %% write init;
@@ -882,6 +824,18 @@ class Regexp::Scanner
     tokens
   end
+  # lazy-load property maps when first needed
+  require 'yaml'
+  PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
+  def self.short_prop_map
+    @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
+  end
+  def self.long_prop_map
+    @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
+  end
   # Emits an array with the details of the scanned pattern
   def emit(type, token, text, ts, te)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -986,6 +940,8 @@ class Regexp::Scanner
   end
   def emit_options(text, ts, te)
+    token = nil
     if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
       positive, negative, group_local = $1, $2, $3
@@ -1001,13 +957,15 @@ class Regexp::Scanner
       if group_local
         spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
+        token = :options
       else
         # switch for parent group level
         spacing_stack.last[:free_spacing] = free_spacing
+        token = :options_switch
       end
     end
-    emit(:group, :options, text, ts, te)
+    emit(:group, token, text, ts, te)
   end
   # Centralizes and unifies the handling of validation related