RubyGems - regexp_parser - Versions diffs - 2.4.0 → 2.7.0 - Mend

regexp_parser 2.4.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +98 -42
data/README.md +46 -30
data/lib/regexp_parser/expression/base.rb +17 -9
data/lib/regexp_parser/expression/classes/backreference.rb +19 -2
data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
data/lib/regexp_parser/expression/classes/group.rb +10 -0
data/lib/regexp_parser/expression/classes/keep.rb +2 -0
data/lib/regexp_parser/expression/classes/root.rb +3 -5
data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
data/lib/regexp_parser/expression/methods/construct.rb +43 -0
data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
data/lib/regexp_parser/expression/quantifier.rb +6 -5
data/lib/regexp_parser/expression/sequence.rb +6 -21
data/lib/regexp_parser/expression/shared.rb +20 -3
data/lib/regexp_parser/expression/subexpression.rb +4 -1
data/lib/regexp_parser/expression.rb +4 -2
data/lib/regexp_parser/lexer.rb +61 -29
data/lib/regexp_parser/parser.rb +36 -26
data/lib/regexp_parser/scanner/property.rl +1 -1
data/lib/regexp_parser/scanner/scanner.rl +57 -42
data/lib/regexp_parser/scanner.rb +873 -823
data/lib/regexp_parser/syntax/token/escape.rb +1 -1
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +2 -0
data/lib/regexp_parser/version.rb +1 -1
metadata +7 -5

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -59,9 +59,6 @@
   one_or_more           = '+' | '+?' | '++';
   quantifier_greedy     = '?'  | '*'  | '+';
-  quantifier_reluctant  = '??' | '*?' | '+?';
-  quantifier_possessive = '?+' | '*+' | '++';
-  quantifier_mode       = '?'  | '+';
   quantity_exact        = (digit+);
   quantity_minimum      = (digit+) . ',';
@@ -70,9 +67,6 @@
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
                           quantity_maximum | quantity_range ) . range_close;
-  quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_interval;
   conditional           = '(?(';
   group_comment         = '?#' . [^)]* . group_close;
@@ -90,8 +84,8 @@
   group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
   group_ref             = [gk];
-  group_name_id_ab      = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
-  group_name_id_sq      = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
+  group_name_id_ab      = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
+  group_name_id_sq      = ([^0-9\-']  | utf8_multibyte) . ([^'] | utf8_multibyte)*;
   group_number          = '-'? . [1-9] . [0-9]*;
   group_level           = [+\-] . [0-9]+;
@@ -132,7 +126,8 @@
                           keep_mark | sequence_char;
   # escapes that also work within a character set
-  set_escape            = backslash | brackets | escaped_ascii | property_char |
+  set_escape            = backslash | brackets | escaped_ascii |
+                          octal_sequence | property_char |
                           sequence_char | single_codepoint_char_type;
@@ -168,8 +163,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts, te-1))
-      emit(:set, :close, copy(data, ts+1, te))
+      emit(:literal, :literal, '-')
+      emit(:set, :close, ']')
       if in_set?
         fret;
       else
@@ -183,28 +178,27 @@
     };
     '^' {
-      text = copy(data, ts, te)
-      if tokens.last[1] == :open
-        emit(:set, :negate, text)
+      if prev_token[1] == :open
+        emit(:set, :negate, '^')
       else
-        emit(:literal, :literal, text)
+        emit(:literal, :literal, '^')
       end
     };
     '-' {
-      text = copy(data, ts, te)
-      # ranges cant start with a subset or intersection/negation/range operator
-      if tokens.last[0] == :set
-        emit(:literal, :literal, text)
+      # ranges cant start with the opening bracket, a subset, or
+      # intersection/negation/range operators
+      if prev_token[0] == :set
+        emit(:literal, :literal, '-')
       else
-        emit(:set, :range, text)
+        emit(:set, :range, '-')
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, copy(data, ts, te))
+      emit(:set, :intersection, '&&')
     };
     backslash {
@@ -212,7 +206,7 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, copy(data, ts, te))
+      emit(:set, :open, '[')
       fcall character_set;
     };
@@ -254,12 +248,22 @@
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
+    # Special case: in sets, octal sequences have higher priority than backrefs
+    octal_sequence {
+      emit(:escape, :octal, copy(data, ts-1, te))
+      fret;
+    };
+    # Scan all other escapes that work in sets with the generic escape scanner
     set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    # Treat all remaining escapes - those not supported in sets - as literal.
+    # (This currently includes \^, \-, \&, \:, although these could potentially
+    # be meta chars when not escaped, depending on their position in the set.)
     any > (escaped_set_alpha, 1) {
       emit(:escape, :literal, copy(data, ts-1, te))
       fret;
@@ -528,7 +532,7 @@
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, copy(data, ts, te))
+        emit(:conditional, :close, ')')
       else
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
@@ -536,7 +540,7 @@
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, copy(data, ts, te))
+        emit(:group, :close, ')')
       end
     };
@@ -717,23 +721,24 @@ class Regexp::Scanner
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, options: nil, &block)
-    new.scan(input_object, options: options, &block)
+  def self.scan(input_object, options: nil, collect_tokens: true, &block)
+    new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
   end
-  def scan(input_object, options: nil, &block)
-    self.literal = nil
+  def scan(input_object, options: nil, collect_tokens: true, &block)
+    self.collect_tokens = collect_tokens
+    self.literal_run = nil
     stack = []
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    data  = input.unpack("c*") if input.is_a?(String)
+    data  = input.unpack("c*")
     eof   = data.length
     self.tokens = []
-    self.block  = block_given? ? block : nil
+    self.block  = block
     self.set_depth = 0
     self.group_depth = 0
@@ -758,7 +763,7 @@ class Regexp::Scanner
           "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
-    emit_literal if literal
+    emit_literal if literal_run
     tokens
   end
@@ -785,26 +790,37 @@ class Regexp::Scanner
   def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    emit_literal if literal
+    emit_literal if literal_run
     # Ragel runs with byte-based indices (ts, te). These are of little value to
     # end-users, so we keep track of char-based indices and emit those instead.
     ts_char_pos = char_pos
     te_char_pos = char_pos + text.length
-    if block
-      block.call type, token, text, ts_char_pos, te_char_pos
-    end
+    tok = [type, token, text, ts_char_pos, te_char_pos]
-    tokens << [type, token, text, ts_char_pos, te_char_pos]
+    self.prev_token = tok
     self.char_pos = te_char_pos
+    if block
+      block.call type, token, text, ts_char_pos, te_char_pos
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
+      tokens << tok if collect_tokens
+    elsif collect_tokens
+      tokens << tok
+    end
   end
+  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
   private
-  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack, :char_pos
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token,
+                :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack,
+                :char_pos
   def free_spacing?(input_object, options)
     if options && !input_object.is_a?(String)
@@ -834,14 +850,13 @@ class Regexp::Scanner
   # Appends one or more characters to the literal buffer, to be emitted later
   # by a call to emit_literal.
   def append_literal(data, ts, te)
-    self.literal = literal || []
-    literal << copy(data, ts, te)
+    (self.literal_run ||= []) << copy(data, ts, te)
   end
   # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    text = literal.join
-    self.literal = nil
+    text = literal_run.join
+    self.literal_run = nil
     emit(:literal, :literal, text)
   end