RubyGems - regexp_parser - Versions diffs - 2.6.0 → 2.9.2 - Mend

regexp_parser 2.6.0 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/Gemfile +5 -5
data/LICENSE +1 -1
data/lib/regexp_parser/expression/base.rb +0 -7
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
data/lib/regexp_parser/expression/classes/group.rb +0 -22
data/lib/regexp_parser/expression/classes/keep.rb +1 -1
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
data/lib/regexp_parser/expression/methods/construct.rb +2 -4
data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
data/lib/regexp_parser/expression/methods/negative.rb +20 -0
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/tests.rb +40 -3
data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
data/lib/regexp_parser/expression/quantifier.rb +30 -17
data/lib/regexp_parser/expression/sequence.rb +5 -10
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +37 -20
data/lib/regexp_parser/expression/subexpression.rb +20 -15
data/lib/regexp_parser/expression.rb +34 -31
data/lib/regexp_parser/lexer.rb +76 -36
data/lib/regexp_parser/parser.rb +101 -100
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +29 -0
data/lib/regexp_parser/scanner/properties/short.csv +3 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +101 -172
data/lib/regexp_parser/scanner.rb +1132 -1283
data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
data/lib/regexp_parser/syntax/token/escape.rb +3 -1
data/lib/regexp_parser/syntax/token/meta.rb +9 -2
data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +13 -13
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +1 -1
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +6 -6
data/regexp_parser.gemspec +5 -5
metadata +14 -8
data/CHANGELOG.md +0 -601
data/README.md +0 -503

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -30,11 +30,6 @@
   class_posix           = ('[:' . '^'? . [^\[\]]* . ':]');
-  # these are not supported in ruby at the moment
-  collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
-  character_equivalent  = '[=' . alpha . '=]';
   line_anchor           = beginning_of_line | end_of_line;
   anchor_char           = [AbBzZG];
@@ -59,9 +54,6 @@
   one_or_more           = '+' | '+?' | '++';
   quantifier_greedy     = '?'  | '*'  | '+';
-  quantifier_reluctant  = '??' | '*?' | '+?';
-  quantifier_possessive = '?+' | '*+' | '++';
-  quantifier_mode       = '?'  | '+';
   quantity_exact        = (digit+);
   quantity_minimum      = (digit+) . ',';
@@ -70,9 +62,6 @@
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
                           quantity_maximum | quantity_range ) . range_close;
-  quantifiers           = quantifier_greedy | quantifier_reluctant |
-                          quantifier_possessive | quantifier_interval;
   conditional           = '(?(';
   group_comment         = '?#' . [^)]* . group_close;
@@ -89,10 +78,9 @@
   # try to treat every other group head as options group, like Ruby
   group_options         = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
-  group_ref             = [gk];
-  group_name_id_ab      = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
-  group_name_id_sq      = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
-  group_number          = '-'? . [1-9] . [0-9]*;
+  group_name_id_ab      = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
+  group_name_id_sq      = ([^0-9\-']  | utf8_multibyte) . ([^'] | utf8_multibyte)*;
+  group_number          = '-'? . [0-9]+;
   group_level           = [+\-] . [0-9]+;
   group_name            = ('<' . group_name_id_ab? . '>') |
@@ -101,15 +89,11 @@
   group_named           = ('?' . group_name );
-  group_name_backref    = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
-                                 ("'" . group_name_id_sq? . group_level? "'"));
-  group_name_call       = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
-                                 ("'" . group_name_id_sq? . group_level? "'"));
+  group_ref_body        = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
+                           ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
-  group_number_backref  = 'k' . (('<' . group_number . group_level? '>') |
-                                 ("'" . group_number . group_level? "'"));
-  group_number_call     = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
-                                 ("'" . ((group_number . group_level?) | '0') "'"));
+  group_ref             = 'k' . group_ref_body;
+  group_call            = 'g' . group_ref_body;
   group_type            = group_atomic | group_passive | group_absence | group_named;
@@ -132,20 +116,21 @@
                           keep_mark | sequence_char;
   # escapes that also work within a character set
-  set_escape            = backslash | brackets | escaped_ascii | property_char |
+  set_escape            = backslash | brackets | escaped_ascii |
+                          octal_sequence | property_char |
                           sequence_char | single_codepoint_char_type;
   # EOF error, used where it can be detected
   action premature_end_error {
     text = copy(data, ts ? ts-1 : 0, -1)
-    raise PrematureEndError.new( text )
+    raise PrematureEndError.new(text)
   }
   # Invalid sequence error, used from sequences, like escapes and sets
   action invalid_sequence_error {
     text = copy(data, ts ? ts-1 : 0, -1)
-    validation_error(:sequence, 'sequence', text)
+    raise ValidationError.for(:sequence, 'sequence', text)
   }
   # group (nesting) and set open/close actions
@@ -168,8 +153,8 @@
     };
     '-]' @set_closed { # special case, emits two tokens
-      emit(:literal, :literal, copy(data, ts, te-1))
-      emit(:set, :close, copy(data, ts+1, te))
+      emit(:literal, :literal, '-')
+      emit(:set, :close, ']')
       if in_set?
         fret;
       else
@@ -183,28 +168,27 @@
     };
     '^' {
-      text = copy(data, ts, te)
-      if tokens.last[1] == :open
-        emit(:set, :negate, text)
+      if prev_token[1] == :open
+        emit(:set, :negate, '^')
       else
-        emit(:literal, :literal, text)
+        emit(:literal, :literal, '^')
       end
     };
     '-' {
-      text = copy(data, ts, te)
-      # ranges cant start with a subset or intersection/negation/range operator
-      if tokens.last[0] == :set
-        emit(:literal, :literal, text)
+      # ranges cant start with the opening bracket, a subset, or
+      # intersection/negation/range operators
+      if prev_token[0] == :set
+        emit(:literal, :literal, '-')
       else
-        emit(:set, :range, text)
+        emit(:set, :range, '-')
       end
     };
     # Unlike ranges, intersections can start or end at set boundaries, whereupon
     # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
     '&&' {
-      emit(:set, :intersection, copy(data, ts, te))
+      emit(:set, :intersection, '&&')
     };
     backslash {
@@ -212,7 +196,7 @@
     };
     set_open >(open_bracket, 1) >set_opened {
-      emit(:set, :open, copy(data, ts, te))
+      emit(:set, :open, '[')
       fcall character_set;
     };
@@ -227,20 +211,12 @@
       end
       unless self.class.posix_classes.include?(class_name)
-        validation_error(:posix_class, text)
+        raise ValidationError.for(:posix_class, text)
       end
       emit(type, class_name.to_sym, text)
     };
-    # These are not supported in ruby at the moment. Enable them if they are.
-    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
-    #   emit(:set, :collation, copy(data, ts, te))
-    # };
-    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
-    #   emit(:set, :equivalent, copy(data, ts, te))
-    # };
     meta_char > (set_meta, 1) {
       emit(:literal, :literal, copy(data, ts, te))
     };
@@ -254,12 +230,22 @@
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
+    # Special case: in sets, octal sequences have higher priority than backrefs
+    octal_sequence {
+      emit(:escape, :octal, copy(data, ts-1, te))
+      fret;
+    };
+    # Scan all other escapes that work in sets with the generic escape scanner
     set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    # Treat all remaining escapes - those not supported in sets - as literal.
+    # (This currently includes \^, \-, \&, \:, although these could potentially
+    # be meta chars when not escaped, depending on their position in the set.)
     any > (escaped_set_alpha, 1) {
       emit(:escape, :literal, copy(data, ts-1, te))
       fret;
@@ -281,6 +267,13 @@
       fret;
     };
+    [8-9] . [0-9] { # special case, emits two tokens
+      text = copy(data, ts-1, te)
+      emit(:escape, :literal, text[0, 2])
+      emit(:literal, :literal, text[2])
+      fret;
+    };
     meta_char {
       case text = copy(data, ts-1, te)
       when '\.';  emit(:escape, :dot,               text)
@@ -371,6 +364,7 @@
   conditional_expression := |*
     group_lookup . ')' {
       text = copy(data, ts, te-1)
+      text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
       emit(:conditional, :condition, text)
       emit(:conditional, :condition_close, ')')
     };
@@ -453,10 +447,9 @@
     # (?#...) comments: parsed as a single expression, without introducing a
     # new nesting level. Comments may not include parentheses, escaped or not.
-    # special case for close, action performed on all transitions to get the
-    # correct closing count.
+    # special case for close to get the correct closing count.
     # ------------------------------------------------------------------------
-    group_open . group_comment $group_closed {
+    (group_open . group_comment) @group_closed {
       emit(:group, :comment, copy(data, ts, te))
     };
@@ -471,10 +464,10 @@
     #
     #   (?imxdau-imx:subexp)  option on/off for subexp
     # ------------------------------------------------------------------------
-    group_open . group_options >group_opened {
+    (group_open . group_options) >group_opened {
       text = copy(data, ts, te)
       if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
-        validation_error(:group_option, $1 || "-#{$2}", text)
+        raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
       end
       emit_options(text)
     };
@@ -485,7 +478,7 @@
     #   (?<=subexp)         look-behind
     #   (?<!subexp)         negative look-behind
     # ------------------------------------------------------------------------
-    group_open . assertion_type >group_opened {
+    (group_open . assertion_type) >group_opened {
       case text = copy(data, ts, te)
       when '(?=';  emit(:assertion, :lookahead,    text)
       when '(?!';  emit(:assertion, :nlookahead,   text)
@@ -502,14 +495,14 @@
     #   (?'name'subexp)     named group (single quoted version)
     #   (subexp)            captured group
     # ------------------------------------------------------------------------
-    group_open . group_type >group_opened {
+    (group_open . group_type) >group_opened {
       case text = copy(data, ts, te)
       when '(?:';  emit(:group, :passive,      text)
       when '(?>';  emit(:group, :atomic,       text)
       when '(?~';  emit(:group, :absence,      text)
       when /^\(\?(?:<>|'')/
-        validation_error(:group, 'named group', 'name is empty')
+        raise ValidationError.for(:group, 'named group', 'name is empty')
       when /^\(\?<[^>]+>/
         emit(:group, :named_ab,  text)
@@ -528,50 +521,52 @@
     group_close @group_closed {
       if conditional_stack.last == group_depth + 1
         conditional_stack.pop
-        emit(:conditional, :close, copy(data, ts, te))
-      else
+        emit(:conditional, :close, ')')
+      elsif group_depth >= 0
         if spacing_stack.length > 1 &&
            spacing_stack.last[:depth] == group_depth + 1
           spacing_stack.pop
           self.free_spacing = spacing_stack.last[:free_spacing]
         end
-        emit(:group, :close, copy(data, ts, te))
+        emit(:group, :close, ')')
+      else
+        raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
       end
     };
     # Group backreference, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
+    backslash . (group_ref) > (backslashed, 4) {
       case text = copy(data, ts, te)
-      when /^\\k(<>|'')/
-        validation_error(:backref, 'backreference', 'ref ID is empty')
-      when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
+      when /^\\k(.)[^0-9\-][^+\-]*['>]$/
         emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
-      when /^\\k(.)\d+\D$/
+      when /^\\k(.)0*[1-9]\d*['>]$/
         emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
-      when /^\\k(.)-\d+\D$/
+      when /^\\k(.)-0*[1-9]\d*['>]$/
         emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
-      when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
+      when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
         emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
-      when /^\\k(.)-?\d+[+\-]\d+\D$/
+      when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
         emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
+      else
+        raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
       end
     };
     # Group call, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_call | group_number_call) > (backslashed, 4) {
+    backslash . (group_call) > (backslashed, 4) {
       case text = copy(data, ts, te)
-      when /^\\g(<>|'')/
-        validation_error(:backref, 'subexpression call', 'ref ID is empty')
-      when /^\\g(.)[^\p{digit}+\->][^+\-]*/
+      when /^\\g(.)[^0-9+\-].*['>]$/
         emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
-      when /^\\g(.)\d+\D$/
+      when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
         emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
-      when /^\\g(.)[+-]\d+/
+      when /^\\g(.)[+-]0*[1-9]\d*/
         emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
+      else
+        raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
       end
     };
@@ -645,95 +640,35 @@
   *|;
 }%%
-# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
-# This file was generated from lib/regexp_parser/scanner/scanner.rl
-require 'regexp_parser/error'
+require_relative 'scanner/errors/scanner_error'
+require_relative 'scanner/errors/premature_end_error'
+require_relative 'scanner/errors/validation_error'
 class Regexp::Scanner
-  # General scanner error (catch all)
-  class ScannerError < Regexp::Parser::Error; end
-  # Base for all scanner validation errors
-  class ValidationError < Regexp::Parser::Error
-    def initialize(reason)
-      super reason
-    end
-  end
-  # Unexpected end of pattern
-  class PrematureEndError < ScannerError
-    def initialize(where = '')
-      super "Premature end of pattern at #{where}"
-    end
-  end
-  # Invalid sequence format. Used for escape sequences, mainly.
-  class InvalidSequenceError < ValidationError
-    def initialize(what = 'sequence', where = '')
-      super "Invalid #{what} at #{where}"
-    end
-  end
-  # Invalid group. Used for named groups.
-  class InvalidGroupError < ValidationError
-    def initialize(what, reason)
-      super "Invalid #{what}, #{reason}."
-    end
-  end
-  # Invalid groupOption. Used for inline options.
-  # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
-  class InvalidGroupOption < ValidationError
-    def initialize(option, text)
-      super "Invalid group option #{option} in #{text}"
-    end
-  end
-  # Invalid back reference. Used for name a number refs/calls.
-  class InvalidBackrefError < ValidationError
-    def initialize(what, reason)
-      super "Invalid back reference #{what}, #{reason}"
-    end
-  end
-  # The property name was not recognized by the scanner.
-  class UnknownUnicodePropertyError < ValidationError
-    def initialize(name)
-      super "Unknown unicode character property name #{name}"
-    end
-  end
-  # The POSIX class name was not recognized by the scanner.
-  class UnknownPosixClassError < ValidationError
-    def initialize(text)
-      super "Unknown POSIX class #{text}"
-    end
-  end
   # Scans the given regular expression text, or Regexp object and collects the
   # emitted token into an array that gets returned at the end. If a block is
   # given, it gets called for each emitted token.
   #
   # This method may raise errors if a syntax error is encountered.
   # --------------------------------------------------------------------------
-  def self.scan(input_object, options: nil, &block)
-    new.scan(input_object, options: options, &block)
+  def self.scan(input_object, options: nil, collect_tokens: true, &block)
+    new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
   end
-  def scan(input_object, options: nil, &block)
-    self.literal = nil
+  def scan(input_object, options: nil, collect_tokens: true, &block)
+    self.collect_tokens = collect_tokens
+    self.literal_run = nil
     stack = []
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
-    data  = input.unpack("c*") if input.is_a?(String)
+    data  = input.unpack("c*")
     eof   = data.length
     self.tokens = []
-    self.block  = block_given? ? block : nil
+    self.block  = block
     self.set_depth = 0
     self.group_depth = 0
@@ -758,7 +693,7 @@ class Regexp::Scanner
           "[#{set_depth}]") if in_set?
     # when the entire expression is a literal run
-    emit_literal if literal
+    emit_literal if literal_run
     tokens
   end
@@ -785,26 +720,37 @@ class Regexp::Scanner
   def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
-    emit_literal if literal
+    emit_literal if literal_run
     # Ragel runs with byte-based indices (ts, te). These are of little value to
     # end-users, so we keep track of char-based indices and emit those instead.
     ts_char_pos = char_pos
     te_char_pos = char_pos + text.length
-    if block
-      block.call type, token, text, ts_char_pos, te_char_pos
-    end
+    tok = [type, token, text, ts_char_pos, te_char_pos]
-    tokens << [type, token, text, ts_char_pos, te_char_pos]
+    self.prev_token = tok
     self.char_pos = te_char_pos
+    if block
+      block.call type, token, text, ts_char_pos, te_char_pos
+      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
+      tokens << tok if collect_tokens
+    elsif collect_tokens
+      tokens << tok
+    end
   end
+  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
   private
-  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
-                :group_depth, :set_depth, :conditional_stack, :char_pos
+  attr_accessor :block,
+                :collect_tokens, :tokens, :prev_token,
+                :free_spacing, :spacing_stack,
+                :group_depth, :set_depth, :conditional_stack,
+                :char_pos
   def free_spacing?(input_object, options)
     if options && !input_object.is_a?(String)
@@ -834,14 +780,13 @@ class Regexp::Scanner
   # Appends one or more characters to the literal buffer, to be emitted later
   # by a call to emit_literal.
   def append_literal(data, ts, te)
-    self.literal = literal || []
-    literal << copy(data, ts, te)
+    (self.literal_run ||= []) << copy(data, ts, te)
   end
   # Emits the literal run collected by calls to the append_literal method.
   def emit_literal
-    text = literal.join
-    self.literal = nil
+    text = literal_run.join
+    self.literal_run = nil
     emit(:literal, :literal, text)
   end
@@ -876,24 +821,8 @@ class Regexp::Scanner
   def emit_meta_control_sequence(data, ts, te, token)
     if data.last < 0x00 || data.last > 0x7F
-      validation_error(:sequence, 'escape', token.to_s)
+      raise ValidationError.for(:sequence, 'escape', token.to_s)
     end
     emit(:escape, token, copy(data, ts-1, te))
   end
-  # Centralizes and unifies the handling of validation related
-  # errors.
-  def validation_error(type, what, reason = nil)
-    error =
-      case type
-      when :backref      then InvalidBackrefError.new(what, reason)
-      when :group        then InvalidGroupError.new(what, reason)
-      when :group_option then InvalidGroupOption.new(what, reason)
-      when :posix_class  then UnknownPosixClassError.new(what)
-      when :property     then UnknownUnicodePropertyError.new(what)
-      when :sequence     then InvalidSequenceError.new(what, reason)
-      end
-    raise error # unless @@config.validation_ignore
-  end
 end # module Regexp::Scanner