RubyGems - regexp_parser - Versions diffs - 1.5.0 → 1.8.0 - Mend

regexp_parser 1.5.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +59 -0
data/Gemfile +3 -3
data/README.md +14 -6
data/Rakefile +3 -4
data/lib/regexp_parser/expression.rb +6 -43
data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
data/lib/regexp_parser/expression/classes/escape.rb +0 -4
data/lib/regexp_parser/expression/methods/match.rb +13 -0
data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
data/lib/regexp_parser/expression/methods/options.rb +35 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
data/lib/regexp_parser/expression/methods/tests.rb +6 -15
data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
data/lib/regexp_parser/expression/sequence.rb +3 -2
data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
data/lib/regexp_parser/lexer.rb +4 -25
data/lib/regexp_parser/parser.rb +40 -33
data/lib/regexp_parser/scanner.rb +1208 -1353
data/lib/regexp_parser/scanner/char_type.rl +0 -3
data/lib/regexp_parser/scanner/properties/long.yml +15 -1
data/lib/regexp_parser/scanner/properties/short.yml +5 -0
data/lib/regexp_parser/scanner/scanner.rl +116 -202
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
data/lib/regexp_parser/version.rb +1 -1
data/spec/expression/base_spec.rb +14 -0
data/spec/expression/methods/match_length_spec.rb +20 -0
data/spec/expression/methods/match_spec.rb +25 -0
data/spec/expression/methods/tests_spec.rb +2 -0
data/spec/expression/methods/traverse_spec.rb +21 -0
data/spec/expression/options_spec.rb +128 -0
data/spec/expression/root_spec.rb +9 -0
data/spec/expression/sequence_spec.rb +9 -0
data/spec/lexer/conditionals_spec.rb +49 -119
data/spec/lexer/delimiters_spec.rb +68 -0
data/spec/lexer/escapes_spec.rb +8 -32
data/spec/lexer/keep_spec.rb +5 -17
data/spec/lexer/literals_spec.rb +73 -110
data/spec/lexer/nesting_spec.rb +86 -117
data/spec/lexer/refcalls_spec.rb +51 -50
data/spec/parser/all_spec.rb +13 -1
data/spec/parser/anchors_spec.rb +9 -23
data/spec/parser/conditionals_spec.rb +9 -9
data/spec/parser/errors_spec.rb +22 -43
data/spec/parser/escapes_spec.rb +33 -44
data/spec/parser/free_space_spec.rb +25 -4
data/spec/parser/groups_spec.rb +98 -257
data/spec/parser/keep_spec.rb +2 -15
data/spec/parser/options_spec.rb +28 -0
data/spec/parser/posix_classes_spec.rb +5 -24
data/spec/parser/properties_spec.rb +42 -54
data/spec/parser/quantifiers_spec.rb +42 -283
data/spec/parser/refcalls_spec.rb +60 -185
data/spec/parser/set/intersections_spec.rb +17 -17
data/spec/parser/set/ranges_spec.rb +17 -17
data/spec/parser/sets_spec.rb +5 -5
data/spec/parser/types_spec.rb +11 -36
data/spec/scanner/anchors_spec.rb +13 -28
data/spec/scanner/conditionals_spec.rb +121 -173
data/spec/scanner/delimiters_spec.rb +52 -0
data/spec/scanner/errors_spec.rb +64 -87
data/spec/scanner/escapes_spec.rb +53 -50
data/spec/scanner/free_space_spec.rb +102 -165
data/spec/scanner/groups_spec.rb +45 -64
data/spec/scanner/keep_spec.rb +5 -28
data/spec/scanner/literals_spec.rb +45 -81
data/spec/scanner/meta_spec.rb +13 -33
data/spec/scanner/options_spec.rb +36 -0
data/spec/scanner/properties_spec.rb +43 -286
data/spec/scanner/quantifiers_spec.rb +13 -28
data/spec/scanner/refcalls_spec.rb +32 -48
data/spec/scanner/sets_spec.rb +88 -102
data/spec/scanner/types_spec.rb +10 -25
data/spec/spec_helper.rb +1 -0
data/spec/support/shared_examples.rb +77 -0
data/spec/syntax/syntax_spec.rb +4 -0
data/spec/syntax/versions/1.8.6_spec.rb +12 -33
data/spec/syntax/versions/1.9.1_spec.rb +5 -18
data/spec/syntax/versions/1.9.3_spec.rb +4 -17
data/spec/syntax/versions/2.0.0_spec.rb +8 -23
data/spec/syntax/versions/2.2.0_spec.rb +4 -17
data/spec/syntax/versions/aliases_spec.rb +27 -109
metadata +28 -10
data/spec/scanner/scripts_spec.rb +0 -49
data/spec/scanner/unicode_blocks_spec.rb +0 -28

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 707834a32bc2295b448953730eabddabb11bafc68fdf3148174f7be61d8b1f30
-  data.tar.gz: 72d199d28c342d6aae178a5876a7df6f59abcdf40c6ac4f05ea9dc40d16d9f3a
+  metadata.gz: 9c8e3ba5269d32f57fdcad4ba98f16bc88b74713106a23f63ba2728fa89cf802
+  data.tar.gz: bbb1fe8c3f72b750a707a9f6c237c752455b4cd34d2bc8b0f255284fd32d4ed3
 SHA512:
-  metadata.gz: 4c1402afedc1efb79f633ee93065598b64732519ba587ca3f682eb8bbb4aaa264e31dd916b95f9751f7c6e85e867efa260501b40e55409327efa5b769346a183
-  data.tar.gz: 6335cbc411b08adb64bfca9646eebc3a5c39d4651a2495d34f87fca21927da3a363fc320159a3732cdd9e2d8732986190fcd6c9d523b7308531f91848951ccbd
+  metadata.gz: 93f94773ee6cb173771608ecdaf67e9e444e1ba7922cd97b8124a6ec90868e94b484ad6bdd0599d9ae9c2247e2a1627cdb03c620ce85f62d320ba8b1fdbb63bb
+  data.tar.gz: 9e1927e7c10d182bce099f24964f8c7e88ac76d479d22c632ab09406bc37711c9cc78af53892d46b009d0f12c8c3ffcd2221b2a1b2ca5365b3d3c1797448f3f2

data/CHANGELOG.md CHANGED

@@ -1,5 +1,64 @@
 ## [Unreleased]
+### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com)
+### Changed
+- dropped support for running on Ruby 1.9.x
+### Added
+- regexp flags can now be passed when parsing a `String` as regexp body
+  * see the [README](/README.md#usage) for details
+  * thanks to [Owen Stephens](https://github.com/owst)
+- bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes
+  * matches Onigmo behavior
+  * thanks for the report to [Marc-André Lafortune](https://github.com/marcandre)
+### Fixed
+- fixed parsing comments without preceding space or trailing newline in x-mode
+  * thanks to [Owen Stephens](https://github.com/owst)
+### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com)
+### Fixed
+- Support for literals that include the unescaped delimiters `{`, `}`, and `]`. These
+  delimiters are informally supported by various regexp engines.
+### [1.7.0] - 2020-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
+### Added
+- `Expression#each_expression` and `#traverse` can now be called without a block
+  * this returns an `Enumerator` and allows chaining, e.g. `each_expression.select`
+  * thanks to [Masataka Kuwabara](https://github.com/pocke)
+### Fixed
+- `MatchLength#each` no longer ignores the given `limit:` when called without a block
+### [1.6.0] - 2019-06-16 - [Janosch Müller](mailto:janosch84@gmail.com)
+### Added
+- Added support for 16 new unicode properties introduced in Ruby 2.6.2 and 2.6.3
+### [1.5.1] - 2019-05-23 - [Janosch Müller](mailto:janosch84@gmail.com)
+### Fixed
+- Fixed `#options` (and thus `#i?`, `#u?` etc.) not being set for some expressions:
+  * this affected posix classes as well as alternation, conditional, and intersection branches
+  * `#options` was already correct for all child expressions of such branches
+  * this only made an operational difference for posix classes as they respect encoding flags
+- Fixed `#options` not respecting all negative options in weird cases like '(?u-m-x)'
+- Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags
+- Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)'
+- Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\'
+- Fixed `Expression#match` and `#=~` not working with a single argument
 ### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com)
 ### Added

data/Gemfile CHANGED

@@ -3,7 +3,7 @@ source 'https://rubygems.org'
 gemspec
 group :development, :test do
-  gem 'rake'
-  gem 'regexp_property_values'
-  gem 'rspec'
+  gem 'rake', '~> 13.0'
+  gem 'regexp_property_values', '~> 1.0'
+  gem 'rspec', '~> 3.8'
 end

data/README.md CHANGED

@@ -8,7 +8,7 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
   * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
   * A lexer that produces a "stream" of token objects.
   * A parser that produces a "tree" of Expression objects (OO API)
-* Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
+* Runs on Ruby 2.x and JRuby runtimes
 * Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
@@ -72,6 +72,17 @@ called with the results as follows:
 * **Parser**: after completion, the block gets passed the root expression.
   _The result of the block is returned._
+All three methods accept either a `Regexp` or `String` (containing the pattern)
+- if a String is passed, `options` can be supplied:
+```ruby
+require 'regexp_parser'
+Regexp::Parser.parse(
+  "a+ # Recognises a and A...",
+  options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
+)
+```
 ---
 ## Components
@@ -136,11 +147,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
     to the lexer.
   * The MRI implementation may accept expressions that either conflict with
-    the documentation or are undocumented. The scanner does not support such
-    implementation quirks.
-    _(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
-    [#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
+    the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
+    The scanner will try to support as many of these cases as possible.
 ---
 ### Syntax

data/Rakefile CHANGED

@@ -74,14 +74,13 @@ namespace :props do
       puts "Wrote #{hash.count} aliases to `#{path}`"
     end
-    _, long_names = RegexpPropertyValues.short_and_long_names
-    long_names_to_tokens = long_names.map do |name|
-      [name.downcase.gsub(/[^0-9a-z=.]/, ''), name.downcase]
+    long_names_to_tokens = RegexpPropertyValues.all.map do |val|
+      [val.identifier, val.full_name.downcase]
     end
     write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
     short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
-      [k.downcase.gsub(/[^0-9a-z=.]/, ''), v.downcase]
+      [k.identifier, v.full_name.downcase]
     end
     write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
   end

data/lib/regexp_parser/expression.rb CHANGED

@@ -97,47 +97,6 @@ module Regexp::Expression
       quantified? and quantifier.possessive?
     end
-    def multiline?
-      options[:m] == true
-    end
-    alias :m? :multiline?
-    def case_insensitive?
-      options[:i] == true
-    end
-    alias :i? :case_insensitive?
-    alias :ignore_case? :case_insensitive?
-    def free_spacing?
-      options[:x] == true
-    end
-    alias :x? :free_spacing?
-    alias :extended? :free_spacing?
-    def default_classes?
-      options[:d] == true
-    end
-    alias :d? :default_classes?
-    def ascii_classes?
-      options[:a] == true
-    end
-    alias :a? :ascii_classes?
-    def unicode_classes?
-      options[:u] == true
-    end
-    alias :u? :unicode_classes?
-    def matches?(string)
-      Regexp.new(to_s) =~ string ? true : false
-    end
-    def match(string, offset)
-      Regexp.new(to_s).match(string, offset)
-    end
-    alias :=~ :match
     def attributes
       {
         type:              type,
@@ -156,12 +115,14 @@ module Regexp::Expression
   end
   def self.parsed(exp)
+    warn('WARNING: Regexp::Expression::Base.parsed is buggy and '\
+         'will be removed in 2.0.0. Use Regexp::Parser.parse instead.')
     case exp
     when String
       Regexp::Parser.parse(exp)
     when Regexp
-      Regexp::Parser.parse(exp.source)
-    when Regexp::Expression
+      Regexp::Parser.parse(exp.source) # <- causes loss of root options
+    when Regexp::Expression            # <- never triggers
       exp
     else
       raise ArgumentError, 'Expression.parsed accepts a String, Regexp, or '\
@@ -194,7 +155,9 @@ require 'regexp_parser/expression/classes/set/intersection'
 require 'regexp_parser/expression/classes/set/range'
 require 'regexp_parser/expression/classes/type'
+require 'regexp_parser/expression/methods/match'
 require 'regexp_parser/expression/methods/match_length'
+require 'regexp_parser/expression/methods/options'
 require 'regexp_parser/expression/methods/strfregexp'
 require 'regexp_parser/expression/methods/tests'
 require 'regexp_parser/expression/methods/traverse'

data/lib/regexp_parser/expression/classes/conditional.rb CHANGED

@@ -26,9 +26,10 @@ module Regexp::Expression
         expressions.last << exp
       end
-      def add_sequence
+      def add_sequence(active_opts = {})
         raise TooManyBranches.new if branches.length == 2
-        Branch.add_to(self, { conditional_level: conditional_level + 1 })
+        params = { conditional_level: conditional_level + 1 }
+        Branch.add_to(self, params, active_opts)
       end
       alias :branch :add_sequence

data/lib/regexp_parser/expression/classes/escape.rb CHANGED

@@ -60,10 +60,6 @@ module Regexp::Expression
         codepoint.chr('utf-8')
       end
-      def codepoint
-        raise NotImplementedError, 'implement in subclass'
-      end
       private
       def control_sequence_to_s(control_sequence)

data/lib/regexp_parser/expression/methods/match.rb ADDED

@@ -0,0 +1,13 @@
+module Regexp::Expression
+  class Base
+    def match?(string)
+      !!match(string)
+    end
+    alias :matches? :match?
+    def match(string, offset = 0)
+      Regexp.new(to_s).match(string, offset)
+    end
+    alias :=~ :match
+  end
+end

data/lib/regexp_parser/expression/methods/match_length.rb CHANGED

@@ -22,7 +22,7 @@ class Regexp::MatchLength
   end
   def each(opts = {})
-    return enum_for(__method__) unless block_given?
+    return enum_for(__method__, opts) unless block_given?
     limit = opts[:limit] || 1000
     yielded = 0
     (min..max).each do |num|

data/lib/regexp_parser/expression/methods/options.rb ADDED

@@ -0,0 +1,35 @@
+module Regexp::Expression
+  class Base
+    def multiline?
+      options[:m] == true
+    end
+    alias :m? :multiline?
+    def case_insensitive?
+      options[:i] == true
+    end
+    alias :i? :case_insensitive?
+    alias :ignore_case? :case_insensitive?
+    def free_spacing?
+      options[:x] == true
+    end
+    alias :x? :free_spacing?
+    alias :extended? :free_spacing?
+    def default_classes?
+      options[:d] == true
+    end
+    alias :d? :default_classes?
+    def ascii_classes?
+      options[:a] == true
+    end
+    alias :a? :ascii_classes?
+    def unicode_classes?
+      options[:u] == true
+    end
+    alias :u? :unicode_classes?
+  end
+end

data/lib/regexp_parser/expression/methods/strfregexp.rb CHANGED

@@ -1,5 +1,4 @@
 module Regexp::Expression
   class Base
     #   %l  Level (depth) of the expression. Returns 'root' for the root

data/lib/regexp_parser/expression/methods/tests.rb CHANGED

@@ -75,32 +75,23 @@ module Regexp::Expression
     def one_of?(scope, top = true)
       case scope
       when Array
-        if scope.include?(:*)
-          return (scope.include?(token) or scope.include?(:*))
-        else
-          return scope.include?(token)
-        end
+        scope.include?(:*) || scope.include?(token)
       when Hash
         if scope.has_key?(:*)
           test_type = scope.has_key?(type) ? type : :*
-          return one_of?(scope[test_type], false)
+          one_of?(scope[test_type], false)
         else
-          return (scope.has_key?(type) and one_of?(scope[type], false))
+          scope.has_key?(type) && one_of?(scope[type], false)
         end
       when Symbol
-        return true if scope == :*
-        return is?(scope) unless top
-        return type?(scope) if top
+        scope.equal?(:*) || (top ? type?(scope) : is?(scope))
       else
-        raise "Array, Hash, or Symbol expected, #{scope.class.name} given"
+        raise ArgumentError,
+              "Array, Hash, or Symbol expected, #{scope.class.name} given"
       end
-      false
     end
   end
 end

data/lib/regexp_parser/expression/methods/traverse.rb CHANGED

@@ -14,7 +14,7 @@ module Regexp::Expression
     #
     # Returns self.
     def traverse(include_self = false, &block)
-      raise 'traverse requires a block' unless block_given?
+      return enum_for(__method__, include_self) unless block_given?
       block.call(:enter, self, 0) if include_self
@@ -37,6 +37,8 @@ module Regexp::Expression
     # Iterates over the expressions of this expression as an array, passing
     # the expression and its index within its parent to the given block.
     def each_expression(include_self = false, &block)
+      return enum_for(__method__, include_self) unless block_given?
       traverse(include_self) do |event, exp, index|
         yield(exp, index) unless event == :exit
       end

data/lib/regexp_parser/expression/sequence.rb CHANGED

@@ -18,13 +18,14 @@ module Regexp::Expression
     end
     class << self
-      def add_to(subexpression, options = {})
+      def add_to(subexpression, params = {}, active_opts = {})
         sequence = at_levels(
           subexpression.level,
           subexpression.set_level,
-          options[:conditional_level] || subexpression.conditional_level
+          params[:conditional_level] || subexpression.conditional_level
         )
         sequence.nesting_level = subexpression.nesting_level + 1
+        sequence.options = active_opts
         subexpression.expressions << sequence
         sequence
       end

data/lib/regexp_parser/expression/sequence_operation.rb CHANGED

@@ -14,12 +14,8 @@ module Regexp::Expression
       expressions.last << exp
     end
-    def add_sequence
-      self.class::OPERAND.add_to(self)
-    end
-    def quantify(token, text, min = nil, max = nil, mode = :greedy)
-      sequences.last.last.quantify(token, text, min, max, mode)
+    def add_sequence(active_opts = {})
+      self.class::OPERAND.add_to(self, {}, active_opts)
     end
     def to_s(format = :full)

data/lib/regexp_parser/lexer.rb CHANGED

@@ -11,11 +11,11 @@ class Regexp::Lexer
   CLOSING_TOKENS = [:close].freeze
-  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
-    new.lex(input, syntax, &block)
+  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+    new.lex(input, syntax, options: options, &block)
   end
-  def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
+  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
     syntax = Regexp::Syntax.new(syntax)
     self.tokens = []
@@ -25,7 +25,7 @@ class Regexp::Lexer
     self.shift = 0
     last = nil
-    Regexp::Scanner.scan(input) do |type, token, text, ts, te|
+    Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
       type, token = *syntax.normalize(type, token)
       syntax.check! type, token
@@ -39,10 +39,6 @@ class Regexp::Lexer
       current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                   nesting, set_nesting, conditional_nesting)
-      current = merge_literal(current) if type == :literal and
-        set_nesting == 0 and
-        last and last.type == :literal
       current = merge_condition(current) if type == :conditional and
         [:condition, :condition_close].include?(token)
@@ -122,23 +118,6 @@ class Regexp::Lexer
     self.shift = shift + 3 # one space less, but extra \, u, {, and }
   end
-  # called by scan to merge two consecutive literals. this happens when tokens
-  # get normalized (as in the case of posix/bre) and end up becoming literals.
-  def merge_literal(current)
-    last = tokens.pop
-    Regexp::Token.new(
-      :literal,
-      :literal,
-      last.text + current.text,
-      last.ts,
-      current.te,
-      nesting,
-      set_nesting,
-      conditional_nesting,
-    )
-  end
   def merge_condition(current)
     last = tokens.pop
     Regexp::Token.new(:conditional, :condition, last.text + current.text,