RubyGems - regexp_parser - Versions diffs - 1.7.0 → 2.8.1 - Mend

regexp_parser 1.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +364 -22
data/Gemfile +8 -2
data/LICENSE +1 -1
data/README.md +124 -88
data/Rakefile +6 -70
data/lib/regexp_parser/error.rb +4 -0
data/lib/regexp_parser/expression/base.rb +76 -0
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
data/lib/regexp_parser/expression/classes/group.rb +28 -15
data/lib/regexp_parser/expression/classes/keep.rb +2 -0
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
data/lib/regexp_parser/expression/classes/root.rb +4 -19
data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
data/lib/regexp_parser/expression/methods/construct.rb +41 -0
data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
data/lib/regexp_parser/expression/methods/tests.rb +47 -1
data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
data/lib/regexp_parser/expression/quantifier.rb +57 -17
data/lib/regexp_parser/expression/sequence.rb +11 -47
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +111 -0
data/lib/regexp_parser/expression/subexpression.rb +27 -19
data/lib/regexp_parser/expression.rb +14 -141
data/lib/regexp_parser/lexer.rb +83 -41
data/lib/regexp_parser/parser.rb +371 -429
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +633 -0
data/lib/regexp_parser/scanner/properties/short.csv +248 -0
data/lib/regexp_parser/scanner/property.rl +4 -4
data/lib/regexp_parser/scanner/scanner.rl +295 -368
data/lib/regexp_parser/scanner.rb +1405 -1674
data/lib/regexp_parser/syntax/any.rb +2 -7
data/lib/regexp_parser/syntax/base.rb +92 -67
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +33 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/token/meta.rb +20 -0
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +8 -6
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +0 -2
data/regexp_parser.gemspec +20 -22
metadata +49 -166
data/lib/regexp_parser/scanner/properties/long.yml +0 -594
data/lib/regexp_parser/scanner/properties/short.yml +0 -237
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/expression/base_spec.rb +0 -94
data/spec/expression/clone_spec.rb +0 -120
data/spec/expression/conditional_spec.rb +0 -89
data/spec/expression/free_space_spec.rb +0 -27
data/spec/expression/methods/match_length_spec.rb +0 -161
data/spec/expression/methods/match_spec.rb +0 -25
data/spec/expression/methods/strfregexp_spec.rb +0 -224
data/spec/expression/methods/tests_spec.rb +0 -99
data/spec/expression/methods/traverse_spec.rb +0 -161
data/spec/expression/options_spec.rb +0 -128
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9
data/spec/expression/subexpression_spec.rb +0 -50
data/spec/expression/to_h_spec.rb +0 -26
data/spec/expression/to_s_spec.rb +0 -100
data/spec/lexer/all_spec.rb +0 -22
data/spec/lexer/conditionals_spec.rb +0 -53
data/spec/lexer/escapes_spec.rb +0 -14
data/spec/lexer/keep_spec.rb +0 -10
data/spec/lexer/literals_spec.rb +0 -89
data/spec/lexer/nesting_spec.rb +0 -99
data/spec/lexer/refcalls_spec.rb +0 -55
data/spec/parser/all_spec.rb +0 -43
data/spec/parser/alternation_spec.rb +0 -88
data/spec/parser/anchors_spec.rb +0 -17
data/spec/parser/conditionals_spec.rb +0 -179
data/spec/parser/errors_spec.rb +0 -30
data/spec/parser/escapes_spec.rb +0 -121
data/spec/parser/free_space_spec.rb +0 -130
data/spec/parser/groups_spec.rb +0 -108
data/spec/parser/keep_spec.rb +0 -6
data/spec/parser/posix_classes_spec.rb +0 -8
data/spec/parser/properties_spec.rb +0 -115
data/spec/parser/quantifiers_spec.rb +0 -51
data/spec/parser/refcalls_spec.rb +0 -112
data/spec/parser/set/intersections_spec.rb +0 -127
data/spec/parser/set/ranges_spec.rb +0 -111
data/spec/parser/sets_spec.rb +0 -178
data/spec/parser/types_spec.rb +0 -18
data/spec/scanner/all_spec.rb +0 -18
data/spec/scanner/anchors_spec.rb +0 -21
data/spec/scanner/conditionals_spec.rb +0 -128
data/spec/scanner/errors_spec.rb +0 -68
data/spec/scanner/escapes_spec.rb +0 -53
data/spec/scanner/free_space_spec.rb +0 -133
data/spec/scanner/groups_spec.rb +0 -52
data/spec/scanner/keep_spec.rb +0 -10
data/spec/scanner/literals_spec.rb +0 -49
data/spec/scanner/meta_spec.rb +0 -18
data/spec/scanner/properties_spec.rb +0 -64
data/spec/scanner/quantifiers_spec.rb +0 -20
data/spec/scanner/refcalls_spec.rb +0 -36
data/spec/scanner/sets_spec.rb +0 -102
data/spec/scanner/types_spec.rb +0 -14
data/spec/spec_helper.rb +0 -15
data/spec/support/runner.rb +0 -42
data/spec/support/shared_examples.rb +0 -77
data/spec/support/warning_extractor.rb +0 -60
data/spec/syntax/syntax_spec.rb +0 -48
data/spec/syntax/syntax_token_map_spec.rb +0 -23
data/spec/syntax/versions/1.8.6_spec.rb +0 -17
data/spec/syntax/versions/1.9.1_spec.rb +0 -10
data/spec/syntax/versions/1.9.3_spec.rb +0 -9
data/spec/syntax/versions/2.0.0_spec.rb +0 -13
data/spec/syntax/versions/2.2.0_spec.rb +0 -9
data/spec/syntax/versions/aliases_spec.rb +0 -37
data/spec/token/token_spec.rb +0 -85
/data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0

data/README.md CHANGED Viewed

@@ -1,15 +1,18 @@
 # Regexp::Parser
-[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
+[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
+[![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
+[![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
+[![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
 A Ruby gem for tokenizing, parsing, and transforming regular expressions.
 * Multilayered
   * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
-  * A lexer that produces a "stream" of token objects.
-  * A parser that produces a "tree" of Expression objects (OO API)
-* Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
-* Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
+  * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
+  * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
+* Runs on Ruby 2.x, 3.x and JRuby runtimes
+* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
 _For examples of regexp_parser in use, see [Example Projects](#example-projects)._
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
 ---
 ## Requirements
-* Ruby >= 1.9
+* Ruby >= 2.0
 * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
-_Note: See the .travis.yml file for covered versions._
 ---
 ## Install
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
 ```gem 'regexp_parser', '~> X.Y.Z'```
-See rubygems for the the [latest version number](https://rubygems.org/gems/regexp_parser)
+See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
+for the the latest version number.
 ---
 ## Usage
 The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
-provides a single method that takes a regular expression (as a RegExp object or
+provides a single method that takes a regular expression (as a Regexp object or
 a string) and returns its results. The **Lexer** and the **Parser** accept an
 optional second argument that specifies the syntax version, like 'ruby/2.0',
 which defaults to the host Ruby version (using RUBY_VERSION).
@@ -66,12 +67,23 @@ called with the results as follows:
 * **Scanner**: the block gets passed the results as they are scanned. See the
   example in the next section for details.
-* **Lexer**: after completion, the block gets passed the tokens one by one.
+* **Lexer**: the block gets passed the tokens one by one as they are scanned.
   _The result of the block is returned._
 * **Parser**: after completion, the block gets passed the root expression.
   _The result of the block is returned._
+All three methods accept either a `Regexp` or `String` (containing the pattern)
+- if a String is passed, `options` can be supplied:
+```ruby
+require 'regexp_parser'
+Regexp::Parser.parse(
+  "a+ # Recognizes a and A...",
+  options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
+)
+```
 ---
 ## Components
@@ -90,7 +102,7 @@ start/end offsets for each token found.
 ```ruby
 require 'regexp_parser'
-Regexp::Scanner.scan /(ab?(cd)*[e-h]+)/  do |type, token, text, ts, te|
+Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
   puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
 end
@@ -113,8 +125,8 @@ A one-liner that uses map on the result of the scan to return the textual
 parts of the pattern:
 ```ruby
-Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
-#=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
+Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
+# => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
 ```
@@ -136,11 +148,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
     to the lexer.
   * The MRI implementation may accept expressions that either conflict with
-    the documentation or are undocumented. The scanner does not support such
-    implementation quirks.
-    _(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
-    [#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
+    the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
+    The scanner will try to support as many of these cases as possible.
 ---
 ### Syntax
@@ -149,31 +158,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
 flavor variations. Syntax only comes into play in the lexer.
 #### Example
-The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
+The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
 checks a few of their implementation features.
 ```ruby
 require 'regexp_parser'
-ruby_20 = Regexp::Syntax.new 'ruby/2.0'
+ruby_20 = Regexp::Syntax.for 'ruby/2.0'
 ruby_20.implements? :quantifier,  :zero_or_one             # => true
 ruby_20.implements? :quantifier,  :zero_or_one_reluctant   # => true
 ruby_20.implements? :quantifier,  :zero_or_one_possessive  # => true
 ruby_20.implements? :conditional, :condition               # => true
-ruby_19 = Regexp::Syntax.new 'ruby/1.9'
+ruby_19 = Regexp::Syntax.for 'ruby/1.9'
 ruby_19.implements? :quantifier,  :zero_or_one             # => true
 ruby_19.implements? :quantifier,  :zero_or_one_reluctant   # => true
 ruby_19.implements? :quantifier,  :zero_or_one_possessive  # => true
 ruby_19.implements? :conditional, :condition               # => false
-ruby_18 = Regexp::Syntax.new 'ruby/1.8'
+ruby_18 = Regexp::Syntax.for 'ruby/1.8'
 ruby_18.implements? :quantifier,  :zero_or_one             # => true
 ruby_18.implements? :quantifier,  :zero_or_one_reluctant   # => true
 ruby_18.implements? :quantifier,  :zero_or_one_possessive  # => false
 ruby_18.implements? :conditional, :condition               # => false
 ```
+Syntax objects can also be queried about their complete and relative feature sets.
+```ruby
+require 'regexp_parser'
+ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
+ruby_20.added_features                  # => { conditional: [...], ... }
+ruby_20.removed_features                # => { property: [:newline], ... }
+ruby_20.features                        # => { anchor: [...], ... }
+```
 #### Notes
   * Variations on a token, for example a named group with angle brackets (< and >)
@@ -202,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
 ```ruby
 require 'regexp_parser'
-Regexp::Lexer.lex /a?(b(c))*[d]+/, 'ruby/1.9' do |token|
+Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
   puts "#{'  ' * token.level}#{token.text}"
 end
@@ -228,8 +247,8 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
 by a quantifier that only applies to it.
 ```ruby
-Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
-#=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
+Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
+# => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
 ```
 #### Notes
@@ -243,7 +262,7 @@ Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
 ### Parser
 Sits on top of the lexer and transforms the "stream" of Token objects emitted
 by it into a tree of Expression objects represented by an instance of the
-Expression::Root class.
+`Expression::Root` class.
 See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
 wiki page for attributes and methods.
@@ -251,12 +270,40 @@ wiki page for attributes and methods.
 #### Example
+This example uses the tree traversal method `#each_expression`
+and the method `#strfregexp` to print each object in the tree.
+```ruby
+include_root  = true
+indent_offset = include_root ? 1 : 0
+tree.each_expression(include_root) do |exp|
+  puts exp.strfregexp("%>> %c", indent_offset)
+end
+# Output
+# > Regexp::Expression::Root
+#   > Regexp::Expression::Literal
+#   > Regexp::Expression::Group::Capture
+#     > Regexp::Expression::Literal
+#     > Regexp::Expression::Group::Capture
+#       > Regexp::Expression::Literal
+#     > Regexp::Expression::Literal
+#   > Regexp::Expression::Group::Named
+#     > Regexp::Expression::CharacterSet
+```
+_Note: quantifiers do not appear in the output because they are members of the
+Expression class. See the next section for details._
+Another example, using `#traverse` for a more fine-grained tree traversal:
 ```ruby
 require 'regexp_parser'
 regex = /a?(b+(c)d)*(?<name>[0-9]+)/
-tree = Regexp::Parser.parse( regex, 'ruby/2.1' )
+tree = Regexp::Parser.parse(regex, 'ruby/2.1')
 tree.traverse do |event, exp|
   puts "#{event}: #{exp.type} `#{exp.to_s}`"
@@ -276,40 +323,15 @@ end
 # exit: group `(?<name>[0-9]+)`
 ```
-Another example, using each_expression and strfregexp to print the object tree.
 _See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods`
 for more information on these methods._
-```ruby
-include_root  = true
-indent_offset = include_root ? 1 : 0
-tree.each_expression(include_root) do |exp, level_index|
-  puts exp.strfregexp("%>> %c", indent_offset)
-end
-# Output
-# > Regexp::Expression::Root
-#   > Regexp::Expression::Literal
-#   > Regexp::Expression::Group::Capture
-#     > Regexp::Expression::Literal
-#     > Regexp::Expression::Group::Capture
-#       > Regexp::Expression::Literal
-#     > Regexp::Expression::Literal
-#   > Regexp::Expression::Group::Named
-#     > Regexp::Expression::CharacterSet
-```
-_Note: quantifiers do not appear in the output because they are members of the
-Expression class. See the next section for details._
 ---
 ## Supported Syntax
 The three modules support all the regular expression syntax features of Ruby 1.8,
-1.9, and 2.x:
+1.9, 2.x and 3.x:
 _Note that not all of these are available in all versions of Ruby_
@@ -337,7 +359,7 @@ _Note that not all of these are available in all versions of Ruby_
 | &emsp;&emsp;_Nest Level_              | `\k<n-1>`                                               | &#x2713; |
 | &emsp;&emsp;_Numbered_                | `\k<1>`                                                 | &#x2713; |
 | &emsp;&emsp;_Relative_                | `\k<-2>`                                                | &#x2713; |
-| &emsp;&emsp;_Traditional_             | `\1` thru `\9`                                          | &#x2713; |
+| &emsp;&emsp;_Traditional_             | `\1` through `\9`                                       | &#x2713; |
 | &emsp;&nbsp;_**Capturing**_           | `(abc)`                                                 | &#x2713; |
 | &emsp;&nbsp;_**Comments**_            | `(?# comment text)`                                     | &#x2713; |
 | &emsp;&nbsp;_**Named**_               | `(?<name>abc)`, `(?'name'abc)`                          | &#x2713; |
@@ -349,15 +371,15 @@ _Note that not all of these are available in all versions of Ruby_
 | **POSIX Classes**                     | `[:alpha:]`, `[:^digit:]`                               | &#x2713; |
 | **Quantifiers**                       |                                                         | &#x22f1; |
 | &emsp;&nbsp;_**Greedy**_              | `?`, `*`, `+`, `{m,M}`                                  | &#x2713; |
-| &emsp;&nbsp;_**Reluctant** (Lazy)_    | `??`, `*?`, `+?`, `{m,M}?`                              | &#x2713; |
-| &emsp;&nbsp;_**Possessive**_          | `?+`, `*+`, `++`, `{m,M}+`                              | &#x2713; |
+| &emsp;&nbsp;_**Reluctant** (Lazy)_    | `??`, `*?`, `+?` \[1\]                                  | &#x2713; |
+| &emsp;&nbsp;_**Possessive**_          | `?+`, `*+`, `++` \[1\]                                  | &#x2713; |
 | **String Escapes**                    |                                                         | &#x22f1; |
-| &emsp;&nbsp;_**Control**_             | `\C-C`, `\cD`                                           | &#x2713; |
+| &emsp;&nbsp;_**Control** \[2\]_       | `\C-C`, `\cD`                                           | &#x2713; |
 | &emsp;&nbsp;_**Hex**_                 | `\x20`, `\x{701230}`                                    | &#x2713; |
-| &emsp;&nbsp;_**Meta**_                | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C`        | &#x2713; |
+| &emsp;&nbsp;_**Meta** \[2\]_          | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C`        | &#x2713; |
 | &emsp;&nbsp;_**Octal**_               | `\0`, `\01`, `\012`                                     | &#x2713; |
 | &emsp;&nbsp;_**Unicode**_             | `\uHHHH`, `\u{H+ H+}`                                   | &#x2713; |
-| **Unicode Properties**                | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
+| **Unicode Properties**                | _<sub>([Unicode 13.0.0])</sub>_                         | &#x22f1; |
 | &emsp;&nbsp;_**Age**_                 | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}`            | &#x2713; |
 | &emsp;&nbsp;_**Blocks**_              | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}`          | &#x2713; |
 | &emsp;&nbsp;_**Classes**_             | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}`                  | &#x2713; |
@@ -366,6 +388,18 @@ _Note that not all of these are available in all versions of Ruby_
 | &emsp;&nbsp;_**Scripts**_             | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}`              | &#x2713; |
 | &emsp;&nbsp;_**Simple**_              | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}`               | &#x2713; |
+[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
+**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
+Any `+` or `?` that follows an interval quantifier will be treated as another,
+chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
+[#69](https://github.com/ammar/regexp_parser/pull/69).
+**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
+escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
+so they will only reach the scanner and will only be emitted if a String or a Regexp
+that has been built with the `::new` constructor is scanned.
 ##### Inapplicable Features
 Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -379,40 +413,29 @@ expressions library (Onigmo). They are not supported by the scanner.
   - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
   - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
 See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
-_**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
-or incorrectly return tokens/objects as literals._
+_**Note**: Attempting to process expressions with unsupported syntax features can raise
+an error, or incorrectly return tokens/objects as literals._
 ## Testing
-To run the tests simply run rake from the root directory, as 'test' is the default task.
-It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
-The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
+To run the tests simply run rake from the root directory.
-```
-bin/test
-```
+The default task generates the scanner's code from the Ragel source files and runs
+all the specs, thus it requires Ragel to be installed.
-You can run a specific test like so:
+Note that changes to Ragel files will not be reflected when running `rspec` on its own,
+so to run individual tests you might want to run:
 ```
-bin/test spec/scanner/properties_spec.rb
-```
-Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
-```
-rake ragel:rb && bin/test spec/scanner/properties_spec.rb
+rake ragel:rb && rspec spec/scanner/properties_spec.rb
 ```
 ## Building
-Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be
-installed. The build tasks will automatically invoke the 'ragel:rb' task to generate the
-Ruby scanner code.
+Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
+to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
+the Ruby scanner code.
 The project uses the standard rubygems package tasks, so:
@@ -432,13 +455,26 @@ rake install
 ## Example Projects
 Projects using regexp_parser.
-- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
+- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
+that uses regexp_parser to convert Regexps to css/xpath selectors.
+- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
+to JavaScript-compatible regular expressions.
+- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
+with alias support.
+- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
+(amongst others) to see if your tests cover their behavior.
-- [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
+- [repper](https://github.com/jaynetics/repper) is a regular expression
+pretty-printer and formatter for Ruby.
-- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) uses regexp_parser to generate examples of postal codes.
+- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
+uses regexp_parser to lint Regexps.
-- [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
+- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
+that uses regexp_parser to generate examples of postal codes.
 ## References
@@ -467,4 +503,4 @@ Documentation and books used while working on this project.
 ---
 ##### Copyright
-_Copyright (c) 2010-2019 Ammar Ali. See LICENSE file for details._
+_Copyright (c) 2010-2023 Ammar Ali. See LICENSE file for details._

data/Rakefile CHANGED Viewed

@@ -1,87 +1,23 @@
+require 'bundler'
 require 'rubygems'
+require 'rubygems/package_task'
 require 'rake'
 require 'rake/testtask'
+require 'rspec/core/rake_task'
-require 'bundler'
-require 'rubygems/package_task'
-RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
-RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
-RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
+Dir['tasks/**/*.rake'].each { |file| load(file) }
 Bundler::GemHelper.install_tasks
+RSpec::Core::RakeTask.new(:spec)
 task :default => [:'test:full']
 namespace :test do
-  task full: :'ragel:rb' do
-    sh 'bin/test'
-  end
+  task full: [:'ragel:rb', :spec]
 end
-namespace :ragel do
-  desc "Process the ragel source files and output ruby code"
-  task :rb do |t|
-    RAGEL_SOURCE_FILES.each do |file|
-      output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
-      # using faster flat table driven FSM, about 25% larger code, but about 30% faster
-      sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
-      contents = File.read(output_file)
-      File.open(output_file, 'r+') do |file|
-        contents = "# -*- warn-indent:false;  -*-\n" + contents
-        file.write(contents)
-      end
-    end
-  end
-  desc "Delete the ragel generated source file(s)"
-  task :clean do |t|
-    RAGEL_SOURCE_FILES.each do |file|
-      sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
-    end
-  end
-end
 # Add ragel task as a prerequisite for building the gem to ensure that the
 # latest scanner code is generated and included in the build.
 desc "Runs ragel:rb before building the gem"
 task :build => ['ragel:rb']
-namespace :props do
-  desc 'Write new property value hashes for the properties scanner'
-  task :update do
-    require 'regexp_property_values'
-    RegexpPropertyValues.update
-    dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
-    require 'psych'
-    write_hash_to_file = ->(hash, path) do
-      File.open(path, 'w') do |f|
-        f.puts '#',
-               "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
-               '#',
-               hash.sort.to_h.to_yaml
-      end
-      puts "Wrote #{hash.count} aliases to `#{path}`"
-    end
-    long_names_to_tokens = RegexpPropertyValues.all.map do |val|
-      [val.identifier, val.full_name.downcase]
-    end
-    write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
-    short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
-      [k.identifier, v.full_name.downcase]
-    end
-    write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
-  end
-end

data/lib/regexp_parser/error.rb ADDED Viewed

@@ -0,0 +1,4 @@
+class Regexp::Parser
+  # base class for all gem-specific errors
+  class Error < StandardError; end
+end

data/lib/regexp_parser/expression/base.rb ADDED Viewed

@@ -0,0 +1,76 @@
+module Regexp::Expression
+  class Base
+    include Regexp::Expression::Shared
+    def initialize(token, options = {})
+      init_from_token_and_options(token, options)
+    end
+    def to_re(format = :full)
+      if set_level > 0
+        warn "Calling #to_re on character set members is deprecated - "\
+             "their behavior might not be equivalent outside of the set."
+      end
+      ::Regexp.new(to_s(format))
+    end
+    def quantify(*args)
+      self.quantifier = Quantifier.new(*args)
+    end
+    def unquantified_clone
+      clone.tap { |exp| exp.quantifier = nil }
+    end
+    # Deprecated. Prefer `#repetitions` which has a more uniform interface.
+    def quantity
+      return [nil,nil] unless quantified?
+      [quantifier.min, quantifier.max]
+    end
+    def repetitions
+      @repetitions ||=
+        if quantified?
+          min = quantifier.min
+          max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
+          range = min..max
+          # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
+          if RUBY_VERSION.to_f < 2.7
+            range.define_singleton_method(:minmax) { [min, max] }
+          end
+          range
+        else
+          1..1
+        end
+    end
+    def greedy?
+      quantified? and quantifier.greedy?
+    end
+    def reluctant?
+      quantified? and quantifier.reluctant?
+    end
+    alias :lazy? :reluctant?
+    def possessive?
+      quantified? and quantifier.possessive?
+    end
+    def to_h
+      {
+        type:              type,
+        token:             token,
+        text:              to_s(:base),
+        starts_at:         ts,
+        length:            full_length,
+        level:             level,
+        set_level:         set_level,
+        conditional_level: conditional_level,
+        options:           options,
+        quantifier:        quantified? ? quantifier.to_h : nil,
+      }
+    end
+    alias :attributes :to_h
+  end
+end

data/lib/regexp_parser/expression/classes/alternation.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Regexp::Expression
-  # A sequence of expressions, used by Alternation as one of its alternative.
+  # A sequence of expressions, used by Alternation as one of its alternatives.
   class Alternative < Regexp::Expression::Sequence; end
   class Alternation < Regexp::Expression::SequenceOperation

data/lib/regexp_parser/expression/classes/anchor.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 module Regexp::Expression
   module Anchor
     class Base < Regexp::Expression::Base; end
@@ -22,5 +21,4 @@ module Regexp::Expression
     EOS      = EndOfString
     EOSobEOL = EndOfStringOrBeforeEndOfLine
   end
 end

data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} RENAMED Viewed

@@ -2,6 +2,23 @@ module Regexp::Expression
   module Backreference
     class Base < Regexp::Expression::Base
       attr_accessor :referenced_expression
+      def initialize_copy(orig)
+        exp_id = [self.class, self.starts_at]
+        # prevent infinite recursion for recursive subexp calls
+        copied = @@copied ||= {}
+        self.referenced_expression =
+          if copied[exp_id]
+            orig.referenced_expression
+          else
+            copied[exp_id] = true
+            orig.referenced_expression.dup
+          end
+        copied.clear
+        super
+      end
     end
     class Number < Backreference::Base
@@ -9,7 +26,7 @@ module Regexp::Expression
       alias reference number
       def initialize(token, options = {})
-        @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
+        @number = token.text[/-?\d+/].to_i
         super
       end
     end
@@ -33,7 +50,7 @@ module Regexp::Expression
     class NameCall           < Backreference::Name; end
     class NumberCallRelative < Backreference::NumberRelative; end
-    class NumberRecursionLevel < Backreference::Number
+    class NumberRecursionLevel < Backreference::NumberRelative
       attr_reader :recursion_level
       def initialize(token, options = {})
@@ -52,4 +69,7 @@ module Regexp::Expression
       end
     end
   end
+  # alias for symmetry between token symbol and Expression class name
+  Backref = Backreference
 end