regexp_parser 2.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +94 -6
  3. data/Gemfile +2 -1
  4. data/LICENSE +1 -1
  5. data/README.md +40 -30
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +1 -1
  8. data/lib/regexp_parser/expression/base.rb +75 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
  18. data/lib/regexp_parser/expression/classes/group.rb +6 -6
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/root.rb +3 -6
  22. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
  23. data/lib/regexp_parser/expression/methods/construct.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
  25. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  26. data/lib/regexp_parser/expression/methods/tests.rb +10 -1
  27. data/lib/regexp_parser/expression/quantifier.rb +41 -23
  28. data/lib/regexp_parser/expression/sequence.rb +9 -24
  29. data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
  30. data/lib/regexp_parser/expression/shared.rb +85 -0
  31. data/lib/regexp_parser/expression/subexpression.rb +11 -8
  32. data/lib/regexp_parser/expression.rb +10 -132
  33. data/lib/regexp_parser/lexer.rb +8 -6
  34. data/lib/regexp_parser/parser.rb +21 -72
  35. data/lib/regexp_parser/scanner/properties/long.csv +622 -0
  36. data/lib/regexp_parser/scanner/properties/short.csv +246 -0
  37. data/lib/regexp_parser/scanner/property.rl +1 -1
  38. data/lib/regexp_parser/scanner/scanner.rl +48 -35
  39. data/lib/regexp_parser/scanner.rb +735 -801
  40. data/lib/regexp_parser/syntax/any.rb +2 -7
  41. data/lib/regexp_parser/syntax/base.rb +91 -66
  42. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  43. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  44. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  45. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  46. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  47. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  48. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  49. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  50. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  51. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  52. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  53. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  54. data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
  55. data/lib/regexp_parser/syntax/token.rb +45 -0
  56. data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
  57. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  58. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  59. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  60. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  61. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  62. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  63. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  64. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  65. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  66. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  67. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  68. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  69. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  70. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  71. data/lib/regexp_parser/syntax/versions.rb +1 -1
  72. data/lib/regexp_parser/syntax.rb +1 -1
  73. data/lib/regexp_parser/token.rb +9 -20
  74. data/lib/regexp_parser/version.rb +1 -1
  75. data/lib/regexp_parser.rb +0 -2
  76. data/regexp_parser.gemspec +20 -22
  77. metadata +37 -166
  78. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  79. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  80. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  81. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  82. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  83. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  84. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  85. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  86. data/spec/expression/base_spec.rb +0 -104
  87. data/spec/expression/clone_spec.rb +0 -152
  88. data/spec/expression/conditional_spec.rb +0 -89
  89. data/spec/expression/free_space_spec.rb +0 -27
  90. data/spec/expression/methods/match_length_spec.rb +0 -161
  91. data/spec/expression/methods/match_spec.rb +0 -25
  92. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  93. data/spec/expression/methods/tests_spec.rb +0 -99
  94. data/spec/expression/methods/traverse_spec.rb +0 -161
  95. data/spec/expression/options_spec.rb +0 -128
  96. data/spec/expression/subexpression_spec.rb +0 -50
  97. data/spec/expression/to_h_spec.rb +0 -26
  98. data/spec/expression/to_s_spec.rb +0 -108
  99. data/spec/lexer/all_spec.rb +0 -22
  100. data/spec/lexer/conditionals_spec.rb +0 -53
  101. data/spec/lexer/delimiters_spec.rb +0 -68
  102. data/spec/lexer/escapes_spec.rb +0 -14
  103. data/spec/lexer/keep_spec.rb +0 -10
  104. data/spec/lexer/literals_spec.rb +0 -64
  105. data/spec/lexer/nesting_spec.rb +0 -99
  106. data/spec/lexer/refcalls_spec.rb +0 -60
  107. data/spec/parser/all_spec.rb +0 -43
  108. data/spec/parser/alternation_spec.rb +0 -88
  109. data/spec/parser/anchors_spec.rb +0 -17
  110. data/spec/parser/conditionals_spec.rb +0 -179
  111. data/spec/parser/errors_spec.rb +0 -30
  112. data/spec/parser/escapes_spec.rb +0 -121
  113. data/spec/parser/free_space_spec.rb +0 -130
  114. data/spec/parser/groups_spec.rb +0 -108
  115. data/spec/parser/keep_spec.rb +0 -6
  116. data/spec/parser/options_spec.rb +0 -28
  117. data/spec/parser/posix_classes_spec.rb +0 -8
  118. data/spec/parser/properties_spec.rb +0 -115
  119. data/spec/parser/quantifiers_spec.rb +0 -68
  120. data/spec/parser/refcalls_spec.rb +0 -117
  121. data/spec/parser/set/intersections_spec.rb +0 -127
  122. data/spec/parser/set/ranges_spec.rb +0 -111
  123. data/spec/parser/sets_spec.rb +0 -178
  124. data/spec/parser/types_spec.rb +0 -18
  125. data/spec/scanner/all_spec.rb +0 -18
  126. data/spec/scanner/anchors_spec.rb +0 -21
  127. data/spec/scanner/conditionals_spec.rb +0 -128
  128. data/spec/scanner/delimiters_spec.rb +0 -52
  129. data/spec/scanner/errors_spec.rb +0 -67
  130. data/spec/scanner/escapes_spec.rb +0 -64
  131. data/spec/scanner/free_space_spec.rb +0 -165
  132. data/spec/scanner/groups_spec.rb +0 -61
  133. data/spec/scanner/keep_spec.rb +0 -10
  134. data/spec/scanner/literals_spec.rb +0 -39
  135. data/spec/scanner/meta_spec.rb +0 -18
  136. data/spec/scanner/options_spec.rb +0 -36
  137. data/spec/scanner/properties_spec.rb +0 -64
  138. data/spec/scanner/quantifiers_spec.rb +0 -25
  139. data/spec/scanner/refcalls_spec.rb +0 -55
  140. data/spec/scanner/sets_spec.rb +0 -151
  141. data/spec/scanner/types_spec.rb +0 -14
  142. data/spec/spec_helper.rb +0 -16
  143. data/spec/support/runner.rb +0 -42
  144. data/spec/support/shared_examples.rb +0 -77
  145. data/spec/support/warning_extractor.rb +0 -60
  146. data/spec/syntax/syntax_spec.rb +0 -48
  147. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  148. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  149. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  150. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  151. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  152. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  153. data/spec/syntax/versions/aliases_spec.rb +0 -37
  154. data/spec/token/token_spec.rb +0 -85
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 077b8a0c90d90cf46e44671ec1335a5373eef72c61a0bcf4de43ba5217a188c3
4
- data.tar.gz: b9aed868af73adcdf40c09720c5d10091b25a53b25a792717ceb5591039a2931
3
+ metadata.gz: f871ec3cdea5a594f72f5386f1b344710e6204f7307ba40d966653197f526be8
4
+ data.tar.gz: dd93c880f29ec77531faa2379fbfc8e34a9b67680664c6a3477d38afeaa1809a
5
5
  SHA512:
6
- metadata.gz: 9c04d9a6434c6e3f322e97e8e2a1c86b3ddda88bd8821368a37b92f5836e4c3df1dc27a79165303420c3e8d5eea31bda1483824da01a40ce30961b645ba65ddd
7
- data.tar.gz: 01e5c261e9dca0c4df7c696128dbc0520ca40aa6b9393cc8d6c3bdb8386470aeb773566000b811f98c1407038216c8d2c0b444c7955ea5a881ac759796f8a440
6
+ metadata.gz: 45e52ab0ce7bec3e4a275efa3828532778c49e8d36eec1ea82a43755a87abc9eee97e986027aa8f5c64fd604f15164d2ad4f37e5d6e22a5a1e3e9da6788271b9
7
+ data.tar.gz: 1f5514f3252294d9fe0877cff1d8b0db0400838c97ed78d15bbb794b94595c20d081681e4b1fe9bb6c89be7749514d8b2b8cf385360d002cd89e2a76ce6d2e63
data/CHANGELOG.md CHANGED
@@ -1,5 +1,93 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ### Added
4
+
5
+ - `Regexp::Expression::Base.construct` and `.token_class` methods
6
+
7
+ ## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
8
+
9
+ ### Fixed
10
+
11
+ - fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
12
+ - they used to be treated as reluctant or possessive mode indicators
13
+ - however, Ruby does not support these modes for interval quantifiers
14
+ - they are now treated as chained quantifiers instead, as Ruby does it
15
+ - c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
16
+ - fixed `Expression::Base#nesting_level` for some tree rewrite cases
17
+ - e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
18
+ - fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
19
+ - they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
20
+ - they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
21
+
22
+ ### Added
23
+
24
+ - added `Expression::Base#==` for (deep) comparison of expressions
25
+ - added `Expression::Base#parts`
26
+ - returns the text elements and subexpressions of an expression
27
+ - e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
28
+ - added `Expression::Base#te` (a.k.a. token end index)
29
+ - `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
30
+ - made some `Expression::Base` methods available on `Quantifier` instances, too
31
+ - `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
32
+ - `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
33
+ - `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
34
+ - this allows a more unified handling with `Expression::Base` instances
35
+ - allowed `Quantifier#initialize` to take a token and options Hash like other nodes
36
+ - added a deprecation warning for initializing Quantifiers with 4+ arguments:
37
+
38
+ Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments
39
+ is deprecated.
40
+
41
+ It will no longer be supported in regexp_parser v3.0.0.
42
+
43
+ Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
44
+ with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
45
+ will be derived automatically.
46
+
47
+ Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
48
+
49
+ This is consistent with how Expression::Base instances are created.
50
+
51
+
52
+ ## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:janosch84@gmail.com)
53
+
54
+ ### Fixed
55
+
56
+ - removed five inexistent unicode properties from `Syntax#features`
57
+ - these were never supported by Ruby or the `Regexp::Scanner`
58
+ - thanks to [Markus Schirp](https://github.com/mbj) for the report
59
+
60
+ ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
61
+
62
+ ### Added
63
+
64
+ - improved parsing performance through `Syntax` refactoring
65
+ - instead of fresh `Syntax` instances, pre-loaded constants are now re-used
66
+ - this approximately doubles the parsing speed for simple regexps
67
+ - added methods to `Syntax` classes to show relative feature sets
68
+ - e.g. `Regexp::Syntax::V3_2_0.added_features`
69
+ - support for new unicode properties of Ruby 3.2 / Unicode 14.0
70
+
71
+ ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
72
+
73
+ ### Fixed
74
+
75
+ - fixed Syntax version of absence groups (`(?~...)`)
76
+ - the lexer accepted them for any Ruby version
77
+ - now they are only recognized for Ruby >= 2.4.1 in which they were introduced
78
+ - reduced gem size by excluding specs from package
79
+ - removed deprecated `test_files` gemspec setting
80
+ - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
81
+ - no longer depend on `set`
82
+ - `set` was removed from the stdlib and made a standalone gem as of Ruby 3
83
+ - this made it a hidden/undeclared dependency of `regexp_parser`
84
+
85
+ ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
86
+
87
+ ### Added
88
+
89
+ - added support for 13 new unicode properties introduced in Ruby 3.1.0
90
+
3
91
  ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
4
92
 
5
93
  ### Fixed
@@ -149,7 +237,7 @@
149
237
 
150
238
  ### Added
151
239
 
152
- - `Expression#each_expression` and `#traverse` can now be called without a block
240
+ - `Expression::Base#each_expression` and `#traverse` can now be called without a block
153
241
  * this returns an `Enumerator` and allows chaining, e.g. `each_expression.select`
154
242
  * thanks to [Masataka Kuwabara](https://github.com/pocke)
155
243
 
@@ -175,7 +263,7 @@
175
263
  - Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags
176
264
  - Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)'
177
265
  - Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\'
178
- - Fixed `Expression#match` and `#=~` not working with a single argument
266
+ - Fixed `Expression::Base#match` and `#=~` not working with a single argument
179
267
 
180
268
  ### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com)
181
269
 
@@ -183,15 +271,15 @@
183
271
 
184
272
  - Added `#referenced_expression` for backrefs, subexp calls and conditionals
185
273
  * returns the `Group` expression that is being referenced via name or number
186
- - Added `Expression#repetitions`
274
+ - Added `Expression::Base#repetitions`
187
275
  * returns a `Range` of allowed repetitions (`1..1` if there is no quantifier)
188
276
  * like `#quantity` but with a more uniform interface
189
- - Added `Expression#match_length`
277
+ - Added `Expression::Base#match_length`
190
278
  * allows to inspect and iterate over String lengths matched by the Expression
191
279
 
192
280
  ### Fixed
193
281
 
194
- - Fixed `Expression#clone` "direction"
282
+ - Fixed `Expression::Base#clone` "direction"
195
283
  * it used to dup ivars onto the callee, leaving only the clone referencing the original objects
196
284
  * this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys
197
285
  - Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals
@@ -353,7 +441,7 @@ This release includes several breaking changes, mostly to character sets, #map a
353
441
  - Fixed a thread safety issue (issue #45)
354
442
  - Some public class methods that were only reliable for
355
443
  internal use are now private instance methods (PR #46)
356
- - Improved the usefulness of Expression#options (issue #43) -
444
+ - Improved the usefulness of Expression::Base#options (issue #43) -
357
445
  #options and derived methods such as #i?, #m? and #x? are now
358
446
  defined for all Expressions that are affected by such flags.
359
447
  - Fixed scanning of whitespace following (?x) (commit 5c94bd2)
data/Gemfile CHANGED
@@ -5,9 +5,10 @@ gemspec
5
5
  group :development, :test do
6
6
  gem 'ice_nine', '~> 0.11.2'
7
7
  gem 'rake', '~> 13.0'
8
- gem 'regexp_property_values', '~> 1.0'
8
+ gem 'regexp_property_values', '~> 1.3'
9
9
  gem 'rspec', '~> 3.10'
10
10
  if RUBY_VERSION.to_f >= 2.7
11
+ gem 'benchmark-ips', '~> 2.1'
11
12
  gem 'gouteur'
12
13
  gem 'rubocop', '~> 1.7'
13
14
  end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2015, Ammar Ali
1
+ Copyright (c) 2010, 2012-2022, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
@@ -154,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
154
157
  flavor variations. Syntax only comes into play in the lexer.
155
158
 
156
159
  #### Example
157
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
160
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
158
161
  checks a few of their implementation features.
159
162
 
160
163
  ```ruby
161
164
  require 'regexp_parser'
162
165
 
163
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
166
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
164
167
  ruby_20.implements? :quantifier, :zero_or_one # => true
165
168
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
166
169
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
167
170
  ruby_20.implements? :conditional, :condition # => true
168
171
 
169
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
172
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
170
173
  ruby_19.implements? :quantifier, :zero_or_one # => true
171
174
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
172
175
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
173
176
  ruby_19.implements? :conditional, :condition # => false
174
177
 
175
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
178
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
176
179
  ruby_18.implements? :quantifier, :zero_or_one # => true
177
180
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
178
181
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
179
182
  ruby_18.implements? :conditional, :condition # => false
180
183
  ```
181
184
 
185
+ Syntax objects can also be queried about their complete and relative feature sets.
186
+
187
+ ```ruby
188
+ require 'regexp_parser'
189
+
190
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
191
+ ruby_20.added_features # => { conditional: [...], ... }
192
+ ruby_20.removed_features # => { property: [:newline], ... }
193
+ ruby_20.features # => { anchor: [...], ... }
194
+ ```
182
195
 
183
196
  #### Notes
184
197
  * Variations on a token, for example a named group with angle brackets (< and >)
@@ -354,15 +367,15 @@ _Note that not all of these are available in all versions of Ruby_
354
367
  | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | &#x2713; |
355
368
  | **Quantifiers** | | &#x22f1; |
356
369
  | &emsp;&nbsp;_**Greedy**_ | `?`, `*`, `+`, `{m,M}` | &#x2713; |
357
- | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
358
- | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
370
+ | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | &#x2713; |
371
+ | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++` \[1\] | &#x2713; |
359
372
  | **String Escapes** | | &#x22f1; |
360
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
373
+ | &emsp;&nbsp;_**Control** \[2\]_ | `\C-C`, `\cD` | &#x2713; |
361
374
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
362
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
375
+ | &emsp;&nbsp;_**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
363
376
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
364
377
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
365
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
378
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
366
379
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
367
380
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
368
381
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -371,6 +384,14 @@ _Note that not all of these are available in all versions of Ruby_
371
384
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
372
385
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
373
386
 
387
+ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
388
+ quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
389
+ [#69](https://github.com/ammar/regexp_parser/pull/69).
390
+
391
+ **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
392
+ https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
393
+ scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
394
+
374
395
  ##### Inapplicable Features
375
396
 
376
397
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -384,7 +405,6 @@ expressions library (Onigmo). They are not supported by the scanner.
384
405
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
385
406
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
386
407
 
387
-
388
408
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
389
409
 
390
410
  _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
@@ -392,26 +412,14 @@ or incorrectly return tokens/objects as literals._
392
412
 
393
413
 
394
414
  ## Testing
395
- To run the tests simply run rake from the root directory, as 'test' is the default task.
415
+ To run the tests simply run rake from the root directory.
396
416
 
397
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
417
+ The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
398
418
 
399
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
419
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
400
420
 
401
421
  ```
402
- bin/test
403
- ```
404
-
405
- You can run a specific test like so:
406
-
407
- ```
408
- bin/test spec/scanner/properties_spec.rb
409
- ```
410
-
411
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
412
-
413
- ```
414
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
422
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
415
423
  ```
416
424
 
417
425
  ## Building
@@ -439,11 +447,13 @@ Projects using regexp_parser.
439
447
 
440
448
  - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
441
449
 
442
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
450
+ - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
443
451
 
444
452
  - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
445
453
 
446
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
454
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
455
+
456
+ - [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer for Ruby.
447
457
 
448
458
  - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
449
459
 
@@ -476,4 +486,4 @@ Documentation and books used while working on this project.
476
486
 
477
487
  ---
478
488
  ##### Copyright
479
- _Copyright (c) 2010-2020 Ammar Ali. See LICENSE file for details._
489
+ _Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,87 +1,23 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
11
- RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
14
9
 
15
10
  Bundler::GemHelper.install_tasks
16
11
 
12
+ RSpec::Core::RakeTask.new(:spec)
17
13
 
18
14
  task :default => [:'test:full']
19
15
 
20
16
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
17
+ task full: [:'ragel:rb', :spec]
24
18
  end
25
19
 
26
- namespace :ragel do
27
- desc "Process the ragel source files and output ruby code"
28
- task :rb do
29
- RAGEL_SOURCE_FILES.each do |source_file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
31
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
33
-
34
- contents = File.read(output_file)
35
-
36
- File.open(output_file, 'r+') do |file|
37
- contents = "# -*- warn-indent:false; -*-\n" + contents
38
-
39
- file.write(contents)
40
- end
41
- end
42
- end
43
-
44
- desc "Delete the ragel generated source file(s)"
45
- task :clean do
46
- RAGEL_SOURCE_FILES.each do |file|
47
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
- end
49
- end
50
- end
51
-
52
-
53
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
21
  # latest scanner code is generated and included in the build.
55
22
  desc "Runs ragel:rb before building the gem"
56
23
  task :build => ['ragel:rb']
57
-
58
-
59
- namespace :props do
60
- desc 'Write new property value hashes for the properties scanner'
61
- task :update do
62
- require 'regexp_property_values'
63
- RegexpPropertyValues.update
64
- dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
65
-
66
- require 'psych'
67
- write_hash_to_file = ->(hash, path) do
68
- File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
73
- end
74
- puts "Wrote #{hash.count} aliases to `#{path}`"
75
- end
76
-
77
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
- [val.identifier, val.full_name.downcase]
79
- end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
81
-
82
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
- [k.identifier, v.full_name.downcase]
84
- end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
86
- end
87
- end
@@ -1,4 +1,4 @@
1
1
  class Regexp::Parser
2
- # base class for all gem-specific errors (inherited but never raised itself)
2
+ # base class for all gem-specific errors
3
3
  class Error < StandardError; end
4
4
  end
@@ -0,0 +1,75 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ include Regexp::Expression::Shared
4
+
5
+ def initialize(token, options = {})
6
+ init_from_token_and_options(token, options)
7
+ end
8
+
9
+ def initialize_copy(orig)
10
+ self.text = orig.text.dup if orig.text
11
+ self.options = orig.options.dup if orig.options
12
+ self.quantifier = orig.quantifier.clone if orig.quantifier
13
+ super
14
+ end
15
+
16
+ def to_re(format = :full)
17
+ ::Regexp.new(to_s(format))
18
+ end
19
+
20
+ def quantify(*args)
21
+ self.quantifier = Quantifier.new(*args)
22
+ end
23
+
24
+ def unquantified_clone
25
+ clone.tap { |exp| exp.quantifier = nil }
26
+ end
27
+
28
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
29
+ def quantity
30
+ return [nil,nil] unless quantified?
31
+ [quantifier.min, quantifier.max]
32
+ end
33
+
34
+ def repetitions
35
+ return 1..1 unless quantified?
36
+ min = quantifier.min
37
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
38
+ range = min..max
39
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
40
+ if RUBY_VERSION.to_f < 2.7
41
+ range.define_singleton_method(:minmax) { [min, max] }
42
+ end
43
+ range
44
+ end
45
+
46
+ def greedy?
47
+ quantified? and quantifier.greedy?
48
+ end
49
+
50
+ def reluctant?
51
+ quantified? and quantifier.reluctant?
52
+ end
53
+ alias :lazy? :reluctant?
54
+
55
+ def possessive?
56
+ quantified? and quantifier.possessive?
57
+ end
58
+
59
+ def to_h
60
+ {
61
+ type: type,
62
+ token: token,
63
+ text: to_s(:base),
64
+ starts_at: ts,
65
+ length: full_length,
66
+ level: level,
67
+ set_level: set_level,
68
+ conditional_level: conditional_level,
69
+ options: options,
70
+ quantifier: quantified? ? quantifier.to_h : nil,
71
+ }
72
+ end
73
+ alias :attributes :to_h
74
+ end
75
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -1,4 +1,5 @@
1
1
  module Regexp::Expression
2
+ # TODO: unify name with token :backref, one way or the other, in v3.0.0
2
3
  module Backreference
3
4
  class Base < Regexp::Expression::Base
4
5
  attr_accessor :referenced_expression
@@ -16,8 +16,8 @@ module Regexp::Expression
16
16
  count == 2
17
17
  end
18
18
 
19
- def to_s(_format = :full)
20
- expressions.join(text)
19
+ def parts
20
+ intersperse(expressions, text.dup)
21
21
  end
22
22
  end
23
23
  end
@@ -20,8 +20,8 @@ module Regexp::Expression
20
20
  self.closed = true
21
21
  end
22
22
 
23
- def to_s(format = :full)
24
- "#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
23
+ def parts
24
+ ["#{text}#{'^' if negated?}", *expressions, ']']
25
25
  end
26
26
  end
27
27
  end # module Regexp::Expression
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module CharacterType
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -15,5 +14,4 @@ module Regexp::Expression
15
14
  class Linebreak < CharacterType::Base; end
16
15
  class ExtendedGrapheme < CharacterType::Base; end
17
16
  end
18
-
19
17
  end
@@ -55,8 +55,8 @@ module Regexp::Expression
55
55
  condition.reference
56
56
  end
57
57
 
58
- def to_s(format = :full)
59
- "#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
58
+ def parts
59
+ [text.dup, condition, *intersperse(branches, '|'), ')']
60
60
  end
61
61
 
62
62
  def initialize_copy(orig)
@@ -1,16 +1,22 @@
1
1
  module Regexp::Expression
2
+ # TODO: unify naming with Token::Escape, on way or the other, in v3.0.0
2
3
  module EscapeSequence
3
4
  class Base < Regexp::Expression::Base
4
- require 'yaml'
5
-
6
- def char
7
- # poor man's unescape without using eval
8
- YAML.load(%Q(---\n"#{text}"\n))
9
- end
10
-
11
5
  def codepoint
12
6
  char.ord
13
7
  end
8
+
9
+ if ''.respond_to?(:undump)
10
+ def char
11
+ %("#{text}").undump
12
+ end
13
+ else
14
+ # poor man's unescape without using eval
15
+ require 'yaml'
16
+ def char
17
+ YAML.load(%Q(---\n"#{text}"\n))
18
+ end
19
+ end
14
20
  end
15
21
 
16
22
  class Literal < EscapeSequence::Base
@@ -1,7 +1,6 @@
1
1
  module Regexp::Expression
2
-
3
2
  class FreeSpace < Regexp::Expression::Base
4
- def quantify(_token, _text, _min = nil, _max = nil, _mode = :greedy)
3
+ def quantify(*_args)
5
4
  raise Regexp::Parser::Error, 'Can not quantify a free space object'
6
5
  end
7
6
  end
@@ -13,5 +12,4 @@ module Regexp::Expression
13
12
  text << exp.text
14
13
  end
15
14
  end
16
-
17
15
  end
@@ -1,8 +1,8 @@
1
1
  module Regexp::Expression
2
2
  module Group
3
3
  class Base < Regexp::Expression::Subexpression
4
- def to_s(format = :full)
5
- "#{text}#{expressions.join})#{quantifier_affix(format)}"
4
+ def parts
5
+ [text.dup, *expressions, ')']
6
6
  end
7
7
 
8
8
  def capturing?; false end
@@ -18,9 +18,9 @@ module Regexp::Expression
18
18
  super
19
19
  end
20
20
 
21
- def to_s(format = :full)
21
+ def parts
22
22
  if implicit?
23
- "#{expressions.join}#{quantifier_affix(format)}"
23
+ expressions
24
24
  else
25
25
  super
26
26
  end
@@ -65,8 +65,8 @@ module Regexp::Expression
65
65
  end
66
66
 
67
67
  class Comment < Group::Base
68
- def to_s(_format = :full)
69
- text.dup
68
+ def parts
69
+ [text.dup]
70
70
  end
71
71
 
72
72
  def comment?; true end
@@ -1,5 +1,7 @@
1
1
  module Regexp::Expression
2
2
  module Keep
3
+ # TOOD: in regexp_parser v3.0.0 this should possibly be a Subexpression
4
+ # that contains all expressions to its left.
3
5
  class Mark < Regexp::Expression::Base; end
4
6
  end
5
7
  end