regexp_parser 2.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +94 -6
  3. data/Gemfile +2 -1
  4. data/LICENSE +1 -1
  5. data/README.md +40 -30
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +1 -1
  8. data/lib/regexp_parser/expression/base.rb +75 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
  18. data/lib/regexp_parser/expression/classes/group.rb +6 -6
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/root.rb +3 -6
  22. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
  23. data/lib/regexp_parser/expression/methods/construct.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
  25. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  26. data/lib/regexp_parser/expression/methods/tests.rb +10 -1
  27. data/lib/regexp_parser/expression/quantifier.rb +41 -23
  28. data/lib/regexp_parser/expression/sequence.rb +9 -24
  29. data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
  30. data/lib/regexp_parser/expression/shared.rb +85 -0
  31. data/lib/regexp_parser/expression/subexpression.rb +11 -8
  32. data/lib/regexp_parser/expression.rb +10 -132
  33. data/lib/regexp_parser/lexer.rb +8 -6
  34. data/lib/regexp_parser/parser.rb +21 -72
  35. data/lib/regexp_parser/scanner/properties/long.csv +622 -0
  36. data/lib/regexp_parser/scanner/properties/short.csv +246 -0
  37. data/lib/regexp_parser/scanner/property.rl +1 -1
  38. data/lib/regexp_parser/scanner/scanner.rl +48 -35
  39. data/lib/regexp_parser/scanner.rb +735 -801
  40. data/lib/regexp_parser/syntax/any.rb +2 -7
  41. data/lib/regexp_parser/syntax/base.rb +91 -66
  42. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  43. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  44. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  45. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  46. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  47. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  48. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  49. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  50. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  51. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  52. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  53. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  54. data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
  55. data/lib/regexp_parser/syntax/token.rb +45 -0
  56. data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
  57. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  58. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  59. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  60. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  61. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  62. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  63. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  64. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  65. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  66. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  67. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  68. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  69. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  70. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  71. data/lib/regexp_parser/syntax/versions.rb +1 -1
  72. data/lib/regexp_parser/syntax.rb +1 -1
  73. data/lib/regexp_parser/token.rb +9 -20
  74. data/lib/regexp_parser/version.rb +1 -1
  75. data/lib/regexp_parser.rb +0 -2
  76. data/regexp_parser.gemspec +20 -22
  77. metadata +37 -166
  78. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  79. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  80. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  81. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  82. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  83. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  84. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  85. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  86. data/spec/expression/base_spec.rb +0 -104
  87. data/spec/expression/clone_spec.rb +0 -152
  88. data/spec/expression/conditional_spec.rb +0 -89
  89. data/spec/expression/free_space_spec.rb +0 -27
  90. data/spec/expression/methods/match_length_spec.rb +0 -161
  91. data/spec/expression/methods/match_spec.rb +0 -25
  92. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  93. data/spec/expression/methods/tests_spec.rb +0 -99
  94. data/spec/expression/methods/traverse_spec.rb +0 -161
  95. data/spec/expression/options_spec.rb +0 -128
  96. data/spec/expression/subexpression_spec.rb +0 -50
  97. data/spec/expression/to_h_spec.rb +0 -26
  98. data/spec/expression/to_s_spec.rb +0 -108
  99. data/spec/lexer/all_spec.rb +0 -22
  100. data/spec/lexer/conditionals_spec.rb +0 -53
  101. data/spec/lexer/delimiters_spec.rb +0 -68
  102. data/spec/lexer/escapes_spec.rb +0 -14
  103. data/spec/lexer/keep_spec.rb +0 -10
  104. data/spec/lexer/literals_spec.rb +0 -64
  105. data/spec/lexer/nesting_spec.rb +0 -99
  106. data/spec/lexer/refcalls_spec.rb +0 -60
  107. data/spec/parser/all_spec.rb +0 -43
  108. data/spec/parser/alternation_spec.rb +0 -88
  109. data/spec/parser/anchors_spec.rb +0 -17
  110. data/spec/parser/conditionals_spec.rb +0 -179
  111. data/spec/parser/errors_spec.rb +0 -30
  112. data/spec/parser/escapes_spec.rb +0 -121
  113. data/spec/parser/free_space_spec.rb +0 -130
  114. data/spec/parser/groups_spec.rb +0 -108
  115. data/spec/parser/keep_spec.rb +0 -6
  116. data/spec/parser/options_spec.rb +0 -28
  117. data/spec/parser/posix_classes_spec.rb +0 -8
  118. data/spec/parser/properties_spec.rb +0 -115
  119. data/spec/parser/quantifiers_spec.rb +0 -68
  120. data/spec/parser/refcalls_spec.rb +0 -117
  121. data/spec/parser/set/intersections_spec.rb +0 -127
  122. data/spec/parser/set/ranges_spec.rb +0 -111
  123. data/spec/parser/sets_spec.rb +0 -178
  124. data/spec/parser/types_spec.rb +0 -18
  125. data/spec/scanner/all_spec.rb +0 -18
  126. data/spec/scanner/anchors_spec.rb +0 -21
  127. data/spec/scanner/conditionals_spec.rb +0 -128
  128. data/spec/scanner/delimiters_spec.rb +0 -52
  129. data/spec/scanner/errors_spec.rb +0 -67
  130. data/spec/scanner/escapes_spec.rb +0 -64
  131. data/spec/scanner/free_space_spec.rb +0 -165
  132. data/spec/scanner/groups_spec.rb +0 -61
  133. data/spec/scanner/keep_spec.rb +0 -10
  134. data/spec/scanner/literals_spec.rb +0 -39
  135. data/spec/scanner/meta_spec.rb +0 -18
  136. data/spec/scanner/options_spec.rb +0 -36
  137. data/spec/scanner/properties_spec.rb +0 -64
  138. data/spec/scanner/quantifiers_spec.rb +0 -25
  139. data/spec/scanner/refcalls_spec.rb +0 -55
  140. data/spec/scanner/sets_spec.rb +0 -151
  141. data/spec/scanner/types_spec.rb +0 -14
  142. data/spec/spec_helper.rb +0 -16
  143. data/spec/support/runner.rb +0 -42
  144. data/spec/support/shared_examples.rb +0 -77
  145. data/spec/support/warning_extractor.rb +0 -60
  146. data/spec/syntax/syntax_spec.rb +0 -48
  147. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  148. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  149. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  150. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  151. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  152. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  153. data/spec/syntax/versions/aliases_spec.rb +0 -37
  154. data/spec/token/token_spec.rb +0 -85
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 077b8a0c90d90cf46e44671ec1335a5373eef72c61a0bcf4de43ba5217a188c3
4
- data.tar.gz: b9aed868af73adcdf40c09720c5d10091b25a53b25a792717ceb5591039a2931
3
+ metadata.gz: f871ec3cdea5a594f72f5386f1b344710e6204f7307ba40d966653197f526be8
4
+ data.tar.gz: dd93c880f29ec77531faa2379fbfc8e34a9b67680664c6a3477d38afeaa1809a
5
5
  SHA512:
6
- metadata.gz: 9c04d9a6434c6e3f322e97e8e2a1c86b3ddda88bd8821368a37b92f5836e4c3df1dc27a79165303420c3e8d5eea31bda1483824da01a40ce30961b645ba65ddd
7
- data.tar.gz: 01e5c261e9dca0c4df7c696128dbc0520ca40aa6b9393cc8d6c3bdb8386470aeb773566000b811f98c1407038216c8d2c0b444c7955ea5a881ac759796f8a440
6
+ metadata.gz: 45e52ab0ce7bec3e4a275efa3828532778c49e8d36eec1ea82a43755a87abc9eee97e986027aa8f5c64fd604f15164d2ad4f37e5d6e22a5a1e3e9da6788271b9
7
+ data.tar.gz: 1f5514f3252294d9fe0877cff1d8b0db0400838c97ed78d15bbb794b94595c20d081681e4b1fe9bb6c89be7749514d8b2b8cf385360d002cd89e2a76ce6d2e63
data/CHANGELOG.md CHANGED
@@ -1,5 +1,93 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ### Added
4
+
5
+ - `Regexp::Expression::Base.construct` and `.token_class` methods
6
+
7
+ ## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
8
+
9
+ ### Fixed
10
+
11
+ - fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
12
+ - they used to be treated as reluctant or possessive mode indicators
13
+ - however, Ruby does not support these modes for interval quantifiers
14
+ - they are now treated as chained quantifiers instead, as Ruby does it
15
+ - c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
16
+ - fixed `Expression::Base#nesting_level` for some tree rewrite cases
17
+ - e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
18
+ - fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
19
+ - they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
20
+ - they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
21
+
22
+ ### Added
23
+
24
+ - added `Expression::Base#==` for (deep) comparison of expressions
25
+ - added `Expression::Base#parts`
26
+ - returns the text elements and subexpressions of an expression
27
+ - e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
28
+ - added `Expression::Base#te` (a.k.a. token end index)
29
+ - `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
30
+ - made some `Expression::Base` methods available on `Quantifier` instances, too
31
+ - `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
32
+ - `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
33
+ - `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
34
+ - this allows a more unified handling with `Expression::Base` instances
35
+ - allowed `Quantifier#initialize` to take a token and options Hash like other nodes
36
+ - added a deprecation warning for initializing Quantifiers with 4+ arguments:
37
+
38
+ Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments
39
+ is deprecated.
40
+
41
+ It will no longer be supported in regexp_parser v3.0.0.
42
+
43
+ Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
44
+ with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
45
+ will be derived automatically.
46
+
47
+ Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
48
+
49
+ This is consistent with how Expression::Base instances are created.
50
+
51
+
52
+ ## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:janosch84@gmail.com)
53
+
54
+ ### Fixed
55
+
56
+ - removed five inexistent unicode properties from `Syntax#features`
57
+ - these were never supported by Ruby or the `Regexp::Scanner`
58
+ - thanks to [Markus Schirp](https://github.com/mbj) for the report
59
+
60
+ ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
61
+
62
+ ### Added
63
+
64
+ - improved parsing performance through `Syntax` refactoring
65
+ - instead of fresh `Syntax` instances, pre-loaded constants are now re-used
66
+ - this approximately doubles the parsing speed for simple regexps
67
+ - added methods to `Syntax` classes to show relative feature sets
68
+ - e.g. `Regexp::Syntax::V3_2_0.added_features`
69
+ - support for new unicode properties of Ruby 3.2 / Unicode 14.0
70
+
71
+ ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
72
+
73
+ ### Fixed
74
+
75
+ - fixed Syntax version of absence groups (`(?~...)`)
76
+ - the lexer accepted them for any Ruby version
77
+ - now they are only recognized for Ruby >= 2.4.1 in which they were introduced
78
+ - reduced gem size by excluding specs from package
79
+ - removed deprecated `test_files` gemspec setting
80
+ - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
81
+ - no longer depend on `set`
82
+ - `set` was removed from the stdlib and made a standalone gem as of Ruby 3
83
+ - this made it a hidden/undeclared dependency of `regexp_parser`
84
+
85
+ ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
86
+
87
+ ### Added
88
+
89
+ - added support for 13 new unicode properties introduced in Ruby 3.1.0
90
+
3
91
  ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
4
92
 
5
93
  ### Fixed
@@ -149,7 +237,7 @@
149
237
 
150
238
  ### Added
151
239
 
152
- - `Expression#each_expression` and `#traverse` can now be called without a block
240
+ - `Expression::Base#each_expression` and `#traverse` can now be called without a block
153
241
  * this returns an `Enumerator` and allows chaining, e.g. `each_expression.select`
154
242
  * thanks to [Masataka Kuwabara](https://github.com/pocke)
155
243
 
@@ -175,7 +263,7 @@
175
263
  - Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags
176
264
  - Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)'
177
265
  - Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\'
178
- - Fixed `Expression#match` and `#=~` not working with a single argument
266
+ - Fixed `Expression::Base#match` and `#=~` not working with a single argument
179
267
 
180
268
  ### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com)
181
269
 
@@ -183,15 +271,15 @@
183
271
 
184
272
  - Added `#referenced_expression` for backrefs, subexp calls and conditionals
185
273
  * returns the `Group` expression that is being referenced via name or number
186
- - Added `Expression#repetitions`
274
+ - Added `Expression::Base#repetitions`
187
275
  * returns a `Range` of allowed repetitions (`1..1` if there is no quantifier)
188
276
  * like `#quantity` but with a more uniform interface
189
- - Added `Expression#match_length`
277
+ - Added `Expression::Base#match_length`
190
278
  * allows to inspect and iterate over String lengths matched by the Expression
191
279
 
192
280
  ### Fixed
193
281
 
194
- - Fixed `Expression#clone` "direction"
282
+ - Fixed `Expression::Base#clone` "direction"
195
283
  * it used to dup ivars onto the callee, leaving only the clone referencing the original objects
196
284
  * this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys
197
285
  - Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals
@@ -353,7 +441,7 @@ This release includes several breaking changes, mostly to character sets, #map a
353
441
  - Fixed a thread safety issue (issue #45)
354
442
  - Some public class methods that were only reliable for
355
443
  internal use are now private instance methods (PR #46)
356
- - Improved the usefulness of Expression#options (issue #43) -
444
+ - Improved the usefulness of Expression::Base#options (issue #43) -
357
445
  #options and derived methods such as #i?, #m? and #x? are now
358
446
  defined for all Expressions that are affected by such flags.
359
447
  - Fixed scanning of whitespace following (?x) (commit 5c94bd2)
data/Gemfile CHANGED
@@ -5,9 +5,10 @@ gemspec
5
5
  group :development, :test do
6
6
  gem 'ice_nine', '~> 0.11.2'
7
7
  gem 'rake', '~> 13.0'
8
- gem 'regexp_property_values', '~> 1.0'
8
+ gem 'regexp_property_values', '~> 1.3'
9
9
  gem 'rspec', '~> 3.10'
10
10
  if RUBY_VERSION.to_f >= 2.7
11
+ gem 'benchmark-ips', '~> 2.1'
11
12
  gem 'gouteur'
12
13
  gem 'rubocop', '~> 1.7'
13
14
  end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2015, Ammar Ali
1
+ Copyright (c) 2010, 2012-2022, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
@@ -154,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
154
157
  flavor variations. Syntax only comes into play in the lexer.
155
158
 
156
159
  #### Example
157
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
160
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
158
161
  checks a few of their implementation features.
159
162
 
160
163
  ```ruby
161
164
  require 'regexp_parser'
162
165
 
163
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
166
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
164
167
  ruby_20.implements? :quantifier, :zero_or_one # => true
165
168
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
166
169
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
167
170
  ruby_20.implements? :conditional, :condition # => true
168
171
 
169
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
172
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
170
173
  ruby_19.implements? :quantifier, :zero_or_one # => true
171
174
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
172
175
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
173
176
  ruby_19.implements? :conditional, :condition # => false
174
177
 
175
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
178
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
176
179
  ruby_18.implements? :quantifier, :zero_or_one # => true
177
180
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
178
181
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
179
182
  ruby_18.implements? :conditional, :condition # => false
180
183
  ```
181
184
 
185
+ Syntax objects can also be queried about their complete and relative feature sets.
186
+
187
+ ```ruby
188
+ require 'regexp_parser'
189
+
190
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
191
+ ruby_20.added_features # => { conditional: [...], ... }
192
+ ruby_20.removed_features # => { property: [:newline], ... }
193
+ ruby_20.features # => { anchor: [...], ... }
194
+ ```
182
195
 
183
196
  #### Notes
184
197
  * Variations on a token, for example a named group with angle brackets (< and >)
@@ -354,15 +367,15 @@ _Note that not all of these are available in all versions of Ruby_
354
367
  | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | &#x2713; |
355
368
  | **Quantifiers** | | &#x22f1; |
356
369
  | &emsp;&nbsp;_**Greedy**_ | `?`, `*`, `+`, `{m,M}` | &#x2713; |
357
- | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
358
- | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
370
+ | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | &#x2713; |
371
+ | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++` \[1\] | &#x2713; |
359
372
  | **String Escapes** | | &#x22f1; |
360
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
373
+ | &emsp;&nbsp;_**Control** \[2\]_ | `\C-C`, `\cD` | &#x2713; |
361
374
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
362
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
375
+ | &emsp;&nbsp;_**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
363
376
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
364
377
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
365
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
378
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
366
379
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
367
380
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
368
381
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -371,6 +384,14 @@ _Note that not all of these are available in all versions of Ruby_
371
384
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
372
385
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
373
386
 
387
+ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
388
+ quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
389
+ [#69](https://github.com/ammar/regexp_parser/pull/69).
390
+
391
+ **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
392
+ https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
393
+ scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
394
+
374
395
  ##### Inapplicable Features
375
396
 
376
397
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -384,7 +405,6 @@ expressions library (Onigmo). They are not supported by the scanner.
384
405
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
385
406
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
386
407
 
387
-
388
408
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
389
409
 
390
410
  _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
@@ -392,26 +412,14 @@ or incorrectly return tokens/objects as literals._
392
412
 
393
413
 
394
414
  ## Testing
395
- To run the tests simply run rake from the root directory, as 'test' is the default task.
415
+ To run the tests simply run rake from the root directory.
396
416
 
397
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
417
+ The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
398
418
 
399
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
419
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
400
420
 
401
421
  ```
402
- bin/test
403
- ```
404
-
405
- You can run a specific test like so:
406
-
407
- ```
408
- bin/test spec/scanner/properties_spec.rb
409
- ```
410
-
411
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
412
-
413
- ```
414
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
422
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
415
423
  ```
416
424
 
417
425
  ## Building
@@ -439,11 +447,13 @@ Projects using regexp_parser.
439
447
 
440
448
  - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
441
449
 
442
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
450
+ - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
443
451
 
444
452
  - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
445
453
 
446
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
454
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
455
+
456
+ - [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer for Ruby.
447
457
 
448
458
  - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
449
459
 
@@ -476,4 +486,4 @@ Documentation and books used while working on this project.
476
486
 
477
487
  ---
478
488
  ##### Copyright
479
- _Copyright (c) 2010-2020 Ammar Ali. See LICENSE file for details._
489
+ _Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,87 +1,23 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
11
- RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
14
9
 
15
10
  Bundler::GemHelper.install_tasks
16
11
 
12
+ RSpec::Core::RakeTask.new(:spec)
17
13
 
18
14
  task :default => [:'test:full']
19
15
 
20
16
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
17
+ task full: [:'ragel:rb', :spec]
24
18
  end
25
19
 
26
- namespace :ragel do
27
- desc "Process the ragel source files and output ruby code"
28
- task :rb do
29
- RAGEL_SOURCE_FILES.each do |source_file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
31
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
33
-
34
- contents = File.read(output_file)
35
-
36
- File.open(output_file, 'r+') do |file|
37
- contents = "# -*- warn-indent:false; -*-\n" + contents
38
-
39
- file.write(contents)
40
- end
41
- end
42
- end
43
-
44
- desc "Delete the ragel generated source file(s)"
45
- task :clean do
46
- RAGEL_SOURCE_FILES.each do |file|
47
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
- end
49
- end
50
- end
51
-
52
-
53
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
21
  # latest scanner code is generated and included in the build.
55
22
  desc "Runs ragel:rb before building the gem"
56
23
  task :build => ['ragel:rb']
57
-
58
-
59
- namespace :props do
60
- desc 'Write new property value hashes for the properties scanner'
61
- task :update do
62
- require 'regexp_property_values'
63
- RegexpPropertyValues.update
64
- dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
65
-
66
- require 'psych'
67
- write_hash_to_file = ->(hash, path) do
68
- File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
73
- end
74
- puts "Wrote #{hash.count} aliases to `#{path}`"
75
- end
76
-
77
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
- [val.identifier, val.full_name.downcase]
79
- end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
81
-
82
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
- [k.identifier, v.full_name.downcase]
84
- end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
86
- end
87
- end
@@ -1,4 +1,4 @@
1
1
  class Regexp::Parser
2
- # base class for all gem-specific errors (inherited but never raised itself)
2
+ # base class for all gem-specific errors
3
3
  class Error < StandardError; end
4
4
  end
@@ -0,0 +1,75 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ include Regexp::Expression::Shared
4
+
5
+ def initialize(token, options = {})
6
+ init_from_token_and_options(token, options)
7
+ end
8
+
9
+ def initialize_copy(orig)
10
+ self.text = orig.text.dup if orig.text
11
+ self.options = orig.options.dup if orig.options
12
+ self.quantifier = orig.quantifier.clone if orig.quantifier
13
+ super
14
+ end
15
+
16
+ def to_re(format = :full)
17
+ ::Regexp.new(to_s(format))
18
+ end
19
+
20
+ def quantify(*args)
21
+ self.quantifier = Quantifier.new(*args)
22
+ end
23
+
24
+ def unquantified_clone
25
+ clone.tap { |exp| exp.quantifier = nil }
26
+ end
27
+
28
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
29
+ def quantity
30
+ return [nil,nil] unless quantified?
31
+ [quantifier.min, quantifier.max]
32
+ end
33
+
34
+ def repetitions
35
+ return 1..1 unless quantified?
36
+ min = quantifier.min
37
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
38
+ range = min..max
39
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
40
+ if RUBY_VERSION.to_f < 2.7
41
+ range.define_singleton_method(:minmax) { [min, max] }
42
+ end
43
+ range
44
+ end
45
+
46
+ def greedy?
47
+ quantified? and quantifier.greedy?
48
+ end
49
+
50
+ def reluctant?
51
+ quantified? and quantifier.reluctant?
52
+ end
53
+ alias :lazy? :reluctant?
54
+
55
+ def possessive?
56
+ quantified? and quantifier.possessive?
57
+ end
58
+
59
+ def to_h
60
+ {
61
+ type: type,
62
+ token: token,
63
+ text: to_s(:base),
64
+ starts_at: ts,
65
+ length: full_length,
66
+ level: level,
67
+ set_level: set_level,
68
+ conditional_level: conditional_level,
69
+ options: options,
70
+ quantifier: quantified? ? quantifier.to_h : nil,
71
+ }
72
+ end
73
+ alias :attributes :to_h
74
+ end
75
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -1,4 +1,5 @@
1
1
  module Regexp::Expression
2
+ # TODO: unify name with token :backref, one way or the other, in v3.0.0
2
3
  module Backreference
3
4
  class Base < Regexp::Expression::Base
4
5
  attr_accessor :referenced_expression
@@ -16,8 +16,8 @@ module Regexp::Expression
16
16
  count == 2
17
17
  end
18
18
 
19
- def to_s(_format = :full)
20
- expressions.join(text)
19
+ def parts
20
+ intersperse(expressions, text.dup)
21
21
  end
22
22
  end
23
23
  end
@@ -20,8 +20,8 @@ module Regexp::Expression
20
20
  self.closed = true
21
21
  end
22
22
 
23
- def to_s(format = :full)
24
- "#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
23
+ def parts
24
+ ["#{text}#{'^' if negated?}", *expressions, ']']
25
25
  end
26
26
  end
27
27
  end # module Regexp::Expression
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module CharacterType
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -15,5 +14,4 @@ module Regexp::Expression
15
14
  class Linebreak < CharacterType::Base; end
16
15
  class ExtendedGrapheme < CharacterType::Base; end
17
16
  end
18
-
19
17
  end
@@ -55,8 +55,8 @@ module Regexp::Expression
55
55
  condition.reference
56
56
  end
57
57
 
58
- def to_s(format = :full)
59
- "#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
58
+ def parts
59
+ [text.dup, condition, *intersperse(branches, '|'), ')']
60
60
  end
61
61
 
62
62
  def initialize_copy(orig)
@@ -1,16 +1,22 @@
1
1
  module Regexp::Expression
2
+ # TODO: unify naming with Token::Escape, on way or the other, in v3.0.0
2
3
  module EscapeSequence
3
4
  class Base < Regexp::Expression::Base
4
- require 'yaml'
5
-
6
- def char
7
- # poor man's unescape without using eval
8
- YAML.load(%Q(---\n"#{text}"\n))
9
- end
10
-
11
5
  def codepoint
12
6
  char.ord
13
7
  end
8
+
9
+ if ''.respond_to?(:undump)
10
+ def char
11
+ %("#{text}").undump
12
+ end
13
+ else
14
+ # poor man's unescape without using eval
15
+ require 'yaml'
16
+ def char
17
+ YAML.load(%Q(---\n"#{text}"\n))
18
+ end
19
+ end
14
20
  end
15
21
 
16
22
  class Literal < EscapeSequence::Base
@@ -1,7 +1,6 @@
1
1
  module Regexp::Expression
2
-
3
2
  class FreeSpace < Regexp::Expression::Base
4
- def quantify(_token, _text, _min = nil, _max = nil, _mode = :greedy)
3
+ def quantify(*_args)
5
4
  raise Regexp::Parser::Error, 'Can not quantify a free space object'
6
5
  end
7
6
  end
@@ -13,5 +12,4 @@ module Regexp::Expression
13
12
  text << exp.text
14
13
  end
15
14
  end
16
-
17
15
  end
@@ -1,8 +1,8 @@
1
1
  module Regexp::Expression
2
2
  module Group
3
3
  class Base < Regexp::Expression::Subexpression
4
- def to_s(format = :full)
5
- "#{text}#{expressions.join})#{quantifier_affix(format)}"
4
+ def parts
5
+ [text.dup, *expressions, ')']
6
6
  end
7
7
 
8
8
  def capturing?; false end
@@ -18,9 +18,9 @@ module Regexp::Expression
18
18
  super
19
19
  end
20
20
 
21
- def to_s(format = :full)
21
+ def parts
22
22
  if implicit?
23
- "#{expressions.join}#{quantifier_affix(format)}"
23
+ expressions
24
24
  else
25
25
  super
26
26
  end
@@ -65,8 +65,8 @@ module Regexp::Expression
65
65
  end
66
66
 
67
67
  class Comment < Group::Base
68
- def to_s(_format = :full)
69
- text.dup
68
+ def parts
69
+ [text.dup]
70
70
  end
71
71
 
72
72
  def comment?; true end
@@ -1,5 +1,7 @@
1
1
  module Regexp::Expression
2
2
  module Keep
3
+ # TOOD: in regexp_parser v3.0.0 this should possibly be a Subexpression
4
+ # that contains all expressions to its left.
3
5
  class Mark < Regexp::Expression::Base; end
4
6
  end
5
7
  end