regexp_parser 1.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +364 -22
  3. data/Gemfile +8 -2
  4. data/LICENSE +1 -1
  5. data/README.md +124 -88
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +76 -0
  9. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  10. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  11. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  18. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  22. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  23. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  24. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  25. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  26. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  27. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  28. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  29. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  30. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  31. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  32. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  33. data/lib/regexp_parser/expression/sequence.rb +11 -47
  34. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  35. data/lib/regexp_parser/expression/shared.rb +111 -0
  36. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  37. data/lib/regexp_parser/expression.rb +14 -141
  38. data/lib/regexp_parser/lexer.rb +83 -41
  39. data/lib/regexp_parser/parser.rb +371 -429
  40. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  41. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  42. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  43. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  44. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  45. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  46. data/lib/regexp_parser/scanner/property.rl +4 -4
  47. data/lib/regexp_parser/scanner/scanner.rl +295 -368
  48. data/lib/regexp_parser/scanner.rb +1405 -1674
  49. data/lib/regexp_parser/syntax/any.rb +2 -7
  50. data/lib/regexp_parser/syntax/base.rb +92 -67
  51. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  52. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  53. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  54. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  55. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  56. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  57. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  58. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  59. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  60. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  61. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  62. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  63. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  64. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  65. data/lib/regexp_parser/syntax/token.rb +45 -0
  66. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  67. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  68. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  69. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  70. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  71. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  75. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  79. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  81. data/lib/regexp_parser/syntax/versions.rb +3 -1
  82. data/lib/regexp_parser/syntax.rb +8 -6
  83. data/lib/regexp_parser/token.rb +9 -20
  84. data/lib/regexp_parser/version.rb +1 -1
  85. data/lib/regexp_parser.rb +0 -2
  86. data/regexp_parser.gemspec +20 -22
  87. metadata +49 -166
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/README.md CHANGED
@@ -1,15 +1,18 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
7
10
  * Multilayered
8
11
  * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
9
- * A lexer that produces a "stream" of token objects.
10
- * A parser that produces a "tree" of Expression objects (OO API)
11
- * Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
12
- * Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
12
+ * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
13
+ * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
14
+ * Runs on Ruby 2.x, 3.x and JRuby runtimes
15
+ * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
13
16
 
14
17
 
15
18
  _For examples of regexp_parser in use, see [Example Projects](#example-projects)._
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
18
21
  ---
19
22
  ## Requirements
20
23
 
21
- * Ruby >= 1.9
24
+ * Ruby >= 2.0
22
25
  * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
23
26
 
24
27
 
25
- _Note: See the .travis.yml file for covered versions._
26
-
27
-
28
28
  ---
29
29
  ## Install
30
30
 
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
36
36
 
37
37
  ```gem 'regexp_parser', '~> X.Y.Z'```
38
38
 
39
- See rubygems for the the [latest version number](https://rubygems.org/gems/regexp_parser)
39
+ See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
40
+ for the the latest version number.
40
41
 
41
42
 
42
43
  ---
43
44
  ## Usage
44
45
 
45
46
  The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
46
- provides a single method that takes a regular expression (as a RegExp object or
47
+ provides a single method that takes a regular expression (as a Regexp object or
47
48
  a string) and returns its results. The **Lexer** and the **Parser** accept an
48
49
  optional second argument that specifies the syntax version, like 'ruby/2.0',
49
50
  which defaults to the host Ruby version (using RUBY_VERSION).
@@ -66,12 +67,23 @@ called with the results as follows:
66
67
  * **Scanner**: the block gets passed the results as they are scanned. See the
67
68
  example in the next section for details.
68
69
 
69
- * **Lexer**: after completion, the block gets passed the tokens one by one.
70
+ * **Lexer**: the block gets passed the tokens one by one as they are scanned.
70
71
  _The result of the block is returned._
71
72
 
72
73
  * **Parser**: after completion, the block gets passed the root expression.
73
74
  _The result of the block is returned._
74
75
 
76
+ All three methods accept either a `Regexp` or `String` (containing the pattern)
77
+ - if a String is passed, `options` can be supplied:
78
+
79
+ ```ruby
80
+ require 'regexp_parser'
81
+
82
+ Regexp::Parser.parse(
83
+ "a+ # Recognizes a and A...",
84
+ options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
85
+ )
86
+ ```
75
87
 
76
88
  ---
77
89
  ## Components
@@ -90,7 +102,7 @@ start/end offsets for each token found.
90
102
  ```ruby
91
103
  require 'regexp_parser'
92
104
 
93
- Regexp::Scanner.scan /(ab?(cd)*[e-h]+)/ do |type, token, text, ts, te|
105
+ Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
94
106
  puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
95
107
  end
96
108
 
@@ -113,8 +125,8 @@ A one-liner that uses map on the result of the scan to return the textual
113
125
  parts of the pattern:
114
126
 
115
127
  ```ruby
116
- Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
117
- #=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
128
+ Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
129
+ # => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
118
130
  ```
119
131
 
120
132
 
@@ -136,11 +148,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
136
148
  to the lexer.
137
149
 
138
150
  * The MRI implementation may accept expressions that either conflict with
139
- the documentation or are undocumented. The scanner does not support such
140
- implementation quirks.
141
- _(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
142
- [#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
143
-
151
+ the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
152
+ The scanner will try to support as many of these cases as possible.
144
153
 
145
154
  ---
146
155
  ### Syntax
@@ -149,31 +158,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
149
158
  flavor variations. Syntax only comes into play in the lexer.
150
159
 
151
160
  #### Example
152
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
161
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
153
162
  checks a few of their implementation features.
154
163
 
155
164
  ```ruby
156
165
  require 'regexp_parser'
157
166
 
158
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
167
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
159
168
  ruby_20.implements? :quantifier, :zero_or_one # => true
160
169
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
161
170
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
162
171
  ruby_20.implements? :conditional, :condition # => true
163
172
 
164
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
173
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
165
174
  ruby_19.implements? :quantifier, :zero_or_one # => true
166
175
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
167
176
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
168
177
  ruby_19.implements? :conditional, :condition # => false
169
178
 
170
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
179
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
171
180
  ruby_18.implements? :quantifier, :zero_or_one # => true
172
181
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
173
182
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
174
183
  ruby_18.implements? :conditional, :condition # => false
175
184
  ```
176
185
 
186
+ Syntax objects can also be queried about their complete and relative feature sets.
187
+
188
+ ```ruby
189
+ require 'regexp_parser'
190
+
191
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
192
+ ruby_20.added_features # => { conditional: [...], ... }
193
+ ruby_20.removed_features # => { property: [:newline], ... }
194
+ ruby_20.features # => { anchor: [...], ... }
195
+ ```
177
196
 
178
197
  #### Notes
179
198
  * Variations on a token, for example a named group with angle brackets (< and >)
@@ -202,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
202
221
  ```ruby
203
222
  require 'regexp_parser'
204
223
 
205
- Regexp::Lexer.lex /a?(b(c))*[d]+/, 'ruby/1.9' do |token|
224
+ Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
206
225
  puts "#{' ' * token.level}#{token.text}"
207
226
  end
208
227
 
@@ -228,8 +247,8 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
228
247
  by a quantifier that only applies to it.
229
248
 
230
249
  ```ruby
231
- Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
232
- #=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
250
+ Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
251
+ # => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
233
252
  ```
234
253
 
235
254
  #### Notes
@@ -243,7 +262,7 @@ Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
243
262
  ### Parser
244
263
  Sits on top of the lexer and transforms the "stream" of Token objects emitted
245
264
  by it into a tree of Expression objects represented by an instance of the
246
- Expression::Root class.
265
+ `Expression::Root` class.
247
266
 
248
267
  See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
249
268
  wiki page for attributes and methods.
@@ -251,12 +270,40 @@ wiki page for attributes and methods.
251
270
 
252
271
  #### Example
253
272
 
273
+ This example uses the tree traversal method `#each_expression`
274
+ and the method `#strfregexp` to print each object in the tree.
275
+
276
+ ```ruby
277
+ include_root = true
278
+ indent_offset = include_root ? 1 : 0
279
+
280
+ tree.each_expression(include_root) do |exp|
281
+ puts exp.strfregexp("%>> %c", indent_offset)
282
+ end
283
+
284
+ # Output
285
+ # > Regexp::Expression::Root
286
+ # > Regexp::Expression::Literal
287
+ # > Regexp::Expression::Group::Capture
288
+ # > Regexp::Expression::Literal
289
+ # > Regexp::Expression::Group::Capture
290
+ # > Regexp::Expression::Literal
291
+ # > Regexp::Expression::Literal
292
+ # > Regexp::Expression::Group::Named
293
+ # > Regexp::Expression::CharacterSet
294
+ ```
295
+
296
+ _Note: quantifiers do not appear in the output because they are members of the
297
+ Expression class. See the next section for details._
298
+
299
+ Another example, using `#traverse` for a more fine-grained tree traversal:
300
+
254
301
  ```ruby
255
302
  require 'regexp_parser'
256
303
 
257
304
  regex = /a?(b+(c)d)*(?<name>[0-9]+)/
258
305
 
259
- tree = Regexp::Parser.parse( regex, 'ruby/2.1' )
306
+ tree = Regexp::Parser.parse(regex, 'ruby/2.1')
260
307
 
261
308
  tree.traverse do |event, exp|
262
309
  puts "#{event}: #{exp.type} `#{exp.to_s}`"
@@ -276,40 +323,15 @@ end
276
323
  # exit: group `(?<name>[0-9]+)`
277
324
  ```
278
325
 
279
- Another example, using each_expression and strfregexp to print the object tree.
280
326
  _See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods`
281
327
  for more information on these methods._
282
328
 
283
- ```ruby
284
- include_root = true
285
- indent_offset = include_root ? 1 : 0
286
-
287
- tree.each_expression(include_root) do |exp, level_index|
288
- puts exp.strfregexp("%>> %c", indent_offset)
289
- end
290
-
291
- # Output
292
- # > Regexp::Expression::Root
293
- # > Regexp::Expression::Literal
294
- # > Regexp::Expression::Group::Capture
295
- # > Regexp::Expression::Literal
296
- # > Regexp::Expression::Group::Capture
297
- # > Regexp::Expression::Literal
298
- # > Regexp::Expression::Literal
299
- # > Regexp::Expression::Group::Named
300
- # > Regexp::Expression::CharacterSet
301
- ```
302
-
303
- _Note: quantifiers do not appear in the output because they are members of the
304
- Expression class. See the next section for details._
305
-
306
-
307
329
  ---
308
330
 
309
331
 
310
332
  ## Supported Syntax
311
333
  The three modules support all the regular expression syntax features of Ruby 1.8,
312
- 1.9, and 2.x:
334
+ 1.9, 2.x and 3.x:
313
335
 
314
336
  _Note that not all of these are available in all versions of Ruby_
315
337
 
@@ -337,7 +359,7 @@ _Note that not all of these are available in all versions of Ruby_
337
359
  | &emsp;&emsp;_Nest Level_ | `\k<n-1>` | &#x2713; |
338
360
  | &emsp;&emsp;_Numbered_ | `\k<1>` | &#x2713; |
339
361
  | &emsp;&emsp;_Relative_ | `\k<-2>` | &#x2713; |
340
- | &emsp;&emsp;_Traditional_ | `\1` thru `\9` | &#x2713; |
362
+ | &emsp;&emsp;_Traditional_ | `\1` through `\9` | &#x2713; |
341
363
  | &emsp;&nbsp;_**Capturing**_ | `(abc)` | &#x2713; |
342
364
  | &emsp;&nbsp;_**Comments**_ | `(?# comment text)` | &#x2713; |
343
365
  | &emsp;&nbsp;_**Named**_ | `(?<name>abc)`, `(?'name'abc)` | &#x2713; |
@@ -349,15 +371,15 @@ _Note that not all of these are available in all versions of Ruby_
349
371
  | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | &#x2713; |
350
372
  | **Quantifiers** | | &#x22f1; |
351
373
  | &emsp;&nbsp;_**Greedy**_ | `?`, `*`, `+`, `{m,M}` | &#x2713; |
352
- | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
353
- | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
374
+ | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | &#x2713; |
375
+ | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++` \[1\] | &#x2713; |
354
376
  | **String Escapes** | | &#x22f1; |
355
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
377
+ | &emsp;&nbsp;_**Control** \[2\]_ | `\C-C`, `\cD` | &#x2713; |
356
378
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
357
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
379
+ | &emsp;&nbsp;_**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
358
380
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
359
381
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
360
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
382
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | &#x22f1; |
361
383
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
362
384
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
363
385
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -366,6 +388,18 @@ _Note that not all of these are available in all versions of Ruby_
366
388
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
367
389
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
368
390
 
391
+ [Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
392
+
393
+ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
394
+ Any `+` or `?` that follows an interval quantifier will be treated as another,
395
+ chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
396
+ [#69](https://github.com/ammar/regexp_parser/pull/69).
397
+
398
+ **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
399
+ escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
400
+ so they will only reach the scanner and will only be emitted if a String or a Regexp
401
+ that has been built with the `::new` constructor is scanned.
402
+
369
403
  ##### Inapplicable Features
370
404
 
371
405
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -379,40 +413,29 @@ expressions library (Onigmo). They are not supported by the scanner.
379
413
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
380
414
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
381
415
 
382
-
383
416
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
384
417
 
385
- _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
386
- or incorrectly return tokens/objects as literals._
418
+ _**Note**: Attempting to process expressions with unsupported syntax features can raise
419
+ an error, or incorrectly return tokens/objects as literals._
387
420
 
388
421
 
389
422
  ## Testing
390
- To run the tests simply run rake from the root directory, as 'test' is the default task.
391
-
392
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
393
-
394
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
423
+ To run the tests simply run rake from the root directory.
395
424
 
396
- ```
397
- bin/test
398
- ```
425
+ The default task generates the scanner's code from the Ragel source files and runs
426
+ all the specs, thus it requires Ragel to be installed.
399
427
 
400
- You can run a specific test like so:
428
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own,
429
+ so to run individual tests you might want to run:
401
430
 
402
431
  ```
403
- bin/test spec/scanner/properties_spec.rb
404
- ```
405
-
406
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
407
-
408
- ```
409
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
432
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
410
433
  ```
411
434
 
412
435
  ## Building
413
- Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be
414
- installed. The build tasks will automatically invoke the 'ragel:rb' task to generate the
415
- Ruby scanner code.
436
+ Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
437
+ to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
438
+ the Ruby scanner code.
416
439
 
417
440
 
418
441
  The project uses the standard rubygems package tasks, so:
@@ -432,13 +455,26 @@ rake install
432
455
  ## Example Projects
433
456
  Projects using regexp_parser.
434
457
 
435
- - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
458
+ - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
459
+ that uses regexp_parser to convert Regexps to css/xpath selectors.
460
+
461
+ - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
462
+ to JavaScript-compatible regular expressions.
463
+
464
+ - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
465
+ with alias support.
466
+
467
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
468
+ (amongst others) to see if your tests cover their behavior.
436
469
 
437
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
470
+ - [repper](https://github.com/jaynetics/repper) is a regular expression
471
+ pretty-printer and formatter for Ruby.
438
472
 
439
- - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) uses regexp_parser to generate examples of postal codes.
473
+ - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
474
+ uses regexp_parser to lint Regexps.
440
475
 
441
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
476
+ - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
477
+ that uses regexp_parser to generate examples of postal codes.
442
478
 
443
479
 
444
480
  ## References
@@ -467,4 +503,4 @@ Documentation and books used while working on this project.
467
503
 
468
504
  ---
469
505
  ##### Copyright
470
- _Copyright (c) 2010-2019 Ammar Ali. See LICENSE file for details._
506
+ _Copyright (c) 2010-2023 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,87 +1,23 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
11
- RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
14
9
 
15
10
  Bundler::GemHelper.install_tasks
16
11
 
12
+ RSpec::Core::RakeTask.new(:spec)
17
13
 
18
14
  task :default => [:'test:full']
19
15
 
20
16
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
17
+ task full: [:'ragel:rb', :spec]
24
18
  end
25
19
 
26
- namespace :ragel do
27
- desc "Process the ragel source files and output ruby code"
28
- task :rb do |t|
29
- RAGEL_SOURCE_FILES.each do |file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
31
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
33
-
34
- contents = File.read(output_file)
35
-
36
- File.open(output_file, 'r+') do |file|
37
- contents = "# -*- warn-indent:false; -*-\n" + contents
38
-
39
- file.write(contents)
40
- end
41
- end
42
- end
43
-
44
- desc "Delete the ragel generated source file(s)"
45
- task :clean do |t|
46
- RAGEL_SOURCE_FILES.each do |file|
47
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
- end
49
- end
50
- end
51
-
52
-
53
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
21
  # latest scanner code is generated and included in the build.
55
22
  desc "Runs ragel:rb before building the gem"
56
23
  task :build => ['ragel:rb']
57
-
58
-
59
- namespace :props do
60
- desc 'Write new property value hashes for the properties scanner'
61
- task :update do
62
- require 'regexp_property_values'
63
- RegexpPropertyValues.update
64
- dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
65
-
66
- require 'psych'
67
- write_hash_to_file = ->(hash, path) do
68
- File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
73
- end
74
- puts "Wrote #{hash.count} aliases to `#{path}`"
75
- end
76
-
77
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
- [val.identifier, val.full_name.downcase]
79
- end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
81
-
82
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
- [k.identifier, v.full_name.downcase]
84
- end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
86
- end
87
- end
@@ -0,0 +1,4 @@
1
+ class Regexp::Parser
2
+ # base class for all gem-specific errors
3
+ class Error < StandardError; end
4
+ end
@@ -0,0 +1,76 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ include Regexp::Expression::Shared
4
+
5
+ def initialize(token, options = {})
6
+ init_from_token_and_options(token, options)
7
+ end
8
+
9
+ def to_re(format = :full)
10
+ if set_level > 0
11
+ warn "Calling #to_re on character set members is deprecated - "\
12
+ "their behavior might not be equivalent outside of the set."
13
+ end
14
+ ::Regexp.new(to_s(format))
15
+ end
16
+
17
+ def quantify(*args)
18
+ self.quantifier = Quantifier.new(*args)
19
+ end
20
+
21
+ def unquantified_clone
22
+ clone.tap { |exp| exp.quantifier = nil }
23
+ end
24
+
25
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
26
+ def quantity
27
+ return [nil,nil] unless quantified?
28
+ [quantifier.min, quantifier.max]
29
+ end
30
+
31
+ def repetitions
32
+ @repetitions ||=
33
+ if quantified?
34
+ min = quantifier.min
35
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
36
+ range = min..max
37
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
38
+ if RUBY_VERSION.to_f < 2.7
39
+ range.define_singleton_method(:minmax) { [min, max] }
40
+ end
41
+ range
42
+ else
43
+ 1..1
44
+ end
45
+ end
46
+
47
+ def greedy?
48
+ quantified? and quantifier.greedy?
49
+ end
50
+
51
+ def reluctant?
52
+ quantified? and quantifier.reluctant?
53
+ end
54
+ alias :lazy? :reluctant?
55
+
56
+ def possessive?
57
+ quantified? and quantifier.possessive?
58
+ end
59
+
60
+ def to_h
61
+ {
62
+ type: type,
63
+ token: token,
64
+ text: to_s(:base),
65
+ starts_at: ts,
66
+ length: full_length,
67
+ level: level,
68
+ set_level: set_level,
69
+ conditional_level: conditional_level,
70
+ options: options,
71
+ quantifier: quantified? ? quantifier.to_h : nil,
72
+ }
73
+ end
74
+ alias :attributes :to_h
75
+ end
76
+ end
@@ -1,5 +1,5 @@
1
1
  module Regexp::Expression
2
- # A sequence of expressions, used by Alternation as one of its alternative.
2
+ # A sequence of expressions, used by Alternation as one of its alternatives.
3
3
  class Alternative < Regexp::Expression::Sequence; end
4
4
 
5
5
  class Alternation < Regexp::Expression::SequenceOperation
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -2,6 +2,23 @@ module Regexp::Expression
2
2
  module Backreference
3
3
  class Base < Regexp::Expression::Base
4
4
  attr_accessor :referenced_expression
5
+
6
+ def initialize_copy(orig)
7
+ exp_id = [self.class, self.starts_at]
8
+
9
+ # prevent infinite recursion for recursive subexp calls
10
+ copied = @@copied ||= {}
11
+ self.referenced_expression =
12
+ if copied[exp_id]
13
+ orig.referenced_expression
14
+ else
15
+ copied[exp_id] = true
16
+ orig.referenced_expression.dup
17
+ end
18
+ copied.clear
19
+
20
+ super
21
+ end
5
22
  end
6
23
 
7
24
  class Number < Backreference::Base
@@ -9,7 +26,7 @@ module Regexp::Expression
9
26
  alias reference number
10
27
 
11
28
  def initialize(token, options = {})
12
- @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
29
+ @number = token.text[/-?\d+/].to_i
13
30
  super
14
31
  end
15
32
  end
@@ -33,7 +50,7 @@ module Regexp::Expression
33
50
  class NameCall < Backreference::Name; end
34
51
  class NumberCallRelative < Backreference::NumberRelative; end
35
52
 
36
- class NumberRecursionLevel < Backreference::Number
53
+ class NumberRecursionLevel < Backreference::NumberRelative
37
54
  attr_reader :recursion_level
38
55
 
39
56
  def initialize(token, options = {})
@@ -52,4 +69,7 @@ module Regexp::Expression
52
69
  end
53
70
  end
54
71
  end
72
+
73
+ # alias for symmetry between token symbol and Expression class name
74
+ Backref = Backreference
55
75
  end