regexp_parser 1.7.0 → 2.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +364 -22
  3. data/Gemfile +8 -2
  4. data/LICENSE +1 -1
  5. data/README.md +124 -88
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +76 -0
  9. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  10. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  11. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  18. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  22. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  23. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  24. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  25. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  26. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  27. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  28. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  29. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  30. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  31. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  32. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  33. data/lib/regexp_parser/expression/sequence.rb +11 -47
  34. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  35. data/lib/regexp_parser/expression/shared.rb +111 -0
  36. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  37. data/lib/regexp_parser/expression.rb +14 -141
  38. data/lib/regexp_parser/lexer.rb +83 -41
  39. data/lib/regexp_parser/parser.rb +371 -429
  40. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  41. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  42. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  43. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  44. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  45. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  46. data/lib/regexp_parser/scanner/property.rl +4 -4
  47. data/lib/regexp_parser/scanner/scanner.rl +295 -368
  48. data/lib/regexp_parser/scanner.rb +1405 -1674
  49. data/lib/regexp_parser/syntax/any.rb +2 -7
  50. data/lib/regexp_parser/syntax/base.rb +92 -67
  51. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  52. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  53. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  54. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  55. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  56. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  57. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  58. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  59. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  60. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  61. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  62. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  63. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  64. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  65. data/lib/regexp_parser/syntax/token.rb +45 -0
  66. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  67. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  68. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  69. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  70. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  71. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  75. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  79. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  81. data/lib/regexp_parser/syntax/versions.rb +3 -1
  82. data/lib/regexp_parser/syntax.rb +8 -6
  83. data/lib/regexp_parser/token.rb +9 -20
  84. data/lib/regexp_parser/version.rb +1 -1
  85. data/lib/regexp_parser.rb +0 -2
  86. data/regexp_parser.gemspec +20 -22
  87. metadata +49 -166
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/README.md CHANGED
@@ -1,15 +1,18 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
7
10
  * Multilayered
8
11
  * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
9
- * A lexer that produces a "stream" of token objects.
10
- * A parser that produces a "tree" of Expression objects (OO API)
11
- * Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
12
- * Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
12
+ * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
13
+ * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
14
+ * Runs on Ruby 2.x, 3.x and JRuby runtimes
15
+ * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
13
16
 
14
17
 
15
18
  _For examples of regexp_parser in use, see [Example Projects](#example-projects)._
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
18
21
  ---
19
22
  ## Requirements
20
23
 
21
- * Ruby >= 1.9
24
+ * Ruby >= 2.0
22
25
  * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
23
26
 
24
27
 
25
- _Note: See the .travis.yml file for covered versions._
26
-
27
-
28
28
  ---
29
29
  ## Install
30
30
 
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
36
36
 
37
37
  ```gem 'regexp_parser', '~> X.Y.Z'```
38
38
 
39
- See rubygems for the the [latest version number](https://rubygems.org/gems/regexp_parser)
39
+ See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
40
+ for the the latest version number.
40
41
 
41
42
 
42
43
  ---
43
44
  ## Usage
44
45
 
45
46
  The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
46
- provides a single method that takes a regular expression (as a RegExp object or
47
+ provides a single method that takes a regular expression (as a Regexp object or
47
48
  a string) and returns its results. The **Lexer** and the **Parser** accept an
48
49
  optional second argument that specifies the syntax version, like 'ruby/2.0',
49
50
  which defaults to the host Ruby version (using RUBY_VERSION).
@@ -66,12 +67,23 @@ called with the results as follows:
66
67
  * **Scanner**: the block gets passed the results as they are scanned. See the
67
68
  example in the next section for details.
68
69
 
69
- * **Lexer**: after completion, the block gets passed the tokens one by one.
70
+ * **Lexer**: the block gets passed the tokens one by one as they are scanned.
70
71
  _The result of the block is returned._
71
72
 
72
73
  * **Parser**: after completion, the block gets passed the root expression.
73
74
  _The result of the block is returned._
74
75
 
76
+ All three methods accept either a `Regexp` or `String` (containing the pattern)
77
+ - if a String is passed, `options` can be supplied:
78
+
79
+ ```ruby
80
+ require 'regexp_parser'
81
+
82
+ Regexp::Parser.parse(
83
+ "a+ # Recognizes a and A...",
84
+ options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
85
+ )
86
+ ```
75
87
 
76
88
  ---
77
89
  ## Components
@@ -90,7 +102,7 @@ start/end offsets for each token found.
90
102
  ```ruby
91
103
  require 'regexp_parser'
92
104
 
93
- Regexp::Scanner.scan /(ab?(cd)*[e-h]+)/ do |type, token, text, ts, te|
105
+ Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
94
106
  puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
95
107
  end
96
108
 
@@ -113,8 +125,8 @@ A one-liner that uses map on the result of the scan to return the textual
113
125
  parts of the pattern:
114
126
 
115
127
  ```ruby
116
- Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
117
- #=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
128
+ Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
129
+ # => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
118
130
  ```
119
131
 
120
132
 
@@ -136,11 +148,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
136
148
  to the lexer.
137
149
 
138
150
  * The MRI implementation may accept expressions that either conflict with
139
- the documentation or are undocumented. The scanner does not support such
140
- implementation quirks.
141
- _(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
142
- [#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
143
-
151
+ the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
152
+ The scanner will try to support as many of these cases as possible.
144
153
 
145
154
  ---
146
155
  ### Syntax
@@ -149,31 +158,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
149
158
  flavor variations. Syntax only comes into play in the lexer.
150
159
 
151
160
  #### Example
152
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
161
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
153
162
  checks a few of their implementation features.
154
163
 
155
164
  ```ruby
156
165
  require 'regexp_parser'
157
166
 
158
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
167
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
159
168
  ruby_20.implements? :quantifier, :zero_or_one # => true
160
169
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
161
170
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
162
171
  ruby_20.implements? :conditional, :condition # => true
163
172
 
164
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
173
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
165
174
  ruby_19.implements? :quantifier, :zero_or_one # => true
166
175
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
167
176
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
168
177
  ruby_19.implements? :conditional, :condition # => false
169
178
 
170
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
179
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
171
180
  ruby_18.implements? :quantifier, :zero_or_one # => true
172
181
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
173
182
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
174
183
  ruby_18.implements? :conditional, :condition # => false
175
184
  ```
176
185
 
186
+ Syntax objects can also be queried about their complete and relative feature sets.
187
+
188
+ ```ruby
189
+ require 'regexp_parser'
190
+
191
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
192
+ ruby_20.added_features # => { conditional: [...], ... }
193
+ ruby_20.removed_features # => { property: [:newline], ... }
194
+ ruby_20.features # => { anchor: [...], ... }
195
+ ```
177
196
 
178
197
  #### Notes
179
198
  * Variations on a token, for example a named group with angle brackets (< and >)
@@ -202,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
202
221
  ```ruby
203
222
  require 'regexp_parser'
204
223
 
205
- Regexp::Lexer.lex /a?(b(c))*[d]+/, 'ruby/1.9' do |token|
224
+ Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
206
225
  puts "#{' ' * token.level}#{token.text}"
207
226
  end
208
227
 
@@ -228,8 +247,8 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
228
247
  by a quantifier that only applies to it.
229
248
 
230
249
  ```ruby
231
- Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
232
- #=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
250
+ Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
251
+ # => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
233
252
  ```
234
253
 
235
254
  #### Notes
@@ -243,7 +262,7 @@ Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
243
262
  ### Parser
244
263
  Sits on top of the lexer and transforms the "stream" of Token objects emitted
245
264
  by it into a tree of Expression objects represented by an instance of the
246
- Expression::Root class.
265
+ `Expression::Root` class.
247
266
 
248
267
  See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
249
268
  wiki page for attributes and methods.
@@ -251,12 +270,40 @@ wiki page for attributes and methods.
251
270
 
252
271
  #### Example
253
272
 
273
+ This example uses the tree traversal method `#each_expression`
274
+ and the method `#strfregexp` to print each object in the tree.
275
+
276
+ ```ruby
277
+ include_root = true
278
+ indent_offset = include_root ? 1 : 0
279
+
280
+ tree.each_expression(include_root) do |exp|
281
+ puts exp.strfregexp("%>> %c", indent_offset)
282
+ end
283
+
284
+ # Output
285
+ # > Regexp::Expression::Root
286
+ # > Regexp::Expression::Literal
287
+ # > Regexp::Expression::Group::Capture
288
+ # > Regexp::Expression::Literal
289
+ # > Regexp::Expression::Group::Capture
290
+ # > Regexp::Expression::Literal
291
+ # > Regexp::Expression::Literal
292
+ # > Regexp::Expression::Group::Named
293
+ # > Regexp::Expression::CharacterSet
294
+ ```
295
+
296
+ _Note: quantifiers do not appear in the output because they are members of the
297
+ Expression class. See the next section for details._
298
+
299
+ Another example, using `#traverse` for a more fine-grained tree traversal:
300
+
254
301
  ```ruby
255
302
  require 'regexp_parser'
256
303
 
257
304
  regex = /a?(b+(c)d)*(?<name>[0-9]+)/
258
305
 
259
- tree = Regexp::Parser.parse( regex, 'ruby/2.1' )
306
+ tree = Regexp::Parser.parse(regex, 'ruby/2.1')
260
307
 
261
308
  tree.traverse do |event, exp|
262
309
  puts "#{event}: #{exp.type} `#{exp.to_s}`"
@@ -276,40 +323,15 @@ end
276
323
  # exit: group `(?<name>[0-9]+)`
277
324
  ```
278
325
 
279
- Another example, using each_expression and strfregexp to print the object tree.
280
326
  _See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods`
281
327
  for more information on these methods._
282
328
 
283
- ```ruby
284
- include_root = true
285
- indent_offset = include_root ? 1 : 0
286
-
287
- tree.each_expression(include_root) do |exp, level_index|
288
- puts exp.strfregexp("%>> %c", indent_offset)
289
- end
290
-
291
- # Output
292
- # > Regexp::Expression::Root
293
- # > Regexp::Expression::Literal
294
- # > Regexp::Expression::Group::Capture
295
- # > Regexp::Expression::Literal
296
- # > Regexp::Expression::Group::Capture
297
- # > Regexp::Expression::Literal
298
- # > Regexp::Expression::Literal
299
- # > Regexp::Expression::Group::Named
300
- # > Regexp::Expression::CharacterSet
301
- ```
302
-
303
- _Note: quantifiers do not appear in the output because they are members of the
304
- Expression class. See the next section for details._
305
-
306
-
307
329
  ---
308
330
 
309
331
 
310
332
  ## Supported Syntax
311
333
  The three modules support all the regular expression syntax features of Ruby 1.8,
312
- 1.9, and 2.x:
334
+ 1.9, 2.x and 3.x:
313
335
 
314
336
  _Note that not all of these are available in all versions of Ruby_
315
337
 
@@ -337,7 +359,7 @@ _Note that not all of these are available in all versions of Ruby_
337
359
  | &emsp;&emsp;_Nest Level_ | `\k<n-1>` | &#x2713; |
338
360
  | &emsp;&emsp;_Numbered_ | `\k<1>` | &#x2713; |
339
361
  | &emsp;&emsp;_Relative_ | `\k<-2>` | &#x2713; |
340
- | &emsp;&emsp;_Traditional_ | `\1` thru `\9` | &#x2713; |
362
+ | &emsp;&emsp;_Traditional_ | `\1` through `\9` | &#x2713; |
341
363
  | &emsp;&nbsp;_**Capturing**_ | `(abc)` | &#x2713; |
342
364
  | &emsp;&nbsp;_**Comments**_ | `(?# comment text)` | &#x2713; |
343
365
  | &emsp;&nbsp;_**Named**_ | `(?<name>abc)`, `(?'name'abc)` | &#x2713; |
@@ -349,15 +371,15 @@ _Note that not all of these are available in all versions of Ruby_
349
371
  | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | &#x2713; |
350
372
  | **Quantifiers** | | &#x22f1; |
351
373
  | &emsp;&nbsp;_**Greedy**_ | `?`, `*`, `+`, `{m,M}` | &#x2713; |
352
- | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
353
- | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
374
+ | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | &#x2713; |
375
+ | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++` \[1\] | &#x2713; |
354
376
  | **String Escapes** | | &#x22f1; |
355
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
377
+ | &emsp;&nbsp;_**Control** \[2\]_ | `\C-C`, `\cD` | &#x2713; |
356
378
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
357
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
379
+ | &emsp;&nbsp;_**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
358
380
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
359
381
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
360
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
382
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | &#x22f1; |
361
383
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
362
384
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
363
385
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -366,6 +388,18 @@ _Note that not all of these are available in all versions of Ruby_
366
388
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
367
389
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
368
390
 
391
+ [Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
392
+
393
+ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
394
+ Any `+` or `?` that follows an interval quantifier will be treated as another,
395
+ chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
396
+ [#69](https://github.com/ammar/regexp_parser/pull/69).
397
+
398
+ **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
399
+ escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
400
+ so they will only reach the scanner and will only be emitted if a String or a Regexp
401
+ that has been built with the `::new` constructor is scanned.
402
+
369
403
  ##### Inapplicable Features
370
404
 
371
405
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -379,40 +413,29 @@ expressions library (Onigmo). They are not supported by the scanner.
379
413
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
380
414
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
381
415
 
382
-
383
416
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
384
417
 
385
- _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
386
- or incorrectly return tokens/objects as literals._
418
+ _**Note**: Attempting to process expressions with unsupported syntax features can raise
419
+ an error, or incorrectly return tokens/objects as literals._
387
420
 
388
421
 
389
422
  ## Testing
390
- To run the tests simply run rake from the root directory, as 'test' is the default task.
391
-
392
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
393
-
394
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
423
+ To run the tests simply run rake from the root directory.
395
424
 
396
- ```
397
- bin/test
398
- ```
425
+ The default task generates the scanner's code from the Ragel source files and runs
426
+ all the specs, thus it requires Ragel to be installed.
399
427
 
400
- You can run a specific test like so:
428
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own,
429
+ so to run individual tests you might want to run:
401
430
 
402
431
  ```
403
- bin/test spec/scanner/properties_spec.rb
404
- ```
405
-
406
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
407
-
408
- ```
409
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
432
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
410
433
  ```
411
434
 
412
435
  ## Building
413
- Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be
414
- installed. The build tasks will automatically invoke the 'ragel:rb' task to generate the
415
- Ruby scanner code.
436
+ Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
437
+ to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
438
+ the Ruby scanner code.
416
439
 
417
440
 
418
441
  The project uses the standard rubygems package tasks, so:
@@ -432,13 +455,26 @@ rake install
432
455
  ## Example Projects
433
456
  Projects using regexp_parser.
434
457
 
435
- - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
458
+ - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
459
+ that uses regexp_parser to convert Regexps to css/xpath selectors.
460
+
461
+ - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
462
+ to JavaScript-compatible regular expressions.
463
+
464
+ - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
465
+ with alias support.
466
+
467
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
468
+ (amongst others) to see if your tests cover their behavior.
436
469
 
437
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
470
+ - [repper](https://github.com/jaynetics/repper) is a regular expression
471
+ pretty-printer and formatter for Ruby.
438
472
 
439
- - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) uses regexp_parser to generate examples of postal codes.
473
+ - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
474
+ uses regexp_parser to lint Regexps.
440
475
 
441
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
476
+ - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
477
+ that uses regexp_parser to generate examples of postal codes.
442
478
 
443
479
 
444
480
  ## References
@@ -467,4 +503,4 @@ Documentation and books used while working on this project.
467
503
 
468
504
  ---
469
505
  ##### Copyright
470
- _Copyright (c) 2010-2019 Ammar Ali. See LICENSE file for details._
506
+ _Copyright (c) 2010-2023 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,87 +1,23 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
11
- RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
14
9
 
15
10
  Bundler::GemHelper.install_tasks
16
11
 
12
+ RSpec::Core::RakeTask.new(:spec)
17
13
 
18
14
  task :default => [:'test:full']
19
15
 
20
16
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
17
+ task full: [:'ragel:rb', :spec]
24
18
  end
25
19
 
26
- namespace :ragel do
27
- desc "Process the ragel source files and output ruby code"
28
- task :rb do |t|
29
- RAGEL_SOURCE_FILES.each do |file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
31
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
33
-
34
- contents = File.read(output_file)
35
-
36
- File.open(output_file, 'r+') do |file|
37
- contents = "# -*- warn-indent:false; -*-\n" + contents
38
-
39
- file.write(contents)
40
- end
41
- end
42
- end
43
-
44
- desc "Delete the ragel generated source file(s)"
45
- task :clean do |t|
46
- RAGEL_SOURCE_FILES.each do |file|
47
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
- end
49
- end
50
- end
51
-
52
-
53
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
21
  # latest scanner code is generated and included in the build.
55
22
  desc "Runs ragel:rb before building the gem"
56
23
  task :build => ['ragel:rb']
57
-
58
-
59
- namespace :props do
60
- desc 'Write new property value hashes for the properties scanner'
61
- task :update do
62
- require 'regexp_property_values'
63
- RegexpPropertyValues.update
64
- dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
65
-
66
- require 'psych'
67
- write_hash_to_file = ->(hash, path) do
68
- File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
73
- end
74
- puts "Wrote #{hash.count} aliases to `#{path}`"
75
- end
76
-
77
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
- [val.identifier, val.full_name.downcase]
79
- end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
81
-
82
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
- [k.identifier, v.full_name.downcase]
84
- end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
86
- end
87
- end
@@ -0,0 +1,4 @@
1
+ class Regexp::Parser
2
+ # base class for all gem-specific errors
3
+ class Error < StandardError; end
4
+ end
@@ -0,0 +1,76 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ include Regexp::Expression::Shared
4
+
5
+ def initialize(token, options = {})
6
+ init_from_token_and_options(token, options)
7
+ end
8
+
9
+ def to_re(format = :full)
10
+ if set_level > 0
11
+ warn "Calling #to_re on character set members is deprecated - "\
12
+ "their behavior might not be equivalent outside of the set."
13
+ end
14
+ ::Regexp.new(to_s(format))
15
+ end
16
+
17
+ def quantify(*args)
18
+ self.quantifier = Quantifier.new(*args)
19
+ end
20
+
21
+ def unquantified_clone
22
+ clone.tap { |exp| exp.quantifier = nil }
23
+ end
24
+
25
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
26
+ def quantity
27
+ return [nil,nil] unless quantified?
28
+ [quantifier.min, quantifier.max]
29
+ end
30
+
31
+ def repetitions
32
+ @repetitions ||=
33
+ if quantified?
34
+ min = quantifier.min
35
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
36
+ range = min..max
37
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
38
+ if RUBY_VERSION.to_f < 2.7
39
+ range.define_singleton_method(:minmax) { [min, max] }
40
+ end
41
+ range
42
+ else
43
+ 1..1
44
+ end
45
+ end
46
+
47
+ def greedy?
48
+ quantified? and quantifier.greedy?
49
+ end
50
+
51
+ def reluctant?
52
+ quantified? and quantifier.reluctant?
53
+ end
54
+ alias :lazy? :reluctant?
55
+
56
+ def possessive?
57
+ quantified? and quantifier.possessive?
58
+ end
59
+
60
+ def to_h
61
+ {
62
+ type: type,
63
+ token: token,
64
+ text: to_s(:base),
65
+ starts_at: ts,
66
+ length: full_length,
67
+ level: level,
68
+ set_level: set_level,
69
+ conditional_level: conditional_level,
70
+ options: options,
71
+ quantifier: quantified? ? quantifier.to_h : nil,
72
+ }
73
+ end
74
+ alias :attributes :to_h
75
+ end
76
+ end
@@ -1,5 +1,5 @@
1
1
  module Regexp::Expression
2
- # A sequence of expressions, used by Alternation as one of its alternative.
2
+ # A sequence of expressions, used by Alternation as one of its alternatives.
3
3
  class Alternative < Regexp::Expression::Sequence; end
4
4
 
5
5
  class Alternation < Regexp::Expression::SequenceOperation
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -2,6 +2,23 @@ module Regexp::Expression
2
2
  module Backreference
3
3
  class Base < Regexp::Expression::Base
4
4
  attr_accessor :referenced_expression
5
+
6
+ def initialize_copy(orig)
7
+ exp_id = [self.class, self.starts_at]
8
+
9
+ # prevent infinite recursion for recursive subexp calls
10
+ copied = @@copied ||= {}
11
+ self.referenced_expression =
12
+ if copied[exp_id]
13
+ orig.referenced_expression
14
+ else
15
+ copied[exp_id] = true
16
+ orig.referenced_expression.dup
17
+ end
18
+ copied.clear
19
+
20
+ super
21
+ end
5
22
  end
6
23
 
7
24
  class Number < Backreference::Base
@@ -9,7 +26,7 @@ module Regexp::Expression
9
26
  alias reference number
10
27
 
11
28
  def initialize(token, options = {})
12
- @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
29
+ @number = token.text[/-?\d+/].to_i
13
30
  super
14
31
  end
15
32
  end
@@ -33,7 +50,7 @@ module Regexp::Expression
33
50
  class NameCall < Backreference::Name; end
34
51
  class NumberCallRelative < Backreference::NumberRelative; end
35
52
 
36
- class NumberRecursionLevel < Backreference::Number
53
+ class NumberRecursionLevel < Backreference::NumberRelative
37
54
  attr_reader :recursion_level
38
55
 
39
56
  def initialize(token, options = {})
@@ -52,4 +69,7 @@ module Regexp::Expression
52
69
  end
53
70
  end
54
71
  end
72
+
73
+ # alias for symmetry between token symbol and Expression class name
74
+ Backref = Backreference
55
75
  end