regexp_parser 1.7.0 → 2.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +364 -22
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/README.md +124 -88
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +295 -368
- data/lib/regexp_parser/scanner.rb +1405 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +49 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/README.md
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
7
10
|
* Multilayered
|
8
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
9
|
-
* A lexer that produces a "stream" of
|
10
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
11
|
-
* Runs on Ruby
|
12
|
-
* Recognizes Ruby 1.8, 1.9, and
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
|
+
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
|
+
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
13
16
|
|
14
17
|
|
15
18
|
_For examples of regexp_parser in use, see [Example Projects](#example-projects)._
|
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
|
|
18
21
|
---
|
19
22
|
## Requirements
|
20
23
|
|
21
|
-
* Ruby >=
|
24
|
+
* Ruby >= 2.0
|
22
25
|
* Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
|
23
26
|
|
24
27
|
|
25
|
-
_Note: See the .travis.yml file for covered versions._
|
26
|
-
|
27
|
-
|
28
28
|
---
|
29
29
|
## Install
|
30
30
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -66,12 +67,23 @@ called with the results as follows:
|
|
66
67
|
* **Scanner**: the block gets passed the results as they are scanned. See the
|
67
68
|
example in the next section for details.
|
68
69
|
|
69
|
-
* **Lexer**:
|
70
|
+
* **Lexer**: the block gets passed the tokens one by one as they are scanned.
|
70
71
|
_The result of the block is returned._
|
71
72
|
|
72
73
|
* **Parser**: after completion, the block gets passed the root expression.
|
73
74
|
_The result of the block is returned._
|
74
75
|
|
76
|
+
All three methods accept either a `Regexp` or `String` (containing the pattern)
|
77
|
+
- if a String is passed, `options` can be supplied:
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
require 'regexp_parser'
|
81
|
+
|
82
|
+
Regexp::Parser.parse(
|
83
|
+
"a+ # Recognizes a and A...",
|
84
|
+
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
85
|
+
)
|
86
|
+
```
|
75
87
|
|
76
88
|
---
|
77
89
|
## Components
|
@@ -90,7 +102,7 @@ start/end offsets for each token found.
|
|
90
102
|
```ruby
|
91
103
|
require 'regexp_parser'
|
92
104
|
|
93
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
94
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
95
107
|
end
|
96
108
|
|
@@ -113,8 +125,8 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
113
125
|
parts of the pattern:
|
114
126
|
|
115
127
|
```ruby
|
116
|
-
Regexp::Scanner.scan(
|
117
|
-
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
129
|
+
# => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
118
130
|
```
|
119
131
|
|
120
132
|
|
@@ -136,11 +148,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
|
|
136
148
|
to the lexer.
|
137
149
|
|
138
150
|
* The MRI implementation may accept expressions that either conflict with
|
139
|
-
the documentation or are undocumented
|
140
|
-
|
141
|
-
_(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
|
142
|
-
[#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
|
143
|
-
|
151
|
+
the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
|
152
|
+
The scanner will try to support as many of these cases as possible.
|
144
153
|
|
145
154
|
---
|
146
155
|
### Syntax
|
@@ -149,31 +158,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
149
158
|
flavor variations. Syntax only comes into play in the lexer.
|
150
159
|
|
151
160
|
#### Example
|
152
|
-
The following
|
161
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
153
162
|
checks a few of their implementation features.
|
154
163
|
|
155
164
|
```ruby
|
156
165
|
require 'regexp_parser'
|
157
166
|
|
158
|
-
ruby_20 = Regexp::Syntax.
|
167
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
159
168
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
160
169
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
161
170
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
162
171
|
ruby_20.implements? :conditional, :condition # => true
|
163
172
|
|
164
|
-
ruby_19 = Regexp::Syntax.
|
173
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
165
174
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
166
175
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
167
176
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
168
177
|
ruby_19.implements? :conditional, :condition # => false
|
169
178
|
|
170
|
-
ruby_18 = Regexp::Syntax.
|
179
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
171
180
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
172
181
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
173
182
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
174
183
|
ruby_18.implements? :conditional, :condition # => false
|
175
184
|
```
|
176
185
|
|
186
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
187
|
+
|
188
|
+
```ruby
|
189
|
+
require 'regexp_parser'
|
190
|
+
|
191
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
192
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
193
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
194
|
+
ruby_20.features # => { anchor: [...], ... }
|
195
|
+
```
|
177
196
|
|
178
197
|
#### Notes
|
179
198
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
@@ -202,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
202
221
|
```ruby
|
203
222
|
require 'regexp_parser'
|
204
223
|
|
205
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
206
225
|
puts "#{' ' * token.level}#{token.text}"
|
207
226
|
end
|
208
227
|
|
@@ -228,8 +247,8 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
228
247
|
by a quantifier that only applies to it.
|
229
248
|
|
230
249
|
```ruby
|
231
|
-
Regexp::Lexer.scan(
|
232
|
-
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
251
|
+
# => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
233
252
|
```
|
234
253
|
|
235
254
|
#### Notes
|
@@ -243,7 +262,7 @@ Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
|
|
243
262
|
### Parser
|
244
263
|
Sits on top of the lexer and transforms the "stream" of Token objects emitted
|
245
264
|
by it into a tree of Expression objects represented by an instance of the
|
246
|
-
Expression::Root class.
|
265
|
+
`Expression::Root` class.
|
247
266
|
|
248
267
|
See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
249
268
|
wiki page for attributes and methods.
|
@@ -251,12 +270,40 @@ wiki page for attributes and methods.
|
|
251
270
|
|
252
271
|
#### Example
|
253
272
|
|
273
|
+
This example uses the tree traversal method `#each_expression`
|
274
|
+
and the method `#strfregexp` to print each object in the tree.
|
275
|
+
|
276
|
+
```ruby
|
277
|
+
include_root = true
|
278
|
+
indent_offset = include_root ? 1 : 0
|
279
|
+
|
280
|
+
tree.each_expression(include_root) do |exp|
|
281
|
+
puts exp.strfregexp("%>> %c", indent_offset)
|
282
|
+
end
|
283
|
+
|
284
|
+
# Output
|
285
|
+
# > Regexp::Expression::Root
|
286
|
+
# > Regexp::Expression::Literal
|
287
|
+
# > Regexp::Expression::Group::Capture
|
288
|
+
# > Regexp::Expression::Literal
|
289
|
+
# > Regexp::Expression::Group::Capture
|
290
|
+
# > Regexp::Expression::Literal
|
291
|
+
# > Regexp::Expression::Literal
|
292
|
+
# > Regexp::Expression::Group::Named
|
293
|
+
# > Regexp::Expression::CharacterSet
|
294
|
+
```
|
295
|
+
|
296
|
+
_Note: quantifiers do not appear in the output because they are members of the
|
297
|
+
Expression class. See the next section for details._
|
298
|
+
|
299
|
+
Another example, using `#traverse` for a more fine-grained tree traversal:
|
300
|
+
|
254
301
|
```ruby
|
255
302
|
require 'regexp_parser'
|
256
303
|
|
257
304
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
258
305
|
|
259
|
-
tree = Regexp::Parser.parse(
|
306
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
260
307
|
|
261
308
|
tree.traverse do |event, exp|
|
262
309
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -276,40 +323,15 @@ end
|
|
276
323
|
# exit: group `(?<name>[0-9]+)`
|
277
324
|
```
|
278
325
|
|
279
|
-
Another example, using each_expression and strfregexp to print the object tree.
|
280
326
|
_See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods`
|
281
327
|
for more information on these methods._
|
282
328
|
|
283
|
-
```ruby
|
284
|
-
include_root = true
|
285
|
-
indent_offset = include_root ? 1 : 0
|
286
|
-
|
287
|
-
tree.each_expression(include_root) do |exp, level_index|
|
288
|
-
puts exp.strfregexp("%>> %c", indent_offset)
|
289
|
-
end
|
290
|
-
|
291
|
-
# Output
|
292
|
-
# > Regexp::Expression::Root
|
293
|
-
# > Regexp::Expression::Literal
|
294
|
-
# > Regexp::Expression::Group::Capture
|
295
|
-
# > Regexp::Expression::Literal
|
296
|
-
# > Regexp::Expression::Group::Capture
|
297
|
-
# > Regexp::Expression::Literal
|
298
|
-
# > Regexp::Expression::Literal
|
299
|
-
# > Regexp::Expression::Group::Named
|
300
|
-
# > Regexp::Expression::CharacterSet
|
301
|
-
```
|
302
|
-
|
303
|
-
_Note: quantifiers do not appear in the output because they are members of the
|
304
|
-
Expression class. See the next section for details._
|
305
|
-
|
306
|
-
|
307
329
|
---
|
308
330
|
|
309
331
|
|
310
332
|
## Supported Syntax
|
311
333
|
The three modules support all the regular expression syntax features of Ruby 1.8,
|
312
|
-
1.9, and
|
334
|
+
1.9, 2.x and 3.x:
|
313
335
|
|
314
336
|
_Note that not all of these are available in all versions of Ruby_
|
315
337
|
|
@@ -337,7 +359,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
337
359
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
338
360
|
|   _Numbered_ | `\k<1>` | ✓ |
|
339
361
|
|   _Relative_ | `\k<-2>` | ✓ |
|
340
|
-
|   _Traditional_ | `\1`
|
362
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
341
363
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
342
364
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
343
365
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -349,15 +371,15 @@ _Note that not all of these are available in all versions of Ruby_
|
|
349
371
|
| **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ |
|
350
372
|
| **Quantifiers** | | ⋱ |
|
351
373
|
|   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ |
|
352
|
-
|   _**Reluctant** (Lazy)_ | `??`, `*?`,
|
353
|
-
|   _**Possessive**_ | `?+`, `*+`,
|
374
|
+
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ |
|
375
|
+
|   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ |
|
354
376
|
| **String Escapes** | | ⋱ |
|
355
|
-
|   _**Control**_
|
377
|
+
|   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ |
|
356
378
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
357
|
-
|   _**Meta**_
|
379
|
+
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
358
380
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
359
381
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
360
|
-
| **Unicode Properties** | _<sub>([Unicode
|
382
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
361
383
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
362
384
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
363
385
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -366,6 +388,18 @@ _Note that not all of these are available in all versions of Ruby_
|
|
366
388
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
367
389
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
368
390
|
|
391
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
392
|
+
|
393
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
394
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
395
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
396
|
+
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
397
|
+
|
398
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
399
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
400
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
401
|
+
that has been built with the `::new` constructor is scanned.
|
402
|
+
|
369
403
|
##### Inapplicable Features
|
370
404
|
|
371
405
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -379,40 +413,29 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
379
413
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
380
414
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
381
415
|
|
382
|
-
|
383
416
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
384
417
|
|
385
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
386
|
-
or incorrectly return tokens/objects as literals._
|
418
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
419
|
+
an error, or incorrectly return tokens/objects as literals._
|
387
420
|
|
388
421
|
|
389
422
|
## Testing
|
390
|
-
To run the tests simply run rake from the root directory
|
391
|
-
|
392
|
-
It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
|
393
|
-
|
394
|
-
The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
|
423
|
+
To run the tests simply run rake from the root directory.
|
395
424
|
|
396
|
-
|
397
|
-
|
398
|
-
```
|
425
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
426
|
+
all the specs, thus it requires Ragel to be installed.
|
399
427
|
|
400
|
-
|
428
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
429
|
+
so to run individual tests you might want to run:
|
401
430
|
|
402
431
|
```
|
403
|
-
|
404
|
-
```
|
405
|
-
|
406
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
407
|
-
|
408
|
-
```
|
409
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
432
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
410
433
|
```
|
411
434
|
|
412
435
|
## Building
|
413
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
414
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
415
|
-
Ruby scanner code.
|
436
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
437
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
438
|
+
the Ruby scanner code.
|
416
439
|
|
417
440
|
|
418
441
|
The project uses the standard rubygems package tasks, so:
|
@@ -432,13 +455,26 @@ rake install
|
|
432
455
|
## Example Projects
|
433
456
|
Projects using regexp_parser.
|
434
457
|
|
435
|
-
- [
|
458
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
459
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
460
|
+
|
461
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
462
|
+
to JavaScript-compatible regular expressions.
|
463
|
+
|
464
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
465
|
+
with alias support.
|
466
|
+
|
467
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
468
|
+
(amongst others) to see if your tests cover their behavior.
|
436
469
|
|
437
|
-
- [
|
470
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
471
|
+
pretty-printer and formatter for Ruby.
|
438
472
|
|
439
|
-
- [
|
473
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
474
|
+
uses regexp_parser to lint Regexps.
|
440
475
|
|
441
|
-
- [
|
476
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
477
|
+
that uses regexp_parser to generate examples of postal codes.
|
442
478
|
|
443
479
|
|
444
480
|
## References
|
@@ -467,4 +503,4 @@ Documentation and books used while working on this project.
|
|
467
503
|
|
468
504
|
---
|
469
505
|
##### Copyright
|
470
|
-
_Copyright (c) 2010-
|
506
|
+
_Copyright (c) 2010-2023 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,87 +1,23 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
require 'rubygems/package_task'
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
|
11
|
-
RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
14
9
|
|
15
10
|
Bundler::GemHelper.install_tasks
|
16
11
|
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
13
|
|
18
14
|
task :default => [:'test:full']
|
19
15
|
|
20
16
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
17
|
+
task full: [:'ragel:rb', :spec]
|
24
18
|
end
|
25
19
|
|
26
|
-
namespace :ragel do
|
27
|
-
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do |t|
|
29
|
-
RAGEL_SOURCE_FILES.each do |file|
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
31
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
|
33
|
-
|
34
|
-
contents = File.read(output_file)
|
35
|
-
|
36
|
-
File.open(output_file, 'r+') do |file|
|
37
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
38
|
-
|
39
|
-
file.write(contents)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do |t|
|
46
|
-
RAGEL_SOURCE_FILES.each do |file|
|
47
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
21
|
# latest scanner code is generated and included in the build.
|
55
22
|
desc "Runs ragel:rb before building the gem"
|
56
23
|
task :build => ['ragel:rb']
|
57
|
-
|
58
|
-
|
59
|
-
namespace :props do
|
60
|
-
desc 'Write new property value hashes for the properties scanner'
|
61
|
-
task :update do
|
62
|
-
require 'regexp_property_values'
|
63
|
-
RegexpPropertyValues.update
|
64
|
-
dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
|
65
|
-
|
66
|
-
require 'psych'
|
67
|
-
write_hash_to_file = ->(hash, path) do
|
68
|
-
File.open(path, 'w') do |f|
|
69
|
-
f.puts '#',
|
70
|
-
"# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
73
|
-
end
|
74
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
|
-
end
|
76
|
-
|
77
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
|
-
[val.identifier, val.full_name.downcase]
|
79
|
-
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
|
81
|
-
|
82
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
|
-
[k.identifier, v.full_name.downcase]
|
84
|
-
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
|
86
|
-
end
|
87
|
-
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
include Regexp::Expression::Shared
|
4
|
+
|
5
|
+
def initialize(token, options = {})
|
6
|
+
init_from_token_and_options(token, options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def to_re(format = :full)
|
10
|
+
if set_level > 0
|
11
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
12
|
+
"their behavior might not be equivalent outside of the set."
|
13
|
+
end
|
14
|
+
::Regexp.new(to_s(format))
|
15
|
+
end
|
16
|
+
|
17
|
+
def quantify(*args)
|
18
|
+
self.quantifier = Quantifier.new(*args)
|
19
|
+
end
|
20
|
+
|
21
|
+
def unquantified_clone
|
22
|
+
clone.tap { |exp| exp.quantifier = nil }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
26
|
+
def quantity
|
27
|
+
return [nil,nil] unless quantified?
|
28
|
+
[quantifier.min, quantifier.max]
|
29
|
+
end
|
30
|
+
|
31
|
+
def repetitions
|
32
|
+
@repetitions ||=
|
33
|
+
if quantified?
|
34
|
+
min = quantifier.min
|
35
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
36
|
+
range = min..max
|
37
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
38
|
+
if RUBY_VERSION.to_f < 2.7
|
39
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
40
|
+
end
|
41
|
+
range
|
42
|
+
else
|
43
|
+
1..1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def greedy?
|
48
|
+
quantified? and quantifier.greedy?
|
49
|
+
end
|
50
|
+
|
51
|
+
def reluctant?
|
52
|
+
quantified? and quantifier.reluctant?
|
53
|
+
end
|
54
|
+
alias :lazy? :reluctant?
|
55
|
+
|
56
|
+
def possessive?
|
57
|
+
quantified? and quantifier.possessive?
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_h
|
61
|
+
{
|
62
|
+
type: type,
|
63
|
+
token: token,
|
64
|
+
text: to_s(:base),
|
65
|
+
starts_at: ts,
|
66
|
+
length: full_length,
|
67
|
+
level: level,
|
68
|
+
set_level: set_level,
|
69
|
+
conditional_level: conditional_level,
|
70
|
+
options: options,
|
71
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
72
|
+
}
|
73
|
+
end
|
74
|
+
alias :attributes :to_h
|
75
|
+
end
|
76
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
# A sequence of expressions, used by Alternation as one of its
|
2
|
+
# A sequence of expressions, used by Alternation as one of its alternatives.
|
3
3
|
class Alternative < Regexp::Expression::Sequence; end
|
4
4
|
|
5
5
|
class Alternation < Regexp::Expression::SequenceOperation
|
@@ -2,6 +2,23 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
exp_id = [self.class, self.starts_at]
|
8
|
+
|
9
|
+
# prevent infinite recursion for recursive subexp calls
|
10
|
+
copied = @@copied ||= {}
|
11
|
+
self.referenced_expression =
|
12
|
+
if copied[exp_id]
|
13
|
+
orig.referenced_expression
|
14
|
+
else
|
15
|
+
copied[exp_id] = true
|
16
|
+
orig.referenced_expression.dup
|
17
|
+
end
|
18
|
+
copied.clear
|
19
|
+
|
20
|
+
super
|
21
|
+
end
|
5
22
|
end
|
6
23
|
|
7
24
|
class Number < Backreference::Base
|
@@ -9,7 +26,7 @@ module Regexp::Expression
|
|
9
26
|
alias reference number
|
10
27
|
|
11
28
|
def initialize(token, options = {})
|
12
|
-
@number = token.text[
|
29
|
+
@number = token.text[/-?\d+/].to_i
|
13
30
|
super
|
14
31
|
end
|
15
32
|
end
|
@@ -33,7 +50,7 @@ module Regexp::Expression
|
|
33
50
|
class NameCall < Backreference::Name; end
|
34
51
|
class NumberCallRelative < Backreference::NumberRelative; end
|
35
52
|
|
36
|
-
class NumberRecursionLevel < Backreference::
|
53
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
37
54
|
attr_reader :recursion_level
|
38
55
|
|
39
56
|
def initialize(token, options = {})
|
@@ -52,4 +69,7 @@ module Regexp::Expression
|
|
52
69
|
end
|
53
70
|
end
|
54
71
|
end
|
72
|
+
|
73
|
+
# alias for symmetry between token symbol and Expression class name
|
74
|
+
Backref = Backreference
|
55
75
|
end
|