regexp_parser 1.7.0 → 2.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +364 -22
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/README.md +124 -88
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +295 -368
- data/lib/regexp_parser/scanner.rb +1405 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +49 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/README.md
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
7
10
|
* Multilayered
|
8
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
9
|
-
* A lexer that produces a "stream" of
|
10
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
11
|
-
* Runs on Ruby
|
12
|
-
* Recognizes Ruby 1.8, 1.9, and
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
|
+
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
|
+
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
13
16
|
|
14
17
|
|
15
18
|
_For examples of regexp_parser in use, see [Example Projects](#example-projects)._
|
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
|
|
18
21
|
---
|
19
22
|
## Requirements
|
20
23
|
|
21
|
-
* Ruby >=
|
24
|
+
* Ruby >= 2.0
|
22
25
|
* Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
|
23
26
|
|
24
27
|
|
25
|
-
_Note: See the .travis.yml file for covered versions._
|
26
|
-
|
27
|
-
|
28
28
|
---
|
29
29
|
## Install
|
30
30
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -66,12 +67,23 @@ called with the results as follows:
|
|
66
67
|
* **Scanner**: the block gets passed the results as they are scanned. See the
|
67
68
|
example in the next section for details.
|
68
69
|
|
69
|
-
* **Lexer**:
|
70
|
+
* **Lexer**: the block gets passed the tokens one by one as they are scanned.
|
70
71
|
_The result of the block is returned._
|
71
72
|
|
72
73
|
* **Parser**: after completion, the block gets passed the root expression.
|
73
74
|
_The result of the block is returned._
|
74
75
|
|
76
|
+
All three methods accept either a `Regexp` or `String` (containing the pattern)
|
77
|
+
- if a String is passed, `options` can be supplied:
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
require 'regexp_parser'
|
81
|
+
|
82
|
+
Regexp::Parser.parse(
|
83
|
+
"a+ # Recognizes a and A...",
|
84
|
+
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
85
|
+
)
|
86
|
+
```
|
75
87
|
|
76
88
|
---
|
77
89
|
## Components
|
@@ -90,7 +102,7 @@ start/end offsets for each token found.
|
|
90
102
|
```ruby
|
91
103
|
require 'regexp_parser'
|
92
104
|
|
93
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
94
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
95
107
|
end
|
96
108
|
|
@@ -113,8 +125,8 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
113
125
|
parts of the pattern:
|
114
126
|
|
115
127
|
```ruby
|
116
|
-
Regexp::Scanner.scan(
|
117
|
-
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
129
|
+
# => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
118
130
|
```
|
119
131
|
|
120
132
|
|
@@ -136,11 +148,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
|
|
136
148
|
to the lexer.
|
137
149
|
|
138
150
|
* The MRI implementation may accept expressions that either conflict with
|
139
|
-
the documentation or are undocumented
|
140
|
-
|
141
|
-
_(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
|
142
|
-
[#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_
|
143
|
-
|
151
|
+
the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
|
152
|
+
The scanner will try to support as many of these cases as possible.
|
144
153
|
|
145
154
|
---
|
146
155
|
### Syntax
|
@@ -149,31 +158,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
149
158
|
flavor variations. Syntax only comes into play in the lexer.
|
150
159
|
|
151
160
|
#### Example
|
152
|
-
The following
|
161
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
153
162
|
checks a few of their implementation features.
|
154
163
|
|
155
164
|
```ruby
|
156
165
|
require 'regexp_parser'
|
157
166
|
|
158
|
-
ruby_20 = Regexp::Syntax.
|
167
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
159
168
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
160
169
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
161
170
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
162
171
|
ruby_20.implements? :conditional, :condition # => true
|
163
172
|
|
164
|
-
ruby_19 = Regexp::Syntax.
|
173
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
165
174
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
166
175
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
167
176
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
168
177
|
ruby_19.implements? :conditional, :condition # => false
|
169
178
|
|
170
|
-
ruby_18 = Regexp::Syntax.
|
179
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
171
180
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
172
181
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
173
182
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
174
183
|
ruby_18.implements? :conditional, :condition # => false
|
175
184
|
```
|
176
185
|
|
186
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
187
|
+
|
188
|
+
```ruby
|
189
|
+
require 'regexp_parser'
|
190
|
+
|
191
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
192
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
193
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
194
|
+
ruby_20.features # => { anchor: [...], ... }
|
195
|
+
```
|
177
196
|
|
178
197
|
#### Notes
|
179
198
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
@@ -202,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
202
221
|
```ruby
|
203
222
|
require 'regexp_parser'
|
204
223
|
|
205
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
206
225
|
puts "#{' ' * token.level}#{token.text}"
|
207
226
|
end
|
208
227
|
|
@@ -228,8 +247,8 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
228
247
|
by a quantifier that only applies to it.
|
229
248
|
|
230
249
|
```ruby
|
231
|
-
Regexp::Lexer.scan(
|
232
|
-
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
251
|
+
# => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
233
252
|
```
|
234
253
|
|
235
254
|
#### Notes
|
@@ -243,7 +262,7 @@ Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
|
|
243
262
|
### Parser
|
244
263
|
Sits on top of the lexer and transforms the "stream" of Token objects emitted
|
245
264
|
by it into a tree of Expression objects represented by an instance of the
|
246
|
-
Expression::Root class.
|
265
|
+
`Expression::Root` class.
|
247
266
|
|
248
267
|
See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
249
268
|
wiki page for attributes and methods.
|
@@ -251,12 +270,40 @@ wiki page for attributes and methods.
|
|
251
270
|
|
252
271
|
#### Example
|
253
272
|
|
273
|
+
This example uses the tree traversal method `#each_expression`
|
274
|
+
and the method `#strfregexp` to print each object in the tree.
|
275
|
+
|
276
|
+
```ruby
|
277
|
+
include_root = true
|
278
|
+
indent_offset = include_root ? 1 : 0
|
279
|
+
|
280
|
+
tree.each_expression(include_root) do |exp|
|
281
|
+
puts exp.strfregexp("%>> %c", indent_offset)
|
282
|
+
end
|
283
|
+
|
284
|
+
# Output
|
285
|
+
# > Regexp::Expression::Root
|
286
|
+
# > Regexp::Expression::Literal
|
287
|
+
# > Regexp::Expression::Group::Capture
|
288
|
+
# > Regexp::Expression::Literal
|
289
|
+
# > Regexp::Expression::Group::Capture
|
290
|
+
# > Regexp::Expression::Literal
|
291
|
+
# > Regexp::Expression::Literal
|
292
|
+
# > Regexp::Expression::Group::Named
|
293
|
+
# > Regexp::Expression::CharacterSet
|
294
|
+
```
|
295
|
+
|
296
|
+
_Note: quantifiers do not appear in the output because they are members of the
|
297
|
+
Expression class. See the next section for details._
|
298
|
+
|
299
|
+
Another example, using `#traverse` for a more fine-grained tree traversal:
|
300
|
+
|
254
301
|
```ruby
|
255
302
|
require 'regexp_parser'
|
256
303
|
|
257
304
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
258
305
|
|
259
|
-
tree = Regexp::Parser.parse(
|
306
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
260
307
|
|
261
308
|
tree.traverse do |event, exp|
|
262
309
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -276,40 +323,15 @@ end
|
|
276
323
|
# exit: group `(?<name>[0-9]+)`
|
277
324
|
```
|
278
325
|
|
279
|
-
Another example, using each_expression and strfregexp to print the object tree.
|
280
326
|
_See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods`
|
281
327
|
for more information on these methods._
|
282
328
|
|
283
|
-
```ruby
|
284
|
-
include_root = true
|
285
|
-
indent_offset = include_root ? 1 : 0
|
286
|
-
|
287
|
-
tree.each_expression(include_root) do |exp, level_index|
|
288
|
-
puts exp.strfregexp("%>> %c", indent_offset)
|
289
|
-
end
|
290
|
-
|
291
|
-
# Output
|
292
|
-
# > Regexp::Expression::Root
|
293
|
-
# > Regexp::Expression::Literal
|
294
|
-
# > Regexp::Expression::Group::Capture
|
295
|
-
# > Regexp::Expression::Literal
|
296
|
-
# > Regexp::Expression::Group::Capture
|
297
|
-
# > Regexp::Expression::Literal
|
298
|
-
# > Regexp::Expression::Literal
|
299
|
-
# > Regexp::Expression::Group::Named
|
300
|
-
# > Regexp::Expression::CharacterSet
|
301
|
-
```
|
302
|
-
|
303
|
-
_Note: quantifiers do not appear in the output because they are members of the
|
304
|
-
Expression class. See the next section for details._
|
305
|
-
|
306
|
-
|
307
329
|
---
|
308
330
|
|
309
331
|
|
310
332
|
## Supported Syntax
|
311
333
|
The three modules support all the regular expression syntax features of Ruby 1.8,
|
312
|
-
1.9, and
|
334
|
+
1.9, 2.x and 3.x:
|
313
335
|
|
314
336
|
_Note that not all of these are available in all versions of Ruby_
|
315
337
|
|
@@ -337,7 +359,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
337
359
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
338
360
|
|   _Numbered_ | `\k<1>` | ✓ |
|
339
361
|
|   _Relative_ | `\k<-2>` | ✓ |
|
340
|
-
|   _Traditional_ | `\1`
|
362
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
341
363
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
342
364
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
343
365
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -349,15 +371,15 @@ _Note that not all of these are available in all versions of Ruby_
|
|
349
371
|
| **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ |
|
350
372
|
| **Quantifiers** | | ⋱ |
|
351
373
|
|   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ |
|
352
|
-
|   _**Reluctant** (Lazy)_ | `??`, `*?`,
|
353
|
-
|   _**Possessive**_ | `?+`, `*+`,
|
374
|
+
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ |
|
375
|
+
|   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ |
|
354
376
|
| **String Escapes** | | ⋱ |
|
355
|
-
|   _**Control**_
|
377
|
+
|   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ |
|
356
378
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
357
|
-
|   _**Meta**_
|
379
|
+
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
358
380
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
359
381
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
360
|
-
| **Unicode Properties** | _<sub>([Unicode
|
382
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
361
383
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
362
384
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
363
385
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -366,6 +388,18 @@ _Note that not all of these are available in all versions of Ruby_
|
|
366
388
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
367
389
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
368
390
|
|
391
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
392
|
+
|
393
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
394
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
395
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
396
|
+
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
397
|
+
|
398
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
399
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
400
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
401
|
+
that has been built with the `::new` constructor is scanned.
|
402
|
+
|
369
403
|
##### Inapplicable Features
|
370
404
|
|
371
405
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -379,40 +413,29 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
379
413
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
380
414
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
381
415
|
|
382
|
-
|
383
416
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
384
417
|
|
385
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
386
|
-
or incorrectly return tokens/objects as literals._
|
418
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
419
|
+
an error, or incorrectly return tokens/objects as literals._
|
387
420
|
|
388
421
|
|
389
422
|
## Testing
|
390
|
-
To run the tests simply run rake from the root directory
|
391
|
-
|
392
|
-
It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
|
393
|
-
|
394
|
-
The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
|
423
|
+
To run the tests simply run rake from the root directory.
|
395
424
|
|
396
|
-
|
397
|
-
|
398
|
-
```
|
425
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
426
|
+
all the specs, thus it requires Ragel to be installed.
|
399
427
|
|
400
|
-
|
428
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
429
|
+
so to run individual tests you might want to run:
|
401
430
|
|
402
431
|
```
|
403
|
-
|
404
|
-
```
|
405
|
-
|
406
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
407
|
-
|
408
|
-
```
|
409
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
432
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
410
433
|
```
|
411
434
|
|
412
435
|
## Building
|
413
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
414
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
415
|
-
Ruby scanner code.
|
436
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
437
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
438
|
+
the Ruby scanner code.
|
416
439
|
|
417
440
|
|
418
441
|
The project uses the standard rubygems package tasks, so:
|
@@ -432,13 +455,26 @@ rake install
|
|
432
455
|
## Example Projects
|
433
456
|
Projects using regexp_parser.
|
434
457
|
|
435
|
-
- [
|
458
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
459
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
460
|
+
|
461
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
462
|
+
to JavaScript-compatible regular expressions.
|
463
|
+
|
464
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
465
|
+
with alias support.
|
466
|
+
|
467
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
468
|
+
(amongst others) to see if your tests cover their behavior.
|
436
469
|
|
437
|
-
- [
|
470
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
471
|
+
pretty-printer and formatter for Ruby.
|
438
472
|
|
439
|
-
- [
|
473
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
474
|
+
uses regexp_parser to lint Regexps.
|
440
475
|
|
441
|
-
- [
|
476
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
477
|
+
that uses regexp_parser to generate examples of postal codes.
|
442
478
|
|
443
479
|
|
444
480
|
## References
|
@@ -467,4 +503,4 @@ Documentation and books used while working on this project.
|
|
467
503
|
|
468
504
|
---
|
469
505
|
##### Copyright
|
470
|
-
_Copyright (c) 2010-
|
506
|
+
_Copyright (c) 2010-2023 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,87 +1,23 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
require 'rubygems/package_task'
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
|
11
|
-
RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
14
9
|
|
15
10
|
Bundler::GemHelper.install_tasks
|
16
11
|
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
13
|
|
18
14
|
task :default => [:'test:full']
|
19
15
|
|
20
16
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
17
|
+
task full: [:'ragel:rb', :spec]
|
24
18
|
end
|
25
19
|
|
26
|
-
namespace :ragel do
|
27
|
-
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do |t|
|
29
|
-
RAGEL_SOURCE_FILES.each do |file|
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
31
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
|
33
|
-
|
34
|
-
contents = File.read(output_file)
|
35
|
-
|
36
|
-
File.open(output_file, 'r+') do |file|
|
37
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
38
|
-
|
39
|
-
file.write(contents)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do |t|
|
46
|
-
RAGEL_SOURCE_FILES.each do |file|
|
47
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
21
|
# latest scanner code is generated and included in the build.
|
55
22
|
desc "Runs ragel:rb before building the gem"
|
56
23
|
task :build => ['ragel:rb']
|
57
|
-
|
58
|
-
|
59
|
-
namespace :props do
|
60
|
-
desc 'Write new property value hashes for the properties scanner'
|
61
|
-
task :update do
|
62
|
-
require 'regexp_property_values'
|
63
|
-
RegexpPropertyValues.update
|
64
|
-
dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
|
65
|
-
|
66
|
-
require 'psych'
|
67
|
-
write_hash_to_file = ->(hash, path) do
|
68
|
-
File.open(path, 'w') do |f|
|
69
|
-
f.puts '#',
|
70
|
-
"# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
73
|
-
end
|
74
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
|
-
end
|
76
|
-
|
77
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
|
-
[val.identifier, val.full_name.downcase]
|
79
|
-
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
|
81
|
-
|
82
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
|
-
[k.identifier, v.full_name.downcase]
|
84
|
-
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
|
86
|
-
end
|
87
|
-
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
include Regexp::Expression::Shared
|
4
|
+
|
5
|
+
def initialize(token, options = {})
|
6
|
+
init_from_token_and_options(token, options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def to_re(format = :full)
|
10
|
+
if set_level > 0
|
11
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
12
|
+
"their behavior might not be equivalent outside of the set."
|
13
|
+
end
|
14
|
+
::Regexp.new(to_s(format))
|
15
|
+
end
|
16
|
+
|
17
|
+
def quantify(*args)
|
18
|
+
self.quantifier = Quantifier.new(*args)
|
19
|
+
end
|
20
|
+
|
21
|
+
def unquantified_clone
|
22
|
+
clone.tap { |exp| exp.quantifier = nil }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
26
|
+
def quantity
|
27
|
+
return [nil,nil] unless quantified?
|
28
|
+
[quantifier.min, quantifier.max]
|
29
|
+
end
|
30
|
+
|
31
|
+
def repetitions
|
32
|
+
@repetitions ||=
|
33
|
+
if quantified?
|
34
|
+
min = quantifier.min
|
35
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
36
|
+
range = min..max
|
37
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
38
|
+
if RUBY_VERSION.to_f < 2.7
|
39
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
40
|
+
end
|
41
|
+
range
|
42
|
+
else
|
43
|
+
1..1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def greedy?
|
48
|
+
quantified? and quantifier.greedy?
|
49
|
+
end
|
50
|
+
|
51
|
+
def reluctant?
|
52
|
+
quantified? and quantifier.reluctant?
|
53
|
+
end
|
54
|
+
alias :lazy? :reluctant?
|
55
|
+
|
56
|
+
def possessive?
|
57
|
+
quantified? and quantifier.possessive?
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_h
|
61
|
+
{
|
62
|
+
type: type,
|
63
|
+
token: token,
|
64
|
+
text: to_s(:base),
|
65
|
+
starts_at: ts,
|
66
|
+
length: full_length,
|
67
|
+
level: level,
|
68
|
+
set_level: set_level,
|
69
|
+
conditional_level: conditional_level,
|
70
|
+
options: options,
|
71
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
72
|
+
}
|
73
|
+
end
|
74
|
+
alias :attributes :to_h
|
75
|
+
end
|
76
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
# A sequence of expressions, used by Alternation as one of its
|
2
|
+
# A sequence of expressions, used by Alternation as one of its alternatives.
|
3
3
|
class Alternative < Regexp::Expression::Sequence; end
|
4
4
|
|
5
5
|
class Alternation < Regexp::Expression::SequenceOperation
|
@@ -2,6 +2,23 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
exp_id = [self.class, self.starts_at]
|
8
|
+
|
9
|
+
# prevent infinite recursion for recursive subexp calls
|
10
|
+
copied = @@copied ||= {}
|
11
|
+
self.referenced_expression =
|
12
|
+
if copied[exp_id]
|
13
|
+
orig.referenced_expression
|
14
|
+
else
|
15
|
+
copied[exp_id] = true
|
16
|
+
orig.referenced_expression.dup
|
17
|
+
end
|
18
|
+
copied.clear
|
19
|
+
|
20
|
+
super
|
21
|
+
end
|
5
22
|
end
|
6
23
|
|
7
24
|
class Number < Backreference::Base
|
@@ -9,7 +26,7 @@ module Regexp::Expression
|
|
9
26
|
alias reference number
|
10
27
|
|
11
28
|
def initialize(token, options = {})
|
12
|
-
@number = token.text[
|
29
|
+
@number = token.text[/-?\d+/].to_i
|
13
30
|
super
|
14
31
|
end
|
15
32
|
end
|
@@ -33,7 +50,7 @@ module Regexp::Expression
|
|
33
50
|
class NameCall < Backreference::Name; end
|
34
51
|
class NumberCallRelative < Backreference::NumberRelative; end
|
35
52
|
|
36
|
-
class NumberRecursionLevel < Backreference::
|
53
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
37
54
|
attr_reader :recursion_level
|
38
55
|
|
39
56
|
def initialize(token, options = {})
|
@@ -52,4 +69,7 @@ module Regexp::Expression
|
|
52
69
|
end
|
53
70
|
end
|
54
71
|
end
|
72
|
+
|
73
|
+
# alias for symmetry between token symbol and Expression class name
|
74
|
+
Backref = Backreference
|
55
75
|
end
|