regexp_parser 2.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +94 -6
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +40 -30
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +75 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +41 -23
- data/lib/regexp_parser/expression/sequence.rb +9 -24
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +85 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -8
- data/lib/regexp_parser/expression.rb +10 -132
- data/lib/regexp_parser/lexer.rb +8 -6
- data/lib/regexp_parser/parser.rb +21 -72
- data/lib/regexp_parser/scanner/properties/long.csv +622 -0
- data/lib/regexp_parser/scanner/properties/short.csv +246 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +48 -35
- data/lib/regexp_parser/scanner.rb +735 -801
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +37 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f871ec3cdea5a594f72f5386f1b344710e6204f7307ba40d966653197f526be8
|
4
|
+
data.tar.gz: dd93c880f29ec77531faa2379fbfc8e34a9b67680664c6a3477d38afeaa1809a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e52ab0ce7bec3e4a275efa3828532778c49e8d36eec1ea82a43755a87abc9eee97e986027aa8f5c64fd604f15164d2ad4f37e5d6e22a5a1e3e9da6788271b9
|
7
|
+
data.tar.gz: 1f5514f3252294d9fe0877cff1d8b0db0400838c97ed78d15bbb794b94595c20d081681e4b1fe9bb6c89be7749514d8b2b8cf385360d002cd89e2a76ce6d2e63
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,93 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
### Added
|
4
|
+
|
5
|
+
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
6
|
+
|
7
|
+
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
8
|
+
|
9
|
+
### Fixed
|
10
|
+
|
11
|
+
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
12
|
+
- they used to be treated as reluctant or possessive mode indicators
|
13
|
+
- however, Ruby does not support these modes for interval quantifiers
|
14
|
+
- they are now treated as chained quantifiers instead, as Ruby does it
|
15
|
+
- c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
16
|
+
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
17
|
+
- e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
18
|
+
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
19
|
+
- they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
20
|
+
- they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
21
|
+
|
22
|
+
### Added
|
23
|
+
|
24
|
+
- added `Expression::Base#==` for (deep) comparison of expressions
|
25
|
+
- added `Expression::Base#parts`
|
26
|
+
- returns the text elements and subexpressions of an expression
|
27
|
+
- e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
28
|
+
- added `Expression::Base#te` (a.k.a. token end index)
|
29
|
+
- `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
30
|
+
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
31
|
+
- `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
32
|
+
- `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
33
|
+
- `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
34
|
+
- this allows a more unified handling with `Expression::Base` instances
|
35
|
+
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
36
|
+
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
37
|
+
|
38
|
+
Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments
|
39
|
+
is deprecated.
|
40
|
+
|
41
|
+
It will no longer be supported in regexp_parser v3.0.0.
|
42
|
+
|
43
|
+
Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
|
44
|
+
with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
|
45
|
+
will be derived automatically.
|
46
|
+
|
47
|
+
Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
|
48
|
+
|
49
|
+
This is consistent with how Expression::Base instances are created.
|
50
|
+
|
51
|
+
|
52
|
+
## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:janosch84@gmail.com)
|
53
|
+
|
54
|
+
### Fixed
|
55
|
+
|
56
|
+
- removed five inexistent unicode properties from `Syntax#features`
|
57
|
+
- these were never supported by Ruby or the `Regexp::Scanner`
|
58
|
+
- thanks to [Markus Schirp](https://github.com/mbj) for the report
|
59
|
+
|
60
|
+
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
61
|
+
|
62
|
+
### Added
|
63
|
+
|
64
|
+
- improved parsing performance through `Syntax` refactoring
|
65
|
+
- instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
66
|
+
- this approximately doubles the parsing speed for simple regexps
|
67
|
+
- added methods to `Syntax` classes to show relative feature sets
|
68
|
+
- e.g. `Regexp::Syntax::V3_2_0.added_features`
|
69
|
+
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
70
|
+
|
71
|
+
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
72
|
+
|
73
|
+
### Fixed
|
74
|
+
|
75
|
+
- fixed Syntax version of absence groups (`(?~...)`)
|
76
|
+
- the lexer accepted them for any Ruby version
|
77
|
+
- now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
78
|
+
- reduced gem size by excluding specs from package
|
79
|
+
- removed deprecated `test_files` gemspec setting
|
80
|
+
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
81
|
+
- no longer depend on `set`
|
82
|
+
- `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
83
|
+
- this made it a hidden/undeclared dependency of `regexp_parser`
|
84
|
+
|
85
|
+
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
86
|
+
|
87
|
+
### Added
|
88
|
+
|
89
|
+
- added support for 13 new unicode properties introduced in Ruby 3.1.0
|
90
|
+
|
3
91
|
## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
92
|
|
5
93
|
### Fixed
|
@@ -149,7 +237,7 @@
|
|
149
237
|
|
150
238
|
### Added
|
151
239
|
|
152
|
-
- `Expression#each_expression` and `#traverse` can now be called without a block
|
240
|
+
- `Expression::Base#each_expression` and `#traverse` can now be called without a block
|
153
241
|
* this returns an `Enumerator` and allows chaining, e.g. `each_expression.select`
|
154
242
|
* thanks to [Masataka Kuwabara](https://github.com/pocke)
|
155
243
|
|
@@ -175,7 +263,7 @@
|
|
175
263
|
- Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags
|
176
264
|
- Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)'
|
177
265
|
- Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\'
|
178
|
-
- Fixed `Expression#match` and `#=~` not working with a single argument
|
266
|
+
- Fixed `Expression::Base#match` and `#=~` not working with a single argument
|
179
267
|
|
180
268
|
### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com)
|
181
269
|
|
@@ -183,15 +271,15 @@
|
|
183
271
|
|
184
272
|
- Added `#referenced_expression` for backrefs, subexp calls and conditionals
|
185
273
|
* returns the `Group` expression that is being referenced via name or number
|
186
|
-
- Added `Expression#repetitions`
|
274
|
+
- Added `Expression::Base#repetitions`
|
187
275
|
* returns a `Range` of allowed repetitions (`1..1` if there is no quantifier)
|
188
276
|
* like `#quantity` but with a more uniform interface
|
189
|
-
- Added `Expression#match_length`
|
277
|
+
- Added `Expression::Base#match_length`
|
190
278
|
* allows to inspect and iterate over String lengths matched by the Expression
|
191
279
|
|
192
280
|
### Fixed
|
193
281
|
|
194
|
-
- Fixed `Expression#clone` "direction"
|
282
|
+
- Fixed `Expression::Base#clone` "direction"
|
195
283
|
* it used to dup ivars onto the callee, leaving only the clone referencing the original objects
|
196
284
|
* this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys
|
197
285
|
- Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals
|
@@ -353,7 +441,7 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
353
441
|
- Fixed a thread safety issue (issue #45)
|
354
442
|
- Some public class methods that were only reliable for
|
355
443
|
internal use are now private instance methods (PR #46)
|
356
|
-
- Improved the usefulness of Expression#options (issue #43) -
|
444
|
+
- Improved the usefulness of Expression::Base#options (issue #43) -
|
357
445
|
#options and derived methods such as #i?, #m? and #x? are now
|
358
446
|
defined for all Expressions that are affected by such flags.
|
359
447
|
- Fixed scanning of whitespace following (?x) (commit 5c94bd2)
|
data/Gemfile
CHANGED
@@ -5,9 +5,10 @@ gemspec
|
|
5
5
|
group :development, :test do
|
6
6
|
gem 'ice_nine', '~> 0.11.2'
|
7
7
|
gem 'rake', '~> 13.0'
|
8
|
-
gem 'regexp_property_values', '~> 1.
|
8
|
+
gem 'regexp_property_values', '~> 1.3'
|
9
9
|
gem 'rspec', '~> 3.10'
|
10
10
|
if RUBY_VERSION.to_f >= 2.7
|
11
|
+
gem 'benchmark-ips', '~> 2.1'
|
11
12
|
gem 'gouteur'
|
12
13
|
gem 'rubocop', '~> 1.7'
|
13
14
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
@@ -154,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
154
157
|
flavor variations. Syntax only comes into play in the lexer.
|
155
158
|
|
156
159
|
#### Example
|
157
|
-
The following
|
160
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
158
161
|
checks a few of their implementation features.
|
159
162
|
|
160
163
|
```ruby
|
161
164
|
require 'regexp_parser'
|
162
165
|
|
163
|
-
ruby_20 = Regexp::Syntax.
|
166
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
164
167
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
165
168
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
166
169
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
167
170
|
ruby_20.implements? :conditional, :condition # => true
|
168
171
|
|
169
|
-
ruby_19 = Regexp::Syntax.
|
172
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
170
173
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
171
174
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
172
175
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
173
176
|
ruby_19.implements? :conditional, :condition # => false
|
174
177
|
|
175
|
-
ruby_18 = Regexp::Syntax.
|
178
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
176
179
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
177
180
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
178
181
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
179
182
|
ruby_18.implements? :conditional, :condition # => false
|
180
183
|
```
|
181
184
|
|
185
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
require 'regexp_parser'
|
189
|
+
|
190
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
191
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
192
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
193
|
+
ruby_20.features # => { anchor: [...], ... }
|
194
|
+
```
|
182
195
|
|
183
196
|
#### Notes
|
184
197
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
@@ -354,15 +367,15 @@ _Note that not all of these are available in all versions of Ruby_
|
|
354
367
|
| **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ |
|
355
368
|
| **Quantifiers** | | ⋱ |
|
356
369
|
|   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ |
|
357
|
-
|   _**Reluctant** (Lazy)_ | `??`, `*?`,
|
358
|
-
|   _**Possessive**_ | `?+`, `*+`,
|
370
|
+
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ |
|
371
|
+
|   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ |
|
359
372
|
| **String Escapes** | | ⋱ |
|
360
|
-
|   _**Control**_
|
373
|
+
|   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ |
|
361
374
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
362
|
-
|   _**Meta**_
|
375
|
+
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
363
376
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
364
377
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
365
|
-
| **Unicode Properties** | _<sub>([Unicode
|
378
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | ⋱ |
|
366
379
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
367
380
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
368
381
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -371,6 +384,14 @@ _Note that not all of these are available in all versions of Ruby_
|
|
371
384
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
372
385
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
373
386
|
|
387
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
|
388
|
+
quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
|
+
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
|
+
|
391
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
|
392
|
+
https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
|
393
|
+
scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
|
394
|
+
|
374
395
|
##### Inapplicable Features
|
375
396
|
|
376
397
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -384,7 +405,6 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
384
405
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
385
406
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
386
407
|
|
387
|
-
|
388
408
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
389
409
|
|
390
410
|
_**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
|
@@ -392,26 +412,14 @@ or incorrectly return tokens/objects as literals._
|
|
392
412
|
|
393
413
|
|
394
414
|
## Testing
|
395
|
-
To run the tests simply run rake from the root directory
|
415
|
+
To run the tests simply run rake from the root directory.
|
396
416
|
|
397
|
-
|
417
|
+
The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
|
398
418
|
|
399
|
-
|
419
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
|
400
420
|
|
401
421
|
```
|
402
|
-
|
403
|
-
```
|
404
|
-
|
405
|
-
You can run a specific test like so:
|
406
|
-
|
407
|
-
```
|
408
|
-
bin/test spec/scanner/properties_spec.rb
|
409
|
-
```
|
410
|
-
|
411
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
412
|
-
|
413
|
-
```
|
414
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
422
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
415
423
|
```
|
416
424
|
|
417
425
|
## Building
|
@@ -439,11 +447,13 @@ Projects using regexp_parser.
|
|
439
447
|
|
440
448
|
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
|
441
449
|
|
442
|
-
- [js_regex](https://github.com/
|
450
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
|
443
451
|
|
444
452
|
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
|
445
453
|
|
446
|
-
- [mutant](https://github.com/mbj/mutant)
|
454
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
|
455
|
+
|
456
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer for Ruby.
|
447
457
|
|
448
458
|
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
|
449
459
|
|
@@ -476,4 +486,4 @@ Documentation and books used while working on this project.
|
|
476
486
|
|
477
487
|
---
|
478
488
|
##### Copyright
|
479
|
-
_Copyright (c) 2010-
|
489
|
+
_Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,87 +1,23 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
require 'rubygems/package_task'
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
11
|
-
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
14
9
|
|
15
10
|
Bundler::GemHelper.install_tasks
|
16
11
|
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
13
|
|
18
14
|
task :default => [:'test:full']
|
19
15
|
|
20
16
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
17
|
+
task full: [:'ragel:rb', :spec]
|
24
18
|
end
|
25
19
|
|
26
|
-
namespace :ragel do
|
27
|
-
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |source_file|
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
|
-
|
34
|
-
contents = File.read(output_file)
|
35
|
-
|
36
|
-
File.open(output_file, 'r+') do |file|
|
37
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
38
|
-
|
39
|
-
file.write(contents)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do
|
46
|
-
RAGEL_SOURCE_FILES.each do |file|
|
47
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
21
|
# latest scanner code is generated and included in the build.
|
55
22
|
desc "Runs ragel:rb before building the gem"
|
56
23
|
task :build => ['ragel:rb']
|
57
|
-
|
58
|
-
|
59
|
-
namespace :props do
|
60
|
-
desc 'Write new property value hashes for the properties scanner'
|
61
|
-
task :update do
|
62
|
-
require 'regexp_property_values'
|
63
|
-
RegexpPropertyValues.update
|
64
|
-
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
|
-
|
66
|
-
require 'psych'
|
67
|
-
write_hash_to_file = ->(hash, path) do
|
68
|
-
File.open(path, 'w') do |f|
|
69
|
-
f.puts '#',
|
70
|
-
"# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
73
|
-
end
|
74
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
|
-
end
|
76
|
-
|
77
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
|
-
[val.identifier, val.full_name.downcase]
|
79
|
-
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
|
81
|
-
|
82
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
|
-
[k.identifier, v.full_name.downcase]
|
84
|
-
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
|
86
|
-
end
|
87
|
-
end
|
data/lib/regexp_parser/error.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
include Regexp::Expression::Shared
|
4
|
+
|
5
|
+
def initialize(token, options = {})
|
6
|
+
init_from_token_and_options(token, options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize_copy(orig)
|
10
|
+
self.text = orig.text.dup if orig.text
|
11
|
+
self.options = orig.options.dup if orig.options
|
12
|
+
self.quantifier = orig.quantifier.clone if orig.quantifier
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_re(format = :full)
|
17
|
+
::Regexp.new(to_s(format))
|
18
|
+
end
|
19
|
+
|
20
|
+
def quantify(*args)
|
21
|
+
self.quantifier = Quantifier.new(*args)
|
22
|
+
end
|
23
|
+
|
24
|
+
def unquantified_clone
|
25
|
+
clone.tap { |exp| exp.quantifier = nil }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
29
|
+
def quantity
|
30
|
+
return [nil,nil] unless quantified?
|
31
|
+
[quantifier.min, quantifier.max]
|
32
|
+
end
|
33
|
+
|
34
|
+
def repetitions
|
35
|
+
return 1..1 unless quantified?
|
36
|
+
min = quantifier.min
|
37
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
38
|
+
range = min..max
|
39
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
40
|
+
if RUBY_VERSION.to_f < 2.7
|
41
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
42
|
+
end
|
43
|
+
range
|
44
|
+
end
|
45
|
+
|
46
|
+
def greedy?
|
47
|
+
quantified? and quantifier.greedy?
|
48
|
+
end
|
49
|
+
|
50
|
+
def reluctant?
|
51
|
+
quantified? and quantifier.reluctant?
|
52
|
+
end
|
53
|
+
alias :lazy? :reluctant?
|
54
|
+
|
55
|
+
def possessive?
|
56
|
+
quantified? and quantifier.possessive?
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_h
|
60
|
+
{
|
61
|
+
type: type,
|
62
|
+
token: token,
|
63
|
+
text: to_s(:base),
|
64
|
+
starts_at: ts,
|
65
|
+
length: full_length,
|
66
|
+
level: level,
|
67
|
+
set_level: set_level,
|
68
|
+
conditional_level: conditional_level,
|
69
|
+
options: options,
|
70
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
71
|
+
}
|
72
|
+
end
|
73
|
+
alias :attributes :to_h
|
74
|
+
end
|
75
|
+
end
|
File without changes
|
@@ -20,8 +20,8 @@ module Regexp::Expression
|
|
20
20
|
self.closed = true
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
"#{text}#{'^' if negated?}
|
23
|
+
def parts
|
24
|
+
["#{text}#{'^' if negated?}", *expressions, ']']
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end # module Regexp::Expression
|
@@ -55,8 +55,8 @@ module Regexp::Expression
|
|
55
55
|
condition.reference
|
56
56
|
end
|
57
57
|
|
58
|
-
def
|
59
|
-
|
58
|
+
def parts
|
59
|
+
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
60
|
end
|
61
61
|
|
62
62
|
def initialize_copy(orig)
|
@@ -1,16 +1,22 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
# TODO: unify naming with Token::Escape, on way or the other, in v3.0.0
|
2
3
|
module EscapeSequence
|
3
4
|
class Base < Regexp::Expression::Base
|
4
|
-
require 'yaml'
|
5
|
-
|
6
|
-
def char
|
7
|
-
# poor man's unescape without using eval
|
8
|
-
YAML.load(%Q(---\n"#{text}"\n))
|
9
|
-
end
|
10
|
-
|
11
5
|
def codepoint
|
12
6
|
char.ord
|
13
7
|
end
|
8
|
+
|
9
|
+
if ''.respond_to?(:undump)
|
10
|
+
def char
|
11
|
+
%("#{text}").undump
|
12
|
+
end
|
13
|
+
else
|
14
|
+
# poor man's unescape without using eval
|
15
|
+
require 'yaml'
|
16
|
+
def char
|
17
|
+
YAML.load(%Q(---\n"#{text}"\n))
|
18
|
+
end
|
19
|
+
end
|
14
20
|
end
|
15
21
|
|
16
22
|
class Literal < EscapeSequence::Base
|
@@ -1,7 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
class FreeSpace < Regexp::Expression::Base
|
4
|
-
def quantify(
|
3
|
+
def quantify(*_args)
|
5
4
|
raise Regexp::Parser::Error, 'Can not quantify a free space object'
|
6
5
|
end
|
7
6
|
end
|
@@ -13,5 +12,4 @@ module Regexp::Expression
|
|
13
12
|
text << exp.text
|
14
13
|
end
|
15
14
|
end
|
16
|
-
|
17
15
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Group
|
3
3
|
class Base < Regexp::Expression::Subexpression
|
4
|
-
def
|
5
|
-
|
4
|
+
def parts
|
5
|
+
[text.dup, *expressions, ')']
|
6
6
|
end
|
7
7
|
|
8
8
|
def capturing?; false end
|
@@ -18,9 +18,9 @@ module Regexp::Expression
|
|
18
18
|
super
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def parts
|
22
22
|
if implicit?
|
23
|
-
|
23
|
+
expressions
|
24
24
|
else
|
25
25
|
super
|
26
26
|
end
|
@@ -65,8 +65,8 @@ module Regexp::Expression
|
|
65
65
|
end
|
66
66
|
|
67
67
|
class Comment < Group::Base
|
68
|
-
def
|
69
|
-
text.dup
|
68
|
+
def parts
|
69
|
+
[text.dup]
|
70
70
|
end
|
71
71
|
|
72
72
|
def comment?; true end
|