regexp_parser 2.1.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +94 -6
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +40 -30
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +75 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +41 -23
- data/lib/regexp_parser/expression/sequence.rb +9 -24
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +85 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -8
- data/lib/regexp_parser/expression.rb +10 -132
- data/lib/regexp_parser/lexer.rb +8 -6
- data/lib/regexp_parser/parser.rb +21 -72
- data/lib/regexp_parser/scanner/properties/long.csv +622 -0
- data/lib/regexp_parser/scanner/properties/short.csv +246 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +48 -35
- data/lib/regexp_parser/scanner.rb +735 -801
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +37 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f871ec3cdea5a594f72f5386f1b344710e6204f7307ba40d966653197f526be8
|
4
|
+
data.tar.gz: dd93c880f29ec77531faa2379fbfc8e34a9b67680664c6a3477d38afeaa1809a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e52ab0ce7bec3e4a275efa3828532778c49e8d36eec1ea82a43755a87abc9eee97e986027aa8f5c64fd604f15164d2ad4f37e5d6e22a5a1e3e9da6788271b9
|
7
|
+
data.tar.gz: 1f5514f3252294d9fe0877cff1d8b0db0400838c97ed78d15bbb794b94595c20d081681e4b1fe9bb6c89be7749514d8b2b8cf385360d002cd89e2a76ce6d2e63
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,93 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
### Added
|
4
|
+
|
5
|
+
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
6
|
+
|
7
|
+
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
8
|
+
|
9
|
+
### Fixed
|
10
|
+
|
11
|
+
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
12
|
+
- they used to be treated as reluctant or possessive mode indicators
|
13
|
+
- however, Ruby does not support these modes for interval quantifiers
|
14
|
+
- they are now treated as chained quantifiers instead, as Ruby does it
|
15
|
+
- c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
16
|
+
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
17
|
+
- e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
18
|
+
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
19
|
+
- they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
20
|
+
- they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
21
|
+
|
22
|
+
### Added
|
23
|
+
|
24
|
+
- added `Expression::Base#==` for (deep) comparison of expressions
|
25
|
+
- added `Expression::Base#parts`
|
26
|
+
- returns the text elements and subexpressions of an expression
|
27
|
+
- e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
28
|
+
- added `Expression::Base#te` (a.k.a. token end index)
|
29
|
+
- `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
30
|
+
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
31
|
+
- `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
32
|
+
- `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
33
|
+
- `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
34
|
+
- this allows a more unified handling with `Expression::Base` instances
|
35
|
+
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
36
|
+
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
37
|
+
|
38
|
+
Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments
|
39
|
+
is deprecated.
|
40
|
+
|
41
|
+
It will no longer be supported in regexp_parser v3.0.0.
|
42
|
+
|
43
|
+
Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
|
44
|
+
with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
|
45
|
+
will be derived automatically.
|
46
|
+
|
47
|
+
Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
|
48
|
+
|
49
|
+
This is consistent with how Expression::Base instances are created.
|
50
|
+
|
51
|
+
|
52
|
+
## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:janosch84@gmail.com)
|
53
|
+
|
54
|
+
### Fixed
|
55
|
+
|
56
|
+
- removed five inexistent unicode properties from `Syntax#features`
|
57
|
+
- these were never supported by Ruby or the `Regexp::Scanner`
|
58
|
+
- thanks to [Markus Schirp](https://github.com/mbj) for the report
|
59
|
+
|
60
|
+
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
61
|
+
|
62
|
+
### Added
|
63
|
+
|
64
|
+
- improved parsing performance through `Syntax` refactoring
|
65
|
+
- instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
66
|
+
- this approximately doubles the parsing speed for simple regexps
|
67
|
+
- added methods to `Syntax` classes to show relative feature sets
|
68
|
+
- e.g. `Regexp::Syntax::V3_2_0.added_features`
|
69
|
+
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
70
|
+
|
71
|
+
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
72
|
+
|
73
|
+
### Fixed
|
74
|
+
|
75
|
+
- fixed Syntax version of absence groups (`(?~...)`)
|
76
|
+
- the lexer accepted them for any Ruby version
|
77
|
+
- now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
78
|
+
- reduced gem size by excluding specs from package
|
79
|
+
- removed deprecated `test_files` gemspec setting
|
80
|
+
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
81
|
+
- no longer depend on `set`
|
82
|
+
- `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
83
|
+
- this made it a hidden/undeclared dependency of `regexp_parser`
|
84
|
+
|
85
|
+
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
86
|
+
|
87
|
+
### Added
|
88
|
+
|
89
|
+
- added support for 13 new unicode properties introduced in Ruby 3.1.0
|
90
|
+
|
3
91
|
## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
92
|
|
5
93
|
### Fixed
|
@@ -149,7 +237,7 @@
|
|
149
237
|
|
150
238
|
### Added
|
151
239
|
|
152
|
-
- `Expression#each_expression` and `#traverse` can now be called without a block
|
240
|
+
- `Expression::Base#each_expression` and `#traverse` can now be called without a block
|
153
241
|
* this returns an `Enumerator` and allows chaining, e.g. `each_expression.select`
|
154
242
|
* thanks to [Masataka Kuwabara](https://github.com/pocke)
|
155
243
|
|
@@ -175,7 +263,7 @@
|
|
175
263
|
- Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags
|
176
264
|
- Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)'
|
177
265
|
- Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\'
|
178
|
-
- Fixed `Expression#match` and `#=~` not working with a single argument
|
266
|
+
- Fixed `Expression::Base#match` and `#=~` not working with a single argument
|
179
267
|
|
180
268
|
### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com)
|
181
269
|
|
@@ -183,15 +271,15 @@
|
|
183
271
|
|
184
272
|
- Added `#referenced_expression` for backrefs, subexp calls and conditionals
|
185
273
|
* returns the `Group` expression that is being referenced via name or number
|
186
|
-
- Added `Expression#repetitions`
|
274
|
+
- Added `Expression::Base#repetitions`
|
187
275
|
* returns a `Range` of allowed repetitions (`1..1` if there is no quantifier)
|
188
276
|
* like `#quantity` but with a more uniform interface
|
189
|
-
- Added `Expression#match_length`
|
277
|
+
- Added `Expression::Base#match_length`
|
190
278
|
* allows to inspect and iterate over String lengths matched by the Expression
|
191
279
|
|
192
280
|
### Fixed
|
193
281
|
|
194
|
-
- Fixed `Expression#clone` "direction"
|
282
|
+
- Fixed `Expression::Base#clone` "direction"
|
195
283
|
* it used to dup ivars onto the callee, leaving only the clone referencing the original objects
|
196
284
|
* this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys
|
197
285
|
- Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals
|
@@ -353,7 +441,7 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
353
441
|
- Fixed a thread safety issue (issue #45)
|
354
442
|
- Some public class methods that were only reliable for
|
355
443
|
internal use are now private instance methods (PR #46)
|
356
|
-
- Improved the usefulness of Expression#options (issue #43) -
|
444
|
+
- Improved the usefulness of Expression::Base#options (issue #43) -
|
357
445
|
#options and derived methods such as #i?, #m? and #x? are now
|
358
446
|
defined for all Expressions that are affected by such flags.
|
359
447
|
- Fixed scanning of whitespace following (?x) (commit 5c94bd2)
|
data/Gemfile
CHANGED
@@ -5,9 +5,10 @@ gemspec
|
|
5
5
|
group :development, :test do
|
6
6
|
gem 'ice_nine', '~> 0.11.2'
|
7
7
|
gem 'rake', '~> 13.0'
|
8
|
-
gem 'regexp_property_values', '~> 1.
|
8
|
+
gem 'regexp_property_values', '~> 1.3'
|
9
9
|
gem 'rspec', '~> 3.10'
|
10
10
|
if RUBY_VERSION.to_f >= 2.7
|
11
|
+
gem 'benchmark-ips', '~> 2.1'
|
11
12
|
gem 'gouteur'
|
12
13
|
gem 'rubocop', '~> 1.7'
|
13
14
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
@@ -154,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
154
157
|
flavor variations. Syntax only comes into play in the lexer.
|
155
158
|
|
156
159
|
#### Example
|
157
|
-
The following
|
160
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
158
161
|
checks a few of their implementation features.
|
159
162
|
|
160
163
|
```ruby
|
161
164
|
require 'regexp_parser'
|
162
165
|
|
163
|
-
ruby_20 = Regexp::Syntax.
|
166
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
164
167
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
165
168
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
166
169
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
167
170
|
ruby_20.implements? :conditional, :condition # => true
|
168
171
|
|
169
|
-
ruby_19 = Regexp::Syntax.
|
172
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
170
173
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
171
174
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
172
175
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
173
176
|
ruby_19.implements? :conditional, :condition # => false
|
174
177
|
|
175
|
-
ruby_18 = Regexp::Syntax.
|
178
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
176
179
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
177
180
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
178
181
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
179
182
|
ruby_18.implements? :conditional, :condition # => false
|
180
183
|
```
|
181
184
|
|
185
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
require 'regexp_parser'
|
189
|
+
|
190
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
191
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
192
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
193
|
+
ruby_20.features # => { anchor: [...], ... }
|
194
|
+
```
|
182
195
|
|
183
196
|
#### Notes
|
184
197
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
@@ -354,15 +367,15 @@ _Note that not all of these are available in all versions of Ruby_
|
|
354
367
|
| **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ |
|
355
368
|
| **Quantifiers** | | ⋱ |
|
356
369
|
|   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ |
|
357
|
-
|   _**Reluctant** (Lazy)_ | `??`, `*?`,
|
358
|
-
|   _**Possessive**_ | `?+`, `*+`,
|
370
|
+
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ |
|
371
|
+
|   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ |
|
359
372
|
| **String Escapes** | | ⋱ |
|
360
|
-
|   _**Control**_
|
373
|
+
|   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ |
|
361
374
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
362
|
-
|   _**Meta**_
|
375
|
+
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
363
376
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
364
377
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
365
|
-
| **Unicode Properties** | _<sub>([Unicode
|
378
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | ⋱ |
|
366
379
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
367
380
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
368
381
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -371,6 +384,14 @@ _Note that not all of these are available in all versions of Ruby_
|
|
371
384
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
372
385
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
373
386
|
|
387
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
|
388
|
+
quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
|
+
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
|
+
|
391
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
|
392
|
+
https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
|
393
|
+
scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
|
394
|
+
|
374
395
|
##### Inapplicable Features
|
375
396
|
|
376
397
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -384,7 +405,6 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
384
405
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
385
406
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
386
407
|
|
387
|
-
|
388
408
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
389
409
|
|
390
410
|
_**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
|
@@ -392,26 +412,14 @@ or incorrectly return tokens/objects as literals._
|
|
392
412
|
|
393
413
|
|
394
414
|
## Testing
|
395
|
-
To run the tests simply run rake from the root directory
|
415
|
+
To run the tests simply run rake from the root directory.
|
396
416
|
|
397
|
-
|
417
|
+
The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
|
398
418
|
|
399
|
-
|
419
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
|
400
420
|
|
401
421
|
```
|
402
|
-
|
403
|
-
```
|
404
|
-
|
405
|
-
You can run a specific test like so:
|
406
|
-
|
407
|
-
```
|
408
|
-
bin/test spec/scanner/properties_spec.rb
|
409
|
-
```
|
410
|
-
|
411
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
412
|
-
|
413
|
-
```
|
414
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
422
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
415
423
|
```
|
416
424
|
|
417
425
|
## Building
|
@@ -439,11 +447,13 @@ Projects using regexp_parser.
|
|
439
447
|
|
440
448
|
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
|
441
449
|
|
442
|
-
- [js_regex](https://github.com/
|
450
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
|
443
451
|
|
444
452
|
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
|
445
453
|
|
446
|
-
- [mutant](https://github.com/mbj/mutant)
|
454
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
|
455
|
+
|
456
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer for Ruby.
|
447
457
|
|
448
458
|
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
|
449
459
|
|
@@ -476,4 +486,4 @@ Documentation and books used while working on this project.
|
|
476
486
|
|
477
487
|
---
|
478
488
|
##### Copyright
|
479
|
-
_Copyright (c) 2010-
|
489
|
+
_Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,87 +1,23 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
require 'rubygems/package_task'
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
11
|
-
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
14
9
|
|
15
10
|
Bundler::GemHelper.install_tasks
|
16
11
|
|
12
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
13
|
|
18
14
|
task :default => [:'test:full']
|
19
15
|
|
20
16
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
17
|
+
task full: [:'ragel:rb', :spec]
|
24
18
|
end
|
25
19
|
|
26
|
-
namespace :ragel do
|
27
|
-
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |source_file|
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
|
-
|
34
|
-
contents = File.read(output_file)
|
35
|
-
|
36
|
-
File.open(output_file, 'r+') do |file|
|
37
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
38
|
-
|
39
|
-
file.write(contents)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do
|
46
|
-
RAGEL_SOURCE_FILES.each do |file|
|
47
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
21
|
# latest scanner code is generated and included in the build.
|
55
22
|
desc "Runs ragel:rb before building the gem"
|
56
23
|
task :build => ['ragel:rb']
|
57
|
-
|
58
|
-
|
59
|
-
namespace :props do
|
60
|
-
desc 'Write new property value hashes for the properties scanner'
|
61
|
-
task :update do
|
62
|
-
require 'regexp_property_values'
|
63
|
-
RegexpPropertyValues.update
|
64
|
-
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
|
-
|
66
|
-
require 'psych'
|
67
|
-
write_hash_to_file = ->(hash, path) do
|
68
|
-
File.open(path, 'w') do |f|
|
69
|
-
f.puts '#',
|
70
|
-
"# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
73
|
-
end
|
74
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
|
-
end
|
76
|
-
|
77
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
|
-
[val.identifier, val.full_name.downcase]
|
79
|
-
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
|
81
|
-
|
82
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
|
-
[k.identifier, v.full_name.downcase]
|
84
|
-
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
|
86
|
-
end
|
87
|
-
end
|
data/lib/regexp_parser/error.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
include Regexp::Expression::Shared
|
4
|
+
|
5
|
+
def initialize(token, options = {})
|
6
|
+
init_from_token_and_options(token, options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize_copy(orig)
|
10
|
+
self.text = orig.text.dup if orig.text
|
11
|
+
self.options = orig.options.dup if orig.options
|
12
|
+
self.quantifier = orig.quantifier.clone if orig.quantifier
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_re(format = :full)
|
17
|
+
::Regexp.new(to_s(format))
|
18
|
+
end
|
19
|
+
|
20
|
+
def quantify(*args)
|
21
|
+
self.quantifier = Quantifier.new(*args)
|
22
|
+
end
|
23
|
+
|
24
|
+
def unquantified_clone
|
25
|
+
clone.tap { |exp| exp.quantifier = nil }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
29
|
+
def quantity
|
30
|
+
return [nil,nil] unless quantified?
|
31
|
+
[quantifier.min, quantifier.max]
|
32
|
+
end
|
33
|
+
|
34
|
+
def repetitions
|
35
|
+
return 1..1 unless quantified?
|
36
|
+
min = quantifier.min
|
37
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
38
|
+
range = min..max
|
39
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
40
|
+
if RUBY_VERSION.to_f < 2.7
|
41
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
42
|
+
end
|
43
|
+
range
|
44
|
+
end
|
45
|
+
|
46
|
+
def greedy?
|
47
|
+
quantified? and quantifier.greedy?
|
48
|
+
end
|
49
|
+
|
50
|
+
def reluctant?
|
51
|
+
quantified? and quantifier.reluctant?
|
52
|
+
end
|
53
|
+
alias :lazy? :reluctant?
|
54
|
+
|
55
|
+
def possessive?
|
56
|
+
quantified? and quantifier.possessive?
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_h
|
60
|
+
{
|
61
|
+
type: type,
|
62
|
+
token: token,
|
63
|
+
text: to_s(:base),
|
64
|
+
starts_at: ts,
|
65
|
+
length: full_length,
|
66
|
+
level: level,
|
67
|
+
set_level: set_level,
|
68
|
+
conditional_level: conditional_level,
|
69
|
+
options: options,
|
70
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
71
|
+
}
|
72
|
+
end
|
73
|
+
alias :attributes :to_h
|
74
|
+
end
|
75
|
+
end
|
File without changes
|
@@ -20,8 +20,8 @@ module Regexp::Expression
|
|
20
20
|
self.closed = true
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
"#{text}#{'^' if negated?}
|
23
|
+
def parts
|
24
|
+
["#{text}#{'^' if negated?}", *expressions, ']']
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end # module Regexp::Expression
|
@@ -55,8 +55,8 @@ module Regexp::Expression
|
|
55
55
|
condition.reference
|
56
56
|
end
|
57
57
|
|
58
|
-
def
|
59
|
-
|
58
|
+
def parts
|
59
|
+
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
60
|
end
|
61
61
|
|
62
62
|
def initialize_copy(orig)
|
@@ -1,16 +1,22 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
# TODO: unify naming with Token::Escape, on way or the other, in v3.0.0
|
2
3
|
module EscapeSequence
|
3
4
|
class Base < Regexp::Expression::Base
|
4
|
-
require 'yaml'
|
5
|
-
|
6
|
-
def char
|
7
|
-
# poor man's unescape without using eval
|
8
|
-
YAML.load(%Q(---\n"#{text}"\n))
|
9
|
-
end
|
10
|
-
|
11
5
|
def codepoint
|
12
6
|
char.ord
|
13
7
|
end
|
8
|
+
|
9
|
+
if ''.respond_to?(:undump)
|
10
|
+
def char
|
11
|
+
%("#{text}").undump
|
12
|
+
end
|
13
|
+
else
|
14
|
+
# poor man's unescape without using eval
|
15
|
+
require 'yaml'
|
16
|
+
def char
|
17
|
+
YAML.load(%Q(---\n"#{text}"\n))
|
18
|
+
end
|
19
|
+
end
|
14
20
|
end
|
15
21
|
|
16
22
|
class Literal < EscapeSequence::Base
|
@@ -1,7 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
class FreeSpace < Regexp::Expression::Base
|
4
|
-
def quantify(
|
3
|
+
def quantify(*_args)
|
5
4
|
raise Regexp::Parser::Error, 'Can not quantify a free space object'
|
6
5
|
end
|
7
6
|
end
|
@@ -13,5 +12,4 @@ module Regexp::Expression
|
|
13
12
|
text << exp.text
|
14
13
|
end
|
15
14
|
end
|
16
|
-
|
17
15
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Group
|
3
3
|
class Base < Regexp::Expression::Subexpression
|
4
|
-
def
|
5
|
-
|
4
|
+
def parts
|
5
|
+
[text.dup, *expressions, ')']
|
6
6
|
end
|
7
7
|
|
8
8
|
def capturing?; false end
|
@@ -18,9 +18,9 @@ module Regexp::Expression
|
|
18
18
|
super
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def parts
|
22
22
|
if implicit?
|
23
|
-
|
23
|
+
expressions
|
24
24
|
else
|
25
25
|
super
|
26
26
|
end
|
@@ -65,8 +65,8 @@ module Regexp::Expression
|
|
65
65
|
end
|
66
66
|
|
67
67
|
class Comment < Group::Base
|
68
|
-
def
|
69
|
-
text.dup
|
68
|
+
def parts
|
69
|
+
[text.dup]
|
70
70
|
end
|
71
71
|
|
72
72
|
def comment?; true end
|