regexp_parser 1.7.1 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +157 -1
- data/Gemfile +6 -1
- data/LICENSE +1 -1
- data/README.md +38 -32
- data/Rakefile +18 -27
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +1 -3
- data/lib/regexp_parser/expression/classes/root.rb +4 -17
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +11 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -20
- data/lib/regexp_parser/expression/subexpression.rb +1 -2
- data/lib/regexp_parser/expression.rb +7 -139
- data/lib/regexp_parser/lexer.rb +13 -11
- data/lib/regexp_parser/parser.rb +325 -344
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/properties/long.csv +604 -0
- data/lib/regexp_parser/scanner/properties/short.csv +242 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +235 -255
- data/lib/regexp_parser/scanner.rb +1324 -1387
- data/lib/regexp_parser/syntax/any.rb +4 -6
- data/lib/regexp_parser/syntax/base.rb +13 -15
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +34 -165
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -52
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
|
4
|
+
data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
|
7
|
+
data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,160 @@
|
|
1
|
-
## [
|
1
|
+
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
2
|
+
|
3
|
+
### Fixed
|
4
|
+
|
5
|
+
- fixed Syntax version of absence groups (`(?~...)`)
|
6
|
+
- the lexer accepted them for any Ruby version
|
7
|
+
- now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
8
|
+
- reduced gem size by excluding specs from package
|
9
|
+
- removed deprecated `test_files` gemspec setting
|
10
|
+
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
11
|
+
- no longer depend on `set`
|
12
|
+
- `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
13
|
+
- this made it a hidden/undeclared dependency of `regexp_parser`
|
14
|
+
|
15
|
+
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
16
|
+
|
17
|
+
### Added
|
18
|
+
|
19
|
+
- added support for 13 new unicode properties introduced in Ruby 3.1.0
|
20
|
+
|
21
|
+
## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
|
22
|
+
|
23
|
+
### Fixed
|
24
|
+
|
25
|
+
- fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
|
26
|
+
* thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
|
27
|
+
|
28
|
+
## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
|
29
|
+
|
30
|
+
### Added
|
31
|
+
|
32
|
+
- common ancestor for all scanning/parsing/lexing errors
|
33
|
+
* `Regexp::Parser::Error` can now be rescued as a catch-all
|
34
|
+
* the following errors (and their many descendants) now inherit from it:
|
35
|
+
- `Regexp::Expression::Conditional::TooManyBranches`
|
36
|
+
- `Regexp::Parser::ParserError`
|
37
|
+
- `Regexp::Scanner::ScannerError`
|
38
|
+
- `Regexp::Scanner::ValidationError`
|
39
|
+
- `Regexp::Syntax::SyntaxError`
|
40
|
+
* it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
|
41
|
+
* thanks to [sandstrom](https://github.com/sandstrom) for the cue
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
|
45
|
+
- fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
|
46
|
+
* a regression in v2.0.1 had caused them to be scanned as literals
|
47
|
+
- fixed scanning of some backreference and subexpression call edge cases
|
48
|
+
* e.g. `\k<+1>`, `\g<x-1>`
|
49
|
+
- fixed tokenization of some escapes in character sets
|
50
|
+
* `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
|
51
|
+
* all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
|
52
|
+
* if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
|
53
|
+
* the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
|
54
|
+
- fixed handling of control/metacontrol escapes in character sets
|
55
|
+
* e.g. `[\cX]`, `[\M-\C-X]`
|
56
|
+
* they were misread as bunch of individual literals, escapes, and ranges
|
57
|
+
- fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
|
58
|
+
|
59
|
+
## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
60
|
+
|
61
|
+
### Fixed
|
62
|
+
|
63
|
+
- fixed error when scanning some unlikely and redundant but valid charset patterns
|
64
|
+
* e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
|
65
|
+
- fixed ancestry of some error classes related to syntax version lookup
|
66
|
+
* `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
|
67
|
+
* they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
|
68
|
+
|
69
|
+
## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
70
|
+
|
71
|
+
### Fixed
|
72
|
+
|
73
|
+
- fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive`
|
74
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon)
|
75
|
+
|
76
|
+
## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com)
|
77
|
+
|
78
|
+
### Fixed
|
79
|
+
|
80
|
+
- fixed error when scanning some group names
|
81
|
+
* this affected names containing hyphens, digits or multibyte chars, e.g. `/(?<a1>a)/`
|
82
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
|
83
|
+
- fixed error when scanning hex escapes with just one hex digit
|
84
|
+
* e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not
|
85
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
|
86
|
+
|
87
|
+
## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
88
|
+
|
89
|
+
### Changed
|
90
|
+
|
91
|
+
- some methods that used to return byte-based indices now return char-based indices
|
92
|
+
* the returned values have only changed for Regexps that contain multibyte chars
|
93
|
+
* this is only a breaking change if you used such methods directly AND relied on them pointing to bytes
|
94
|
+
* affected methods:
|
95
|
+
* `Regexp::Token` `#length`, `#offset`, `#te`, `#ts`
|
96
|
+
* `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts`
|
97
|
+
* thanks to [Akinori MUSHA](https://github.com/knu) for the report
|
98
|
+
- removed some deprecated methods/signatures
|
99
|
+
* these are rarely used and have been showing deprecation warnings for a long time
|
100
|
+
* `Regexp::Expression::Subexpression.new` with 3 arguments
|
101
|
+
* `Regexp::Expression::Root.new` without a token argument
|
102
|
+
* `Regexp::Expression.parsed`
|
103
|
+
|
104
|
+
### Added
|
105
|
+
|
106
|
+
- `Regexp::Expression::Base#base_length`
|
107
|
+
* returns the character count of an expression body, ignoring any quantifier
|
108
|
+
- pragmatic, experimental support for chained quantifiers
|
109
|
+
* e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s
|
110
|
+
* successive quantifiers used to be silently dropped by the parser
|
111
|
+
* they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}`
|
112
|
+
* thanks to [calfeld](https://github.com/calfeld) for reporting this a while back
|
113
|
+
|
114
|
+
### Fixed
|
115
|
+
|
116
|
+
- incorrect encoding output for non-ascii comments
|
117
|
+
* this led to a crash when calling `#to_s` on parse results containing such comments
|
118
|
+
* thanks to [Michael Glass](https://github.com/michaelglass) for the report
|
119
|
+
- some crashes when scanning contrived patterns such as `'\😋'`
|
120
|
+
|
121
|
+
### [1.8.2] - 2020-10-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
122
|
+
|
123
|
+
### Fixed
|
124
|
+
|
125
|
+
- fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0
|
126
|
+
* thanks to [Thomas Walpole](https://github.com/twalpole)
|
127
|
+
- removed "unknown future version" warning on Ruby 3.0
|
128
|
+
|
129
|
+
### [1.8.1] - 2020-09-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
130
|
+
|
131
|
+
### Fixed
|
132
|
+
|
133
|
+
- fixed scanning of comment-like text in normal mode
|
134
|
+
* this was an old bug, but had become more prevalent in v1.8.0
|
135
|
+
* thanks to [Tietew](https://github.com/Tietew) for the report
|
136
|
+
- specified correct minimum Ruby version in gemspec
|
137
|
+
* it said 1.9 but really required 2.0 as of v1.8.0
|
138
|
+
|
139
|
+
### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com)
|
140
|
+
|
141
|
+
### Changed
|
142
|
+
|
143
|
+
- dropped support for running on Ruby 1.9.x
|
144
|
+
|
145
|
+
### Added
|
146
|
+
|
147
|
+
- regexp flags can now be passed when parsing a `String` as regexp body
|
148
|
+
* see the [README](/README.md#usage) for details
|
149
|
+
* thanks to [Owen Stephens](https://github.com/owst)
|
150
|
+
- bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes
|
151
|
+
* matches Onigmo behavior
|
152
|
+
* thanks for the report to [Marc-André Lafortune](https://github.com/marcandre)
|
153
|
+
|
154
|
+
### Fixed
|
155
|
+
|
156
|
+
- fixed parsing comments without preceding space or trailing newline in x-mode
|
157
|
+
* thanks to [Owen Stephens](https://github.com/owst)
|
2
158
|
|
3
159
|
### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
4
160
|
|
data/Gemfile
CHANGED
@@ -3,7 +3,12 @@ source 'https://rubygems.org'
|
|
3
3
|
gemspec
|
4
4
|
|
5
5
|
group :development, :test do
|
6
|
+
gem 'ice_nine', '~> 0.11.2'
|
6
7
|
gem 'rake', '~> 13.0'
|
7
8
|
gem 'regexp_property_values', '~> 1.0'
|
8
|
-
gem 'rspec', '~> 3.
|
9
|
+
gem 'rspec', '~> 3.10'
|
10
|
+
if RUBY_VERSION.to_f >= 2.7
|
11
|
+
gem 'gouteur'
|
12
|
+
gem 'rubocop', '~> 1.7'
|
13
|
+
end
|
9
14
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
@@ -8,8 +11,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
8
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
9
12
|
* A lexer that produces a "stream" of token objects.
|
10
13
|
* A parser that produces a "tree" of Expression objects (OO API)
|
11
|
-
* Runs on Ruby
|
12
|
-
* Recognizes Ruby 1.8, 1.9, and
|
14
|
+
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
|
+
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
13
16
|
|
14
17
|
|
15
18
|
_For examples of regexp_parser in use, see [Example Projects](#example-projects)._
|
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
|
|
18
21
|
---
|
19
22
|
## Requirements
|
20
23
|
|
21
|
-
* Ruby >=
|
24
|
+
* Ruby >= 2.0
|
22
25
|
* Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
|
23
26
|
|
24
27
|
|
25
|
-
_Note: See the .travis.yml file for covered versions._
|
26
|
-
|
27
|
-
|
28
28
|
---
|
29
29
|
## Install
|
30
30
|
|
@@ -72,6 +72,17 @@ called with the results as follows:
|
|
72
72
|
* **Parser**: after completion, the block gets passed the root expression.
|
73
73
|
_The result of the block is returned._
|
74
74
|
|
75
|
+
All three methods accept either a `Regexp` or `String` (containing the pattern)
|
76
|
+
- if a String is passed, `options` can be supplied:
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
require 'regexp_parser'
|
80
|
+
|
81
|
+
Regexp::Parser.parse(
|
82
|
+
"a+ # Recognises a and A...",
|
83
|
+
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
|
+
)
|
85
|
+
```
|
75
86
|
|
76
87
|
---
|
77
88
|
## Components
|
@@ -306,7 +317,7 @@ Expression class. See the next section for details._
|
|
306
317
|
|
307
318
|
## Supported Syntax
|
308
319
|
The three modules support all the regular expression syntax features of Ruby 1.8,
|
309
|
-
1.9, and
|
320
|
+
1.9, 2.x and 3.x:
|
310
321
|
|
311
322
|
_Note that not all of these are available in all versions of Ruby_
|
312
323
|
|
@@ -349,12 +360,12 @@ _Note that not all of these are available in all versions of Ruby_
|
|
349
360
|
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | ✓ |
|
350
361
|
|   _**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | ✓ |
|
351
362
|
| **String Escapes** | | ⋱ |
|
352
|
-
|   _**Control**_
|
363
|
+
|   _**Control** \[1\]_ | `\C-C`, `\cD` | ✓ |
|
353
364
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
354
|
-
|   _**Meta**_
|
365
|
+
|   _**Meta** \[1\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
355
366
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
356
367
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
357
|
-
| **Unicode Properties** | _<sub>([Unicode
|
368
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | ⋱ |
|
358
369
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
359
370
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
360
371
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -363,6 +374,10 @@ _Note that not all of these are available in all versions of Ruby_
|
|
363
374
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
364
375
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
365
376
|
|
377
|
+
**\[1\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
|
378
|
+
https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
|
379
|
+
scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
|
380
|
+
|
366
381
|
##### Inapplicable Features
|
367
382
|
|
368
383
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -376,7 +391,6 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
376
391
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
377
392
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
378
393
|
|
379
|
-
|
380
394
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
381
395
|
|
382
396
|
_**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
|
@@ -384,26 +398,14 @@ or incorrectly return tokens/objects as literals._
|
|
384
398
|
|
385
399
|
|
386
400
|
## Testing
|
387
|
-
To run the tests simply run rake from the root directory
|
401
|
+
To run the tests simply run rake from the root directory.
|
388
402
|
|
389
|
-
|
403
|
+
The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
|
390
404
|
|
391
|
-
|
405
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
|
392
406
|
|
393
407
|
```
|
394
|
-
|
395
|
-
```
|
396
|
-
|
397
|
-
You can run a specific test like so:
|
398
|
-
|
399
|
-
```
|
400
|
-
bin/test spec/scanner/properties_spec.rb
|
401
|
-
```
|
402
|
-
|
403
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
404
|
-
|
405
|
-
```
|
406
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
408
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
407
409
|
```
|
408
410
|
|
409
411
|
## Building
|
@@ -429,13 +431,17 @@ rake install
|
|
429
431
|
## Example Projects
|
430
432
|
Projects using regexp_parser.
|
431
433
|
|
434
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
|
435
|
+
|
436
|
+
- [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
|
437
|
+
|
432
438
|
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
|
433
439
|
|
434
|
-
- [mutant](https://github.com/mbj/mutant)
|
440
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
|
435
441
|
|
436
|
-
- [
|
442
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
|
437
443
|
|
438
|
-
- [
|
444
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes.
|
439
445
|
|
440
446
|
|
441
447
|
## References
|
@@ -464,4 +470,4 @@ Documentation and books used while working on this project.
|
|
464
470
|
|
465
471
|
---
|
466
472
|
##### Copyright
|
467
|
-
_Copyright (c) 2010-
|
473
|
+
_Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,35 +1,31 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
|
11
|
-
RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
9
|
+
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
10
|
+
RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
|
14
11
|
|
15
12
|
Bundler::GemHelper.install_tasks
|
16
13
|
|
14
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
15
|
|
18
16
|
task :default => [:'test:full']
|
19
17
|
|
20
18
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
19
|
+
task full: [:'ragel:rb', :spec]
|
24
20
|
end
|
25
21
|
|
26
22
|
namespace :ragel do
|
27
23
|
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{
|
24
|
+
task :rb do
|
25
|
+
RAGEL_SOURCE_FILES.each do |source_file|
|
26
|
+
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
27
|
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{
|
28
|
+
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
29
|
|
34
30
|
contents = File.read(output_file)
|
35
31
|
|
@@ -42,34 +38,29 @@ namespace :ragel do
|
|
42
38
|
end
|
43
39
|
|
44
40
|
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do
|
41
|
+
task :clean do
|
46
42
|
RAGEL_SOURCE_FILES.each do |file|
|
47
43
|
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
44
|
end
|
49
45
|
end
|
50
46
|
end
|
51
47
|
|
52
|
-
|
53
48
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
49
|
# latest scanner code is generated and included in the build.
|
55
50
|
desc "Runs ragel:rb before building the gem"
|
56
51
|
task :build => ['ragel:rb']
|
57
52
|
|
58
|
-
|
59
53
|
namespace :props do
|
60
54
|
desc 'Write new property value hashes for the properties scanner'
|
61
55
|
task :update do
|
62
56
|
require 'regexp_property_values'
|
63
57
|
RegexpPropertyValues.update
|
64
|
-
dir = File.
|
58
|
+
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
59
|
|
66
|
-
require 'psych'
|
67
60
|
write_hash_to_file = ->(hash, path) do
|
68
61
|
File.open(path, 'w') do |f|
|
69
|
-
f.puts
|
70
|
-
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
62
|
+
f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
|
63
|
+
*hash.sort.map { |pair| pair.join(',') }
|
73
64
|
end
|
74
65
|
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
66
|
end
|
@@ -77,11 +68,11 @@ namespace :props do
|
|
77
68
|
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
69
|
[val.identifier, val.full_name.downcase]
|
79
70
|
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.
|
71
|
+
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
|
81
72
|
|
82
73
|
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
74
|
[k.identifier, v.full_name.downcase]
|
84
75
|
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.
|
76
|
+
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
|
86
77
|
end
|
87
78
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
attr_accessor :type, :token
|
4
|
+
attr_accessor :text, :ts
|
5
|
+
attr_accessor :level, :set_level, :conditional_level, :nesting_level
|
6
|
+
|
7
|
+
attr_accessor :quantifier
|
8
|
+
attr_accessor :options
|
9
|
+
|
10
|
+
def initialize(token, options = {})
|
11
|
+
self.type = token.type
|
12
|
+
self.token = token.token
|
13
|
+
self.text = token.text
|
14
|
+
self.ts = token.ts
|
15
|
+
self.level = token.level
|
16
|
+
self.set_level = token.set_level
|
17
|
+
self.conditional_level = token.conditional_level
|
18
|
+
self.nesting_level = 0
|
19
|
+
self.quantifier = nil
|
20
|
+
self.options = options
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize_copy(orig)
|
24
|
+
self.text = (orig.text ? orig.text.dup : nil)
|
25
|
+
self.options = (orig.options ? orig.options.dup : nil)
|
26
|
+
self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
|
27
|
+
super
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_re(format = :full)
|
31
|
+
::Regexp.new(to_s(format))
|
32
|
+
end
|
33
|
+
|
34
|
+
alias :starts_at :ts
|
35
|
+
|
36
|
+
def base_length
|
37
|
+
to_s(:base).length
|
38
|
+
end
|
39
|
+
|
40
|
+
def full_length
|
41
|
+
to_s.length
|
42
|
+
end
|
43
|
+
|
44
|
+
def offset
|
45
|
+
[starts_at, full_length]
|
46
|
+
end
|
47
|
+
|
48
|
+
def coded_offset
|
49
|
+
'@%d+%d' % offset
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_s(format = :full)
|
53
|
+
"#{text}#{quantifier_affix(format)}"
|
54
|
+
end
|
55
|
+
|
56
|
+
def quantifier_affix(expression_format)
|
57
|
+
quantifier.to_s if quantified? && expression_format != :base
|
58
|
+
end
|
59
|
+
|
60
|
+
def terminal?
|
61
|
+
!respond_to?(:expressions)
|
62
|
+
end
|
63
|
+
|
64
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
65
|
+
self.quantifier = Quantifier.new(token, text, min, max, mode)
|
66
|
+
end
|
67
|
+
|
68
|
+
def unquantified_clone
|
69
|
+
clone.tap { |exp| exp.quantifier = nil }
|
70
|
+
end
|
71
|
+
|
72
|
+
def quantified?
|
73
|
+
!quantifier.nil?
|
74
|
+
end
|
75
|
+
|
76
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
77
|
+
def quantity
|
78
|
+
return [nil,nil] unless quantified?
|
79
|
+
[quantifier.min, quantifier.max]
|
80
|
+
end
|
81
|
+
|
82
|
+
def repetitions
|
83
|
+
return 1..1 unless quantified?
|
84
|
+
min = quantifier.min
|
85
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
86
|
+
range = min..max
|
87
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
88
|
+
if RUBY_VERSION.to_f < 2.7
|
89
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
90
|
+
end
|
91
|
+
range
|
92
|
+
end
|
93
|
+
|
94
|
+
def greedy?
|
95
|
+
quantified? and quantifier.greedy?
|
96
|
+
end
|
97
|
+
|
98
|
+
def reluctant?
|
99
|
+
quantified? and quantifier.reluctant?
|
100
|
+
end
|
101
|
+
alias :lazy? :reluctant?
|
102
|
+
|
103
|
+
def possessive?
|
104
|
+
quantified? and quantifier.possessive?
|
105
|
+
end
|
106
|
+
|
107
|
+
def attributes
|
108
|
+
{
|
109
|
+
type: type,
|
110
|
+
token: token,
|
111
|
+
text: to_s(:base),
|
112
|
+
starts_at: ts,
|
113
|
+
length: full_length,
|
114
|
+
level: level,
|
115
|
+
set_level: set_level,
|
116
|
+
conditional_level: conditional_level,
|
117
|
+
options: options,
|
118
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
119
|
+
}
|
120
|
+
end
|
121
|
+
alias :to_h :attributes
|
122
|
+
end
|
123
|
+
end
|
@@ -2,6 +2,11 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
self.referenced_expression = orig.referenced_expression.dup
|
8
|
+
super
|
9
|
+
end
|
5
10
|
end
|
6
11
|
|
7
12
|
class Number < Backreference::Base
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Conditional
|
3
|
-
class TooManyBranches <
|
3
|
+
class TooManyBranches < Regexp::Parser::Error
|
4
4
|
def initialize
|
5
5
|
super('The conditional expression has more than 2 branches')
|
6
6
|
end
|
@@ -15,6 +15,11 @@ module Regexp::Expression
|
|
15
15
|
ref = text.tr("'<>()", "")
|
16
16
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
17
|
end
|
18
|
+
|
19
|
+
def initialize_copy(orig)
|
20
|
+
self.referenced_expression = orig.referenced_expression.dup
|
21
|
+
super
|
22
|
+
end
|
18
23
|
end
|
19
24
|
|
20
25
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -53,6 +58,11 @@ module Regexp::Expression
|
|
53
58
|
def to_s(format = :full)
|
54
59
|
"#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
|
55
60
|
end
|
61
|
+
|
62
|
+
def initialize_copy(orig)
|
63
|
+
self.referenced_expression = orig.referenced_expression.dup
|
64
|
+
super
|
65
|
+
end
|
56
66
|
end
|
57
67
|
end
|
58
68
|
end
|