regexp_parser 1.7.1 → 2.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +157 -1
- data/Gemfile +6 -1
- data/LICENSE +1 -1
- data/README.md +38 -32
- data/Rakefile +18 -27
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +1 -3
- data/lib/regexp_parser/expression/classes/root.rb +4 -17
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +11 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -20
- data/lib/regexp_parser/expression/subexpression.rb +1 -2
- data/lib/regexp_parser/expression.rb +7 -139
- data/lib/regexp_parser/lexer.rb +13 -11
- data/lib/regexp_parser/parser.rb +325 -344
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/properties/long.csv +604 -0
- data/lib/regexp_parser/scanner/properties/short.csv +242 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +235 -255
- data/lib/regexp_parser/scanner.rb +1324 -1387
- data/lib/regexp_parser/syntax/any.rb +4 -6
- data/lib/regexp_parser/syntax/base.rb +13 -15
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +34 -165
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -52
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
|
4
|
+
data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
|
7
|
+
data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,160 @@
|
|
1
|
-
## [
|
1
|
+
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
2
|
+
|
3
|
+
### Fixed
|
4
|
+
|
5
|
+
- fixed Syntax version of absence groups (`(?~...)`)
|
6
|
+
- the lexer accepted them for any Ruby version
|
7
|
+
- now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
8
|
+
- reduced gem size by excluding specs from package
|
9
|
+
- removed deprecated `test_files` gemspec setting
|
10
|
+
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
11
|
+
- no longer depend on `set`
|
12
|
+
- `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
13
|
+
- this made it a hidden/undeclared dependency of `regexp_parser`
|
14
|
+
|
15
|
+
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
16
|
+
|
17
|
+
### Added
|
18
|
+
|
19
|
+
- added support for 13 new unicode properties introduced in Ruby 3.1.0
|
20
|
+
|
21
|
+
## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
|
22
|
+
|
23
|
+
### Fixed
|
24
|
+
|
25
|
+
- fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
|
26
|
+
* thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
|
27
|
+
|
28
|
+
## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
|
29
|
+
|
30
|
+
### Added
|
31
|
+
|
32
|
+
- common ancestor for all scanning/parsing/lexing errors
|
33
|
+
* `Regexp::Parser::Error` can now be rescued as a catch-all
|
34
|
+
* the following errors (and their many descendants) now inherit from it:
|
35
|
+
- `Regexp::Expression::Conditional::TooManyBranches`
|
36
|
+
- `Regexp::Parser::ParserError`
|
37
|
+
- `Regexp::Scanner::ScannerError`
|
38
|
+
- `Regexp::Scanner::ValidationError`
|
39
|
+
- `Regexp::Syntax::SyntaxError`
|
40
|
+
* it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
|
41
|
+
* thanks to [sandstrom](https://github.com/sandstrom) for the cue
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
|
45
|
+
- fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
|
46
|
+
* a regression in v2.0.1 had caused them to be scanned as literals
|
47
|
+
- fixed scanning of some backreference and subexpression call edge cases
|
48
|
+
* e.g. `\k<+1>`, `\g<x-1>`
|
49
|
+
- fixed tokenization of some escapes in character sets
|
50
|
+
* `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
|
51
|
+
* all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
|
52
|
+
* if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
|
53
|
+
* the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
|
54
|
+
- fixed handling of control/metacontrol escapes in character sets
|
55
|
+
* e.g. `[\cX]`, `[\M-\C-X]`
|
56
|
+
* they were misread as bunch of individual literals, escapes, and ranges
|
57
|
+
- fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
|
58
|
+
|
59
|
+
## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
60
|
+
|
61
|
+
### Fixed
|
62
|
+
|
63
|
+
- fixed error when scanning some unlikely and redundant but valid charset patterns
|
64
|
+
* e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
|
65
|
+
- fixed ancestry of some error classes related to syntax version lookup
|
66
|
+
* `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
|
67
|
+
* they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
|
68
|
+
|
69
|
+
## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
70
|
+
|
71
|
+
### Fixed
|
72
|
+
|
73
|
+
- fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive`
|
74
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon)
|
75
|
+
|
76
|
+
## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com)
|
77
|
+
|
78
|
+
### Fixed
|
79
|
+
|
80
|
+
- fixed error when scanning some group names
|
81
|
+
* this affected names containing hyphens, digits or multibyte chars, e.g. `/(?<a1>a)/`
|
82
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
|
83
|
+
- fixed error when scanning hex escapes with just one hex digit
|
84
|
+
* e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not
|
85
|
+
* thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
|
86
|
+
|
87
|
+
## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
88
|
+
|
89
|
+
### Changed
|
90
|
+
|
91
|
+
- some methods that used to return byte-based indices now return char-based indices
|
92
|
+
* the returned values have only changed for Regexps that contain multibyte chars
|
93
|
+
* this is only a breaking change if you used such methods directly AND relied on them pointing to bytes
|
94
|
+
* affected methods:
|
95
|
+
* `Regexp::Token` `#length`, `#offset`, `#te`, `#ts`
|
96
|
+
* `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts`
|
97
|
+
* thanks to [Akinori MUSHA](https://github.com/knu) for the report
|
98
|
+
- removed some deprecated methods/signatures
|
99
|
+
* these are rarely used and have been showing deprecation warnings for a long time
|
100
|
+
* `Regexp::Expression::Subexpression.new` with 3 arguments
|
101
|
+
* `Regexp::Expression::Root.new` without a token argument
|
102
|
+
* `Regexp::Expression.parsed`
|
103
|
+
|
104
|
+
### Added
|
105
|
+
|
106
|
+
- `Regexp::Expression::Base#base_length`
|
107
|
+
* returns the character count of an expression body, ignoring any quantifier
|
108
|
+
- pragmatic, experimental support for chained quantifiers
|
109
|
+
* e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s
|
110
|
+
* successive quantifiers used to be silently dropped by the parser
|
111
|
+
* they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}`
|
112
|
+
* thanks to [calfeld](https://github.com/calfeld) for reporting this a while back
|
113
|
+
|
114
|
+
### Fixed
|
115
|
+
|
116
|
+
- incorrect encoding output for non-ascii comments
|
117
|
+
* this led to a crash when calling `#to_s` on parse results containing such comments
|
118
|
+
* thanks to [Michael Glass](https://github.com/michaelglass) for the report
|
119
|
+
- some crashes when scanning contrived patterns such as `'\😋'`
|
120
|
+
|
121
|
+
### [1.8.2] - 2020-10-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
122
|
+
|
123
|
+
### Fixed
|
124
|
+
|
125
|
+
- fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0
|
126
|
+
* thanks to [Thomas Walpole](https://github.com/twalpole)
|
127
|
+
- removed "unknown future version" warning on Ruby 3.0
|
128
|
+
|
129
|
+
### [1.8.1] - 2020-09-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
130
|
+
|
131
|
+
### Fixed
|
132
|
+
|
133
|
+
- fixed scanning of comment-like text in normal mode
|
134
|
+
* this was an old bug, but had become more prevalent in v1.8.0
|
135
|
+
* thanks to [Tietew](https://github.com/Tietew) for the report
|
136
|
+
- specified correct minimum Ruby version in gemspec
|
137
|
+
* it said 1.9 but really required 2.0 as of v1.8.0
|
138
|
+
|
139
|
+
### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com)
|
140
|
+
|
141
|
+
### Changed
|
142
|
+
|
143
|
+
- dropped support for running on Ruby 1.9.x
|
144
|
+
|
145
|
+
### Added
|
146
|
+
|
147
|
+
- regexp flags can now be passed when parsing a `String` as regexp body
|
148
|
+
* see the [README](/README.md#usage) for details
|
149
|
+
* thanks to [Owen Stephens](https://github.com/owst)
|
150
|
+
- bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes
|
151
|
+
* matches Onigmo behavior
|
152
|
+
* thanks for the report to [Marc-André Lafortune](https://github.com/marcandre)
|
153
|
+
|
154
|
+
### Fixed
|
155
|
+
|
156
|
+
- fixed parsing comments without preceding space or trailing newline in x-mode
|
157
|
+
* thanks to [Owen Stephens](https://github.com/owst)
|
2
158
|
|
3
159
|
### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
4
160
|
|
data/Gemfile
CHANGED
@@ -3,7 +3,12 @@ source 'https://rubygems.org'
|
|
3
3
|
gemspec
|
4
4
|
|
5
5
|
group :development, :test do
|
6
|
+
gem 'ice_nine', '~> 0.11.2'
|
6
7
|
gem 'rake', '~> 13.0'
|
7
8
|
gem 'regexp_property_values', '~> 1.0'
|
8
|
-
gem 'rspec', '~> 3.
|
9
|
+
gem 'rspec', '~> 3.10'
|
10
|
+
if RUBY_VERSION.to_f >= 2.7
|
11
|
+
gem 'gouteur'
|
12
|
+
gem 'rubocop', '~> 1.7'
|
13
|
+
end
|
9
14
|
end
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
|
4
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
5
|
+
[![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
|
6
|
+
[![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
7
|
|
5
8
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
9
|
|
@@ -8,8 +11,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
8
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
9
12
|
* A lexer that produces a "stream" of token objects.
|
10
13
|
* A parser that produces a "tree" of Expression objects (OO API)
|
11
|
-
* Runs on Ruby
|
12
|
-
* Recognizes Ruby 1.8, 1.9, and
|
14
|
+
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
|
+
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
13
16
|
|
14
17
|
|
15
18
|
_For examples of regexp_parser in use, see [Example Projects](#example-projects)._
|
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
|
|
18
21
|
---
|
19
22
|
## Requirements
|
20
23
|
|
21
|
-
* Ruby >=
|
24
|
+
* Ruby >= 2.0
|
22
25
|
* Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
|
23
26
|
|
24
27
|
|
25
|
-
_Note: See the .travis.yml file for covered versions._
|
26
|
-
|
27
|
-
|
28
28
|
---
|
29
29
|
## Install
|
30
30
|
|
@@ -72,6 +72,17 @@ called with the results as follows:
|
|
72
72
|
* **Parser**: after completion, the block gets passed the root expression.
|
73
73
|
_The result of the block is returned._
|
74
74
|
|
75
|
+
All three methods accept either a `Regexp` or `String` (containing the pattern)
|
76
|
+
- if a String is passed, `options` can be supplied:
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
require 'regexp_parser'
|
80
|
+
|
81
|
+
Regexp::Parser.parse(
|
82
|
+
"a+ # Recognises a and A...",
|
83
|
+
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
|
+
)
|
85
|
+
```
|
75
86
|
|
76
87
|
---
|
77
88
|
## Components
|
@@ -306,7 +317,7 @@ Expression class. See the next section for details._
|
|
306
317
|
|
307
318
|
## Supported Syntax
|
308
319
|
The three modules support all the regular expression syntax features of Ruby 1.8,
|
309
|
-
1.9, and
|
320
|
+
1.9, 2.x and 3.x:
|
310
321
|
|
311
322
|
_Note that not all of these are available in all versions of Ruby_
|
312
323
|
|
@@ -349,12 +360,12 @@ _Note that not all of these are available in all versions of Ruby_
|
|
349
360
|
|   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | ✓ |
|
350
361
|
|   _**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | ✓ |
|
351
362
|
| **String Escapes** | | ⋱ |
|
352
|
-
|   _**Control**_
|
363
|
+
|   _**Control** \[1\]_ | `\C-C`, `\cD` | ✓ |
|
353
364
|
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
|
354
|
-
|   _**Meta**_
|
365
|
+
|   _**Meta** \[1\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
355
366
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
356
367
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
357
|
-
| **Unicode Properties** | _<sub>([Unicode
|
368
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | ⋱ |
|
358
369
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
359
370
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
360
371
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -363,6 +374,10 @@ _Note that not all of these are available in all versions of Ruby_
|
|
363
374
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
364
375
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
365
376
|
|
377
|
+
**\[1\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
|
378
|
+
https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
|
379
|
+
scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
|
380
|
+
|
366
381
|
##### Inapplicable Features
|
367
382
|
|
368
383
|
Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
|
@@ -376,7 +391,6 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
376
391
|
- **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
|
377
392
|
- **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
|
378
393
|
|
379
|
-
|
380
394
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
381
395
|
|
382
396
|
_**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
|
@@ -384,26 +398,14 @@ or incorrectly return tokens/objects as literals._
|
|
384
398
|
|
385
399
|
|
386
400
|
## Testing
|
387
|
-
To run the tests simply run rake from the root directory
|
401
|
+
To run the tests simply run rake from the root directory.
|
388
402
|
|
389
|
-
|
403
|
+
The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
|
390
404
|
|
391
|
-
|
405
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
|
392
406
|
|
393
407
|
```
|
394
|
-
|
395
|
-
```
|
396
|
-
|
397
|
-
You can run a specific test like so:
|
398
|
-
|
399
|
-
```
|
400
|
-
bin/test spec/scanner/properties_spec.rb
|
401
|
-
```
|
402
|
-
|
403
|
-
Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
|
404
|
-
|
405
|
-
```
|
406
|
-
rake ragel:rb && bin/test spec/scanner/properties_spec.rb
|
408
|
+
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
407
409
|
```
|
408
410
|
|
409
411
|
## Building
|
@@ -429,13 +431,17 @@ rake install
|
|
429
431
|
## Example Projects
|
430
432
|
Projects using regexp_parser.
|
431
433
|
|
434
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
|
435
|
+
|
436
|
+
- [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
|
437
|
+
|
432
438
|
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
|
433
439
|
|
434
|
-
- [mutant](https://github.com/mbj/mutant)
|
440
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
|
435
441
|
|
436
|
-
- [
|
442
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
|
437
443
|
|
438
|
-
- [
|
444
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes.
|
439
445
|
|
440
446
|
|
441
447
|
## References
|
@@ -464,4 +470,4 @@ Documentation and books used while working on this project.
|
|
464
470
|
|
465
471
|
---
|
466
472
|
##### Copyright
|
467
|
-
_Copyright (c) 2010-
|
473
|
+
_Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
|
data/Rakefile
CHANGED
@@ -1,35 +1,31 @@
|
|
1
|
+
require 'bundler'
|
1
2
|
require 'rubygems'
|
2
|
-
|
3
|
+
require 'rubygems/package_task'
|
3
4
|
require 'rake'
|
4
5
|
require 'rake/testtask'
|
6
|
+
require 'rspec/core/rake_task'
|
5
7
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
|
11
|
-
RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
|
12
|
-
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
|
-
|
8
|
+
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
9
|
+
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
10
|
+
RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
|
14
11
|
|
15
12
|
Bundler::GemHelper.install_tasks
|
16
13
|
|
14
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
15
|
|
18
16
|
task :default => [:'test:full']
|
19
17
|
|
20
18
|
namespace :test do
|
21
|
-
task full: :'ragel:rb'
|
22
|
-
sh 'bin/test'
|
23
|
-
end
|
19
|
+
task full: [:'ragel:rb', :spec]
|
24
20
|
end
|
25
21
|
|
26
22
|
namespace :ragel do
|
27
23
|
desc "Process the ragel source files and output ruby code"
|
28
|
-
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{
|
24
|
+
task :rb do
|
25
|
+
RAGEL_SOURCE_FILES.each do |source_file|
|
26
|
+
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
27
|
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{
|
28
|
+
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
29
|
|
34
30
|
contents = File.read(output_file)
|
35
31
|
|
@@ -42,34 +38,29 @@ namespace :ragel do
|
|
42
38
|
end
|
43
39
|
|
44
40
|
desc "Delete the ragel generated source file(s)"
|
45
|
-
task :clean do
|
41
|
+
task :clean do
|
46
42
|
RAGEL_SOURCE_FILES.each do |file|
|
47
43
|
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
48
44
|
end
|
49
45
|
end
|
50
46
|
end
|
51
47
|
|
52
|
-
|
53
48
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
54
49
|
# latest scanner code is generated and included in the build.
|
55
50
|
desc "Runs ragel:rb before building the gem"
|
56
51
|
task :build => ['ragel:rb']
|
57
52
|
|
58
|
-
|
59
53
|
namespace :props do
|
60
54
|
desc 'Write new property value hashes for the properties scanner'
|
61
55
|
task :update do
|
62
56
|
require 'regexp_property_values'
|
63
57
|
RegexpPropertyValues.update
|
64
|
-
dir = File.
|
58
|
+
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
59
|
|
66
|
-
require 'psych'
|
67
60
|
write_hash_to_file = ->(hash, path) do
|
68
61
|
File.open(path, 'w') do |f|
|
69
|
-
f.puts
|
70
|
-
|
71
|
-
'#',
|
72
|
-
hash.sort.to_h.to_yaml
|
62
|
+
f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
|
63
|
+
*hash.sort.map { |pair| pair.join(',') }
|
73
64
|
end
|
74
65
|
puts "Wrote #{hash.count} aliases to `#{path}`"
|
75
66
|
end
|
@@ -77,11 +68,11 @@ namespace :props do
|
|
77
68
|
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
78
69
|
[val.identifier, val.full_name.downcase]
|
79
70
|
end
|
80
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.
|
71
|
+
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
|
81
72
|
|
82
73
|
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
83
74
|
[k.identifier, v.full_name.downcase]
|
84
75
|
end
|
85
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.
|
76
|
+
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
|
86
77
|
end
|
87
78
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
attr_accessor :type, :token
|
4
|
+
attr_accessor :text, :ts
|
5
|
+
attr_accessor :level, :set_level, :conditional_level, :nesting_level
|
6
|
+
|
7
|
+
attr_accessor :quantifier
|
8
|
+
attr_accessor :options
|
9
|
+
|
10
|
+
def initialize(token, options = {})
|
11
|
+
self.type = token.type
|
12
|
+
self.token = token.token
|
13
|
+
self.text = token.text
|
14
|
+
self.ts = token.ts
|
15
|
+
self.level = token.level
|
16
|
+
self.set_level = token.set_level
|
17
|
+
self.conditional_level = token.conditional_level
|
18
|
+
self.nesting_level = 0
|
19
|
+
self.quantifier = nil
|
20
|
+
self.options = options
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize_copy(orig)
|
24
|
+
self.text = (orig.text ? orig.text.dup : nil)
|
25
|
+
self.options = (orig.options ? orig.options.dup : nil)
|
26
|
+
self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
|
27
|
+
super
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_re(format = :full)
|
31
|
+
::Regexp.new(to_s(format))
|
32
|
+
end
|
33
|
+
|
34
|
+
alias :starts_at :ts
|
35
|
+
|
36
|
+
def base_length
|
37
|
+
to_s(:base).length
|
38
|
+
end
|
39
|
+
|
40
|
+
def full_length
|
41
|
+
to_s.length
|
42
|
+
end
|
43
|
+
|
44
|
+
def offset
|
45
|
+
[starts_at, full_length]
|
46
|
+
end
|
47
|
+
|
48
|
+
def coded_offset
|
49
|
+
'@%d+%d' % offset
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_s(format = :full)
|
53
|
+
"#{text}#{quantifier_affix(format)}"
|
54
|
+
end
|
55
|
+
|
56
|
+
def quantifier_affix(expression_format)
|
57
|
+
quantifier.to_s if quantified? && expression_format != :base
|
58
|
+
end
|
59
|
+
|
60
|
+
def terminal?
|
61
|
+
!respond_to?(:expressions)
|
62
|
+
end
|
63
|
+
|
64
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
65
|
+
self.quantifier = Quantifier.new(token, text, min, max, mode)
|
66
|
+
end
|
67
|
+
|
68
|
+
def unquantified_clone
|
69
|
+
clone.tap { |exp| exp.quantifier = nil }
|
70
|
+
end
|
71
|
+
|
72
|
+
def quantified?
|
73
|
+
!quantifier.nil?
|
74
|
+
end
|
75
|
+
|
76
|
+
# Deprecated. Prefer `#repetitions` which has a more uniform interface.
|
77
|
+
def quantity
|
78
|
+
return [nil,nil] unless quantified?
|
79
|
+
[quantifier.min, quantifier.max]
|
80
|
+
end
|
81
|
+
|
82
|
+
def repetitions
|
83
|
+
return 1..1 unless quantified?
|
84
|
+
min = quantifier.min
|
85
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
86
|
+
range = min..max
|
87
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
88
|
+
if RUBY_VERSION.to_f < 2.7
|
89
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
90
|
+
end
|
91
|
+
range
|
92
|
+
end
|
93
|
+
|
94
|
+
def greedy?
|
95
|
+
quantified? and quantifier.greedy?
|
96
|
+
end
|
97
|
+
|
98
|
+
def reluctant?
|
99
|
+
quantified? and quantifier.reluctant?
|
100
|
+
end
|
101
|
+
alias :lazy? :reluctant?
|
102
|
+
|
103
|
+
def possessive?
|
104
|
+
quantified? and quantifier.possessive?
|
105
|
+
end
|
106
|
+
|
107
|
+
def attributes
|
108
|
+
{
|
109
|
+
type: type,
|
110
|
+
token: token,
|
111
|
+
text: to_s(:base),
|
112
|
+
starts_at: ts,
|
113
|
+
length: full_length,
|
114
|
+
level: level,
|
115
|
+
set_level: set_level,
|
116
|
+
conditional_level: conditional_level,
|
117
|
+
options: options,
|
118
|
+
quantifier: quantified? ? quantifier.to_h : nil,
|
119
|
+
}
|
120
|
+
end
|
121
|
+
alias :to_h :attributes
|
122
|
+
end
|
123
|
+
end
|
@@ -2,6 +2,11 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
self.referenced_expression = orig.referenced_expression.dup
|
8
|
+
super
|
9
|
+
end
|
5
10
|
end
|
6
11
|
|
7
12
|
class Number < Backreference::Base
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Conditional
|
3
|
-
class TooManyBranches <
|
3
|
+
class TooManyBranches < Regexp::Parser::Error
|
4
4
|
def initialize
|
5
5
|
super('The conditional expression has more than 2 branches')
|
6
6
|
end
|
@@ -15,6 +15,11 @@ module Regexp::Expression
|
|
15
15
|
ref = text.tr("'<>()", "")
|
16
16
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
17
|
end
|
18
|
+
|
19
|
+
def initialize_copy(orig)
|
20
|
+
self.referenced_expression = orig.referenced_expression.dup
|
21
|
+
super
|
22
|
+
end
|
18
23
|
end
|
19
24
|
|
20
25
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -53,6 +58,11 @@ module Regexp::Expression
|
|
53
58
|
def to_s(format = :full)
|
54
59
|
"#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
|
55
60
|
end
|
61
|
+
|
62
|
+
def initialize_copy(orig)
|
63
|
+
self.referenced_expression = orig.referenced_expression.dup
|
64
|
+
super
|
65
|
+
end
|
56
66
|
end
|
57
67
|
end
|
58
68
|
end
|