regexp_parser 2.4.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -42
- data/README.md +46 -30
- data/lib/regexp_parser/expression/base.rb +17 -9
- data/lib/regexp_parser/expression/classes/backreference.rb +19 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +10 -0
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/root.rb +3 -5
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
- data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
- data/lib/regexp_parser/expression/quantifier.rb +6 -5
- data/lib/regexp_parser/expression/sequence.rb +6 -21
- data/lib/regexp_parser/expression/shared.rb +20 -3
- data/lib/regexp_parser/expression/subexpression.rb +4 -1
- data/lib/regexp_parser/expression.rb +4 -2
- data/lib/regexp_parser/lexer.rb +61 -29
- data/lib/regexp_parser/parser.rb +36 -26
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +57 -42
- data/lib/regexp_parser/scanner.rb +873 -823
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
|
4
|
+
data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
|
7
|
+
data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07
|
data/CHANGELOG.md
CHANGED
@@ -1,33 +1,99 @@
|
|
1
|
-
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
9
|
+
|
10
|
+
### Added
|
11
|
+
|
12
|
+
- `Regexp::Lexer.lex` now streams tokens when called with a block
|
13
|
+
- it can now take arbitrarily large input, just like `Regexp::Scanner`
|
14
|
+
- this also slightly improves `Regexp::Parser.parse` performance
|
15
|
+
- note: `Regexp::Parser.parse` still does not and will not support streaming
|
16
|
+
- improved performance of `Subexpression#each_expression`
|
17
|
+
- minor improvements to `Regexp::Scanner` performance
|
18
|
+
- overall improvement of parse performance: about 10% for large Regexps
|
19
|
+
|
20
|
+
### Fixed
|
21
|
+
|
22
|
+
- parsing of octal escape sequences in sets, e.g. `[\141]`
|
23
|
+
* thanks to [Randy Stauner](https://github.com/rwstauner) for the report
|
24
|
+
|
25
|
+
## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
|
29
|
+
- fixed `SystemStackError` when cloning recursive subexpression calls
|
30
|
+
* e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
|
31
|
+
|
32
|
+
## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
|
33
|
+
|
34
|
+
### Fixed
|
35
|
+
|
36
|
+
- fixed scanning of two negative lookbehind edge cases
|
37
|
+
* `(?<!x)y>` used to raise a ScannerError
|
38
|
+
* `(?<!x>)y` used to be misinterpreted as a named group
|
39
|
+
* thanks to [Sergio Medina](https://github.com/serch) for the report
|
40
|
+
|
41
|
+
## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com)
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
|
45
|
+
- fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp)
|
46
|
+
- fixed `#reference`, `#referenced_expression` for recursion level backrefs
|
47
|
+
* e.g. `(a)(b)\k<-1+1>`
|
48
|
+
* `#referenced_expression` was `nil`, now it is the correct `Group` exp
|
49
|
+
- detect and raise for two more syntax errors when parsing String input
|
50
|
+
* quantification of option switches (e.g. `(?i)+`)
|
51
|
+
* invalid references (e.g. `/\k<1>/`)
|
52
|
+
* these are a `SyntaxError` in Ruby, so could only be passed as a String
|
53
|
+
|
54
|
+
### Added
|
55
|
+
|
56
|
+
- `Regexp::Expression::Base#human_name`
|
57
|
+
* returns a nice, human-readable description of the expression
|
58
|
+
- `Regexp::Expression::Base#optional?`
|
59
|
+
* returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`)
|
60
|
+
- added a deprecation warning when calling `#to_re` on set members
|
61
|
+
|
62
|
+
## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com)
|
63
|
+
|
64
|
+
### Added
|
65
|
+
|
66
|
+
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
67
|
+
* see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details
|
2
68
|
|
3
69
|
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
70
|
|
5
71
|
### Fixed
|
6
72
|
|
7
73
|
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
74
|
+
* they used to be treated as reluctant or possessive mode indicators
|
75
|
+
* however, Ruby does not support these modes for interval quantifiers
|
76
|
+
* they are now treated as chained quantifiers instead, as Ruby does it
|
77
|
+
* c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
12
78
|
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
13
|
-
|
79
|
+
* e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
14
80
|
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
15
|
-
|
16
|
-
|
81
|
+
* they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
82
|
+
* they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
17
83
|
|
18
84
|
### Added
|
19
85
|
|
20
86
|
- added `Expression::Base#==` for (deep) comparison of expressions
|
21
87
|
- added `Expression::Base#parts`
|
22
|
-
|
23
|
-
|
88
|
+
* returns the text elements and subexpressions of an expression
|
89
|
+
* e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
24
90
|
- added `Expression::Base#te` (a.k.a. token end index)
|
25
|
-
|
91
|
+
* `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
26
92
|
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
93
|
+
* `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
94
|
+
* `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
95
|
+
* `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
96
|
+
* this allows a more unified handling with `Expression::Base` instances
|
31
97
|
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
32
98
|
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
33
99
|
|
@@ -36,10 +102,12 @@
|
|
36
102
|
|
37
103
|
It will no longer be supported in regexp_parser v3.0.0.
|
38
104
|
|
39
|
-
Please pass a Regexp::Token instead, e.g. replace `
|
40
|
-
with `::Regexp::Token.new(:quantifier,
|
105
|
+
Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
|
106
|
+
with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
|
41
107
|
will be derived automatically.
|
42
108
|
|
109
|
+
Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
|
110
|
+
|
43
111
|
This is consistent with how Expression::Base instances are created.
|
44
112
|
|
45
113
|
|
@@ -48,18 +116,18 @@
|
|
48
116
|
### Fixed
|
49
117
|
|
50
118
|
- removed five inexistent unicode properties from `Syntax#features`
|
51
|
-
|
52
|
-
|
119
|
+
* these were never supported by Ruby or the `Regexp::Scanner`
|
120
|
+
* thanks to [Markus Schirp](https://github.com/mbj) for the report
|
53
121
|
|
54
122
|
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
55
123
|
|
56
124
|
### Added
|
57
125
|
|
58
126
|
- improved parsing performance through `Syntax` refactoring
|
59
|
-
|
60
|
-
|
127
|
+
* instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
128
|
+
* this approximately doubles the parsing speed for simple regexps
|
61
129
|
- added methods to `Syntax` classes to show relative feature sets
|
62
|
-
|
130
|
+
* e.g. `Regexp::Syntax::V3_2_0.added_features`
|
63
131
|
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
64
132
|
|
65
133
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
@@ -67,14 +135,14 @@
|
|
67
135
|
### Fixed
|
68
136
|
|
69
137
|
- fixed Syntax version of absence groups (`(?~...)`)
|
70
|
-
|
71
|
-
|
138
|
+
* the lexer accepted them for any Ruby version
|
139
|
+
* now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
72
140
|
- reduced gem size by excluding specs from package
|
73
141
|
- removed deprecated `test_files` gemspec setting
|
74
142
|
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
75
143
|
- no longer depend on `set`
|
76
|
-
|
77
|
-
|
144
|
+
* `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
145
|
+
* this made it a hidden/undeclared dependency of `regexp_parser`
|
78
146
|
|
79
147
|
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
80
148
|
|
@@ -312,8 +380,8 @@
|
|
312
380
|
|
313
381
|
- Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re`
|
314
382
|
- `Conditional::Condition` no longer lives outside the recursive `#expressions` tree
|
315
|
-
|
316
|
-
|
383
|
+
* it used to be the only expression stored in a custom ivar, complicating traversal
|
384
|
+
* its setter and getter (`#condition=`, `#condition`) still work as before
|
317
385
|
|
318
386
|
## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com)
|
319
387
|
|
@@ -321,8 +389,8 @@
|
|
321
389
|
|
322
390
|
- Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?`
|
323
391
|
- Added `Group::Options#option_changes`
|
324
|
-
|
325
|
-
|
392
|
+
* shows the options enabled or disabled by the given options group
|
393
|
+
* as with all other expressions, `#options` shows the overall active options
|
326
394
|
- Added `Conditional#reference` and `Condition#reference`, indicating the determinative group
|
327
395
|
- Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig)
|
328
396
|
|
@@ -506,7 +574,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
506
574
|
* Fixed scanning of zero length comments (PR #12)
|
507
575
|
* Fixed missing escape:codepoint_list syntax token (PR #14)
|
508
576
|
* Fixed to_s for modified interval quantifiers (PR #17)
|
509
|
-
- Added a note about MRI implementation quirks to Scanner section
|
510
577
|
|
511
578
|
## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
512
579
|
|
@@ -532,7 +599,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
532
599
|
- Renamed Lexer's method to lex, added an alias to the old name (scan)
|
533
600
|
- Use #map instead of #each to run the block in Lexer.lex.
|
534
601
|
- Replaced VERSION.yml file with a constant.
|
535
|
-
- Updated README
|
536
602
|
- Update tokens and scanner with new additions in Unicode 7.0.
|
537
603
|
|
538
604
|
## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
@@ -542,20 +608,11 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
542
608
|
- Added syntax files for missing ruby 2.x versions. These do not add
|
543
609
|
extra syntax support, they just make the gem work with the newer
|
544
610
|
ruby versions.
|
545
|
-
- Added .travis.yml to project root.
|
546
|
-
- README:
|
547
|
-
- Removed note purporting runtime support for ruby 1.8.6.
|
548
|
-
- Added a section identifying the main unsupported syntax features.
|
549
|
-
- Added sections for Testing and Building
|
550
|
-
- Added badges for gem version, Travis CI, and code climate.
|
551
|
-
- Updated README, fixing broken examples, and converting it from a rdoc file to Github's flavor of Markdown.
|
552
611
|
- Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root.
|
553
|
-
|
554
612
|
- Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd'
|
555
613
|
|
556
614
|
## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
557
615
|
|
558
|
-
- Correct ChangeLog.
|
559
616
|
- Added syntax stubs for ruby versions 2.0 and 2.1
|
560
617
|
- Added clone methods for deep copying expressions.
|
561
618
|
- Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier.
|
@@ -564,7 +621,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
564
621
|
- Improved EOF handling in general and especially from sequences like hex and control escapes.
|
565
622
|
- Fixed a bug where named groups with an empty name would return a blank token [].
|
566
623
|
- Fixed a bug where member of a parent set where being added to its last subset.
|
567
|
-
- Various code cleanups in scanner.rl
|
568
624
|
- Fixed a few mutable string bugs by calling dup on the originals.
|
569
625
|
- Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time)
|
570
626
|
- Removed look-behind assertions (positive and negative) from 1.8 syntax
|
data/README.md
CHANGED
@@ -9,8 +9,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
9
9
|
|
10
10
|
* Multilayered
|
11
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
12
|
-
* A lexer that produces a "stream" of
|
13
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
14
|
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
15
|
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
16
16
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -79,7 +80,7 @@ All three methods accept either a `Regexp` or `String` (containing the pattern)
|
|
79
80
|
require 'regexp_parser'
|
80
81
|
|
81
82
|
Regexp::Parser.parse(
|
82
|
-
"a+ #
|
83
|
+
"a+ # Recognizes a and A...",
|
83
84
|
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
85
|
)
|
85
86
|
```
|
@@ -101,7 +102,7 @@ start/end offsets for each token found.
|
|
101
102
|
```ruby
|
102
103
|
require 'regexp_parser'
|
103
104
|
|
104
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
105
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
106
107
|
end
|
107
108
|
|
@@ -124,7 +125,7 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
124
125
|
parts of the pattern:
|
125
126
|
|
126
127
|
```ruby
|
127
|
-
Regexp::Scanner.scan(
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
128
129
|
#=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
129
130
|
```
|
130
131
|
|
@@ -220,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
220
221
|
```ruby
|
221
222
|
require 'regexp_parser'
|
222
223
|
|
223
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
224
225
|
puts "#{' ' * token.level}#{token.text}"
|
225
226
|
end
|
226
227
|
|
@@ -246,7 +247,7 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
246
247
|
by a quantifier that only applies to it.
|
247
248
|
|
248
249
|
```ruby
|
249
|
-
Regexp::Lexer.scan(
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
250
251
|
#=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
251
252
|
```
|
252
253
|
|
@@ -274,7 +275,7 @@ require 'regexp_parser'
|
|
274
275
|
|
275
276
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
276
277
|
|
277
|
-
tree = Regexp::Parser.parse(
|
278
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
278
279
|
|
279
280
|
tree.traverse do |event, exp|
|
280
281
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -355,7 +356,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
355
356
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
356
357
|
|   _Numbered_ | `\k<1>` | ✓ |
|
357
358
|
|   _Relative_ | `\k<-2>` | ✓ |
|
358
|
-
|   _Traditional_ | `\1`
|
359
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
359
360
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
360
361
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
361
362
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -375,7 +376,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
375
376
|
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
376
377
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
377
378
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
378
|
-
| **Unicode Properties** | _<sub>([Unicode 13.0.0]
|
379
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
379
380
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
380
381
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
381
382
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -384,13 +385,17 @@ _Note that not all of these are available in all versions of Ruby_
|
|
384
385
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
385
386
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
386
387
|
|
387
|
-
|
388
|
-
|
388
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
389
|
+
|
390
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
391
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
392
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
393
|
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
394
|
|
391
|
-
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
392
|
-
https://github.com/ruby/ruby/commit/
|
393
|
-
scanner and will only be emitted if a String or a Regexp
|
395
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
396
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
397
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
398
|
+
that has been built with the `::new` constructor is scanned.
|
394
399
|
|
395
400
|
##### Inapplicable Features
|
396
401
|
|
@@ -407,25 +412,27 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
407
412
|
|
408
413
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
409
414
|
|
410
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
411
|
-
or incorrectly return tokens/objects as literals._
|
415
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
416
|
+
an error, or incorrectly return tokens/objects as literals._
|
412
417
|
|
413
418
|
|
414
419
|
## Testing
|
415
420
|
To run the tests simply run rake from the root directory.
|
416
421
|
|
417
|
-
The default task generates the scanner's code from the Ragel source files and runs
|
422
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
423
|
+
all the specs, thus it requires Ragel to be installed.
|
418
424
|
|
419
|
-
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
425
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
426
|
+
so to run individual tests you might want to run:
|
420
427
|
|
421
428
|
```
|
422
429
|
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
423
430
|
```
|
424
431
|
|
425
432
|
## Building
|
426
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
427
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
428
|
-
Ruby scanner code.
|
433
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
434
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
435
|
+
the Ruby scanner code.
|
429
436
|
|
430
437
|
|
431
438
|
The project uses the standard rubygems package tasks, so:
|
@@ -445,17 +452,26 @@ rake install
|
|
445
452
|
## Example Projects
|
446
453
|
Projects using regexp_parser.
|
447
454
|
|
448
|
-
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
455
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
456
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
457
|
+
|
458
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
459
|
+
to JavaScript-compatible regular expressions.
|
449
460
|
|
450
|
-
- [
|
461
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
462
|
+
with alias support.
|
451
463
|
|
452
|
-
- [
|
464
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
465
|
+
(amongst others) to see if your tests cover their behavior.
|
453
466
|
|
454
|
-
- [
|
467
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
468
|
+
pretty-printer and formatter for Ruby.
|
455
469
|
|
456
|
-
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
470
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
471
|
+
uses regexp_parser to lint Regexps.
|
457
472
|
|
458
|
-
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
473
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
474
|
+
that uses regexp_parser to generate examples of postal codes.
|
459
475
|
|
460
476
|
|
461
477
|
## References
|
@@ -14,6 +14,10 @@ module Regexp::Expression
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def to_re(format = :full)
|
17
|
+
if set_level > 0
|
18
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
19
|
+
"their behavior might not be equivalent outside of the set."
|
20
|
+
end
|
17
21
|
::Regexp.new(to_s(format))
|
18
22
|
end
|
19
23
|
|
@@ -32,15 +36,19 @@ module Regexp::Expression
|
|
32
36
|
end
|
33
37
|
|
34
38
|
def repetitions
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
@repetitions ||=
|
40
|
+
if quantified?
|
41
|
+
min = quantifier.min
|
42
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
43
|
+
range = min..max
|
44
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
45
|
+
if RUBY_VERSION.to_f < 2.7
|
46
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
47
|
+
end
|
48
|
+
range
|
49
|
+
else
|
50
|
+
1..1
|
51
|
+
end
|
44
52
|
end
|
45
53
|
|
46
54
|
def greedy?
|
@@ -1,12 +1,29 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
# TODO: unify name with token :backref, one way or the other, in v3.0.0
|
2
3
|
module Backreference
|
3
4
|
class Base < Regexp::Expression::Base
|
4
5
|
attr_accessor :referenced_expression
|
5
6
|
|
6
7
|
def initialize_copy(orig)
|
7
|
-
|
8
|
+
exp_id = [self.class, self.starts_at]
|
9
|
+
|
10
|
+
# prevent infinite recursion for recursive subexp calls
|
11
|
+
copied = @@copied ||= {}
|
12
|
+
self.referenced_expression =
|
13
|
+
if copied[exp_id]
|
14
|
+
orig.referenced_expression
|
15
|
+
else
|
16
|
+
copied[exp_id] = true
|
17
|
+
orig.referenced_expression.dup
|
18
|
+
end
|
19
|
+
copied.clear
|
20
|
+
|
8
21
|
super
|
9
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
10
27
|
end
|
11
28
|
|
12
29
|
class Number < Backreference::Base
|
@@ -38,7 +55,7 @@ module Regexp::Expression
|
|
38
55
|
class NameCall < Backreference::Name; end
|
39
56
|
class NumberCallRelative < Backreference::NumberRelative; end
|
40
57
|
|
41
|
-
class NumberRecursionLevel < Backreference::
|
58
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
42
59
|
attr_reader :recursion_level
|
43
60
|
|
44
61
|
def initialize(token, options = {})
|
File without changes
|
@@ -20,6 +20,10 @@ module Regexp::Expression
|
|
20
20
|
self.referenced_expression = orig.referenced_expression.dup
|
21
21
|
super
|
22
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
29
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -55,6 +59,10 @@ module Regexp::Expression
|
|
55
59
|
condition.reference
|
56
60
|
end
|
57
61
|
|
62
|
+
def referential?
|
63
|
+
true
|
64
|
+
end
|
65
|
+
|
58
66
|
def parts
|
59
67
|
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
68
|
end
|
@@ -33,6 +33,8 @@ module Regexp::Expression
|
|
33
33
|
|
34
34
|
class Absence < Group::Base; end
|
35
35
|
class Atomic < Group::Base; end
|
36
|
+
# TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
|
37
|
+
# longer inherit from Group because it is effectively a terminal expression.
|
36
38
|
class Options < Group::Base
|
37
39
|
attr_accessor :option_changes
|
38
40
|
|
@@ -40,6 +42,14 @@ module Regexp::Expression
|
|
40
42
|
self.option_changes = orig.option_changes.dup
|
41
43
|
super
|
42
44
|
end
|
45
|
+
|
46
|
+
def quantify(*args)
|
47
|
+
if token == :options_switch
|
48
|
+
raise Regexp::Parser::Error, 'Can not quantify an option switch'
|
49
|
+
else
|
50
|
+
super
|
51
|
+
end
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
class Capture < Group::Base
|
@@ -1,11 +1,9 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
class Root < Regexp::Expression::Subexpression
|
3
3
|
def self.build(options = {})
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
def self.build_token
|
8
|
-
Regexp::Token.new(:expression, :root, '', 0)
|
4
|
+
warn "`#{self.class}.build(options)` is deprecated and will raise in "\
|
5
|
+
"regexp_parser v3.0.0. Please use `.construct(options: options)`."
|
6
|
+
construct(options: options)
|
9
7
|
end
|
10
8
|
end
|
11
9
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
module ClassMethods
|
4
|
+
# Convenience method to init a valid Expression without a Regexp::Token
|
5
|
+
def construct(params = {})
|
6
|
+
attrs = construct_defaults.merge(params)
|
7
|
+
options = attrs.delete(:options)
|
8
|
+
token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) }
|
9
|
+
token = Regexp::Token.new(*token_args)
|
10
|
+
raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any?
|
11
|
+
|
12
|
+
new(token, options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def construct_defaults
|
16
|
+
if self == Root
|
17
|
+
{ type: :expression, token: :root, ts: 0 }
|
18
|
+
elsif self < Sequence
|
19
|
+
{ type: :expression, token: :sequence }
|
20
|
+
else
|
21
|
+
{ type: token_class::Type }
|
22
|
+
end.merge(level: 0, set_level: 0, conditional_level: 0, text: '')
|
23
|
+
end
|
24
|
+
|
25
|
+
def token_class
|
26
|
+
if self == Root || self < Sequence
|
27
|
+
nil # no token class because these objects are Parser-generated
|
28
|
+
# TODO: synch exp & token class names for alt., dot, escapes in v3.0.0
|
29
|
+
elsif self == Alternation || self == CharacterType::Any
|
30
|
+
Regexp::Syntax::Token::Meta
|
31
|
+
elsif self <= EscapeSequence::Base
|
32
|
+
Regexp::Syntax::Token::Escape
|
33
|
+
else
|
34
|
+
Regexp::Syntax::Token.const_get(name.split('::')[2])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def token_class
|
40
|
+
self.class.token_class
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|