regexp_parser 2.4.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -42
- data/README.md +46 -30
- data/lib/regexp_parser/expression/base.rb +17 -9
- data/lib/regexp_parser/expression/classes/backreference.rb +19 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +10 -0
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/root.rb +3 -5
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
- data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
- data/lib/regexp_parser/expression/quantifier.rb +6 -5
- data/lib/regexp_parser/expression/sequence.rb +6 -21
- data/lib/regexp_parser/expression/shared.rb +20 -3
- data/lib/regexp_parser/expression/subexpression.rb +4 -1
- data/lib/regexp_parser/expression.rb +4 -2
- data/lib/regexp_parser/lexer.rb +61 -29
- data/lib/regexp_parser/parser.rb +36 -26
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +57 -42
- data/lib/regexp_parser/scanner.rb +873 -823
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
|
4
|
+
data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
|
7
|
+
data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07
|
data/CHANGELOG.md
CHANGED
@@ -1,33 +1,99 @@
|
|
1
|
-
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
9
|
+
|
10
|
+
### Added
|
11
|
+
|
12
|
+
- `Regexp::Lexer.lex` now streams tokens when called with a block
|
13
|
+
- it can now take arbitrarily large input, just like `Regexp::Scanner`
|
14
|
+
- this also slightly improves `Regexp::Parser.parse` performance
|
15
|
+
- note: `Regexp::Parser.parse` still does not and will not support streaming
|
16
|
+
- improved performance of `Subexpression#each_expression`
|
17
|
+
- minor improvements to `Regexp::Scanner` performance
|
18
|
+
- overall improvement of parse performance: about 10% for large Regexps
|
19
|
+
|
20
|
+
### Fixed
|
21
|
+
|
22
|
+
- parsing of octal escape sequences in sets, e.g. `[\141]`
|
23
|
+
* thanks to [Randy Stauner](https://github.com/rwstauner) for the report
|
24
|
+
|
25
|
+
## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
|
29
|
+
- fixed `SystemStackError` when cloning recursive subexpression calls
|
30
|
+
* e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
|
31
|
+
|
32
|
+
## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
|
33
|
+
|
34
|
+
### Fixed
|
35
|
+
|
36
|
+
- fixed scanning of two negative lookbehind edge cases
|
37
|
+
* `(?<!x)y>` used to raise a ScannerError
|
38
|
+
* `(?<!x>)y` used to be misinterpreted as a named group
|
39
|
+
* thanks to [Sergio Medina](https://github.com/serch) for the report
|
40
|
+
|
41
|
+
## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com)
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
|
45
|
+
- fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp)
|
46
|
+
- fixed `#reference`, `#referenced_expression` for recursion level backrefs
|
47
|
+
* e.g. `(a)(b)\k<-1+1>`
|
48
|
+
* `#referenced_expression` was `nil`, now it is the correct `Group` exp
|
49
|
+
- detect and raise for two more syntax errors when parsing String input
|
50
|
+
* quantification of option switches (e.g. `(?i)+`)
|
51
|
+
* invalid references (e.g. `/\k<1>/`)
|
52
|
+
* these are a `SyntaxError` in Ruby, so could only be passed as a String
|
53
|
+
|
54
|
+
### Added
|
55
|
+
|
56
|
+
- `Regexp::Expression::Base#human_name`
|
57
|
+
* returns a nice, human-readable description of the expression
|
58
|
+
- `Regexp::Expression::Base#optional?`
|
59
|
+
* returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`)
|
60
|
+
- added a deprecation warning when calling `#to_re` on set members
|
61
|
+
|
62
|
+
## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com)
|
63
|
+
|
64
|
+
### Added
|
65
|
+
|
66
|
+
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
67
|
+
* see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details
|
2
68
|
|
3
69
|
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
70
|
|
5
71
|
### Fixed
|
6
72
|
|
7
73
|
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
74
|
+
* they used to be treated as reluctant or possessive mode indicators
|
75
|
+
* however, Ruby does not support these modes for interval quantifiers
|
76
|
+
* they are now treated as chained quantifiers instead, as Ruby does it
|
77
|
+
* c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
12
78
|
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
13
|
-
|
79
|
+
* e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
14
80
|
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
15
|
-
|
16
|
-
|
81
|
+
* they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
82
|
+
* they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
17
83
|
|
18
84
|
### Added
|
19
85
|
|
20
86
|
- added `Expression::Base#==` for (deep) comparison of expressions
|
21
87
|
- added `Expression::Base#parts`
|
22
|
-
|
23
|
-
|
88
|
+
* returns the text elements and subexpressions of an expression
|
89
|
+
* e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
24
90
|
- added `Expression::Base#te` (a.k.a. token end index)
|
25
|
-
|
91
|
+
* `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
26
92
|
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
93
|
+
* `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
94
|
+
* `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
95
|
+
* `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
96
|
+
* this allows a more unified handling with `Expression::Base` instances
|
31
97
|
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
32
98
|
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
33
99
|
|
@@ -36,10 +102,12 @@
|
|
36
102
|
|
37
103
|
It will no longer be supported in regexp_parser v3.0.0.
|
38
104
|
|
39
|
-
Please pass a Regexp::Token instead, e.g. replace `
|
40
|
-
with `::Regexp::Token.new(:quantifier,
|
105
|
+
Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode`
|
106
|
+
with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode
|
41
107
|
will be derived automatically.
|
42
108
|
|
109
|
+
Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`.
|
110
|
+
|
43
111
|
This is consistent with how Expression::Base instances are created.
|
44
112
|
|
45
113
|
|
@@ -48,18 +116,18 @@
|
|
48
116
|
### Fixed
|
49
117
|
|
50
118
|
- removed five inexistent unicode properties from `Syntax#features`
|
51
|
-
|
52
|
-
|
119
|
+
* these were never supported by Ruby or the `Regexp::Scanner`
|
120
|
+
* thanks to [Markus Schirp](https://github.com/mbj) for the report
|
53
121
|
|
54
122
|
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
55
123
|
|
56
124
|
### Added
|
57
125
|
|
58
126
|
- improved parsing performance through `Syntax` refactoring
|
59
|
-
|
60
|
-
|
127
|
+
* instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
128
|
+
* this approximately doubles the parsing speed for simple regexps
|
61
129
|
- added methods to `Syntax` classes to show relative feature sets
|
62
|
-
|
130
|
+
* e.g. `Regexp::Syntax::V3_2_0.added_features`
|
63
131
|
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
64
132
|
|
65
133
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
@@ -67,14 +135,14 @@
|
|
67
135
|
### Fixed
|
68
136
|
|
69
137
|
- fixed Syntax version of absence groups (`(?~...)`)
|
70
|
-
|
71
|
-
|
138
|
+
* the lexer accepted them for any Ruby version
|
139
|
+
* now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
72
140
|
- reduced gem size by excluding specs from package
|
73
141
|
- removed deprecated `test_files` gemspec setting
|
74
142
|
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
75
143
|
- no longer depend on `set`
|
76
|
-
|
77
|
-
|
144
|
+
* `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
145
|
+
* this made it a hidden/undeclared dependency of `regexp_parser`
|
78
146
|
|
79
147
|
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
80
148
|
|
@@ -312,8 +380,8 @@
|
|
312
380
|
|
313
381
|
- Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re`
|
314
382
|
- `Conditional::Condition` no longer lives outside the recursive `#expressions` tree
|
315
|
-
|
316
|
-
|
383
|
+
* it used to be the only expression stored in a custom ivar, complicating traversal
|
384
|
+
* its setter and getter (`#condition=`, `#condition`) still work as before
|
317
385
|
|
318
386
|
## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com)
|
319
387
|
|
@@ -321,8 +389,8 @@
|
|
321
389
|
|
322
390
|
- Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?`
|
323
391
|
- Added `Group::Options#option_changes`
|
324
|
-
|
325
|
-
|
392
|
+
* shows the options enabled or disabled by the given options group
|
393
|
+
* as with all other expressions, `#options` shows the overall active options
|
326
394
|
- Added `Conditional#reference` and `Condition#reference`, indicating the determinative group
|
327
395
|
- Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig)
|
328
396
|
|
@@ -506,7 +574,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
506
574
|
* Fixed scanning of zero length comments (PR #12)
|
507
575
|
* Fixed missing escape:codepoint_list syntax token (PR #14)
|
508
576
|
* Fixed to_s for modified interval quantifiers (PR #17)
|
509
|
-
- Added a note about MRI implementation quirks to Scanner section
|
510
577
|
|
511
578
|
## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
512
579
|
|
@@ -532,7 +599,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
532
599
|
- Renamed Lexer's method to lex, added an alias to the old name (scan)
|
533
600
|
- Use #map instead of #each to run the block in Lexer.lex.
|
534
601
|
- Replaced VERSION.yml file with a constant.
|
535
|
-
- Updated README
|
536
602
|
- Update tokens and scanner with new additions in Unicode 7.0.
|
537
603
|
|
538
604
|
## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
@@ -542,20 +608,11 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
542
608
|
- Added syntax files for missing ruby 2.x versions. These do not add
|
543
609
|
extra syntax support, they just make the gem work with the newer
|
544
610
|
ruby versions.
|
545
|
-
- Added .travis.yml to project root.
|
546
|
-
- README:
|
547
|
-
- Removed note purporting runtime support for ruby 1.8.6.
|
548
|
-
- Added a section identifying the main unsupported syntax features.
|
549
|
-
- Added sections for Testing and Building
|
550
|
-
- Added badges for gem version, Travis CI, and code climate.
|
551
|
-
- Updated README, fixing broken examples, and converting it from a rdoc file to Github's flavor of Markdown.
|
552
611
|
- Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root.
|
553
|
-
|
554
612
|
- Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd'
|
555
613
|
|
556
614
|
## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
557
615
|
|
558
|
-
- Correct ChangeLog.
|
559
616
|
- Added syntax stubs for ruby versions 2.0 and 2.1
|
560
617
|
- Added clone methods for deep copying expressions.
|
561
618
|
- Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier.
|
@@ -564,7 +621,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
564
621
|
- Improved EOF handling in general and especially from sequences like hex and control escapes.
|
565
622
|
- Fixed a bug where named groups with an empty name would return a blank token [].
|
566
623
|
- Fixed a bug where member of a parent set where being added to its last subset.
|
567
|
-
- Various code cleanups in scanner.rl
|
568
624
|
- Fixed a few mutable string bugs by calling dup on the originals.
|
569
625
|
- Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time)
|
570
626
|
- Removed look-behind assertions (positive and negative) from 1.8 syntax
|
data/README.md
CHANGED
@@ -9,8 +9,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
9
9
|
|
10
10
|
* Multilayered
|
11
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
12
|
-
* A lexer that produces a "stream" of
|
13
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
14
|
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
15
|
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
16
16
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -79,7 +80,7 @@ All three methods accept either a `Regexp` or `String` (containing the pattern)
|
|
79
80
|
require 'regexp_parser'
|
80
81
|
|
81
82
|
Regexp::Parser.parse(
|
82
|
-
"a+ #
|
83
|
+
"a+ # Recognizes a and A...",
|
83
84
|
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
85
|
)
|
85
86
|
```
|
@@ -101,7 +102,7 @@ start/end offsets for each token found.
|
|
101
102
|
```ruby
|
102
103
|
require 'regexp_parser'
|
103
104
|
|
104
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
105
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
106
107
|
end
|
107
108
|
|
@@ -124,7 +125,7 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
124
125
|
parts of the pattern:
|
125
126
|
|
126
127
|
```ruby
|
127
|
-
Regexp::Scanner.scan(
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
128
129
|
#=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
129
130
|
```
|
130
131
|
|
@@ -220,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
220
221
|
```ruby
|
221
222
|
require 'regexp_parser'
|
222
223
|
|
223
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
224
225
|
puts "#{' ' * token.level}#{token.text}"
|
225
226
|
end
|
226
227
|
|
@@ -246,7 +247,7 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
246
247
|
by a quantifier that only applies to it.
|
247
248
|
|
248
249
|
```ruby
|
249
|
-
Regexp::Lexer.scan(
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
250
251
|
#=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
251
252
|
```
|
252
253
|
|
@@ -274,7 +275,7 @@ require 'regexp_parser'
|
|
274
275
|
|
275
276
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
276
277
|
|
277
|
-
tree = Regexp::Parser.parse(
|
278
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
278
279
|
|
279
280
|
tree.traverse do |event, exp|
|
280
281
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -355,7 +356,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
355
356
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
356
357
|
|   _Numbered_ | `\k<1>` | ✓ |
|
357
358
|
|   _Relative_ | `\k<-2>` | ✓ |
|
358
|
-
|   _Traditional_ | `\1`
|
359
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
359
360
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
360
361
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
361
362
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -375,7 +376,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
375
376
|
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
376
377
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
377
378
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
378
|
-
| **Unicode Properties** | _<sub>([Unicode 13.0.0]
|
379
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
379
380
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
380
381
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
381
382
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -384,13 +385,17 @@ _Note that not all of these are available in all versions of Ruby_
|
|
384
385
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
385
386
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
386
387
|
|
387
|
-
|
388
|
-
|
388
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
389
|
+
|
390
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
391
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
392
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
393
|
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
394
|
|
391
|
-
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
392
|
-
https://github.com/ruby/ruby/commit/
|
393
|
-
scanner and will only be emitted if a String or a Regexp
|
395
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
396
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
397
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
398
|
+
that has been built with the `::new` constructor is scanned.
|
394
399
|
|
395
400
|
##### Inapplicable Features
|
396
401
|
|
@@ -407,25 +412,27 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
407
412
|
|
408
413
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
409
414
|
|
410
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
411
|
-
or incorrectly return tokens/objects as literals._
|
415
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
416
|
+
an error, or incorrectly return tokens/objects as literals._
|
412
417
|
|
413
418
|
|
414
419
|
## Testing
|
415
420
|
To run the tests simply run rake from the root directory.
|
416
421
|
|
417
|
-
The default task generates the scanner's code from the Ragel source files and runs
|
422
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
423
|
+
all the specs, thus it requires Ragel to be installed.
|
418
424
|
|
419
|
-
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
425
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
426
|
+
so to run individual tests you might want to run:
|
420
427
|
|
421
428
|
```
|
422
429
|
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
423
430
|
```
|
424
431
|
|
425
432
|
## Building
|
426
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
427
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
428
|
-
Ruby scanner code.
|
433
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
434
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
435
|
+
the Ruby scanner code.
|
429
436
|
|
430
437
|
|
431
438
|
The project uses the standard rubygems package tasks, so:
|
@@ -445,17 +452,26 @@ rake install
|
|
445
452
|
## Example Projects
|
446
453
|
Projects using regexp_parser.
|
447
454
|
|
448
|
-
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
455
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
456
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
457
|
+
|
458
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
459
|
+
to JavaScript-compatible regular expressions.
|
449
460
|
|
450
|
-
- [
|
461
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
462
|
+
with alias support.
|
451
463
|
|
452
|
-
- [
|
464
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
465
|
+
(amongst others) to see if your tests cover their behavior.
|
453
466
|
|
454
|
-
- [
|
467
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
468
|
+
pretty-printer and formatter for Ruby.
|
455
469
|
|
456
|
-
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
470
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
471
|
+
uses regexp_parser to lint Regexps.
|
457
472
|
|
458
|
-
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
473
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
474
|
+
that uses regexp_parser to generate examples of postal codes.
|
459
475
|
|
460
476
|
|
461
477
|
## References
|
@@ -14,6 +14,10 @@ module Regexp::Expression
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def to_re(format = :full)
|
17
|
+
if set_level > 0
|
18
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
19
|
+
"their behavior might not be equivalent outside of the set."
|
20
|
+
end
|
17
21
|
::Regexp.new(to_s(format))
|
18
22
|
end
|
19
23
|
|
@@ -32,15 +36,19 @@ module Regexp::Expression
|
|
32
36
|
end
|
33
37
|
|
34
38
|
def repetitions
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
@repetitions ||=
|
40
|
+
if quantified?
|
41
|
+
min = quantifier.min
|
42
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
43
|
+
range = min..max
|
44
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
45
|
+
if RUBY_VERSION.to_f < 2.7
|
46
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
47
|
+
end
|
48
|
+
range
|
49
|
+
else
|
50
|
+
1..1
|
51
|
+
end
|
44
52
|
end
|
45
53
|
|
46
54
|
def greedy?
|
@@ -1,12 +1,29 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
# TODO: unify name with token :backref, one way or the other, in v3.0.0
|
2
3
|
module Backreference
|
3
4
|
class Base < Regexp::Expression::Base
|
4
5
|
attr_accessor :referenced_expression
|
5
6
|
|
6
7
|
def initialize_copy(orig)
|
7
|
-
|
8
|
+
exp_id = [self.class, self.starts_at]
|
9
|
+
|
10
|
+
# prevent infinite recursion for recursive subexp calls
|
11
|
+
copied = @@copied ||= {}
|
12
|
+
self.referenced_expression =
|
13
|
+
if copied[exp_id]
|
14
|
+
orig.referenced_expression
|
15
|
+
else
|
16
|
+
copied[exp_id] = true
|
17
|
+
orig.referenced_expression.dup
|
18
|
+
end
|
19
|
+
copied.clear
|
20
|
+
|
8
21
|
super
|
9
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
10
27
|
end
|
11
28
|
|
12
29
|
class Number < Backreference::Base
|
@@ -38,7 +55,7 @@ module Regexp::Expression
|
|
38
55
|
class NameCall < Backreference::Name; end
|
39
56
|
class NumberCallRelative < Backreference::NumberRelative; end
|
40
57
|
|
41
|
-
class NumberRecursionLevel < Backreference::
|
58
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
42
59
|
attr_reader :recursion_level
|
43
60
|
|
44
61
|
def initialize(token, options = {})
|
File without changes
|
@@ -20,6 +20,10 @@ module Regexp::Expression
|
|
20
20
|
self.referenced_expression = orig.referenced_expression.dup
|
21
21
|
super
|
22
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
29
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -55,6 +59,10 @@ module Regexp::Expression
|
|
55
59
|
condition.reference
|
56
60
|
end
|
57
61
|
|
62
|
+
def referential?
|
63
|
+
true
|
64
|
+
end
|
65
|
+
|
58
66
|
def parts
|
59
67
|
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
68
|
end
|
@@ -33,6 +33,8 @@ module Regexp::Expression
|
|
33
33
|
|
34
34
|
class Absence < Group::Base; end
|
35
35
|
class Atomic < Group::Base; end
|
36
|
+
# TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
|
37
|
+
# longer inherit from Group because it is effectively a terminal expression.
|
36
38
|
class Options < Group::Base
|
37
39
|
attr_accessor :option_changes
|
38
40
|
|
@@ -40,6 +42,14 @@ module Regexp::Expression
|
|
40
42
|
self.option_changes = orig.option_changes.dup
|
41
43
|
super
|
42
44
|
end
|
45
|
+
|
46
|
+
def quantify(*args)
|
47
|
+
if token == :options_switch
|
48
|
+
raise Regexp::Parser::Error, 'Can not quantify an option switch'
|
49
|
+
else
|
50
|
+
super
|
51
|
+
end
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
class Capture < Group::Base
|
@@ -1,11 +1,9 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
class Root < Regexp::Expression::Subexpression
|
3
3
|
def self.build(options = {})
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
def self.build_token
|
8
|
-
Regexp::Token.new(:expression, :root, '', 0)
|
4
|
+
warn "`#{self.class}.build(options)` is deprecated and will raise in "\
|
5
|
+
"regexp_parser v3.0.0. Please use `.construct(options: options)`."
|
6
|
+
construct(options: options)
|
9
7
|
end
|
10
8
|
end
|
11
9
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
module ClassMethods
|
4
|
+
# Convenience method to init a valid Expression without a Regexp::Token
|
5
|
+
def construct(params = {})
|
6
|
+
attrs = construct_defaults.merge(params)
|
7
|
+
options = attrs.delete(:options)
|
8
|
+
token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) }
|
9
|
+
token = Regexp::Token.new(*token_args)
|
10
|
+
raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any?
|
11
|
+
|
12
|
+
new(token, options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def construct_defaults
|
16
|
+
if self == Root
|
17
|
+
{ type: :expression, token: :root, ts: 0 }
|
18
|
+
elsif self < Sequence
|
19
|
+
{ type: :expression, token: :sequence }
|
20
|
+
else
|
21
|
+
{ type: token_class::Type }
|
22
|
+
end.merge(level: 0, set_level: 0, conditional_level: 0, text: '')
|
23
|
+
end
|
24
|
+
|
25
|
+
def token_class
|
26
|
+
if self == Root || self < Sequence
|
27
|
+
nil # no token class because these objects are Parser-generated
|
28
|
+
# TODO: synch exp & token class names for alt., dot, escapes in v3.0.0
|
29
|
+
elsif self == Alternation || self == CharacterType::Any
|
30
|
+
Regexp::Syntax::Token::Meta
|
31
|
+
elsif self <= EscapeSequence::Base
|
32
|
+
Regexp::Syntax::Token::Escape
|
33
|
+
else
|
34
|
+
Regexp::Syntax::Token.const_get(name.split('::')[2])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def token_class
|
40
|
+
self.class.token_class
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|