regexp_parser 2.5.0 → 2.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +67 -39
- data/README.md +45 -31
- data/lib/regexp_parser/expression/base.rb +17 -9
- data/lib/regexp_parser/expression/classes/backreference.rb +1 -1
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +10 -0
- data/lib/regexp_parser/expression/classes/unicode_property.rb +1 -1
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/shared.rb +11 -2
- data/lib/regexp_parser/expression.rb +1 -0
- data/lib/regexp_parser/parser.rb +16 -4
- data/lib/regexp_parser/scanner/scanner.rl +2 -2
- data/lib/regexp_parser/scanner.rb +582 -578
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a468f97c0fecc8b90781d4d6775f82423fd5e7f15561a419be849b1d24fe05d9
|
4
|
+
data.tar.gz: c5c78beabe6ebe360b4f7cdede3c62149f4eba3c1556fd55cf02e3300cdb38b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3b86a8f66154804b49d227ad4653cb969f1c337d4dc90de09e116e39cd87f608a12d29cc0422e4b1b4201234bc2b5b6467b065d94c274674fb1c555a04518d8
|
7
|
+
data.tar.gz: fb26d224504f71645645013ee3dd5a07066b0323f9c97f8c0a716e75ea0d4fdffbf41c0526eafdf19c6d7fe1772d6616aec71541dc46d51123640cfc76b703f6
|
data/CHANGELOG.md
CHANGED
@@ -1,37 +1,77 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
1
8
|
## [Unreleased]
|
2
9
|
|
10
|
+
## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
|
11
|
+
|
12
|
+
### Fixed
|
13
|
+
|
14
|
+
- fixed scanning of two negative lookbehind edge cases
|
15
|
+
* `(?<!x)y>` used to raise a ScannerError
|
16
|
+
* `(?<!x>)y` used to be misinterpreted as a named group
|
17
|
+
* thanks to [Sergio Medina](https://github.com/serch) for the report
|
18
|
+
|
19
|
+
## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com)
|
20
|
+
|
21
|
+
### Fixed
|
22
|
+
|
23
|
+
- fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp)
|
24
|
+
- fixed `#reference`, `#referenced_expression` for recursion level backrefs
|
25
|
+
* e.g. `(a)(b)\k<-1+1>`
|
26
|
+
* `#referenced_expression` was `nil`, now it is the correct `Group` exp
|
27
|
+
- detect and raise for two more syntax errors when parsing String input
|
28
|
+
* quantification of option switches (e.g. `(?i)+`)
|
29
|
+
* invalid references (e.g. `/\k<1>/`)
|
30
|
+
* these are a `SyntaxError` in Ruby, so could only be passed as a String
|
31
|
+
|
32
|
+
### Added
|
33
|
+
|
34
|
+
- `Regexp::Expression::Base#human_name`
|
35
|
+
* returns a nice, human-readable description of the expression
|
36
|
+
- `Regexp::Expression::Base#optional?`
|
37
|
+
* returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`)
|
38
|
+
- added a deprecation warning when calling `#to_re` on set members
|
39
|
+
|
40
|
+
## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com)
|
41
|
+
|
3
42
|
### Added
|
4
43
|
|
5
44
|
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
45
|
+
* see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details
|
6
46
|
|
7
47
|
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
8
48
|
|
9
49
|
### Fixed
|
10
50
|
|
11
51
|
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
52
|
+
* they used to be treated as reluctant or possessive mode indicators
|
53
|
+
* however, Ruby does not support these modes for interval quantifiers
|
54
|
+
* they are now treated as chained quantifiers instead, as Ruby does it
|
55
|
+
* c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
16
56
|
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
17
|
-
|
57
|
+
* e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
18
58
|
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
19
|
-
|
20
|
-
|
59
|
+
* they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
60
|
+
* they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
21
61
|
|
22
62
|
### Added
|
23
63
|
|
24
64
|
- added `Expression::Base#==` for (deep) comparison of expressions
|
25
65
|
- added `Expression::Base#parts`
|
26
|
-
|
27
|
-
|
66
|
+
* returns the text elements and subexpressions of an expression
|
67
|
+
* e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
28
68
|
- added `Expression::Base#te` (a.k.a. token end index)
|
29
|
-
|
69
|
+
* `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
30
70
|
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
71
|
+
* `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
72
|
+
* `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
73
|
+
* `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
74
|
+
* this allows a more unified handling with `Expression::Base` instances
|
35
75
|
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
36
76
|
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
37
77
|
|
@@ -54,18 +94,18 @@
|
|
54
94
|
### Fixed
|
55
95
|
|
56
96
|
- removed five inexistent unicode properties from `Syntax#features`
|
57
|
-
|
58
|
-
|
97
|
+
* these were never supported by Ruby or the `Regexp::Scanner`
|
98
|
+
* thanks to [Markus Schirp](https://github.com/mbj) for the report
|
59
99
|
|
60
100
|
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
61
101
|
|
62
102
|
### Added
|
63
103
|
|
64
104
|
- improved parsing performance through `Syntax` refactoring
|
65
|
-
|
66
|
-
|
105
|
+
* instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
106
|
+
* this approximately doubles the parsing speed for simple regexps
|
67
107
|
- added methods to `Syntax` classes to show relative feature sets
|
68
|
-
|
108
|
+
* e.g. `Regexp::Syntax::V3_2_0.added_features`
|
69
109
|
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
70
110
|
|
71
111
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
@@ -73,14 +113,14 @@
|
|
73
113
|
### Fixed
|
74
114
|
|
75
115
|
- fixed Syntax version of absence groups (`(?~...)`)
|
76
|
-
|
77
|
-
|
116
|
+
* the lexer accepted them for any Ruby version
|
117
|
+
* now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
78
118
|
- reduced gem size by excluding specs from package
|
79
119
|
- removed deprecated `test_files` gemspec setting
|
80
120
|
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
81
121
|
- no longer depend on `set`
|
82
|
-
|
83
|
-
|
122
|
+
* `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
123
|
+
* this made it a hidden/undeclared dependency of `regexp_parser`
|
84
124
|
|
85
125
|
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
86
126
|
|
@@ -318,8 +358,8 @@
|
|
318
358
|
|
319
359
|
- Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re`
|
320
360
|
- `Conditional::Condition` no longer lives outside the recursive `#expressions` tree
|
321
|
-
|
322
|
-
|
361
|
+
* it used to be the only expression stored in a custom ivar, complicating traversal
|
362
|
+
* its setter and getter (`#condition=`, `#condition`) still work as before
|
323
363
|
|
324
364
|
## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com)
|
325
365
|
|
@@ -327,8 +367,8 @@
|
|
327
367
|
|
328
368
|
- Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?`
|
329
369
|
- Added `Group::Options#option_changes`
|
330
|
-
|
331
|
-
|
370
|
+
* shows the options enabled or disabled by the given options group
|
371
|
+
* as with all other expressions, `#options` shows the overall active options
|
332
372
|
- Added `Conditional#reference` and `Condition#reference`, indicating the determinative group
|
333
373
|
- Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig)
|
334
374
|
|
@@ -512,7 +552,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
512
552
|
* Fixed scanning of zero length comments (PR #12)
|
513
553
|
* Fixed missing escape:codepoint_list syntax token (PR #14)
|
514
554
|
* Fixed to_s for modified interval quantifiers (PR #17)
|
515
|
-
- Added a note about MRI implementation quirks to Scanner section
|
516
555
|
|
517
556
|
## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
518
557
|
|
@@ -538,7 +577,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
538
577
|
- Renamed Lexer's method to lex, added an alias to the old name (scan)
|
539
578
|
- Use #map instead of #each to run the block in Lexer.lex.
|
540
579
|
- Replaced VERSION.yml file with a constant.
|
541
|
-
- Updated README
|
542
580
|
- Update tokens and scanner with new additions in Unicode 7.0.
|
543
581
|
|
544
582
|
## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
@@ -548,20 +586,11 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
548
586
|
- Added syntax files for missing ruby 2.x versions. These do not add
|
549
587
|
extra syntax support, they just make the gem work with the newer
|
550
588
|
ruby versions.
|
551
|
-
- Added .travis.yml to project root.
|
552
|
-
- README:
|
553
|
-
- Removed note purporting runtime support for ruby 1.8.6.
|
554
|
-
- Added a section identifying the main unsupported syntax features.
|
555
|
-
- Added sections for Testing and Building
|
556
|
-
- Added badges for gem version, Travis CI, and code climate.
|
557
|
-
- Updated README, fixing broken examples, and converting it from a rdoc file to Github's flavor of Markdown.
|
558
589
|
- Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root.
|
559
|
-
|
560
590
|
- Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd'
|
561
591
|
|
562
592
|
## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
563
593
|
|
564
|
-
- Correct ChangeLog.
|
565
594
|
- Added syntax stubs for ruby versions 2.0 and 2.1
|
566
595
|
- Added clone methods for deep copying expressions.
|
567
596
|
- Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier.
|
@@ -570,7 +599,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
570
599
|
- Improved EOF handling in general and especially from sequences like hex and control escapes.
|
571
600
|
- Fixed a bug where named groups with an empty name would return a blank token [].
|
572
601
|
- Fixed a bug where member of a parent set where being added to its last subset.
|
573
|
-
- Various code cleanups in scanner.rl
|
574
602
|
- Fixed a few mutable string bugs by calling dup on the originals.
|
575
603
|
- Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time)
|
576
604
|
- Removed look-behind assertions (positive and negative) from 1.8 syntax
|
data/README.md
CHANGED
@@ -9,8 +9,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
9
9
|
|
10
10
|
* Multilayered
|
11
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
12
|
-
* A lexer that produces a "stream" of
|
13
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
14
|
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
15
|
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
16
16
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -79,7 +80,7 @@ All three methods accept either a `Regexp` or `String` (containing the pattern)
|
|
79
80
|
require 'regexp_parser'
|
80
81
|
|
81
82
|
Regexp::Parser.parse(
|
82
|
-
"a+ #
|
83
|
+
"a+ # Recognizes a and A...",
|
83
84
|
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
85
|
)
|
85
86
|
```
|
@@ -101,7 +102,7 @@ start/end offsets for each token found.
|
|
101
102
|
```ruby
|
102
103
|
require 'regexp_parser'
|
103
104
|
|
104
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
105
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
106
107
|
end
|
107
108
|
|
@@ -124,7 +125,7 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
124
125
|
parts of the pattern:
|
125
126
|
|
126
127
|
```ruby
|
127
|
-
Regexp::Scanner.scan(
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
128
129
|
#=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
129
130
|
```
|
130
131
|
|
@@ -220,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
220
221
|
```ruby
|
221
222
|
require 'regexp_parser'
|
222
223
|
|
223
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
224
225
|
puts "#{' ' * token.level}#{token.text}"
|
225
226
|
end
|
226
227
|
|
@@ -246,7 +247,7 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
246
247
|
by a quantifier that only applies to it.
|
247
248
|
|
248
249
|
```ruby
|
249
|
-
Regexp::Lexer.scan(
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
250
251
|
#=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
251
252
|
```
|
252
253
|
|
@@ -274,7 +275,7 @@ require 'regexp_parser'
|
|
274
275
|
|
275
276
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
276
277
|
|
277
|
-
tree = Regexp::Parser.parse(
|
278
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
278
279
|
|
279
280
|
tree.traverse do |event, exp|
|
280
281
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -355,7 +356,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
355
356
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
356
357
|
|   _Numbered_ | `\k<1>` | ✓ |
|
357
358
|
|   _Relative_ | `\k<-2>` | ✓ |
|
358
|
-
|   _Traditional_ | `\1`
|
359
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
359
360
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
360
361
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
361
362
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -375,7 +376,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
375
376
|
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
376
377
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
377
378
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
378
|
-
| **Unicode Properties** | _<sub>([Unicode 13.0.0]
|
379
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
379
380
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
380
381
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
381
382
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -384,13 +385,17 @@ _Note that not all of these are available in all versions of Ruby_
|
|
384
385
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
385
386
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
386
387
|
|
387
|
-
|
388
|
-
|
388
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
389
|
+
|
390
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
391
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
392
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
393
|
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
394
|
|
391
|
-
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
392
|
-
https://github.com/ruby/ruby/commit/
|
393
|
-
scanner and will only be emitted if a String or a Regexp
|
395
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
396
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
397
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
398
|
+
that has been built with the `::new` constructor is scanned.
|
394
399
|
|
395
400
|
##### Inapplicable Features
|
396
401
|
|
@@ -407,25 +412,27 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
407
412
|
|
408
413
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
409
414
|
|
410
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
411
|
-
or incorrectly return tokens/objects as literals._
|
415
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
416
|
+
an error, or incorrectly return tokens/objects as literals._
|
412
417
|
|
413
418
|
|
414
419
|
## Testing
|
415
420
|
To run the tests simply run rake from the root directory.
|
416
421
|
|
417
|
-
The default task generates the scanner's code from the Ragel source files and runs
|
422
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
423
|
+
all the specs, thus it requires Ragel to be installed.
|
418
424
|
|
419
|
-
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
425
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
426
|
+
so to run individual tests you might want to run:
|
420
427
|
|
421
428
|
```
|
422
429
|
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
423
430
|
```
|
424
431
|
|
425
432
|
## Building
|
426
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
427
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
428
|
-
Ruby scanner code.
|
433
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
434
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
435
|
+
the Ruby scanner code.
|
429
436
|
|
430
437
|
|
431
438
|
The project uses the standard rubygems package tasks, so:
|
@@ -445,19 +452,26 @@ rake install
|
|
445
452
|
## Example Projects
|
446
453
|
Projects using regexp_parser.
|
447
454
|
|
448
|
-
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
455
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
456
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
449
457
|
|
450
|
-
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
458
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
459
|
+
to JavaScript-compatible regular expressions.
|
451
460
|
|
452
|
-
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
461
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
462
|
+
with alias support.
|
453
463
|
|
454
|
-
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
464
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
465
|
+
(amongst others) to see if your tests cover their behavior.
|
455
466
|
|
456
|
-
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
467
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
468
|
+
pretty-printer and formatter for Ruby.
|
457
469
|
|
458
|
-
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
470
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
471
|
+
uses regexp_parser to lint Regexps.
|
459
472
|
|
460
|
-
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
473
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
474
|
+
that uses regexp_parser to generate examples of postal codes.
|
461
475
|
|
462
476
|
|
463
477
|
## References
|
@@ -14,6 +14,10 @@ module Regexp::Expression
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def to_re(format = :full)
|
17
|
+
if set_level > 0
|
18
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
19
|
+
"their behavior might not be equivalent outside of the set."
|
20
|
+
end
|
17
21
|
::Regexp.new(to_s(format))
|
18
22
|
end
|
19
23
|
|
@@ -32,15 +36,19 @@ module Regexp::Expression
|
|
32
36
|
end
|
33
37
|
|
34
38
|
def repetitions
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
@repetitions ||=
|
40
|
+
if quantified?
|
41
|
+
min = quantifier.min
|
42
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
43
|
+
range = min..max
|
44
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
45
|
+
if RUBY_VERSION.to_f < 2.7
|
46
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
47
|
+
end
|
48
|
+
range
|
49
|
+
else
|
50
|
+
1..1
|
51
|
+
end
|
44
52
|
end
|
45
53
|
|
46
54
|
def greedy?
|
@@ -39,7 +39,7 @@ module Regexp::Expression
|
|
39
39
|
class NameCall < Backreference::Name; end
|
40
40
|
class NumberCallRelative < Backreference::NumberRelative; end
|
41
41
|
|
42
|
-
class NumberRecursionLevel < Backreference::
|
42
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
43
43
|
attr_reader :recursion_level
|
44
44
|
|
45
45
|
def initialize(token, options = {})
|
@@ -33,6 +33,8 @@ module Regexp::Expression
|
|
33
33
|
|
34
34
|
class Absence < Group::Base; end
|
35
35
|
class Atomic < Group::Base; end
|
36
|
+
# TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
|
37
|
+
# longer inherit from Group because it is effectively a terminal expression.
|
36
38
|
class Options < Group::Base
|
37
39
|
attr_accessor :option_changes
|
38
40
|
|
@@ -40,6 +42,14 @@ module Regexp::Expression
|
|
40
42
|
self.option_changes = orig.option_changes.dup
|
41
43
|
super
|
42
44
|
end
|
45
|
+
|
46
|
+
def quantify(*args)
|
47
|
+
if token == :options_switch
|
48
|
+
raise Regexp::Parser::Error, 'Can not quantify an option switch'
|
49
|
+
else
|
50
|
+
super
|
51
|
+
end
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
class Capture < Group::Base
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
# default implementation, e.g. "atomic group", "hex escape", "word type", ..
|
4
|
+
def human_name
|
5
|
+
[token, type].compact.join(' ').tr('_', ' ')
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
Alternation.class_eval { def human_name; 'alternation' end }
|
10
|
+
Alternative.class_eval { def human_name; 'alternative' end }
|
11
|
+
Anchor::BOL.class_eval { def human_name; 'beginning of line' end }
|
12
|
+
Anchor::BOS.class_eval { def human_name; 'beginning of string' end }
|
13
|
+
Anchor::EOL.class_eval { def human_name; 'end of line' end }
|
14
|
+
Anchor::EOS.class_eval { def human_name; 'end of string' end }
|
15
|
+
Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end }
|
16
|
+
Anchor::MatchStart.class_eval { def human_name; 'match start' end }
|
17
|
+
Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end }
|
18
|
+
Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end }
|
19
|
+
Assertion::Lookahead.class_eval { def human_name; 'lookahead' end }
|
20
|
+
Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end }
|
21
|
+
Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end }
|
22
|
+
Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end }
|
23
|
+
Backreference::Name.class_eval { def human_name; 'backreference by name' end }
|
24
|
+
Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end }
|
25
|
+
Backreference::Number.class_eval { def human_name; 'backreference' end }
|
26
|
+
Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end }
|
27
|
+
Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end }
|
28
|
+
Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end }
|
29
|
+
CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end }
|
30
|
+
CharacterSet::Intersection.class_eval { def human_name; 'intersection' end }
|
31
|
+
CharacterSet::Range.class_eval { def human_name; 'character range' end }
|
32
|
+
CharacterType::Any.class_eval { def human_name; 'match-all' end }
|
33
|
+
Comment.class_eval { def human_name; 'comment' end }
|
34
|
+
Conditional::Branch.class_eval { def human_name; 'conditional branch' end }
|
35
|
+
Conditional::Condition.class_eval { def human_name; 'condition' end }
|
36
|
+
Conditional::Expression.class_eval { def human_name; 'conditional' end }
|
37
|
+
Group::Capture.class_eval { def human_name; "capture group #{number}" end }
|
38
|
+
Group::Named.class_eval { def human_name; 'named capture group' end }
|
39
|
+
Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end }
|
40
|
+
Literal.class_eval { def human_name; 'literal' end }
|
41
|
+
Root.class_eval { def human_name; 'root' end }
|
42
|
+
WhiteSpace.class_eval { def human_name; 'free space' end }
|
43
|
+
end
|
@@ -8,9 +8,9 @@ module Regexp::Expression
|
|
8
8
|
|
9
9
|
attr_accessor :type, :token, :text, :ts, :te,
|
10
10
|
:level, :set_level, :conditional_level,
|
11
|
-
:options
|
11
|
+
:options
|
12
12
|
|
13
|
-
attr_reader :nesting_level
|
13
|
+
attr_reader :nesting_level, :quantifier
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
@@ -64,6 +64,10 @@ module Regexp::Expression
|
|
64
64
|
!quantifier.nil?
|
65
65
|
end
|
66
66
|
|
67
|
+
def optional?
|
68
|
+
quantified? && quantifier.min == 0
|
69
|
+
end
|
70
|
+
|
67
71
|
def offset
|
68
72
|
[starts_at, full_length]
|
69
73
|
end
|
@@ -81,5 +85,10 @@ module Regexp::Expression
|
|
81
85
|
quantifier && quantifier.nesting_level = lvl
|
82
86
|
terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
|
83
87
|
end
|
88
|
+
|
89
|
+
def quantifier=(qtf)
|
90
|
+
@quantifier = qtf
|
91
|
+
@repetitions = nil # clear memoized value
|
92
|
+
end
|
84
93
|
end
|
85
94
|
end
|
@@ -25,6 +25,7 @@ require 'regexp_parser/expression/classes/root'
|
|
25
25
|
require 'regexp_parser/expression/classes/unicode_property'
|
26
26
|
|
27
27
|
require 'regexp_parser/expression/methods/construct'
|
28
|
+
require 'regexp_parser/expression/methods/human_name'
|
28
29
|
require 'regexp_parser/expression/methods/match'
|
29
30
|
require 'regexp_parser/expression/methods/match_length'
|
30
31
|
require 'regexp_parser/expression/methods/options'
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -235,7 +235,15 @@ class Regexp::Parser
|
|
235
235
|
when :number, :number_ref
|
236
236
|
node << Backreference::Number.new(token, active_opts)
|
237
237
|
when :number_recursion_ref
|
238
|
-
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
238
|
+
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
239
|
+
# TODO: should split off new token number_recursion_rel_ref and new
|
240
|
+
# class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
|
241
|
+
if exp.text =~ /[<'][+-]/
|
242
|
+
assign_effective_number(exp)
|
243
|
+
else
|
244
|
+
exp.effective_number = exp.number
|
245
|
+
end
|
246
|
+
end
|
239
247
|
when :number_call
|
240
248
|
node << Backreference::NumberCall.new(token, active_opts)
|
241
249
|
when :number_rel_ref
|
@@ -254,6 +262,8 @@ class Regexp::Parser
|
|
254
262
|
def assign_effective_number(exp)
|
255
263
|
exp.effective_number =
|
256
264
|
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
265
|
+
exp.effective_number > 0 ||
|
266
|
+
raise(ParserError, "Invalid reference: #{exp.reference}")
|
257
267
|
end
|
258
268
|
|
259
269
|
def conditional(token)
|
@@ -569,15 +579,17 @@ class Regexp::Parser
|
|
569
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
570
580
|
# the instance of Group::Capture that it refers to via its number.
|
571
581
|
def assign_referenced_expressions
|
572
|
-
targets = {}
|
573
582
|
# find all referencable expressions
|
583
|
+
targets = { 0 => root }
|
574
584
|
root.each_expression do |exp|
|
575
585
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
576
586
|
end
|
577
587
|
# assign them to any refering expressions
|
578
588
|
root.each_expression do |exp|
|
579
|
-
exp.respond_to?(:reference)
|
580
|
-
|
589
|
+
next unless exp.respond_to?(:reference)
|
590
|
+
|
591
|
+
exp.referenced_expression = targets[exp.reference] ||
|
592
|
+
raise(ParserError, "Invalid reference: #{exp.reference}")
|
581
593
|
end
|
582
594
|
end
|
583
595
|
end # module Regexp::Parser
|
@@ -90,8 +90,8 @@
|
|
90
90
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
91
|
|
92
92
|
group_ref = [gk];
|
93
|
-
group_name_id_ab = ([
|
94
|
-
group_name_id_sq = ([^0-9\-']
|
93
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
94
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
95
95
|
group_number = '-'? . [1-9] . [0-9]*;
|
96
96
|
group_level = [+\-] . [0-9]+;
|
97
97
|
|