regexp_parser 2.5.0 → 2.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +74 -39
- data/README.md +45 -31
- data/lib/regexp_parser/expression/base.rb +17 -9
- data/lib/regexp_parser/expression/classes/backreference.rb +14 -2
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +10 -0
- data/lib/regexp_parser/expression/classes/unicode_property.rb +1 -1
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/shared.rb +11 -2
- data/lib/regexp_parser/expression.rb +1 -0
- data/lib/regexp_parser/parser.rb +16 -4
- data/lib/regexp_parser/scanner/scanner.rl +2 -2
- data/lib/regexp_parser/scanner.rb +582 -578
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66568005494b517613155277c6be4731eb8a26bb9b48a692a9430507286ce583
|
4
|
+
data.tar.gz: d1fc6c6f1a0c7f939c51703ac844c2dbb134f96e0e55780646cb7e3e87d7a652
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b955b2215b71c94497e52841142fab8c2b9930d0d6cea6ea2b3eeb8ed9fe84575e2f34aae3a6051af2b56429f98cf070b9151805f2cb93ddb511ec1e0e50dd7c
|
7
|
+
data.tar.gz: 3a4f083942b66ddb4b67ab33f14bb1c0b724a60c2b30605059d32ce3648e9cb46e31e797b7a526a2028c1e018d73365f5ef955256de4e63397d6ea105714ff12
|
data/CHANGELOG.md
CHANGED
@@ -1,37 +1,84 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
1
8
|
## [Unreleased]
|
2
9
|
|
10
|
+
## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
|
11
|
+
|
12
|
+
### Fixed
|
13
|
+
|
14
|
+
- fixed `SystemStackError` when cloning recursive subexpression calls
|
15
|
+
* e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
|
16
|
+
|
17
|
+
## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
|
18
|
+
|
19
|
+
### Fixed
|
20
|
+
|
21
|
+
- fixed scanning of two negative lookbehind edge cases
|
22
|
+
* `(?<!x)y>` used to raise a ScannerError
|
23
|
+
* `(?<!x>)y` used to be misinterpreted as a named group
|
24
|
+
* thanks to [Sergio Medina](https://github.com/serch) for the report
|
25
|
+
|
26
|
+
## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com)
|
27
|
+
|
28
|
+
### Fixed
|
29
|
+
|
30
|
+
- fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp)
|
31
|
+
- fixed `#reference`, `#referenced_expression` for recursion level backrefs
|
32
|
+
* e.g. `(a)(b)\k<-1+1>`
|
33
|
+
* `#referenced_expression` was `nil`, now it is the correct `Group` exp
|
34
|
+
- detect and raise for two more syntax errors when parsing String input
|
35
|
+
* quantification of option switches (e.g. `(?i)+`)
|
36
|
+
* invalid references (e.g. `/\k<1>/`)
|
37
|
+
* these are a `SyntaxError` in Ruby, so could only be passed as a String
|
38
|
+
|
39
|
+
### Added
|
40
|
+
|
41
|
+
- `Regexp::Expression::Base#human_name`
|
42
|
+
* returns a nice, human-readable description of the expression
|
43
|
+
- `Regexp::Expression::Base#optional?`
|
44
|
+
* returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`)
|
45
|
+
- added a deprecation warning when calling `#to_re` on set members
|
46
|
+
|
47
|
+
## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com)
|
48
|
+
|
3
49
|
### Added
|
4
50
|
|
5
51
|
- `Regexp::Expression::Base.construct` and `.token_class` methods
|
52
|
+
* see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details
|
6
53
|
|
7
54
|
## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
|
8
55
|
|
9
56
|
### Fixed
|
10
57
|
|
11
58
|
- fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
59
|
+
* they used to be treated as reluctant or possessive mode indicators
|
60
|
+
* however, Ruby does not support these modes for interval quantifiers
|
61
|
+
* they are now treated as chained quantifiers instead, as Ruby does it
|
62
|
+
* c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
|
16
63
|
- fixed `Expression::Base#nesting_level` for some tree rewrite cases
|
17
|
-
|
64
|
+
* e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
|
18
65
|
- fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
|
19
|
-
|
20
|
-
|
66
|
+
* they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
|
67
|
+
* they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
|
21
68
|
|
22
69
|
### Added
|
23
70
|
|
24
71
|
- added `Expression::Base#==` for (deep) comparison of expressions
|
25
72
|
- added `Expression::Base#parts`
|
26
|
-
|
27
|
-
|
73
|
+
* returns the text elements and subexpressions of an expression
|
74
|
+
* e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
|
28
75
|
- added `Expression::Base#te` (a.k.a. token end index)
|
29
|
-
|
76
|
+
* `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
|
30
77
|
- made some `Expression::Base` methods available on `Quantifier` instances, too
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
78
|
+
* `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
|
79
|
+
* `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
|
80
|
+
* `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
|
81
|
+
* this allows a more unified handling with `Expression::Base` instances
|
35
82
|
- allowed `Quantifier#initialize` to take a token and options Hash like other nodes
|
36
83
|
- added a deprecation warning for initializing Quantifiers with 4+ arguments:
|
37
84
|
|
@@ -54,18 +101,18 @@
|
|
54
101
|
### Fixed
|
55
102
|
|
56
103
|
- removed five inexistent unicode properties from `Syntax#features`
|
57
|
-
|
58
|
-
|
104
|
+
* these were never supported by Ruby or the `Regexp::Scanner`
|
105
|
+
* thanks to [Markus Schirp](https://github.com/mbj) for the report
|
59
106
|
|
60
107
|
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
61
108
|
|
62
109
|
### Added
|
63
110
|
|
64
111
|
- improved parsing performance through `Syntax` refactoring
|
65
|
-
|
66
|
-
|
112
|
+
* instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
113
|
+
* this approximately doubles the parsing speed for simple regexps
|
67
114
|
- added methods to `Syntax` classes to show relative feature sets
|
68
|
-
|
115
|
+
* e.g. `Regexp::Syntax::V3_2_0.added_features`
|
69
116
|
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
70
117
|
|
71
118
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
@@ -73,14 +120,14 @@
|
|
73
120
|
### Fixed
|
74
121
|
|
75
122
|
- fixed Syntax version of absence groups (`(?~...)`)
|
76
|
-
|
77
|
-
|
123
|
+
* the lexer accepted them for any Ruby version
|
124
|
+
* now they are only recognized for Ruby >= 2.4.1 in which they were introduced
|
78
125
|
- reduced gem size by excluding specs from package
|
79
126
|
- removed deprecated `test_files` gemspec setting
|
80
127
|
- no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
|
81
128
|
- no longer depend on `set`
|
82
|
-
|
83
|
-
|
129
|
+
* `set` was removed from the stdlib and made a standalone gem as of Ruby 3
|
130
|
+
* this made it a hidden/undeclared dependency of `regexp_parser`
|
84
131
|
|
85
132
|
## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
|
86
133
|
|
@@ -318,8 +365,8 @@
|
|
318
365
|
|
319
366
|
- Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re`
|
320
367
|
- `Conditional::Condition` no longer lives outside the recursive `#expressions` tree
|
321
|
-
|
322
|
-
|
368
|
+
* it used to be the only expression stored in a custom ivar, complicating traversal
|
369
|
+
* its setter and getter (`#condition=`, `#condition`) still work as before
|
323
370
|
|
324
371
|
## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com)
|
325
372
|
|
@@ -327,8 +374,8 @@
|
|
327
374
|
|
328
375
|
- Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?`
|
329
376
|
- Added `Group::Options#option_changes`
|
330
|
-
|
331
|
-
|
377
|
+
* shows the options enabled or disabled by the given options group
|
378
|
+
* as with all other expressions, `#options` shows the overall active options
|
332
379
|
- Added `Conditional#reference` and `Condition#reference`, indicating the determinative group
|
333
380
|
- Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig)
|
334
381
|
|
@@ -512,7 +559,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
512
559
|
* Fixed scanning of zero length comments (PR #12)
|
513
560
|
* Fixed missing escape:codepoint_list syntax token (PR #14)
|
514
561
|
* Fixed to_s for modified interval quantifiers (PR #17)
|
515
|
-
- Added a note about MRI implementation quirks to Scanner section
|
516
562
|
|
517
563
|
## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
518
564
|
|
@@ -538,7 +584,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
538
584
|
- Renamed Lexer's method to lex, added an alias to the old name (scan)
|
539
585
|
- Use #map instead of #each to run the block in Lexer.lex.
|
540
586
|
- Replaced VERSION.yml file with a constant.
|
541
|
-
- Updated README
|
542
587
|
- Update tokens and scanner with new additions in Unicode 7.0.
|
543
588
|
|
544
589
|
## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
@@ -548,20 +593,11 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
548
593
|
- Added syntax files for missing ruby 2.x versions. These do not add
|
549
594
|
extra syntax support, they just make the gem work with the newer
|
550
595
|
ruby versions.
|
551
|
-
- Added .travis.yml to project root.
|
552
|
-
- README:
|
553
|
-
- Removed note purporting runtime support for ruby 1.8.6.
|
554
|
-
- Added a section identifying the main unsupported syntax features.
|
555
|
-
- Added sections for Testing and Building
|
556
|
-
- Added badges for gem version, Travis CI, and code climate.
|
557
|
-
- Updated README, fixing broken examples, and converting it from a rdoc file to Github's flavor of Markdown.
|
558
596
|
- Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root.
|
559
|
-
|
560
597
|
- Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd'
|
561
598
|
|
562
599
|
## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com)
|
563
600
|
|
564
|
-
- Correct ChangeLog.
|
565
601
|
- Added syntax stubs for ruby versions 2.0 and 2.1
|
566
602
|
- Added clone methods for deep copying expressions.
|
567
603
|
- Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier.
|
@@ -570,7 +606,6 @@ This release includes several breaking changes, mostly to character sets, #map a
|
|
570
606
|
- Improved EOF handling in general and especially from sequences like hex and control escapes.
|
571
607
|
- Fixed a bug where named groups with an empty name would return a blank token [].
|
572
608
|
- Fixed a bug where member of a parent set where being added to its last subset.
|
573
|
-
- Various code cleanups in scanner.rl
|
574
609
|
- Fixed a few mutable string bugs by calling dup on the originals.
|
575
610
|
- Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time)
|
576
611
|
- Removed look-behind assertions (positive and negative) from 1.8 syntax
|
data/README.md
CHANGED
@@ -9,8 +9,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
|
9
9
|
|
10
10
|
* Multilayered
|
11
11
|
* A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
|
12
|
-
* A lexer that produces a "stream" of
|
13
|
-
* A parser that produces a "tree" of Expression objects (OO API)
|
12
|
+
* A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
|
13
|
+
* A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
|
14
14
|
* Runs on Ruby 2.x, 3.x and JRuby runtimes
|
15
15
|
* Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
|
16
16
|
|
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
|
|
36
36
|
|
37
37
|
```gem 'regexp_parser', '~> X.Y.Z'```
|
38
38
|
|
39
|
-
See
|
39
|
+
See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
|
40
|
+
for the the latest version number.
|
40
41
|
|
41
42
|
|
42
43
|
---
|
43
44
|
## Usage
|
44
45
|
|
45
46
|
The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
|
46
|
-
provides a single method that takes a regular expression (as a
|
47
|
+
provides a single method that takes a regular expression (as a Regexp object or
|
47
48
|
a string) and returns its results. The **Lexer** and the **Parser** accept an
|
48
49
|
optional second argument that specifies the syntax version, like 'ruby/2.0',
|
49
50
|
which defaults to the host Ruby version (using RUBY_VERSION).
|
@@ -79,7 +80,7 @@ All three methods accept either a `Regexp` or `String` (containing the pattern)
|
|
79
80
|
require 'regexp_parser'
|
80
81
|
|
81
82
|
Regexp::Parser.parse(
|
82
|
-
"a+ #
|
83
|
+
"a+ # Recognizes a and A...",
|
83
84
|
options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
|
84
85
|
)
|
85
86
|
```
|
@@ -101,7 +102,7 @@ start/end offsets for each token found.
|
|
101
102
|
```ruby
|
102
103
|
require 'regexp_parser'
|
103
104
|
|
104
|
-
Regexp::Scanner.scan
|
105
|
+
Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
|
105
106
|
puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
|
106
107
|
end
|
107
108
|
|
@@ -124,7 +125,7 @@ A one-liner that uses map on the result of the scan to return the textual
|
|
124
125
|
parts of the pattern:
|
125
126
|
|
126
127
|
```ruby
|
127
|
-
Regexp::Scanner.scan(
|
128
|
+
Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
|
128
129
|
#=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
|
129
130
|
```
|
130
131
|
|
@@ -220,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
|
|
220
221
|
```ruby
|
221
222
|
require 'regexp_parser'
|
222
223
|
|
223
|
-
Regexp::Lexer.lex
|
224
|
+
Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
|
224
225
|
puts "#{' ' * token.level}#{token.text}"
|
225
226
|
end
|
226
227
|
|
@@ -246,7 +247,7 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
|
|
246
247
|
by a quantifier that only applies to it.
|
247
248
|
|
248
249
|
```ruby
|
249
|
-
Regexp::Lexer.scan(
|
250
|
+
Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
|
250
251
|
#=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
|
251
252
|
```
|
252
253
|
|
@@ -274,7 +275,7 @@ require 'regexp_parser'
|
|
274
275
|
|
275
276
|
regex = /a?(b+(c)d)*(?<name>[0-9]+)/
|
276
277
|
|
277
|
-
tree = Regexp::Parser.parse(
|
278
|
+
tree = Regexp::Parser.parse(regex, 'ruby/2.1')
|
278
279
|
|
279
280
|
tree.traverse do |event, exp|
|
280
281
|
puts "#{event}: #{exp.type} `#{exp.to_s}`"
|
@@ -355,7 +356,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
355
356
|
|   _Nest Level_ | `\k<n-1>` | ✓ |
|
356
357
|
|   _Numbered_ | `\k<1>` | ✓ |
|
357
358
|
|   _Relative_ | `\k<-2>` | ✓ |
|
358
|
-
|   _Traditional_ | `\1`
|
359
|
+
|   _Traditional_ | `\1` through `\9` | ✓ |
|
359
360
|
|   _**Capturing**_ | `(abc)` | ✓ |
|
360
361
|
|   _**Comments**_ | `(?# comment text)` | ✓ |
|
361
362
|
|   _**Named**_ | `(?<name>abc)`, `(?'name'abc)` | ✓ |
|
@@ -375,7 +376,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
375
376
|
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
|
376
377
|
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
|
377
378
|
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |
|
378
|
-
| **Unicode Properties** | _<sub>([Unicode 13.0.0]
|
379
|
+
| **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | ⋱ |
|
379
380
|
|   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ |
|
380
381
|
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ |
|
381
382
|
|   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ |
|
@@ -384,13 +385,17 @@ _Note that not all of these are available in all versions of Ruby_
|
|
384
385
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ |
|
385
386
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ |
|
386
387
|
|
387
|
-
|
388
|
-
|
388
|
+
[Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
|
389
|
+
|
390
|
+
**\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
|
391
|
+
Any `+` or `?` that follows an interval quantifier will be treated as another,
|
392
|
+
chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
|
389
393
|
[#69](https://github.com/ammar/regexp_parser/pull/69).
|
390
394
|
|
391
|
-
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
392
|
-
https://github.com/ruby/ruby/commit/
|
393
|
-
scanner and will only be emitted if a String or a Regexp
|
395
|
+
**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
|
396
|
+
escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
|
397
|
+
so they will only reach the scanner and will only be emitted if a String or a Regexp
|
398
|
+
that has been built with the `::new` constructor is scanned.
|
394
399
|
|
395
400
|
##### Inapplicable Features
|
396
401
|
|
@@ -407,25 +412,27 @@ expressions library (Onigmo). They are not supported by the scanner.
|
|
407
412
|
|
408
413
|
See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
|
409
414
|
|
410
|
-
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
411
|
-
or incorrectly return tokens/objects as literals._
|
415
|
+
_**Note**: Attempting to process expressions with unsupported syntax features can raise
|
416
|
+
an error, or incorrectly return tokens/objects as literals._
|
412
417
|
|
413
418
|
|
414
419
|
## Testing
|
415
420
|
To run the tests simply run rake from the root directory.
|
416
421
|
|
417
|
-
The default task generates the scanner's code from the Ragel source files and runs
|
422
|
+
The default task generates the scanner's code from the Ragel source files and runs
|
423
|
+
all the specs, thus it requires Ragel to be installed.
|
418
424
|
|
419
|
-
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
425
|
+
Note that changes to Ragel files will not be reflected when running `rspec` on its own,
|
426
|
+
so to run individual tests you might want to run:
|
420
427
|
|
421
428
|
```
|
422
429
|
rake ragel:rb && rspec spec/scanner/properties_spec.rb
|
423
430
|
```
|
424
431
|
|
425
432
|
## Building
|
426
|
-
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
427
|
-
installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
428
|
-
Ruby scanner code.
|
433
|
+
Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
|
434
|
+
to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
|
435
|
+
the Ruby scanner code.
|
429
436
|
|
430
437
|
|
431
438
|
The project uses the standard rubygems package tasks, so:
|
@@ -445,19 +452,26 @@ rake install
|
|
445
452
|
## Example Projects
|
446
453
|
Projects using regexp_parser.
|
447
454
|
|
448
|
-
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
455
|
+
- [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
|
456
|
+
that uses regexp_parser to convert Regexps to css/xpath selectors.
|
449
457
|
|
450
|
-
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
458
|
+
- [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
|
459
|
+
to JavaScript-compatible regular expressions.
|
451
460
|
|
452
|
-
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
461
|
+
- [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
|
462
|
+
with alias support.
|
453
463
|
|
454
|
-
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
464
|
+
- [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
|
465
|
+
(amongst others) to see if your tests cover their behavior.
|
455
466
|
|
456
|
-
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
467
|
+
- [repper](https://github.com/jaynetics/repper) is a regular expression
|
468
|
+
pretty-printer and formatter for Ruby.
|
457
469
|
|
458
|
-
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
470
|
+
- [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
|
471
|
+
uses regexp_parser to lint Regexps.
|
459
472
|
|
460
|
-
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
473
|
+
- [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
|
474
|
+
that uses regexp_parser to generate examples of postal codes.
|
461
475
|
|
462
476
|
|
463
477
|
## References
|
@@ -14,6 +14,10 @@ module Regexp::Expression
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def to_re(format = :full)
|
17
|
+
if set_level > 0
|
18
|
+
warn "Calling #to_re on character set members is deprecated - "\
|
19
|
+
"their behavior might not be equivalent outside of the set."
|
20
|
+
end
|
17
21
|
::Regexp.new(to_s(format))
|
18
22
|
end
|
19
23
|
|
@@ -32,15 +36,19 @@ module Regexp::Expression
|
|
32
36
|
end
|
33
37
|
|
34
38
|
def repetitions
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
@repetitions ||=
|
40
|
+
if quantified?
|
41
|
+
min = quantifier.min
|
42
|
+
max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
|
43
|
+
range = min..max
|
44
|
+
# fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
|
45
|
+
if RUBY_VERSION.to_f < 2.7
|
46
|
+
range.define_singleton_method(:minmax) { [min, max] }
|
47
|
+
end
|
48
|
+
range
|
49
|
+
else
|
50
|
+
1..1
|
51
|
+
end
|
44
52
|
end
|
45
53
|
|
46
54
|
def greedy?
|
@@ -5,7 +5,19 @@ module Regexp::Expression
|
|
5
5
|
attr_accessor :referenced_expression
|
6
6
|
|
7
7
|
def initialize_copy(orig)
|
8
|
-
|
8
|
+
exp_id = [self.class, self.starts_at]
|
9
|
+
|
10
|
+
# prevent infinite recursion for recursive subexp calls
|
11
|
+
copied = @@copied ||= {}
|
12
|
+
self.referenced_expression =
|
13
|
+
if copied[exp_id]
|
14
|
+
orig.referenced_expression
|
15
|
+
else
|
16
|
+
copied[exp_id] = true
|
17
|
+
orig.referenced_expression.dup
|
18
|
+
end
|
19
|
+
copied.clear
|
20
|
+
|
9
21
|
super
|
10
22
|
end
|
11
23
|
end
|
@@ -39,7 +51,7 @@ module Regexp::Expression
|
|
39
51
|
class NameCall < Backreference::Name; end
|
40
52
|
class NumberCallRelative < Backreference::NumberRelative; end
|
41
53
|
|
42
|
-
class NumberRecursionLevel < Backreference::
|
54
|
+
class NumberRecursionLevel < Backreference::NumberRelative
|
43
55
|
attr_reader :recursion_level
|
44
56
|
|
45
57
|
def initialize(token, options = {})
|
@@ -33,6 +33,8 @@ module Regexp::Expression
|
|
33
33
|
|
34
34
|
class Absence < Group::Base; end
|
35
35
|
class Atomic < Group::Base; end
|
36
|
+
# TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
|
37
|
+
# longer inherit from Group because it is effectively a terminal expression.
|
36
38
|
class Options < Group::Base
|
37
39
|
attr_accessor :option_changes
|
38
40
|
|
@@ -40,6 +42,14 @@ module Regexp::Expression
|
|
40
42
|
self.option_changes = orig.option_changes.dup
|
41
43
|
super
|
42
44
|
end
|
45
|
+
|
46
|
+
def quantify(*args)
|
47
|
+
if token == :options_switch
|
48
|
+
raise Regexp::Parser::Error, 'Can not quantify an option switch'
|
49
|
+
else
|
50
|
+
super
|
51
|
+
end
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
class Capture < Group::Base
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
# default implementation, e.g. "atomic group", "hex escape", "word type", ..
|
4
|
+
def human_name
|
5
|
+
[token, type].compact.join(' ').tr('_', ' ')
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
Alternation.class_eval { def human_name; 'alternation' end }
|
10
|
+
Alternative.class_eval { def human_name; 'alternative' end }
|
11
|
+
Anchor::BOL.class_eval { def human_name; 'beginning of line' end }
|
12
|
+
Anchor::BOS.class_eval { def human_name; 'beginning of string' end }
|
13
|
+
Anchor::EOL.class_eval { def human_name; 'end of line' end }
|
14
|
+
Anchor::EOS.class_eval { def human_name; 'end of string' end }
|
15
|
+
Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end }
|
16
|
+
Anchor::MatchStart.class_eval { def human_name; 'match start' end }
|
17
|
+
Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end }
|
18
|
+
Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end }
|
19
|
+
Assertion::Lookahead.class_eval { def human_name; 'lookahead' end }
|
20
|
+
Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end }
|
21
|
+
Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end }
|
22
|
+
Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end }
|
23
|
+
Backreference::Name.class_eval { def human_name; 'backreference by name' end }
|
24
|
+
Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end }
|
25
|
+
Backreference::Number.class_eval { def human_name; 'backreference' end }
|
26
|
+
Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end }
|
27
|
+
Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end }
|
28
|
+
Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end }
|
29
|
+
CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end }
|
30
|
+
CharacterSet::Intersection.class_eval { def human_name; 'intersection' end }
|
31
|
+
CharacterSet::Range.class_eval { def human_name; 'character range' end }
|
32
|
+
CharacterType::Any.class_eval { def human_name; 'match-all' end }
|
33
|
+
Comment.class_eval { def human_name; 'comment' end }
|
34
|
+
Conditional::Branch.class_eval { def human_name; 'conditional branch' end }
|
35
|
+
Conditional::Condition.class_eval { def human_name; 'condition' end }
|
36
|
+
Conditional::Expression.class_eval { def human_name; 'conditional' end }
|
37
|
+
Group::Capture.class_eval { def human_name; "capture group #{number}" end }
|
38
|
+
Group::Named.class_eval { def human_name; 'named capture group' end }
|
39
|
+
Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end }
|
40
|
+
Literal.class_eval { def human_name; 'literal' end }
|
41
|
+
Root.class_eval { def human_name; 'root' end }
|
42
|
+
WhiteSpace.class_eval { def human_name; 'free space' end }
|
43
|
+
end
|
@@ -63,16 +63,20 @@ class Regexp::MatchLength
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def to_re
|
66
|
-
|
66
|
+
/(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/
|
67
67
|
end
|
68
68
|
|
69
69
|
private
|
70
70
|
|
71
71
|
attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
|
73
|
+
if Regexp.method_defined?(:match?) # ruby >= 2.4
|
74
|
+
def test_regexp
|
75
|
+
@test_regexp ||= /^#{to_re}$/
|
76
|
+
end
|
77
|
+
else
|
78
|
+
def test_regexp
|
79
|
+
@test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end }
|
76
80
|
end
|
77
81
|
end
|
78
82
|
end
|
@@ -8,9 +8,9 @@ module Regexp::Expression
|
|
8
8
|
|
9
9
|
attr_accessor :type, :token, :text, :ts, :te,
|
10
10
|
:level, :set_level, :conditional_level,
|
11
|
-
:options
|
11
|
+
:options
|
12
12
|
|
13
|
-
attr_reader :nesting_level
|
13
|
+
attr_reader :nesting_level, :quantifier
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
@@ -64,6 +64,10 @@ module Regexp::Expression
|
|
64
64
|
!quantifier.nil?
|
65
65
|
end
|
66
66
|
|
67
|
+
def optional?
|
68
|
+
quantified? && quantifier.min == 0
|
69
|
+
end
|
70
|
+
|
67
71
|
def offset
|
68
72
|
[starts_at, full_length]
|
69
73
|
end
|
@@ -81,5 +85,10 @@ module Regexp::Expression
|
|
81
85
|
quantifier && quantifier.nesting_level = lvl
|
82
86
|
terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
|
83
87
|
end
|
88
|
+
|
89
|
+
def quantifier=(qtf)
|
90
|
+
@quantifier = qtf
|
91
|
+
@repetitions = nil # clear memoized value
|
92
|
+
end
|
84
93
|
end
|
85
94
|
end
|
@@ -25,6 +25,7 @@ require 'regexp_parser/expression/classes/root'
|
|
25
25
|
require 'regexp_parser/expression/classes/unicode_property'
|
26
26
|
|
27
27
|
require 'regexp_parser/expression/methods/construct'
|
28
|
+
require 'regexp_parser/expression/methods/human_name'
|
28
29
|
require 'regexp_parser/expression/methods/match'
|
29
30
|
require 'regexp_parser/expression/methods/match_length'
|
30
31
|
require 'regexp_parser/expression/methods/options'
|