regexp_parser 2.5.0 → 2.6.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f871ec3cdea5a594f72f5386f1b344710e6204f7307ba40d966653197f526be8
4
- data.tar.gz: dd93c880f29ec77531faa2379fbfc8e34a9b67680664c6a3477d38afeaa1809a
3
+ metadata.gz: 66568005494b517613155277c6be4731eb8a26bb9b48a692a9430507286ce583
4
+ data.tar.gz: d1fc6c6f1a0c7f939c51703ac844c2dbb134f96e0e55780646cb7e3e87d7a652
5
5
  SHA512:
6
- metadata.gz: 45e52ab0ce7bec3e4a275efa3828532778c49e8d36eec1ea82a43755a87abc9eee97e986027aa8f5c64fd604f15164d2ad4f37e5d6e22a5a1e3e9da6788271b9
7
- data.tar.gz: 1f5514f3252294d9fe0877cff1d8b0db0400838c97ed78d15bbb794b94595c20d081681e4b1fe9bb6c89be7749514d8b2b8cf385360d002cd89e2a76ce6d2e63
6
+ metadata.gz: b955b2215b71c94497e52841142fab8c2b9930d0d6cea6ea2b3eeb8ed9fe84575e2f34aae3a6051af2b56429f98cf070b9151805f2cb93ddb511ec1e0e50dd7c
7
+ data.tar.gz: 3a4f083942b66ddb4b67ab33f14bb1c0b724a60c2b30605059d32ce3648e9cb46e31e797b7a526a2028c1e018d73365f5ef955256de4e63397d6ea105714ff12
data/CHANGELOG.md CHANGED
@@ -1,37 +1,84 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
1
8
  ## [Unreleased]
2
9
 
10
+ ## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
11
+
12
+ ### Fixed
13
+
14
+ - fixed `SystemStackError` when cloning recursive subexpression calls
15
+ * e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
16
+
17
+ ## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
18
+
19
+ ### Fixed
20
+
21
+ - fixed scanning of two negative lookbehind edge cases
22
+ * `(?<!x)y>` used to raise a ScannerError
23
+ * `(?<!x>)y` used to be misinterpreted as a named group
24
+ * thanks to [Sergio Medina](https://github.com/serch) for the report
25
+
26
+ ## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com)
27
+
28
+ ### Fixed
29
+
30
+ - fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp)
31
+ - fixed `#reference`, `#referenced_expression` for recursion level backrefs
32
+ * e.g. `(a)(b)\k<-1+1>`
33
+ * `#referenced_expression` was `nil`, now it is the correct `Group` exp
34
+ - detect and raise for two more syntax errors when parsing String input
35
+ * quantification of option switches (e.g. `(?i)+`)
36
+ * invalid references (e.g. `/\k<1>/`)
37
+ * these are a `SyntaxError` in Ruby, so could only be passed as a String
38
+
39
+ ### Added
40
+
41
+ - `Regexp::Expression::Base#human_name`
42
+ * returns a nice, human-readable description of the expression
43
+ - `Regexp::Expression::Base#optional?`
44
+ * returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`)
45
+ - added a deprecation warning when calling `#to_re` on set members
46
+
47
+ ## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com)
48
+
3
49
  ### Added
4
50
 
5
51
  - `Regexp::Expression::Base.construct` and `.token_class` methods
52
+ * see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details
6
53
 
7
54
  ## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com)
8
55
 
9
56
  ### Fixed
10
57
 
11
58
  - fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`)
12
- - they used to be treated as reluctant or possessive mode indicators
13
- - however, Ruby does not support these modes for interval quantifiers
14
- - they are now treated as chained quantifiers instead, as Ruby does it
15
- - c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
59
+ * they used to be treated as reluctant or possessive mode indicators
60
+ * however, Ruby does not support these modes for interval quantifiers
61
+ * they are now treated as chained quantifiers instead, as Ruby does it
62
+ * c.f. [#3](https://github.com/ammar/regexp_parser/issues/3)
16
63
  - fixed `Expression::Base#nesting_level` for some tree rewrite cases
17
- - e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
64
+ * e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level
18
65
  - fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]`
19
- - they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
20
- - they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
66
+ * they raise a `SyntaxError` when used in a Regexp, so could only be passed as String
67
+ * they now raise a `Regexp::Scanner::ValidationError` in the `Scanner`
21
68
 
22
69
  ### Added
23
70
 
24
71
  - added `Expression::Base#==` for (deep) comparison of expressions
25
72
  - added `Expression::Base#parts`
26
- - returns the text elements and subexpressions of an expression
27
- - e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
73
+ * returns the text elements and subexpressions of an expression
74
+ * e.g. `parse(/(a)/)[0].parts # => ["(", #<Literal @text="a"...>, ")"]`
28
75
  - added `Expression::Base#te` (a.k.a. token end index)
29
- - `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
76
+ * `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far
30
77
  - made some `Expression::Base` methods available on `Quantifier` instances, too
31
- - `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
32
- - `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
33
- - `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
34
- - this allows a more unified handling with `Expression::Base` instances
78
+ * `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?`
79
+ * `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset`
80
+ * `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
81
+ * this allows a more unified handling with `Expression::Base` instances
35
82
  - allowed `Quantifier#initialize` to take a token and options Hash like other nodes
36
83
  - added a deprecation warning for initializing Quantifiers with 4+ arguments:
37
84
 
@@ -54,18 +101,18 @@
54
101
  ### Fixed
55
102
 
56
103
  - removed five inexistent unicode properties from `Syntax#features`
57
- - these were never supported by Ruby or the `Regexp::Scanner`
58
- - thanks to [Markus Schirp](https://github.com/mbj) for the report
104
+ * these were never supported by Ruby or the `Regexp::Scanner`
105
+ * thanks to [Markus Schirp](https://github.com/mbj) for the report
59
106
 
60
107
  ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
61
108
 
62
109
  ### Added
63
110
 
64
111
  - improved parsing performance through `Syntax` refactoring
65
- - instead of fresh `Syntax` instances, pre-loaded constants are now re-used
66
- - this approximately doubles the parsing speed for simple regexps
112
+ * instead of fresh `Syntax` instances, pre-loaded constants are now re-used
113
+ * this approximately doubles the parsing speed for simple regexps
67
114
  - added methods to `Syntax` classes to show relative feature sets
68
- - e.g. `Regexp::Syntax::V3_2_0.added_features`
115
+ * e.g. `Regexp::Syntax::V3_2_0.added_features`
69
116
  - support for new unicode properties of Ruby 3.2 / Unicode 14.0
70
117
 
71
118
  ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
@@ -73,14 +120,14 @@
73
120
  ### Fixed
74
121
 
75
122
  - fixed Syntax version of absence groups (`(?~...)`)
76
- - the lexer accepted them for any Ruby version
77
- - now they are only recognized for Ruby >= 2.4.1 in which they were introduced
123
+ * the lexer accepted them for any Ruby version
124
+ * now they are only recognized for Ruby >= 2.4.1 in which they were introduced
78
125
  - reduced gem size by excluding specs from package
79
126
  - removed deprecated `test_files` gemspec setting
80
127
  - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
81
128
  - no longer depend on `set`
82
- - `set` was removed from the stdlib and made a standalone gem as of Ruby 3
83
- - this made it a hidden/undeclared dependency of `regexp_parser`
129
+ * `set` was removed from the stdlib and made a standalone gem as of Ruby 3
130
+ * this made it a hidden/undeclared dependency of `regexp_parser`
84
131
 
85
132
  ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
86
133
 
@@ -318,8 +365,8 @@
318
365
 
319
366
  - Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re`
320
367
  - `Conditional::Condition` no longer lives outside the recursive `#expressions` tree
321
- - it used to be the only expression stored in a custom ivar, complicating traversal
322
- - its setter and getter (`#condition=`, `#condition`) still work as before
368
+ * it used to be the only expression stored in a custom ivar, complicating traversal
369
+ * its setter and getter (`#condition=`, `#condition`) still work as before
323
370
 
324
371
  ## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com)
325
372
 
@@ -327,8 +374,8 @@
327
374
 
328
375
  - Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?`
329
376
  - Added `Group::Options#option_changes`
330
- - shows the options enabled or disabled by the given options group
331
- - as with all other expressions, `#options` shows the overall active options
377
+ * shows the options enabled or disabled by the given options group
378
+ * as with all other expressions, `#options` shows the overall active options
332
379
  - Added `Conditional#reference` and `Condition#reference`, indicating the determinative group
333
380
  - Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig)
334
381
 
@@ -512,7 +559,6 @@ This release includes several breaking changes, mostly to character sets, #map a
512
559
  * Fixed scanning of zero length comments (PR #12)
513
560
  * Fixed missing escape:codepoint_list syntax token (PR #14)
514
561
  * Fixed to_s for modified interval quantifiers (PR #17)
515
- - Added a note about MRI implementation quirks to Scanner section
516
562
 
517
563
  ## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com)
518
564
 
@@ -538,7 +584,6 @@ This release includes several breaking changes, mostly to character sets, #map a
538
584
  - Renamed Lexer's method to lex, added an alias to the old name (scan)
539
585
  - Use #map instead of #each to run the block in Lexer.lex.
540
586
  - Replaced VERSION.yml file with a constant.
541
- - Updated README
542
587
  - Update tokens and scanner with new additions in Unicode 7.0.
543
588
 
544
589
  ## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com)
@@ -548,20 +593,11 @@ This release includes several breaking changes, mostly to character sets, #map a
548
593
  - Added syntax files for missing ruby 2.x versions. These do not add
549
594
  extra syntax support, they just make the gem work with the newer
550
595
  ruby versions.
551
- - Added .travis.yml to project root.
552
- - README:
553
- - Removed note purporting runtime support for ruby 1.8.6.
554
- - Added a section identifying the main unsupported syntax features.
555
- - Added sections for Testing and Building
556
- - Added badges for gem version, Travis CI, and code climate.
557
- - Updated README, fixing broken examples, and converting it from a rdoc file to Github's flavor of Markdown.
558
596
  - Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root.
559
-
560
597
  - Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd'
561
598
 
562
599
  ## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com)
563
600
 
564
- - Correct ChangeLog.
565
601
  - Added syntax stubs for ruby versions 2.0 and 2.1
566
602
  - Added clone methods for deep copying expressions.
567
603
  - Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier.
@@ -570,7 +606,6 @@ This release includes several breaking changes, mostly to character sets, #map a
570
606
  - Improved EOF handling in general and especially from sequences like hex and control escapes.
571
607
  - Fixed a bug where named groups with an empty name would return a blank token [].
572
608
  - Fixed a bug where member of a parent set where being added to its last subset.
573
- - Various code cleanups in scanner.rl
574
609
  - Fixed a few mutable string bugs by calling dup on the originals.
575
610
  - Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time)
576
611
  - Removed look-behind assertions (positive and negative) from 1.8 syntax
data/README.md CHANGED
@@ -9,8 +9,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
9
9
 
10
10
  * Multilayered
11
11
  * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
12
- * A lexer that produces a "stream" of token objects.
13
- * A parser that produces a "tree" of Expression objects (OO API)
12
+ * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects)
13
+ * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects)
14
14
  * Runs on Ruby 2.x, 3.x and JRuby runtimes
15
15
  * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
16
16
 
@@ -36,14 +36,15 @@ Or, add it to your project's `Gemfile`:
36
36
 
37
37
  ```gem 'regexp_parser', '~> X.Y.Z'```
38
38
 
39
- See rubygems for the the [latest version number](https://rubygems.org/gems/regexp_parser)
39
+ See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser)
40
+ for the the latest version number.
40
41
 
41
42
 
42
43
  ---
43
44
  ## Usage
44
45
 
45
46
  The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them
46
- provides a single method that takes a regular expression (as a RegExp object or
47
+ provides a single method that takes a regular expression (as a Regexp object or
47
48
  a string) and returns its results. The **Lexer** and the **Parser** accept an
48
49
  optional second argument that specifies the syntax version, like 'ruby/2.0',
49
50
  which defaults to the host Ruby version (using RUBY_VERSION).
@@ -79,7 +80,7 @@ All three methods accept either a `Regexp` or `String` (containing the pattern)
79
80
  require 'regexp_parser'
80
81
 
81
82
  Regexp::Parser.parse(
82
- "a+ # Recognises a and A...",
83
+ "a+ # Recognizes a and A...",
83
84
  options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
84
85
  )
85
86
  ```
@@ -101,7 +102,7 @@ start/end offsets for each token found.
101
102
  ```ruby
102
103
  require 'regexp_parser'
103
104
 
104
- Regexp::Scanner.scan /(ab?(cd)*[e-h]+)/ do |type, token, text, ts, te|
105
+ Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te|
105
106
  puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]"
106
107
  end
107
108
 
@@ -124,7 +125,7 @@ A one-liner that uses map on the result of the scan to return the textual
124
125
  parts of the pattern:
125
126
 
126
127
  ```ruby
127
- Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
128
+ Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] }
128
129
  #=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"]
129
130
  ```
130
131
 
@@ -220,7 +221,7 @@ syntax, and prints the token objects' text indented to their level.
220
221
  ```ruby
221
222
  require 'regexp_parser'
222
223
 
223
- Regexp::Lexer.lex /a?(b(c))*[d]+/, 'ruby/1.9' do |token|
224
+ Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token|
224
225
  puts "#{' ' * token.level}#{token.text}"
225
226
  end
226
227
 
@@ -246,7 +247,7 @@ how the sequence 'cat' is treated. The 't' is separated because it's followed
246
247
  by a quantifier that only applies to it.
247
248
 
248
249
  ```ruby
249
- Regexp::Lexer.scan( /(cat?([b]at)){3,5}/ ).map {|token| token.text}
250
+ Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text }
250
251
  #=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"]
251
252
  ```
252
253
 
@@ -274,7 +275,7 @@ require 'regexp_parser'
274
275
 
275
276
  regex = /a?(b+(c)d)*(?<name>[0-9]+)/
276
277
 
277
- tree = Regexp::Parser.parse( regex, 'ruby/2.1' )
278
+ tree = Regexp::Parser.parse(regex, 'ruby/2.1')
278
279
 
279
280
  tree.traverse do |event, exp|
280
281
  puts "#{event}: #{exp.type} `#{exp.to_s}`"
@@ -355,7 +356,7 @@ _Note that not all of these are available in all versions of Ruby_
355
356
  | &emsp;&emsp;_Nest Level_ | `\k<n-1>` | &#x2713; |
356
357
  | &emsp;&emsp;_Numbered_ | `\k<1>` | &#x2713; |
357
358
  | &emsp;&emsp;_Relative_ | `\k<-2>` | &#x2713; |
358
- | &emsp;&emsp;_Traditional_ | `\1` thru `\9` | &#x2713; |
359
+ | &emsp;&emsp;_Traditional_ | `\1` through `\9` | &#x2713; |
359
360
  | &emsp;&nbsp;_**Capturing**_ | `(abc)` | &#x2713; |
360
361
  | &emsp;&nbsp;_**Comments**_ | `(?# comment text)` | &#x2713; |
361
362
  | &emsp;&nbsp;_**Named**_ | `(?<name>abc)`, `(?'name'abc)` | &#x2713; |
@@ -375,7 +376,7 @@ _Note that not all of these are available in all versions of Ruby_
375
376
  | &emsp;&nbsp;_**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
376
377
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
377
378
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
378
- | **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
379
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0])</sub>_ | &#x22f1; |
379
380
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
380
381
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
381
382
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -384,13 +385,17 @@ _Note that not all of these are available in all versions of Ruby_
384
385
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
385
386
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
386
387
 
387
- **\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
388
- quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
388
+ [Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/
389
+
390
+ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers.
391
+ Any `+` or `?` that follows an interval quantifier will be treated as another,
392
+ chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
389
393
  [#69](https://github.com/ammar/regexp_parser/pull/69).
390
394
 
391
- **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
392
- https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
393
- scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
395
+ **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex
396
+ escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581),
397
+ so they will only reach the scanner and will only be emitted if a String or a Regexp
398
+ that has been built with the `::new` constructor is scanned.
394
399
 
395
400
  ##### Inapplicable Features
396
401
 
@@ -407,25 +412,27 @@ expressions library (Onigmo). They are not supported by the scanner.
407
412
 
408
413
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
409
414
 
410
- _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
411
- or incorrectly return tokens/objects as literals._
415
+ _**Note**: Attempting to process expressions with unsupported syntax features can raise
416
+ an error, or incorrectly return tokens/objects as literals._
412
417
 
413
418
 
414
419
  ## Testing
415
420
  To run the tests simply run rake from the root directory.
416
421
 
417
- The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
422
+ The default task generates the scanner's code from the Ragel source files and runs
423
+ all the specs, thus it requires Ragel to be installed.
418
424
 
419
- Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
425
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own,
426
+ so to run individual tests you might want to run:
420
427
 
421
428
  ```
422
429
  rake ragel:rb && rspec spec/scanner/properties_spec.rb
423
430
  ```
424
431
 
425
432
  ## Building
426
- Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be
427
- installed. The build tasks will automatically invoke the 'ragel:rb' task to generate the
428
- Ruby scanner code.
433
+ Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/)
434
+ to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate
435
+ the Ruby scanner code.
429
436
 
430
437
 
431
438
  The project uses the standard rubygems package tasks, so:
@@ -445,19 +452,26 @@ rake install
445
452
  ## Example Projects
446
453
  Projects using regexp_parser.
447
454
 
448
- - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
455
+ - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool
456
+ that uses regexp_parser to convert Regexps to css/xpath selectors.
449
457
 
450
- - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
458
+ - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions
459
+ to JavaScript-compatible regular expressions.
451
460
 
452
- - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
461
+ - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor
462
+ with alias support.
453
463
 
454
- - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
464
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions
465
+ (amongst others) to see if your tests cover their behavior.
455
466
 
456
- - [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer for Ruby.
467
+ - [repper](https://github.com/jaynetics/repper) is a regular expression
468
+ pretty-printer and formatter for Ruby.
457
469
 
458
- - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
470
+ - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that
471
+ uses regexp_parser to lint Regexps.
459
472
 
460
- - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes.
473
+ - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper
474
+ that uses regexp_parser to generate examples of postal codes.
461
475
 
462
476
 
463
477
  ## References
@@ -14,6 +14,10 @@ module Regexp::Expression
14
14
  end
15
15
 
16
16
  def to_re(format = :full)
17
+ if set_level > 0
18
+ warn "Calling #to_re on character set members is deprecated - "\
19
+ "their behavior might not be equivalent outside of the set."
20
+ end
17
21
  ::Regexp.new(to_s(format))
18
22
  end
19
23
 
@@ -32,15 +36,19 @@ module Regexp::Expression
32
36
  end
33
37
 
34
38
  def repetitions
35
- return 1..1 unless quantified?
36
- min = quantifier.min
37
- max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
38
- range = min..max
39
- # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
40
- if RUBY_VERSION.to_f < 2.7
41
- range.define_singleton_method(:minmax) { [min, max] }
42
- end
43
- range
39
+ @repetitions ||=
40
+ if quantified?
41
+ min = quantifier.min
42
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
43
+ range = min..max
44
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
45
+ if RUBY_VERSION.to_f < 2.7
46
+ range.define_singleton_method(:minmax) { [min, max] }
47
+ end
48
+ range
49
+ else
50
+ 1..1
51
+ end
44
52
  end
45
53
 
46
54
  def greedy?
@@ -5,7 +5,19 @@ module Regexp::Expression
5
5
  attr_accessor :referenced_expression
6
6
 
7
7
  def initialize_copy(orig)
8
- self.referenced_expression = orig.referenced_expression.dup
8
+ exp_id = [self.class, self.starts_at]
9
+
10
+ # prevent infinite recursion for recursive subexp calls
11
+ copied = @@copied ||= {}
12
+ self.referenced_expression =
13
+ if copied[exp_id]
14
+ orig.referenced_expression
15
+ else
16
+ copied[exp_id] = true
17
+ orig.referenced_expression.dup
18
+ end
19
+ copied.clear
20
+
9
21
  super
10
22
  end
11
23
  end
@@ -39,7 +51,7 @@ module Regexp::Expression
39
51
  class NameCall < Backreference::Name; end
40
52
  class NumberCallRelative < Backreference::NumberRelative; end
41
53
 
42
- class NumberRecursionLevel < Backreference::Number
54
+ class NumberRecursionLevel < Backreference::NumberRelative
43
55
  attr_reader :recursion_level
44
56
 
45
57
  def initialize(token, options = {})
@@ -1,5 +1,5 @@
1
1
  module Regexp::Expression
2
- # TODO: unify naming with Token::Escape, on way or the other, in v3.0.0
2
+ # TODO: unify naming with Token::Escape, one way or the other, in v3.0.0
3
3
  module EscapeSequence
4
4
  class Base < Regexp::Expression::Base
5
5
  def codepoint
@@ -33,6 +33,8 @@ module Regexp::Expression
33
33
 
34
34
  class Absence < Group::Base; end
35
35
  class Atomic < Group::Base; end
36
+ # TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no
37
+ # longer inherit from Group because it is effectively a terminal expression.
36
38
  class Options < Group::Base
37
39
  attr_accessor :option_changes
38
40
 
@@ -40,6 +42,14 @@ module Regexp::Expression
40
42
  self.option_changes = orig.option_changes.dup
41
43
  super
42
44
  end
45
+
46
+ def quantify(*args)
47
+ if token == :options_switch
48
+ raise Regexp::Parser::Error, 'Can not quantify an option switch'
49
+ else
50
+ super
51
+ end
52
+ end
43
53
  end
44
54
 
45
55
  class Capture < Group::Base
@@ -1,5 +1,5 @@
1
1
  module Regexp::Expression
2
- # TODO: unify name with token :property, on way or the other, in v3.0.0
2
+ # TODO: unify name with token :property, one way or the other, in v3.0.0
3
3
  module UnicodeProperty
4
4
  class Base < Regexp::Expression::Base
5
5
  def negative?
@@ -0,0 +1,43 @@
1
+ module Regexp::Expression
2
+ module Shared
3
+ # default implementation, e.g. "atomic group", "hex escape", "word type", ..
4
+ def human_name
5
+ [token, type].compact.join(' ').tr('_', ' ')
6
+ end
7
+ end
8
+
9
+ Alternation.class_eval { def human_name; 'alternation' end }
10
+ Alternative.class_eval { def human_name; 'alternative' end }
11
+ Anchor::BOL.class_eval { def human_name; 'beginning of line' end }
12
+ Anchor::BOS.class_eval { def human_name; 'beginning of string' end }
13
+ Anchor::EOL.class_eval { def human_name; 'end of line' end }
14
+ Anchor::EOS.class_eval { def human_name; 'end of string' end }
15
+ Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end }
16
+ Anchor::MatchStart.class_eval { def human_name; 'match start' end }
17
+ Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end }
18
+ Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end }
19
+ Assertion::Lookahead.class_eval { def human_name; 'lookahead' end }
20
+ Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end }
21
+ Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end }
22
+ Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end }
23
+ Backreference::Name.class_eval { def human_name; 'backreference by name' end }
24
+ Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end }
25
+ Backreference::Number.class_eval { def human_name; 'backreference' end }
26
+ Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end }
27
+ Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end }
28
+ Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end }
29
+ CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end }
30
+ CharacterSet::Intersection.class_eval { def human_name; 'intersection' end }
31
+ CharacterSet::Range.class_eval { def human_name; 'character range' end }
32
+ CharacterType::Any.class_eval { def human_name; 'match-all' end }
33
+ Comment.class_eval { def human_name; 'comment' end }
34
+ Conditional::Branch.class_eval { def human_name; 'conditional branch' end }
35
+ Conditional::Condition.class_eval { def human_name; 'condition' end }
36
+ Conditional::Expression.class_eval { def human_name; 'conditional' end }
37
+ Group::Capture.class_eval { def human_name; "capture group #{number}" end }
38
+ Group::Named.class_eval { def human_name; 'named capture group' end }
39
+ Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end }
40
+ Literal.class_eval { def human_name; 'literal' end }
41
+ Root.class_eval { def human_name; 'root' end }
42
+ WhiteSpace.class_eval { def human_name; 'free space' end }
43
+ end
@@ -63,16 +63,20 @@ class Regexp::MatchLength
63
63
  end
64
64
 
65
65
  def to_re
66
- "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
66
+ /(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/
67
67
  end
68
68
 
69
69
  private
70
70
 
71
71
  attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72
72
 
73
- def test_regexp
74
- @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
75
- regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
73
+ if Regexp.method_defined?(:match?) # ruby >= 2.4
74
+ def test_regexp
75
+ @test_regexp ||= /^#{to_re}$/
76
+ end
77
+ else
78
+ def test_regexp
79
+ @test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end }
76
80
  end
77
81
  end
78
82
  end
@@ -8,9 +8,9 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options, :quantifier
11
+ :options
12
12
 
13
- attr_reader :nesting_level
13
+ attr_reader :nesting_level, :quantifier
14
14
  end
15
15
  end
16
16
 
@@ -64,6 +64,10 @@ module Regexp::Expression
64
64
  !quantifier.nil?
65
65
  end
66
66
 
67
+ def optional?
68
+ quantified? && quantifier.min == 0
69
+ end
70
+
67
71
  def offset
68
72
  [starts_at, full_length]
69
73
  end
@@ -81,5 +85,10 @@ module Regexp::Expression
81
85
  quantifier && quantifier.nesting_level = lvl
82
86
  terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
83
87
  end
88
+
89
+ def quantifier=(qtf)
90
+ @quantifier = qtf
91
+ @repetitions = nil # clear memoized value
92
+ end
84
93
  end
85
94
  end
@@ -25,6 +25,7 @@ require 'regexp_parser/expression/classes/root'
25
25
  require 'regexp_parser/expression/classes/unicode_property'
26
26
 
27
27
  require 'regexp_parser/expression/methods/construct'
28
+ require 'regexp_parser/expression/methods/human_name'
28
29
  require 'regexp_parser/expression/methods/match'
29
30
  require 'regexp_parser/expression/methods/match_length'
30
31
  require 'regexp_parser/expression/methods/options'