regexp_parser 1.7.1 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +157 -1
  3. data/Gemfile +6 -1
  4. data/LICENSE +1 -1
  5. data/README.md +38 -32
  6. data/Rakefile +18 -27
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +123 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  15. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
  17. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/property.rb +1 -3
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -17
  21. data/lib/regexp_parser/expression/classes/type.rb +0 -2
  22. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  23. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  24. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  25. data/lib/regexp_parser/expression/quantifier.rb +11 -2
  26. data/lib/regexp_parser/expression/sequence.rb +3 -20
  27. data/lib/regexp_parser/expression/subexpression.rb +1 -2
  28. data/lib/regexp_parser/expression.rb +7 -139
  29. data/lib/regexp_parser/lexer.rb +13 -11
  30. data/lib/regexp_parser/parser.rb +325 -344
  31. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  32. data/lib/regexp_parser/scanner/properties/long.csv +604 -0
  33. data/lib/regexp_parser/scanner/properties/short.csv +242 -0
  34. data/lib/regexp_parser/scanner/property.rl +2 -2
  35. data/lib/regexp_parser/scanner/scanner.rl +235 -255
  36. data/lib/regexp_parser/scanner.rb +1324 -1387
  37. data/lib/regexp_parser/syntax/any.rb +4 -6
  38. data/lib/regexp_parser/syntax/base.rb +13 -15
  39. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  40. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  41. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  42. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  43. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  44. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  45. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  46. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  47. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  48. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  49. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  50. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  51. data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
  52. data/lib/regexp_parser/syntax/token.rb +45 -0
  53. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  54. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
  55. data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
  56. data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
  57. data/lib/regexp_parser/syntax.rb +8 -6
  58. data/lib/regexp_parser/token.rb +9 -20
  59. data/lib/regexp_parser/version.rb +1 -1
  60. data/lib/regexp_parser.rb +0 -2
  61. data/regexp_parser.gemspec +20 -22
  62. metadata +34 -165
  63. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  64. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  65. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  66. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  67. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  68. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  69. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  70. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  71. data/spec/expression/base_spec.rb +0 -94
  72. data/spec/expression/clone_spec.rb +0 -120
  73. data/spec/expression/conditional_spec.rb +0 -89
  74. data/spec/expression/free_space_spec.rb +0 -27
  75. data/spec/expression/methods/match_length_spec.rb +0 -161
  76. data/spec/expression/methods/match_spec.rb +0 -25
  77. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  78. data/spec/expression/methods/tests_spec.rb +0 -99
  79. data/spec/expression/methods/traverse_spec.rb +0 -161
  80. data/spec/expression/options_spec.rb +0 -128
  81. data/spec/expression/root_spec.rb +0 -9
  82. data/spec/expression/sequence_spec.rb +0 -9
  83. data/spec/expression/subexpression_spec.rb +0 -50
  84. data/spec/expression/to_h_spec.rb +0 -26
  85. data/spec/expression/to_s_spec.rb +0 -100
  86. data/spec/lexer/all_spec.rb +0 -22
  87. data/spec/lexer/conditionals_spec.rb +0 -53
  88. data/spec/lexer/delimiters_spec.rb +0 -68
  89. data/spec/lexer/escapes_spec.rb +0 -14
  90. data/spec/lexer/keep_spec.rb +0 -10
  91. data/spec/lexer/literals_spec.rb +0 -89
  92. data/spec/lexer/nesting_spec.rb +0 -99
  93. data/spec/lexer/refcalls_spec.rb +0 -55
  94. data/spec/parser/all_spec.rb +0 -43
  95. data/spec/parser/alternation_spec.rb +0 -88
  96. data/spec/parser/anchors_spec.rb +0 -17
  97. data/spec/parser/conditionals_spec.rb +0 -179
  98. data/spec/parser/errors_spec.rb +0 -30
  99. data/spec/parser/escapes_spec.rb +0 -121
  100. data/spec/parser/free_space_spec.rb +0 -130
  101. data/spec/parser/groups_spec.rb +0 -108
  102. data/spec/parser/keep_spec.rb +0 -6
  103. data/spec/parser/posix_classes_spec.rb +0 -8
  104. data/spec/parser/properties_spec.rb +0 -115
  105. data/spec/parser/quantifiers_spec.rb +0 -52
  106. data/spec/parser/refcalls_spec.rb +0 -112
  107. data/spec/parser/set/intersections_spec.rb +0 -127
  108. data/spec/parser/set/ranges_spec.rb +0 -111
  109. data/spec/parser/sets_spec.rb +0 -178
  110. data/spec/parser/types_spec.rb +0 -18
  111. data/spec/scanner/all_spec.rb +0 -18
  112. data/spec/scanner/anchors_spec.rb +0 -21
  113. data/spec/scanner/conditionals_spec.rb +0 -128
  114. data/spec/scanner/delimiters_spec.rb +0 -52
  115. data/spec/scanner/errors_spec.rb +0 -67
  116. data/spec/scanner/escapes_spec.rb +0 -53
  117. data/spec/scanner/free_space_spec.rb +0 -133
  118. data/spec/scanner/groups_spec.rb +0 -52
  119. data/spec/scanner/keep_spec.rb +0 -10
  120. data/spec/scanner/literals_spec.rb +0 -49
  121. data/spec/scanner/meta_spec.rb +0 -18
  122. data/spec/scanner/properties_spec.rb +0 -64
  123. data/spec/scanner/quantifiers_spec.rb +0 -20
  124. data/spec/scanner/refcalls_spec.rb +0 -36
  125. data/spec/scanner/sets_spec.rb +0 -102
  126. data/spec/scanner/types_spec.rb +0 -14
  127. data/spec/spec_helper.rb +0 -15
  128. data/spec/support/runner.rb +0 -42
  129. data/spec/support/shared_examples.rb +0 -77
  130. data/spec/support/warning_extractor.rb +0 -60
  131. data/spec/syntax/syntax_spec.rb +0 -48
  132. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  133. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  134. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  135. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  136. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  137. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  138. data/spec/syntax/versions/aliases_spec.rb +0 -37
  139. data/spec/token/token_spec.rb +0 -85
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd872b22bf04a288790ef0f73df9041f14fb88a08c2a03852d9dbbc238b452d6
4
- data.tar.gz: 4641097a24b5fa0f7b0c8e5aacc152587fe8b15d30f3f78bbec8157887b8b897
3
+ metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
4
+ data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
5
5
  SHA512:
6
- metadata.gz: 858570df4a7047a2d8b09555b56de28a66ca4f8022e596c249900f5312f8e7fb9376384ca816bc3c08f3e324930702ad410a28b5be680adea6867e1f8075441e
7
- data.tar.gz: 0d70e7b4f18739826bb334fb305e335e44a354ae302214ca3c1884f66ace8680e48a9e4c64b890b220b82056da761084413c8b9b8c5e363382f5cf165b3d3448
6
+ metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
7
+ data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
data/CHANGELOG.md CHANGED
@@ -1,4 +1,160 @@
1
- ## [Unreleased]
1
+ ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
2
+
3
+ ### Fixed
4
+
5
+ - fixed Syntax version of absence groups (`(?~...)`)
6
+ - the lexer accepted them for any Ruby version
7
+ - now they are only recognized for Ruby >= 2.4.1 in which they were introduced
8
+ - reduced gem size by excluding specs from package
9
+ - removed deprecated `test_files` gemspec setting
10
+ - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
11
+ - no longer depend on `set`
12
+ - `set` was removed from the stdlib and made a standalone gem as of Ruby 3
13
+ - this made it a hidden/undeclared dependency of `regexp_parser`
14
+
15
+ ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
16
+
17
+ ### Added
18
+
19
+ - added support for 13 new unicode properties introduced in Ruby 3.1.0
20
+
21
+ ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
22
+
23
+ ### Fixed
24
+
25
+ - fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
26
+ * thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
27
+
28
+ ## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
29
+
30
+ ### Added
31
+
32
+ - common ancestor for all scanning/parsing/lexing errors
33
+ * `Regexp::Parser::Error` can now be rescued as a catch-all
34
+ * the following errors (and their many descendants) now inherit from it:
35
+ - `Regexp::Expression::Conditional::TooManyBranches`
36
+ - `Regexp::Parser::ParserError`
37
+ - `Regexp::Scanner::ScannerError`
38
+ - `Regexp::Scanner::ValidationError`
39
+ - `Regexp::Syntax::SyntaxError`
40
+ * it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
41
+ * thanks to [sandstrom](https://github.com/sandstrom) for the cue
42
+
43
+ ### Fixed
44
+
45
+ - fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
46
+ * a regression in v2.0.1 had caused them to be scanned as literals
47
+ - fixed scanning of some backreference and subexpression call edge cases
48
+ * e.g. `\k<+1>`, `\g<x-1>`
49
+ - fixed tokenization of some escapes in character sets
50
+ * `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
51
+ * all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
52
+ * if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
53
+ * the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
54
+ - fixed handling of control/metacontrol escapes in character sets
55
+ * e.g. `[\cX]`, `[\M-\C-X]`
56
+ * they were misread as bunch of individual literals, escapes, and ranges
57
+ - fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
58
+
59
+ ## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
60
+
61
+ ### Fixed
62
+
63
+ - fixed error when scanning some unlikely and redundant but valid charset patterns
64
+ * e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
65
+ - fixed ancestry of some error classes related to syntax version lookup
66
+ * `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
67
+ * they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
68
+
69
+ ## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
70
+
71
+ ### Fixed
72
+
73
+ - fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive`
74
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon)
75
+
76
+ ## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com)
77
+
78
+ ### Fixed
79
+
80
+ - fixed error when scanning some group names
81
+ * this affected names containing hyphens, digits or multibyte chars, e.g. `/(?<a1>a)/`
82
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
83
+ - fixed error when scanning hex escapes with just one hex digit
84
+ * e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not
85
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
86
+
87
+ ## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com)
88
+
89
+ ### Changed
90
+
91
+ - some methods that used to return byte-based indices now return char-based indices
92
+ * the returned values have only changed for Regexps that contain multibyte chars
93
+ * this is only a breaking change if you used such methods directly AND relied on them pointing to bytes
94
+ * affected methods:
95
+ * `Regexp::Token` `#length`, `#offset`, `#te`, `#ts`
96
+ * `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts`
97
+ * thanks to [Akinori MUSHA](https://github.com/knu) for the report
98
+ - removed some deprecated methods/signatures
99
+ * these are rarely used and have been showing deprecation warnings for a long time
100
+ * `Regexp::Expression::Subexpression.new` with 3 arguments
101
+ * `Regexp::Expression::Root.new` without a token argument
102
+ * `Regexp::Expression.parsed`
103
+
104
+ ### Added
105
+
106
+ - `Regexp::Expression::Base#base_length`
107
+ * returns the character count of an expression body, ignoring any quantifier
108
+ - pragmatic, experimental support for chained quantifiers
109
+ * e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s
110
+ * successive quantifiers used to be silently dropped by the parser
111
+ * they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}`
112
+ * thanks to [calfeld](https://github.com/calfeld) for reporting this a while back
113
+
114
+ ### Fixed
115
+
116
+ - incorrect encoding output for non-ascii comments
117
+ * this led to a crash when calling `#to_s` on parse results containing such comments
118
+ * thanks to [Michael Glass](https://github.com/michaelglass) for the report
119
+ - some crashes when scanning contrived patterns such as `'\😋'`
120
+
121
+ ### [1.8.2] - 2020-10-11 - [Janosch Müller](mailto:janosch84@gmail.com)
122
+
123
+ ### Fixed
124
+
125
+ - fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0
126
+ * thanks to [Thomas Walpole](https://github.com/twalpole)
127
+ - removed "unknown future version" warning on Ruby 3.0
128
+
129
+ ### [1.8.1] - 2020-09-28 - [Janosch Müller](mailto:janosch84@gmail.com)
130
+
131
+ ### Fixed
132
+
133
+ - fixed scanning of comment-like text in normal mode
134
+ * this was an old bug, but had become more prevalent in v1.8.0
135
+ * thanks to [Tietew](https://github.com/Tietew) for the report
136
+ - specified correct minimum Ruby version in gemspec
137
+ * it said 1.9 but really required 2.0 as of v1.8.0
138
+
139
+ ### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com)
140
+
141
+ ### Changed
142
+
143
+ - dropped support for running on Ruby 1.9.x
144
+
145
+ ### Added
146
+
147
+ - regexp flags can now be passed when parsing a `String` as regexp body
148
+ * see the [README](/README.md#usage) for details
149
+ * thanks to [Owen Stephens](https://github.com/owst)
150
+ - bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes
151
+ * matches Onigmo behavior
152
+ * thanks for the report to [Marc-André Lafortune](https://github.com/marcandre)
153
+
154
+ ### Fixed
155
+
156
+ - fixed parsing comments without preceding space or trailing newline in x-mode
157
+ * thanks to [Owen Stephens](https://github.com/owst)
2
158
 
3
159
  ### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com)
4
160
 
data/Gemfile CHANGED
@@ -3,7 +3,12 @@ source 'https://rubygems.org'
3
3
  gemspec
4
4
 
5
5
  group :development, :test do
6
+ gem 'ice_nine', '~> 0.11.2'
6
7
  gem 'rake', '~> 13.0'
7
8
  gem 'regexp_property_values', '~> 1.0'
8
- gem 'rspec', '~> 3.8'
9
+ gem 'rspec', '~> 3.10'
10
+ if RUBY_VERSION.to_f >= 2.7
11
+ gem 'gouteur'
12
+ gem 'rubocop', '~> 1.7'
13
+ end
9
14
  end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2015, Ammar Ali
1
+ Copyright (c) 2010, 2012-2022, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
@@ -8,8 +11,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
8
11
  * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
9
12
  * A lexer that produces a "stream" of token objects.
10
13
  * A parser that produces a "tree" of Expression objects (OO API)
11
- * Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
12
- * Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
14
+ * Runs on Ruby 2.x, 3.x and JRuby runtimes
15
+ * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
13
16
 
14
17
 
15
18
  _For examples of regexp_parser in use, see [Example Projects](#example-projects)._
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
18
21
  ---
19
22
  ## Requirements
20
23
 
21
- * Ruby >= 1.9
24
+ * Ruby >= 2.0
22
25
  * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
23
26
 
24
27
 
25
- _Note: See the .travis.yml file for covered versions._
26
-
27
-
28
28
  ---
29
29
  ## Install
30
30
 
@@ -72,6 +72,17 @@ called with the results as follows:
72
72
  * **Parser**: after completion, the block gets passed the root expression.
73
73
  _The result of the block is returned._
74
74
 
75
+ All three methods accept either a `Regexp` or `String` (containing the pattern)
76
+ - if a String is passed, `options` can be supplied:
77
+
78
+ ```ruby
79
+ require 'regexp_parser'
80
+
81
+ Regexp::Parser.parse(
82
+ "a+ # Recognises a and A...",
83
+ options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
84
+ )
85
+ ```
75
86
 
76
87
  ---
77
88
  ## Components
@@ -306,7 +317,7 @@ Expression class. See the next section for details._
306
317
 
307
318
  ## Supported Syntax
308
319
  The three modules support all the regular expression syntax features of Ruby 1.8,
309
- 1.9, and 2.x:
320
+ 1.9, 2.x and 3.x:
310
321
 
311
322
  _Note that not all of these are available in all versions of Ruby_
312
323
 
@@ -349,12 +360,12 @@ _Note that not all of these are available in all versions of Ruby_
349
360
  | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
350
361
  | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
351
362
  | **String Escapes** | | &#x22f1; |
352
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
363
+ | &emsp;&nbsp;_**Control** \[1\]_ | `\C-C`, `\cD` | &#x2713; |
353
364
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
354
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
365
+ | &emsp;&nbsp;_**Meta** \[1\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
355
366
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
356
367
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
357
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
368
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
358
369
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
359
370
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
360
371
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -363,6 +374,10 @@ _Note that not all of these are available in all versions of Ruby_
363
374
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
364
375
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
365
376
 
377
+ **\[1\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
378
+ https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
379
+ scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
380
+
366
381
  ##### Inapplicable Features
367
382
 
368
383
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -376,7 +391,6 @@ expressions library (Onigmo). They are not supported by the scanner.
376
391
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
377
392
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
378
393
 
379
-
380
394
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
381
395
 
382
396
  _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
@@ -384,26 +398,14 @@ or incorrectly return tokens/objects as literals._
384
398
 
385
399
 
386
400
  ## Testing
387
- To run the tests simply run rake from the root directory, as 'test' is the default task.
401
+ To run the tests simply run rake from the root directory.
388
402
 
389
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
403
+ The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
390
404
 
391
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
405
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
392
406
 
393
407
  ```
394
- bin/test
395
- ```
396
-
397
- You can run a specific test like so:
398
-
399
- ```
400
- bin/test spec/scanner/properties_spec.rb
401
- ```
402
-
403
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
404
-
405
- ```
406
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
408
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
407
409
  ```
408
410
 
409
411
  ## Building
@@ -429,13 +431,17 @@ rake install
429
431
  ## Example Projects
430
432
  Projects using regexp_parser.
431
433
 
434
+ - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
435
+
436
+ - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
437
+
432
438
  - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
433
439
 
434
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
440
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
435
441
 
436
- - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) uses regexp_parser to generate examples of postal codes.
442
+ - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
437
443
 
438
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
444
+ - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes.
439
445
 
440
446
 
441
447
  ## References
@@ -464,4 +470,4 @@ Documentation and books used while working on this project.
464
470
 
465
471
  ---
466
472
  ##### Copyright
467
- _Copyright (c) 2010-2019 Ammar Ali. See LICENSE file for details._
473
+ _Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,35 +1,31 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
11
- RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
9
+ RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
10
+ RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
14
11
 
15
12
  Bundler::GemHelper.install_tasks
16
13
 
14
+ RSpec::Core::RakeTask.new(:spec)
17
15
 
18
16
  task :default => [:'test:full']
19
17
 
20
18
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
19
+ task full: [:'ragel:rb', :spec]
24
20
  end
25
21
 
26
22
  namespace :ragel do
27
23
  desc "Process the ragel source files and output ruby code"
28
- task :rb do |t|
29
- RAGEL_SOURCE_FILES.each do |file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
24
+ task :rb do
25
+ RAGEL_SOURCE_FILES.each do |source_file|
26
+ output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
31
27
  # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
28
+ sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
33
29
 
34
30
  contents = File.read(output_file)
35
31
 
@@ -42,34 +38,29 @@ namespace :ragel do
42
38
  end
43
39
 
44
40
  desc "Delete the ragel generated source file(s)"
45
- task :clean do |t|
41
+ task :clean do
46
42
  RAGEL_SOURCE_FILES.each do |file|
47
43
  sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
44
  end
49
45
  end
50
46
  end
51
47
 
52
-
53
48
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
49
  # latest scanner code is generated and included in the build.
55
50
  desc "Runs ragel:rb before building the gem"
56
51
  task :build => ['ragel:rb']
57
52
 
58
-
59
53
  namespace :props do
60
54
  desc 'Write new property value hashes for the properties scanner'
61
55
  task :update do
62
56
  require 'regexp_property_values'
63
57
  RegexpPropertyValues.update
64
- dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
58
+ dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
65
59
 
66
- require 'psych'
67
60
  write_hash_to_file = ->(hash, path) do
68
61
  File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
62
+ f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
63
+ *hash.sort.map { |pair| pair.join(',') }
73
64
  end
74
65
  puts "Wrote #{hash.count} aliases to `#{path}`"
75
66
  end
@@ -77,11 +68,11 @@ namespace :props do
77
68
  long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
69
  [val.identifier, val.full_name.downcase]
79
70
  end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
71
+ write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
81
72
 
82
73
  short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
74
  [k.identifier, v.full_name.downcase]
84
75
  end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
76
+ write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
86
77
  end
87
78
  end
@@ -0,0 +1,4 @@
1
+ class Regexp::Parser
2
+ # base class for all gem-specific errors (inherited but never raised itself)
3
+ class Error < StandardError; end
4
+ end
@@ -0,0 +1,123 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ attr_accessor :type, :token
4
+ attr_accessor :text, :ts
5
+ attr_accessor :level, :set_level, :conditional_level, :nesting_level
6
+
7
+ attr_accessor :quantifier
8
+ attr_accessor :options
9
+
10
+ def initialize(token, options = {})
11
+ self.type = token.type
12
+ self.token = token.token
13
+ self.text = token.text
14
+ self.ts = token.ts
15
+ self.level = token.level
16
+ self.set_level = token.set_level
17
+ self.conditional_level = token.conditional_level
18
+ self.nesting_level = 0
19
+ self.quantifier = nil
20
+ self.options = options
21
+ end
22
+
23
+ def initialize_copy(orig)
24
+ self.text = (orig.text ? orig.text.dup : nil)
25
+ self.options = (orig.options ? orig.options.dup : nil)
26
+ self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
27
+ super
28
+ end
29
+
30
+ def to_re(format = :full)
31
+ ::Regexp.new(to_s(format))
32
+ end
33
+
34
+ alias :starts_at :ts
35
+
36
+ def base_length
37
+ to_s(:base).length
38
+ end
39
+
40
+ def full_length
41
+ to_s.length
42
+ end
43
+
44
+ def offset
45
+ [starts_at, full_length]
46
+ end
47
+
48
+ def coded_offset
49
+ '@%d+%d' % offset
50
+ end
51
+
52
+ def to_s(format = :full)
53
+ "#{text}#{quantifier_affix(format)}"
54
+ end
55
+
56
+ def quantifier_affix(expression_format)
57
+ quantifier.to_s if quantified? && expression_format != :base
58
+ end
59
+
60
+ def terminal?
61
+ !respond_to?(:expressions)
62
+ end
63
+
64
+ def quantify(token, text, min = nil, max = nil, mode = :greedy)
65
+ self.quantifier = Quantifier.new(token, text, min, max, mode)
66
+ end
67
+
68
+ def unquantified_clone
69
+ clone.tap { |exp| exp.quantifier = nil }
70
+ end
71
+
72
+ def quantified?
73
+ !quantifier.nil?
74
+ end
75
+
76
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
77
+ def quantity
78
+ return [nil,nil] unless quantified?
79
+ [quantifier.min, quantifier.max]
80
+ end
81
+
82
+ def repetitions
83
+ return 1..1 unless quantified?
84
+ min = quantifier.min
85
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
86
+ range = min..max
87
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
88
+ if RUBY_VERSION.to_f < 2.7
89
+ range.define_singleton_method(:minmax) { [min, max] }
90
+ end
91
+ range
92
+ end
93
+
94
+ def greedy?
95
+ quantified? and quantifier.greedy?
96
+ end
97
+
98
+ def reluctant?
99
+ quantified? and quantifier.reluctant?
100
+ end
101
+ alias :lazy? :reluctant?
102
+
103
+ def possessive?
104
+ quantified? and quantifier.possessive?
105
+ end
106
+
107
+ def attributes
108
+ {
109
+ type: type,
110
+ token: token,
111
+ text: to_s(:base),
112
+ starts_at: ts,
113
+ length: full_length,
114
+ level: level,
115
+ set_level: set_level,
116
+ conditional_level: conditional_level,
117
+ options: options,
118
+ quantifier: quantified? ? quantifier.to_h : nil,
119
+ }
120
+ end
121
+ alias :to_h :attributes
122
+ end
123
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -2,6 +2,11 @@ module Regexp::Expression
2
2
  module Backreference
3
3
  class Base < Regexp::Expression::Base
4
4
  attr_accessor :referenced_expression
5
+
6
+ def initialize_copy(orig)
7
+ self.referenced_expression = orig.referenced_expression.dup
8
+ super
9
+ end
5
10
  end
6
11
 
7
12
  class Number < Backreference::Base
@@ -7,7 +7,8 @@ module Regexp::Expression
7
7
  alias :ts :starts_at
8
8
 
9
9
  def <<(exp)
10
- complete? && raise("Can't add more than 2 expressions to a Range")
10
+ complete? and raise Regexp::Parser::Error,
11
+ "Can't add more than 2 expressions to a Range"
11
12
  super
12
13
  end
13
14
 
@@ -1,6 +1,6 @@
1
1
  module Regexp::Expression
2
2
  module Conditional
3
- class TooManyBranches < StandardError
3
+ class TooManyBranches < Regexp::Parser::Error
4
4
  def initialize
5
5
  super('The conditional expression has more than 2 branches')
6
6
  end
@@ -15,6 +15,11 @@ module Regexp::Expression
15
15
  ref = text.tr("'<>()", "")
16
16
  ref =~ /\D/ ? ref : Integer(ref)
17
17
  end
18
+
19
+ def initialize_copy(orig)
20
+ self.referenced_expression = orig.referenced_expression.dup
21
+ super
22
+ end
18
23
  end
19
24
 
20
25
  class Branch < Regexp::Expression::Sequence; end
@@ -53,6 +58,11 @@ module Regexp::Expression
53
58
  def to_s(format = :full)
54
59
  "#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
55
60
  end
61
+
62
+ def initialize_copy(orig)
63
+ self.referenced_expression = orig.referenced_expression.dup
64
+ super
65
+ end
56
66
  end
57
67
  end
58
68
  end