regexp_parser 1.7.1 → 2.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +157 -1
  3. data/Gemfile +6 -1
  4. data/LICENSE +1 -1
  5. data/README.md +38 -32
  6. data/Rakefile +18 -27
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +123 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  15. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
  17. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/property.rb +1 -3
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -17
  21. data/lib/regexp_parser/expression/classes/type.rb +0 -2
  22. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  23. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  24. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  25. data/lib/regexp_parser/expression/quantifier.rb +11 -2
  26. data/lib/regexp_parser/expression/sequence.rb +3 -20
  27. data/lib/regexp_parser/expression/subexpression.rb +1 -2
  28. data/lib/regexp_parser/expression.rb +7 -139
  29. data/lib/regexp_parser/lexer.rb +13 -11
  30. data/lib/regexp_parser/parser.rb +325 -344
  31. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  32. data/lib/regexp_parser/scanner/properties/long.csv +604 -0
  33. data/lib/regexp_parser/scanner/properties/short.csv +242 -0
  34. data/lib/regexp_parser/scanner/property.rl +2 -2
  35. data/lib/regexp_parser/scanner/scanner.rl +235 -255
  36. data/lib/regexp_parser/scanner.rb +1324 -1387
  37. data/lib/regexp_parser/syntax/any.rb +4 -6
  38. data/lib/regexp_parser/syntax/base.rb +13 -15
  39. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  40. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  41. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  42. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  43. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  44. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  45. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  46. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  47. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  48. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  49. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  50. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  51. data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
  52. data/lib/regexp_parser/syntax/token.rb +45 -0
  53. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  54. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
  55. data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
  56. data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
  57. data/lib/regexp_parser/syntax.rb +8 -6
  58. data/lib/regexp_parser/token.rb +9 -20
  59. data/lib/regexp_parser/version.rb +1 -1
  60. data/lib/regexp_parser.rb +0 -2
  61. data/regexp_parser.gemspec +20 -22
  62. metadata +34 -165
  63. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  64. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  65. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  66. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  67. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  68. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  69. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  70. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  71. data/spec/expression/base_spec.rb +0 -94
  72. data/spec/expression/clone_spec.rb +0 -120
  73. data/spec/expression/conditional_spec.rb +0 -89
  74. data/spec/expression/free_space_spec.rb +0 -27
  75. data/spec/expression/methods/match_length_spec.rb +0 -161
  76. data/spec/expression/methods/match_spec.rb +0 -25
  77. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  78. data/spec/expression/methods/tests_spec.rb +0 -99
  79. data/spec/expression/methods/traverse_spec.rb +0 -161
  80. data/spec/expression/options_spec.rb +0 -128
  81. data/spec/expression/root_spec.rb +0 -9
  82. data/spec/expression/sequence_spec.rb +0 -9
  83. data/spec/expression/subexpression_spec.rb +0 -50
  84. data/spec/expression/to_h_spec.rb +0 -26
  85. data/spec/expression/to_s_spec.rb +0 -100
  86. data/spec/lexer/all_spec.rb +0 -22
  87. data/spec/lexer/conditionals_spec.rb +0 -53
  88. data/spec/lexer/delimiters_spec.rb +0 -68
  89. data/spec/lexer/escapes_spec.rb +0 -14
  90. data/spec/lexer/keep_spec.rb +0 -10
  91. data/spec/lexer/literals_spec.rb +0 -89
  92. data/spec/lexer/nesting_spec.rb +0 -99
  93. data/spec/lexer/refcalls_spec.rb +0 -55
  94. data/spec/parser/all_spec.rb +0 -43
  95. data/spec/parser/alternation_spec.rb +0 -88
  96. data/spec/parser/anchors_spec.rb +0 -17
  97. data/spec/parser/conditionals_spec.rb +0 -179
  98. data/spec/parser/errors_spec.rb +0 -30
  99. data/spec/parser/escapes_spec.rb +0 -121
  100. data/spec/parser/free_space_spec.rb +0 -130
  101. data/spec/parser/groups_spec.rb +0 -108
  102. data/spec/parser/keep_spec.rb +0 -6
  103. data/spec/parser/posix_classes_spec.rb +0 -8
  104. data/spec/parser/properties_spec.rb +0 -115
  105. data/spec/parser/quantifiers_spec.rb +0 -52
  106. data/spec/parser/refcalls_spec.rb +0 -112
  107. data/spec/parser/set/intersections_spec.rb +0 -127
  108. data/spec/parser/set/ranges_spec.rb +0 -111
  109. data/spec/parser/sets_spec.rb +0 -178
  110. data/spec/parser/types_spec.rb +0 -18
  111. data/spec/scanner/all_spec.rb +0 -18
  112. data/spec/scanner/anchors_spec.rb +0 -21
  113. data/spec/scanner/conditionals_spec.rb +0 -128
  114. data/spec/scanner/delimiters_spec.rb +0 -52
  115. data/spec/scanner/errors_spec.rb +0 -67
  116. data/spec/scanner/escapes_spec.rb +0 -53
  117. data/spec/scanner/free_space_spec.rb +0 -133
  118. data/spec/scanner/groups_spec.rb +0 -52
  119. data/spec/scanner/keep_spec.rb +0 -10
  120. data/spec/scanner/literals_spec.rb +0 -49
  121. data/spec/scanner/meta_spec.rb +0 -18
  122. data/spec/scanner/properties_spec.rb +0 -64
  123. data/spec/scanner/quantifiers_spec.rb +0 -20
  124. data/spec/scanner/refcalls_spec.rb +0 -36
  125. data/spec/scanner/sets_spec.rb +0 -102
  126. data/spec/scanner/types_spec.rb +0 -14
  127. data/spec/spec_helper.rb +0 -15
  128. data/spec/support/runner.rb +0 -42
  129. data/spec/support/shared_examples.rb +0 -77
  130. data/spec/support/warning_extractor.rb +0 -60
  131. data/spec/syntax/syntax_spec.rb +0 -48
  132. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  133. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  134. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  135. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  136. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  137. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  138. data/spec/syntax/versions/aliases_spec.rb +0 -37
  139. data/spec/token/token_spec.rb +0 -85
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd872b22bf04a288790ef0f73df9041f14fb88a08c2a03852d9dbbc238b452d6
4
- data.tar.gz: 4641097a24b5fa0f7b0c8e5aacc152587fe8b15d30f3f78bbec8157887b8b897
3
+ metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
4
+ data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
5
5
  SHA512:
6
- metadata.gz: 858570df4a7047a2d8b09555b56de28a66ca4f8022e596c249900f5312f8e7fb9376384ca816bc3c08f3e324930702ad410a28b5be680adea6867e1f8075441e
7
- data.tar.gz: 0d70e7b4f18739826bb334fb305e335e44a354ae302214ca3c1884f66ace8680e48a9e4c64b890b220b82056da761084413c8b9b8c5e363382f5cf165b3d3448
6
+ metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
7
+ data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
data/CHANGELOG.md CHANGED
@@ -1,4 +1,160 @@
1
- ## [Unreleased]
1
+ ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
2
+
3
+ ### Fixed
4
+
5
+ - fixed Syntax version of absence groups (`(?~...)`)
6
+ - the lexer accepted them for any Ruby version
7
+ - now they are only recognized for Ruby >= 2.4.1 in which they were introduced
8
+ - reduced gem size by excluding specs from package
9
+ - removed deprecated `test_files` gemspec setting
10
+ - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4)
11
+ - no longer depend on `set`
12
+ - `set` was removed from the stdlib and made a standalone gem as of Ruby 3
13
+ - this made it a hidden/undeclared dependency of `regexp_parser`
14
+
15
+ ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com)
16
+
17
+ ### Added
18
+
19
+ - added support for 13 new unicode properties introduced in Ruby 3.1.0
20
+
21
+ ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
22
+
23
+ ### Fixed
24
+
25
+ - fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
26
+ * thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
27
+
28
+ ## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
29
+
30
+ ### Added
31
+
32
+ - common ancestor for all scanning/parsing/lexing errors
33
+ * `Regexp::Parser::Error` can now be rescued as a catch-all
34
+ * the following errors (and their many descendants) now inherit from it:
35
+ - `Regexp::Expression::Conditional::TooManyBranches`
36
+ - `Regexp::Parser::ParserError`
37
+ - `Regexp::Scanner::ScannerError`
38
+ - `Regexp::Scanner::ValidationError`
39
+ - `Regexp::Syntax::SyntaxError`
40
+ * it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
41
+ * thanks to [sandstrom](https://github.com/sandstrom) for the cue
42
+
43
+ ### Fixed
44
+
45
+ - fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
46
+ * a regression in v2.0.1 had caused them to be scanned as literals
47
+ - fixed scanning of some backreference and subexpression call edge cases
48
+ * e.g. `\k<+1>`, `\g<x-1>`
49
+ - fixed tokenization of some escapes in character sets
50
+ * `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
51
+ * all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
52
+ * if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
53
+ * the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
54
+ - fixed handling of control/metacontrol escapes in character sets
55
+ * e.g. `[\cX]`, `[\M-\C-X]`
56
+ * they were misread as bunch of individual literals, escapes, and ranges
57
+ - fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
58
+
59
+ ## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
60
+
61
+ ### Fixed
62
+
63
+ - fixed error when scanning some unlikely and redundant but valid charset patterns
64
+ * e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
65
+ - fixed ancestry of some error classes related to syntax version lookup
66
+ * `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
67
+ * they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
68
+
69
+ ## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
70
+
71
+ ### Fixed
72
+
73
+ - fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive`
74
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon)
75
+
76
+ ## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com)
77
+
78
+ ### Fixed
79
+
80
+ - fixed error when scanning some group names
81
+ * this affected names containing hyphens, digits or multibyte chars, e.g. `/(?<a1>a)/`
82
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
83
+ - fixed error when scanning hex escapes with just one hex digit
84
+ * e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not
85
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
86
+
87
+ ## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com)
88
+
89
+ ### Changed
90
+
91
+ - some methods that used to return byte-based indices now return char-based indices
92
+ * the returned values have only changed for Regexps that contain multibyte chars
93
+ * this is only a breaking change if you used such methods directly AND relied on them pointing to bytes
94
+ * affected methods:
95
+ * `Regexp::Token` `#length`, `#offset`, `#te`, `#ts`
96
+ * `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts`
97
+ * thanks to [Akinori MUSHA](https://github.com/knu) for the report
98
+ - removed some deprecated methods/signatures
99
+ * these are rarely used and have been showing deprecation warnings for a long time
100
+ * `Regexp::Expression::Subexpression.new` with 3 arguments
101
+ * `Regexp::Expression::Root.new` without a token argument
102
+ * `Regexp::Expression.parsed`
103
+
104
+ ### Added
105
+
106
+ - `Regexp::Expression::Base#base_length`
107
+ * returns the character count of an expression body, ignoring any quantifier
108
+ - pragmatic, experimental support for chained quantifiers
109
+ * e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s
110
+ * successive quantifiers used to be silently dropped by the parser
111
+ * they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}`
112
+ * thanks to [calfeld](https://github.com/calfeld) for reporting this a while back
113
+
114
+ ### Fixed
115
+
116
+ - incorrect encoding output for non-ascii comments
117
+ * this led to a crash when calling `#to_s` on parse results containing such comments
118
+ * thanks to [Michael Glass](https://github.com/michaelglass) for the report
119
+ - some crashes when scanning contrived patterns such as `'\😋'`
120
+
121
+ ### [1.8.2] - 2020-10-11 - [Janosch Müller](mailto:janosch84@gmail.com)
122
+
123
+ ### Fixed
124
+
125
+ - fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0
126
+ * thanks to [Thomas Walpole](https://github.com/twalpole)
127
+ - removed "unknown future version" warning on Ruby 3.0
128
+
129
+ ### [1.8.1] - 2020-09-28 - [Janosch Müller](mailto:janosch84@gmail.com)
130
+
131
+ ### Fixed
132
+
133
+ - fixed scanning of comment-like text in normal mode
134
+ * this was an old bug, but had become more prevalent in v1.8.0
135
+ * thanks to [Tietew](https://github.com/Tietew) for the report
136
+ - specified correct minimum Ruby version in gemspec
137
+ * it said 1.9 but really required 2.0 as of v1.8.0
138
+
139
+ ### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com)
140
+
141
+ ### Changed
142
+
143
+ - dropped support for running on Ruby 1.9.x
144
+
145
+ ### Added
146
+
147
+ - regexp flags can now be passed when parsing a `String` as regexp body
148
+ * see the [README](/README.md#usage) for details
149
+ * thanks to [Owen Stephens](https://github.com/owst)
150
+ - bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes
151
+ * matches Onigmo behavior
152
+ * thanks for the report to [Marc-André Lafortune](https://github.com/marcandre)
153
+
154
+ ### Fixed
155
+
156
+ - fixed parsing comments without preceding space or trailing newline in x-mode
157
+ * thanks to [Owen Stephens](https://github.com/owst)
2
158
 
3
159
  ### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com)
4
160
 
data/Gemfile CHANGED
@@ -3,7 +3,12 @@ source 'https://rubygems.org'
3
3
  gemspec
4
4
 
5
5
  group :development, :test do
6
+ gem 'ice_nine', '~> 0.11.2'
6
7
  gem 'rake', '~> 13.0'
7
8
  gem 'regexp_property_values', '~> 1.0'
8
- gem 'rspec', '~> 3.8'
9
+ gem 'rspec', '~> 3.10'
10
+ if RUBY_VERSION.to_f >= 2.7
11
+ gem 'gouteur'
12
+ gem 'rubocop', '~> 1.7'
13
+ end
9
14
  end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2015, Ammar Ali
1
+ Copyright (c) 2010, 2012-2022, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser)
4
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions)
5
+ [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions)
6
+ [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
7
 
5
8
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
9
 
@@ -8,8 +11,8 @@ A Ruby gem for tokenizing, parsing, and transforming regular expressions.
8
11
  * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/)
9
12
  * A lexer that produces a "stream" of token objects.
10
13
  * A parser that produces a "tree" of Expression objects (OO API)
11
- * Runs on Ruby 1.9, 2.x, and JRuby (1.9 mode) runtimes.
12
- * Recognizes Ruby 1.8, 1.9, and 2.x regular expressions [See Supported Syntax](#supported-syntax)
14
+ * Runs on Ruby 2.x, 3.x and JRuby runtimes
15
+ * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax)
13
16
 
14
17
 
15
18
  _For examples of regexp_parser in use, see [Example Projects](#example-projects)._
@@ -18,13 +21,10 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
18
21
  ---
19
22
  ## Requirements
20
23
 
21
- * Ruby >= 1.9
24
+ * Ruby >= 2.0
22
25
  * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
23
26
 
24
27
 
25
- _Note: See the .travis.yml file for covered versions._
26
-
27
-
28
28
  ---
29
29
  ## Install
30
30
 
@@ -72,6 +72,17 @@ called with the results as follows:
72
72
  * **Parser**: after completion, the block gets passed the root expression.
73
73
  _The result of the block is returned._
74
74
 
75
+ All three methods accept either a `Regexp` or `String` (containing the pattern)
76
+ - if a String is passed, `options` can be supplied:
77
+
78
+ ```ruby
79
+ require 'regexp_parser'
80
+
81
+ Regexp::Parser.parse(
82
+ "a+ # Recognises a and A...",
83
+ options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE
84
+ )
85
+ ```
75
86
 
76
87
  ---
77
88
  ## Components
@@ -306,7 +317,7 @@ Expression class. See the next section for details._
306
317
 
307
318
  ## Supported Syntax
308
319
  The three modules support all the regular expression syntax features of Ruby 1.8,
309
- 1.9, and 2.x:
320
+ 1.9, 2.x and 3.x:
310
321
 
311
322
  _Note that not all of these are available in all versions of Ruby_
312
323
 
@@ -349,12 +360,12 @@ _Note that not all of these are available in all versions of Ruby_
349
360
  | &emsp;&nbsp;_**Reluctant** (Lazy)_ | `??`, `*?`, `+?`, `{m,M}?` | &#x2713; |
350
361
  | &emsp;&nbsp;_**Possessive**_ | `?+`, `*+`, `++`, `{m,M}+` | &#x2713; |
351
362
  | **String Escapes** | | &#x22f1; |
352
- | &emsp;&nbsp;_**Control**_ | `\C-C`, `\cD` | &#x2713; |
363
+ | &emsp;&nbsp;_**Control** \[1\]_ | `\C-C`, `\cD` | &#x2713; |
353
364
  | &emsp;&nbsp;_**Hex**_ | `\x20`, `\x{701230}` | &#x2713; |
354
- | &emsp;&nbsp;_**Meta**_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
365
+ | &emsp;&nbsp;_**Meta** \[1\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | &#x2713; |
355
366
  | &emsp;&nbsp;_**Octal**_ | `\0`, `\01`, `\012` | &#x2713; |
356
367
  | &emsp;&nbsp;_**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | &#x2713; |
357
- | **Unicode Properties** | _<sub>([Unicode 11.0.0](http://www.unicode.org/versions/Unicode11.0.0/))</sub>_ | &#x22f1; |
368
+ | **Unicode Properties** | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
358
369
  | &emsp;&nbsp;_**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | &#x2713; |
359
370
  | &emsp;&nbsp;_**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | &#x2713; |
360
371
  | &emsp;&nbsp;_**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | &#x2713; |
@@ -363,6 +374,10 @@ _Note that not all of these are available in all versions of Ruby_
363
374
  | &emsp;&nbsp;_**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | &#x2713; |
364
375
  | &emsp;&nbsp;_**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | &#x2713; |
365
376
 
377
+ **\[1\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
378
+ https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
379
+ scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
380
+
366
381
  ##### Inapplicable Features
367
382
 
368
383
  Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not
@@ -376,7 +391,6 @@ expressions library (Onigmo). They are not supported by the scanner.
376
391
  - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_
377
392
  - **Capture History**: `(?@...)`, `(?@<name>...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_
378
393
 
379
-
380
394
  See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues)
381
395
 
382
396
  _**Note**: Attempting to process expressions with unsupported syntax features can raise an error,
@@ -384,26 +398,14 @@ or incorrectly return tokens/objects as literals._
384
398
 
385
399
 
386
400
  ## Testing
387
- To run the tests simply run rake from the root directory, as 'test' is the default task.
401
+ To run the tests simply run rake from the root directory.
388
402
 
389
- It generates the scanner's code from the Ragel source files and runs all the tests, thus it requires Ragel to be installed.
403
+ The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed.
390
404
 
391
- The tests use RSpec. They can also be run with the test runner that whitelists some warnings:
405
+ Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run:
392
406
 
393
407
  ```
394
- bin/test
395
- ```
396
-
397
- You can run a specific test like so:
398
-
399
- ```
400
- bin/test spec/scanner/properties_spec.rb
401
- ```
402
-
403
- Note that changes to Ragel files will not be reflected when running `rspec` or `bin/test`, so you might want to run:
404
-
405
- ```
406
- rake ragel:rb && bin/test spec/scanner/properties_spec.rb
408
+ rake ragel:rb && rspec spec/scanner/properties_spec.rb
407
409
  ```
408
410
 
409
411
  ## Building
@@ -429,13 +431,17 @@ rake install
429
431
  ## Example Projects
430
432
  Projects using regexp_parser.
431
433
 
434
+ - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors.
435
+
436
+ - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
437
+
432
438
  - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support.
433
439
 
434
- - [mutant](https://github.com/mbj/mutant) (before v0.9.0) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
440
+ - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior.
435
441
 
436
- - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) uses regexp_parser to generate examples of postal codes.
442
+ - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps.
437
443
 
438
- - [js_regex](https://github.com/janosch-x/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions.
444
+ - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes.
439
445
 
440
446
 
441
447
  ## References
@@ -464,4 +470,4 @@ Documentation and books used while working on this project.
464
470
 
465
471
  ---
466
472
  ##### Copyright
467
- _Copyright (c) 2010-2019 Ammar Ali. See LICENSE file for details._
473
+ _Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._
data/Rakefile CHANGED
@@ -1,35 +1,31 @@
1
+ require 'bundler'
1
2
  require 'rubygems'
2
-
3
+ require 'rubygems/package_task'
3
4
  require 'rake'
4
5
  require 'rake/testtask'
6
+ require 'rspec/core/rake_task'
5
7
 
6
- require 'bundler'
7
- require 'rubygems/package_task'
8
-
9
-
10
- RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
11
- RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
12
- RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
-
8
+ RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
9
+ RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
10
+ RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
14
11
 
15
12
  Bundler::GemHelper.install_tasks
16
13
 
14
+ RSpec::Core::RakeTask.new(:spec)
17
15
 
18
16
  task :default => [:'test:full']
19
17
 
20
18
  namespace :test do
21
- task full: :'ragel:rb' do
22
- sh 'bin/test'
23
- end
19
+ task full: [:'ragel:rb', :spec]
24
20
  end
25
21
 
26
22
  namespace :ragel do
27
23
  desc "Process the ragel source files and output ruby code"
28
- task :rb do |t|
29
- RAGEL_SOURCE_FILES.each do |file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
24
+ task :rb do
25
+ RAGEL_SOURCE_FILES.each do |source_file|
26
+ output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
31
27
  # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
28
+ sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
33
29
 
34
30
  contents = File.read(output_file)
35
31
 
@@ -42,34 +38,29 @@ namespace :ragel do
42
38
  end
43
39
 
44
40
  desc "Delete the ragel generated source file(s)"
45
- task :clean do |t|
41
+ task :clean do
46
42
  RAGEL_SOURCE_FILES.each do |file|
47
43
  sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
44
  end
49
45
  end
50
46
  end
51
47
 
52
-
53
48
  # Add ragel task as a prerequisite for building the gem to ensure that the
54
49
  # latest scanner code is generated and included in the build.
55
50
  desc "Runs ragel:rb before building the gem"
56
51
  task :build => ['ragel:rb']
57
52
 
58
-
59
53
  namespace :props do
60
54
  desc 'Write new property value hashes for the properties scanner'
61
55
  task :update do
62
56
  require 'regexp_property_values'
63
57
  RegexpPropertyValues.update
64
- dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
58
+ dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
65
59
 
66
- require 'psych'
67
60
  write_hash_to_file = ->(hash, path) do
68
61
  File.open(path, 'w') do |f|
69
- f.puts '#',
70
- "# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT",
71
- '#',
72
- hash.sort.to_h.to_yaml
62
+ f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
63
+ *hash.sort.map { |pair| pair.join(',') }
73
64
  end
74
65
  puts "Wrote #{hash.count} aliases to `#{path}`"
75
66
  end
@@ -77,11 +68,11 @@ namespace :props do
77
68
  long_names_to_tokens = RegexpPropertyValues.all.map do |val|
78
69
  [val.identifier, val.full_name.downcase]
79
70
  end
80
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.yml")
71
+ write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
81
72
 
82
73
  short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
83
74
  [k.identifier, v.full_name.downcase]
84
75
  end
85
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.yml")
76
+ write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
86
77
  end
87
78
  end
@@ -0,0 +1,4 @@
1
+ class Regexp::Parser
2
+ # base class for all gem-specific errors (inherited but never raised itself)
3
+ class Error < StandardError; end
4
+ end
@@ -0,0 +1,123 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ attr_accessor :type, :token
4
+ attr_accessor :text, :ts
5
+ attr_accessor :level, :set_level, :conditional_level, :nesting_level
6
+
7
+ attr_accessor :quantifier
8
+ attr_accessor :options
9
+
10
+ def initialize(token, options = {})
11
+ self.type = token.type
12
+ self.token = token.token
13
+ self.text = token.text
14
+ self.ts = token.ts
15
+ self.level = token.level
16
+ self.set_level = token.set_level
17
+ self.conditional_level = token.conditional_level
18
+ self.nesting_level = 0
19
+ self.quantifier = nil
20
+ self.options = options
21
+ end
22
+
23
+ def initialize_copy(orig)
24
+ self.text = (orig.text ? orig.text.dup : nil)
25
+ self.options = (orig.options ? orig.options.dup : nil)
26
+ self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
27
+ super
28
+ end
29
+
30
+ def to_re(format = :full)
31
+ ::Regexp.new(to_s(format))
32
+ end
33
+
34
+ alias :starts_at :ts
35
+
36
+ def base_length
37
+ to_s(:base).length
38
+ end
39
+
40
+ def full_length
41
+ to_s.length
42
+ end
43
+
44
+ def offset
45
+ [starts_at, full_length]
46
+ end
47
+
48
+ def coded_offset
49
+ '@%d+%d' % offset
50
+ end
51
+
52
+ def to_s(format = :full)
53
+ "#{text}#{quantifier_affix(format)}"
54
+ end
55
+
56
+ def quantifier_affix(expression_format)
57
+ quantifier.to_s if quantified? && expression_format != :base
58
+ end
59
+
60
+ def terminal?
61
+ !respond_to?(:expressions)
62
+ end
63
+
64
+ def quantify(token, text, min = nil, max = nil, mode = :greedy)
65
+ self.quantifier = Quantifier.new(token, text, min, max, mode)
66
+ end
67
+
68
+ def unquantified_clone
69
+ clone.tap { |exp| exp.quantifier = nil }
70
+ end
71
+
72
+ def quantified?
73
+ !quantifier.nil?
74
+ end
75
+
76
+ # Deprecated. Prefer `#repetitions` which has a more uniform interface.
77
+ def quantity
78
+ return [nil,nil] unless quantified?
79
+ [quantifier.min, quantifier.max]
80
+ end
81
+
82
+ def repetitions
83
+ return 1..1 unless quantified?
84
+ min = quantifier.min
85
+ max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
86
+ range = min..max
87
+ # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807
88
+ if RUBY_VERSION.to_f < 2.7
89
+ range.define_singleton_method(:minmax) { [min, max] }
90
+ end
91
+ range
92
+ end
93
+
94
+ def greedy?
95
+ quantified? and quantifier.greedy?
96
+ end
97
+
98
+ def reluctant?
99
+ quantified? and quantifier.reluctant?
100
+ end
101
+ alias :lazy? :reluctant?
102
+
103
+ def possessive?
104
+ quantified? and quantifier.possessive?
105
+ end
106
+
107
+ def attributes
108
+ {
109
+ type: type,
110
+ token: token,
111
+ text: to_s(:base),
112
+ starts_at: ts,
113
+ length: full_length,
114
+ level: level,
115
+ set_level: set_level,
116
+ conditional_level: conditional_level,
117
+ options: options,
118
+ quantifier: quantified? ? quantifier.to_h : nil,
119
+ }
120
+ end
121
+ alias :to_h :attributes
122
+ end
123
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Anchor
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
@@ -22,5 +21,4 @@ module Regexp::Expression
22
21
  EOS = EndOfString
23
22
  EOSobEOL = EndOfStringOrBeforeEndOfLine
24
23
  end
25
-
26
24
  end
@@ -2,6 +2,11 @@ module Regexp::Expression
2
2
  module Backreference
3
3
  class Base < Regexp::Expression::Base
4
4
  attr_accessor :referenced_expression
5
+
6
+ def initialize_copy(orig)
7
+ self.referenced_expression = orig.referenced_expression.dup
8
+ super
9
+ end
5
10
  end
6
11
 
7
12
  class Number < Backreference::Base
@@ -7,7 +7,8 @@ module Regexp::Expression
7
7
  alias :ts :starts_at
8
8
 
9
9
  def <<(exp)
10
- complete? && raise("Can't add more than 2 expressions to a Range")
10
+ complete? and raise Regexp::Parser::Error,
11
+ "Can't add more than 2 expressions to a Range"
11
12
  super
12
13
  end
13
14
 
@@ -1,6 +1,6 @@
1
1
  module Regexp::Expression
2
2
  module Conditional
3
- class TooManyBranches < StandardError
3
+ class TooManyBranches < Regexp::Parser::Error
4
4
  def initialize
5
5
  super('The conditional expression has more than 2 branches')
6
6
  end
@@ -15,6 +15,11 @@ module Regexp::Expression
15
15
  ref = text.tr("'<>()", "")
16
16
  ref =~ /\D/ ? ref : Integer(ref)
17
17
  end
18
+
19
+ def initialize_copy(orig)
20
+ self.referenced_expression = orig.referenced_expression.dup
21
+ super
22
+ end
18
23
  end
19
24
 
20
25
  class Branch < Regexp::Expression::Sequence; end
@@ -53,6 +58,11 @@ module Regexp::Expression
53
58
  def to_s(format = :full)
54
59
  "#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
55
60
  end
61
+
62
+ def initialize_copy(orig)
63
+ self.referenced_expression = orig.referenced_expression.dup
64
+ super
65
+ end
56
66
  end
57
67
  end
58
68
  end