regexp_parser 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -3
- data/Gemfile +5 -1
- data/README.md +1 -1
- data/Rakefile +6 -6
- data/lib/regexp_parser.rb +1 -0
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +1 -1
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -1
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/quantifier.rb +1 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -9
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/parser.rb +281 -332
- data/lib/regexp_parser/scanner.rb +1015 -1003
- data/lib/regexp_parser/scanner/scanner.rl +53 -77
- data/lib/regexp_parser/syntax.rb +6 -6
- data/lib/regexp_parser/syntax/any.rb +1 -1
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/scanner/escapes_spec.rb +1 -1
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +42 -11
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79c8b7838ef53335c9d0fbd21ffdf6815473ee560380a3687e8fab514d031d53
|
4
|
+
data.tar.gz: 2a91f7c7640fc5f2d304c2cbf240886d8e8642994861a9c092f1d4db2ae6b77a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3559a8c7af9c0087ab7a54862c9913e40a3703ffa23f62e6919eec50042523424c2aa4c99b3de9d28d03fc0edd14af37e0dcd0eab7bf822b9af73113be468b59
|
7
|
+
data.tar.gz: 31ed468565bd41fe2d0bd7b82d53d64e213a15e1ade2108ddf813637c228c18f6f7b456725c7e359a08754188ee19c90d06e013be90775ee6a64723b04fa25f0
|
data/CHANGELOG.md
CHANGED
@@ -1,14 +1,45 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
|
+
|
5
|
+
### Added
|
6
|
+
|
7
|
+
- common ancestor for all scanning/parsing/lexing errors
|
8
|
+
* `Regexp::Parser::Error` can now be rescued as a catch-all
|
9
|
+
* the following errors (and their many descendants) now inherit from it:
|
10
|
+
- `Regexp::Expression::Conditional::TooManyBranches`
|
11
|
+
- `Regexp::Parser::ParserError`
|
12
|
+
- `Regexp::Scanner::ScannerError`
|
13
|
+
- `Regexp::Scanner::ValidationError`
|
14
|
+
- `Regexp::Syntax::SyntaxError`
|
15
|
+
* it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
|
16
|
+
* thanks to [sandstrom](https://github.com/sandstrom) for the cue
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
|
20
|
+
- fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
|
21
|
+
* a regression in v2.0.1 had caused them to be scanned as literals
|
22
|
+
- fixed scanning of some backreference and subexpression call edge cases
|
23
|
+
* e.g. `\k<+1>`, `\g<x-1>`
|
24
|
+
- fixed tokenization of some escapes in character sets
|
25
|
+
* `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
|
26
|
+
* all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
|
27
|
+
* if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
|
28
|
+
* the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
|
29
|
+
- fixed handling of control/metacontrol escapes in character sets
|
30
|
+
* e.g. `[\cX]`, `[\M-\C-X]`
|
31
|
+
* they were misread as bunch of individual literals, escapes, and ranges
|
32
|
+
- fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
|
33
|
+
|
3
34
|
## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
35
|
|
5
36
|
### Fixed
|
6
37
|
|
7
38
|
- fixed error when scanning some unlikely and redundant but valid charset patterns
|
8
|
-
|
39
|
+
* e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
|
9
40
|
- fixed ancestry of some error classes related to syntax version lookup
|
10
|
-
|
11
|
-
|
41
|
+
* `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
|
42
|
+
* they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
|
12
43
|
|
13
44
|
## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
14
45
|
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/regexp_parser) [](https://github.com/ammar/regexp_parser/actions) [](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
3
|
+
[](http://badge.fury.io/rb/regexp_parser) [](https://github.com/ammar/regexp_parser/actions) [](https://github.com/ammar/regexp_parser/actions) [](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
4
|
|
5
5
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
6
|
|
data/Rakefile
CHANGED
@@ -7,8 +7,8 @@ require 'bundler'
|
|
7
7
|
require 'rubygems/package_task'
|
8
8
|
|
9
9
|
|
10
|
-
RAGEL_SOURCE_DIR = File.
|
11
|
-
RAGEL_OUTPUT_DIR = File.
|
10
|
+
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
11
|
+
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
12
12
|
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
13
|
|
14
14
|
|
@@ -26,10 +26,10 @@ end
|
|
26
26
|
namespace :ragel do
|
27
27
|
desc "Process the ragel source files and output ruby code"
|
28
28
|
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{
|
29
|
+
RAGEL_SOURCE_FILES.each do |source_file|
|
30
|
+
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
31
|
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{
|
32
|
+
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
33
|
|
34
34
|
contents = File.read(output_file)
|
35
35
|
|
@@ -61,7 +61,7 @@ namespace :props do
|
|
61
61
|
task :update do
|
62
62
|
require 'regexp_property_values'
|
63
63
|
RegexpPropertyValues.update
|
64
|
-
dir = File.
|
64
|
+
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
65
|
|
66
66
|
require 'psych'
|
67
67
|
write_hash_to_file = ->(hash, path) do
|
data/lib/regexp_parser.rb
CHANGED
@@ -21,7 +21,7 @@ module Regexp::Expression
|
|
21
21
|
self.options = options
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def initialize_copy(orig)
|
25
25
|
self.text = (orig.text ? orig.text.dup : nil)
|
26
26
|
self.options = (orig.options ? orig.options.dup : nil)
|
27
27
|
self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
|
@@ -2,6 +2,11 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
self.referenced_expression = orig.referenced_expression.dup
|
8
|
+
super
|
9
|
+
end
|
5
10
|
end
|
6
11
|
|
7
12
|
class Number < Backreference::Base
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Conditional
|
3
|
-
class TooManyBranches <
|
3
|
+
class TooManyBranches < Regexp::Parser::Error
|
4
4
|
def initialize
|
5
5
|
super('The conditional expression has more than 2 branches')
|
6
6
|
end
|
@@ -15,6 +15,11 @@ module Regexp::Expression
|
|
15
15
|
ref = text.tr("'<>()", "")
|
16
16
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
17
|
end
|
18
|
+
|
19
|
+
def initialize_copy(orig)
|
20
|
+
self.referenced_expression = orig.referenced_expression.dup
|
21
|
+
super
|
22
|
+
end
|
18
23
|
end
|
19
24
|
|
20
25
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -53,6 +58,11 @@ module Regexp::Expression
|
|
53
58
|
def to_s(format = :full)
|
54
59
|
"#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
|
55
60
|
end
|
61
|
+
|
62
|
+
def initialize_copy(orig)
|
63
|
+
self.referenced_expression = orig.referenced_expression.dup
|
64
|
+
super
|
65
|
+
end
|
56
66
|
end
|
57
67
|
end
|
58
68
|
end
|
@@ -2,7 +2,7 @@ module Regexp::Expression
|
|
2
2
|
|
3
3
|
class FreeSpace < Regexp::Expression::Base
|
4
4
|
def quantify(_token, _text, _min = nil, _max = nil, _mode = :greedy)
|
5
|
-
raise
|
5
|
+
raise Regexp::Parser::Error, 'Can not quantify a free space object'
|
6
6
|
end
|
7
7
|
end
|
8
8
|
|
@@ -35,6 +35,11 @@ module Regexp::Expression
|
|
35
35
|
class Atomic < Group::Base; end
|
36
36
|
class Options < Group::Base
|
37
37
|
attr_accessor :option_changes
|
38
|
+
|
39
|
+
def initialize_copy(orig)
|
40
|
+
self.option_changes = orig.option_changes.dup
|
41
|
+
super
|
42
|
+
end
|
38
43
|
end
|
39
44
|
|
40
45
|
class Capture < Group::Base
|
@@ -53,7 +58,7 @@ module Regexp::Expression
|
|
53
58
|
super
|
54
59
|
end
|
55
60
|
|
56
|
-
def
|
61
|
+
def initialize_copy(orig)
|
57
62
|
@name = orig.name.dup
|
58
63
|
super
|
59
64
|
end
|
@@ -41,17 +41,11 @@ module Regexp::Expression
|
|
41
41
|
alias :ts :starts_at
|
42
42
|
|
43
43
|
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
44
|
-
|
45
|
-
target
|
46
|
-
|
47
|
-
target = expressions[offset -= 1]
|
48
|
-
end
|
49
|
-
|
50
|
-
target || raise(ArgumentError, "No valid target found for '#{text}' "\
|
51
|
-
'quantifier')
|
44
|
+
target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
45
|
+
target or raise Regexp::Parser::Error,
|
46
|
+
"No valid target found for '#{text}' quantifier"
|
52
47
|
|
53
48
|
target.quantify(token, text, min, max, mode)
|
54
49
|
end
|
55
50
|
end
|
56
|
-
|
57
51
|
end
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -2,9 +2,8 @@ require 'regexp_parser/expression'
|
|
2
2
|
|
3
3
|
class Regexp::Parser
|
4
4
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
5
|
|
7
|
-
class ParserError <
|
6
|
+
class ParserError < Regexp::Parser::Error; end
|
8
7
|
|
9
8
|
class UnknownTokenTypeError < ParserError
|
10
9
|
def initialize(type, token)
|
@@ -70,93 +69,155 @@ class Regexp::Parser
|
|
70
69
|
enabled_options
|
71
70
|
end
|
72
71
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
72
|
+
def parse_token(token)
|
73
|
+
case token.type
|
74
|
+
when :anchor; anchor(token)
|
75
|
+
when :assertion, :group; group(token)
|
76
|
+
when :backref; backref(token)
|
77
|
+
when :conditional; conditional(token)
|
78
|
+
when :escape; escape(token)
|
79
|
+
when :free_space; free_space(token)
|
80
|
+
when :keep; keep(token)
|
81
|
+
when :literal; literal(token)
|
82
|
+
when :meta; meta(token)
|
83
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
84
|
+
when :property, :nonproperty; property(token)
|
85
|
+
when :quantifier; quantifier(token)
|
86
|
+
when :set; set(token)
|
87
|
+
when :type; type(token)
|
88
|
+
else
|
89
|
+
raise UnknownTokenTypeError.new(token.type, token)
|
90
|
+
end
|
79
91
|
|
80
|
-
|
81
|
-
def update_transplanted_subtree(exp, new_parent)
|
82
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
83
|
-
exp.respond_to?(:each) &&
|
84
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
92
|
+
close_completed_character_set_range
|
85
93
|
end
|
86
94
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
95
|
+
def anchor(token)
|
96
|
+
case token.token
|
97
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
98
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
99
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
100
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
101
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
102
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
103
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
104
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
105
|
+
else
|
106
|
+
raise UnknownTokenError.new('Anchor', token)
|
91
107
|
end
|
92
|
-
nesting.pop
|
93
|
-
yield(node) if block_given?
|
94
|
-
self.node = nesting.last
|
95
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
96
108
|
end
|
97
109
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
110
|
+
def group(token)
|
111
|
+
case token.token
|
112
|
+
when :options, :options_switch
|
113
|
+
options_group(token)
|
114
|
+
when :close
|
115
|
+
close_group
|
116
|
+
when :comment
|
117
|
+
node << Group::Comment.new(token, active_opts)
|
118
|
+
else
|
119
|
+
open_group(token)
|
120
|
+
end
|
101
121
|
end
|
102
122
|
|
103
|
-
|
104
|
-
|
123
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
124
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
105
125
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
when :escape; escape(token)
|
111
|
-
when :group; group(token)
|
112
|
-
when :assertion; group(token)
|
113
|
-
when :set; set(token)
|
114
|
-
when :type; type(token)
|
115
|
-
when :backref; backref(token)
|
116
|
-
when :conditional; conditional(token)
|
117
|
-
when :keep; keep(token)
|
118
|
-
|
119
|
-
when :posixclass, :nonposixclass
|
120
|
-
posixclass(token)
|
121
|
-
when :property, :nonproperty
|
122
|
-
property(token)
|
123
|
-
|
124
|
-
when :literal
|
125
|
-
node << Literal.new(token, active_opts)
|
126
|
-
when :free_space
|
127
|
-
free_space(token)
|
126
|
+
def options_group(token)
|
127
|
+
positive, negative = token.text.split('-', 2)
|
128
|
+
negative ||= ''
|
129
|
+
self.switching_options = token.token.equal?(:options_switch)
|
128
130
|
|
129
|
-
|
130
|
-
|
131
|
+
opt_changes = {}
|
132
|
+
new_active_opts = active_opts.dup
|
133
|
+
|
134
|
+
MOD_FLAGS.each do |flag|
|
135
|
+
if positive.include?(flag.to_s)
|
136
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
137
|
+
end
|
138
|
+
if negative.include?(flag.to_s)
|
139
|
+
opt_changes[flag] = false
|
140
|
+
new_active_opts.delete(flag)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
145
|
+
enc_flag = enc_flag.to_sym
|
146
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
147
|
+
opt_changes[other] = false if new_active_opts[other]
|
148
|
+
new_active_opts.delete(other)
|
149
|
+
end
|
150
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
131
151
|
end
|
152
|
+
|
153
|
+
options_stack << new_active_opts
|
154
|
+
|
155
|
+
options_group = Group::Options.new(token, active_opts)
|
156
|
+
options_group.option_changes = opt_changes
|
157
|
+
|
158
|
+
nest(options_group)
|
132
159
|
end
|
133
160
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
161
|
+
def open_group(token)
|
162
|
+
group_class =
|
163
|
+
case token.token
|
164
|
+
when :absence; Group::Absence
|
165
|
+
when :atomic; Group::Atomic
|
166
|
+
when :capture; Group::Capture
|
167
|
+
when :named; Group::Named
|
168
|
+
when :passive; Group::Passive
|
169
|
+
|
170
|
+
when :lookahead; Assertion::Lookahead
|
171
|
+
when :lookbehind; Assertion::Lookbehind
|
172
|
+
when :nlookahead; Assertion::NegativeLookahead
|
173
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
174
|
+
|
175
|
+
else
|
176
|
+
raise UnknownTokenError.new('Group type open', token)
|
177
|
+
end
|
178
|
+
|
179
|
+
group = group_class.new(token, active_opts)
|
180
|
+
|
181
|
+
if group.capturing?
|
182
|
+
group.number = total_captured_group_count + 1
|
183
|
+
group.number_at_level = captured_group_count_at_level + 1
|
184
|
+
count_captured_group
|
148
185
|
end
|
186
|
+
|
187
|
+
# Push the active options to the stack again. This way we can simply pop the
|
188
|
+
# stack for any group we close, no matter if it had its own options or not.
|
189
|
+
options_stack << active_opts
|
190
|
+
|
191
|
+
nest(group)
|
149
192
|
end
|
150
193
|
|
151
|
-
def
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
194
|
+
def total_captured_group_count
|
195
|
+
captured_group_counts.values.reduce(0, :+)
|
196
|
+
end
|
197
|
+
|
198
|
+
def captured_group_count_at_level
|
199
|
+
captured_group_counts[node.level]
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_captured_group
|
203
|
+
captured_group_counts[node.level] += 1
|
204
|
+
end
|
205
|
+
|
206
|
+
def close_group
|
207
|
+
options_stack.pop unless switching_options
|
208
|
+
self.switching_options = false
|
209
|
+
decrease_nesting
|
210
|
+
end
|
211
|
+
|
212
|
+
def decrease_nesting
|
213
|
+
while nesting.last.is_a?(SequenceOperation)
|
214
|
+
nesting.pop
|
215
|
+
self.node = nesting.last
|
159
216
|
end
|
217
|
+
nesting.pop
|
218
|
+
yield(node) if block_given?
|
219
|
+
self.node = nesting.last
|
220
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
160
221
|
end
|
161
222
|
|
162
223
|
def backref(token)
|
@@ -186,31 +247,9 @@ class Regexp::Parser
|
|
186
247
|
end
|
187
248
|
end
|
188
249
|
|
189
|
-
def
|
190
|
-
|
191
|
-
|
192
|
-
node << CharacterType::Digit.new(token, active_opts)
|
193
|
-
when :nondigit
|
194
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
195
|
-
when :hex
|
196
|
-
node << CharacterType::Hex.new(token, active_opts)
|
197
|
-
when :nonhex
|
198
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
199
|
-
when :space
|
200
|
-
node << CharacterType::Space.new(token, active_opts)
|
201
|
-
when :nonspace
|
202
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
203
|
-
when :word
|
204
|
-
node << CharacterType::Word.new(token, active_opts)
|
205
|
-
when :nonword
|
206
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
207
|
-
when :linebreak
|
208
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
209
|
-
when :xgrapheme
|
210
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
211
|
-
else
|
212
|
-
raise UnknownTokenError.new('CharacterType', token)
|
213
|
-
end
|
250
|
+
def assign_effective_number(exp)
|
251
|
+
exp.effective_number =
|
252
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
214
253
|
end
|
215
254
|
|
216
255
|
def conditional(token)
|
@@ -238,11 +277,118 @@ class Regexp::Parser
|
|
238
277
|
end
|
239
278
|
end
|
240
279
|
|
280
|
+
def nest_conditional(exp)
|
281
|
+
conditional_nesting.push(exp)
|
282
|
+
nest(exp)
|
283
|
+
end
|
284
|
+
|
285
|
+
def nest(exp)
|
286
|
+
nesting.push(exp)
|
287
|
+
node << exp
|
288
|
+
update_transplanted_subtree(exp, node)
|
289
|
+
self.node = exp
|
290
|
+
end
|
291
|
+
|
292
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
293
|
+
def update_transplanted_subtree(exp, new_parent)
|
294
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
295
|
+
exp.respond_to?(:each) &&
|
296
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
297
|
+
end
|
298
|
+
|
299
|
+
def escape(token)
|
300
|
+
case token.token
|
301
|
+
|
302
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
303
|
+
|
304
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
305
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
306
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
307
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
308
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
309
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
310
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
311
|
+
|
312
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
313
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
314
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
315
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
316
|
+
|
317
|
+
when :control
|
318
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
319
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
320
|
+
else
|
321
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
322
|
+
end
|
323
|
+
|
324
|
+
when :meta_sequence
|
325
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
326
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
327
|
+
else
|
328
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
329
|
+
end
|
330
|
+
|
331
|
+
else
|
332
|
+
# treating everything else as a literal
|
333
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
334
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
335
|
+
# as stuff that would be a literal even without the backslash.
|
336
|
+
# Right now, they all end up here.
|
337
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
def free_space(token)
|
342
|
+
case token.token
|
343
|
+
when :comment
|
344
|
+
node << Comment.new(token, active_opts)
|
345
|
+
when :whitespace
|
346
|
+
if node.last.is_a?(WhiteSpace)
|
347
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
348
|
+
else
|
349
|
+
node << WhiteSpace.new(token, active_opts)
|
350
|
+
end
|
351
|
+
else
|
352
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
def keep(token)
|
357
|
+
node << Keep::Mark.new(token, active_opts)
|
358
|
+
end
|
359
|
+
|
360
|
+
def literal(token)
|
361
|
+
node << Literal.new(token, active_opts)
|
362
|
+
end
|
363
|
+
|
364
|
+
def meta(token)
|
365
|
+
case token.token
|
366
|
+
when :dot
|
367
|
+
node << CharacterType::Any.new(token, active_opts)
|
368
|
+
when :alternation
|
369
|
+
sequence_operation(Alternation, token)
|
370
|
+
else
|
371
|
+
raise UnknownTokenError.new('Meta', token)
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
def sequence_operation(klass, token)
|
376
|
+
unless node.is_a?(klass)
|
377
|
+
operator = klass.new(token, active_opts)
|
378
|
+
sequence = operator.add_sequence(active_opts)
|
379
|
+
sequence.expressions = node.expressions
|
380
|
+
node.expressions = []
|
381
|
+
nest(operator)
|
382
|
+
end
|
383
|
+
node.add_sequence(active_opts)
|
384
|
+
end
|
385
|
+
|
241
386
|
def posixclass(token)
|
242
387
|
node << PosixClass.new(token, active_opts)
|
243
388
|
end
|
244
389
|
|
245
390
|
include Regexp::Expression::UnicodeProperty
|
391
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
246
392
|
|
247
393
|
def property(token)
|
248
394
|
case token.token
|
@@ -314,127 +460,20 @@ class Regexp::Parser
|
|
314
460
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
315
461
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
316
462
|
|
317
|
-
when *
|
318
|
-
node <<
|
319
|
-
|
320
|
-
when *
|
321
|
-
|
322
|
-
|
323
|
-
when *Token::UnicodeProperty::Emoji
|
324
|
-
node << Emoji.new(token, active_opts)
|
325
|
-
|
326
|
-
when *Token::UnicodeProperty::Script
|
327
|
-
node << Script.new(token, active_opts)
|
328
|
-
|
329
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
330
|
-
node << Block.new(token, active_opts)
|
463
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
464
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
465
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
466
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
467
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
331
468
|
|
332
469
|
else
|
333
470
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
334
471
|
end
|
335
472
|
end
|
336
473
|
|
337
|
-
def anchor(token)
|
338
|
-
case token.token
|
339
|
-
when :bol
|
340
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
341
|
-
when :eol
|
342
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
343
|
-
when :bos
|
344
|
-
node << Anchor::BOS.new(token, active_opts)
|
345
|
-
when :eos
|
346
|
-
node << Anchor::EOS.new(token, active_opts)
|
347
|
-
when :eos_ob_eol
|
348
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
349
|
-
when :word_boundary
|
350
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
351
|
-
when :nonword_boundary
|
352
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
353
|
-
when :match_start
|
354
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
355
|
-
else
|
356
|
-
raise UnknownTokenError.new('Anchor', token)
|
357
|
-
end
|
358
|
-
end
|
359
|
-
|
360
|
-
def escape(token)
|
361
|
-
case token.token
|
362
|
-
|
363
|
-
when :backspace
|
364
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
365
|
-
|
366
|
-
when :escape
|
367
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
368
|
-
when :bell
|
369
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
370
|
-
when :form_feed
|
371
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
372
|
-
when :newline
|
373
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
374
|
-
when :carriage
|
375
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
376
|
-
when :tab
|
377
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
378
|
-
when :vertical_tab
|
379
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
380
|
-
|
381
|
-
when :hex
|
382
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
383
|
-
when :octal
|
384
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
385
|
-
when :codepoint
|
386
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
387
|
-
when :codepoint_list
|
388
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
389
|
-
|
390
|
-
when :control
|
391
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
392
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
393
|
-
else
|
394
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
395
|
-
end
|
396
|
-
|
397
|
-
when :meta_sequence
|
398
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
399
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
400
|
-
else
|
401
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
402
|
-
end
|
403
|
-
|
404
|
-
else
|
405
|
-
# treating everything else as a literal
|
406
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def keep(token)
|
411
|
-
node << Keep::Mark.new(token, active_opts)
|
412
|
-
end
|
413
|
-
|
414
|
-
def free_space(token)
|
415
|
-
case token.token
|
416
|
-
when :comment
|
417
|
-
node << Comment.new(token, active_opts)
|
418
|
-
when :whitespace
|
419
|
-
if node.last.is_a?(WhiteSpace)
|
420
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
421
|
-
else
|
422
|
-
node << WhiteSpace.new(token, active_opts)
|
423
|
-
end
|
424
|
-
else
|
425
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
474
|
def quantifier(token)
|
430
|
-
|
431
|
-
target_node
|
432
|
-
while target_node.is_a?(FreeSpace)
|
433
|
-
target_node = node.expressions[offset -= 1]
|
434
|
-
end
|
435
|
-
|
436
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
437
|
-
"'#{token.text}' ")
|
475
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
476
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
438
477
|
|
439
478
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
440
479
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -454,7 +493,7 @@ class Regexp::Parser
|
|
454
493
|
new_group.implicit = true
|
455
494
|
new_group << target_node
|
456
495
|
increase_level(target_node)
|
457
|
-
node.expressions[
|
496
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
458
497
|
target_node = new_group
|
459
498
|
end
|
460
499
|
|
@@ -515,100 +554,16 @@ class Regexp::Parser
|
|
515
554
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
516
555
|
end
|
517
556
|
|
518
|
-
def
|
519
|
-
case token.token
|
520
|
-
when :options, :options_switch
|
521
|
-
options_group(token)
|
522
|
-
when :close
|
523
|
-
close_group
|
524
|
-
when :comment
|
525
|
-
node << Group::Comment.new(token, active_opts)
|
526
|
-
else
|
527
|
-
open_group(token)
|
528
|
-
end
|
529
|
-
end
|
530
|
-
|
531
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
532
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
533
|
-
|
534
|
-
def options_group(token)
|
535
|
-
positive, negative = token.text.split('-', 2)
|
536
|
-
negative ||= ''
|
537
|
-
self.switching_options = token.token.equal?(:options_switch)
|
538
|
-
|
539
|
-
opt_changes = {}
|
540
|
-
new_active_opts = active_opts.dup
|
541
|
-
|
542
|
-
MOD_FLAGS.each do |flag|
|
543
|
-
if positive.include?(flag.to_s)
|
544
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
545
|
-
end
|
546
|
-
if negative.include?(flag.to_s)
|
547
|
-
opt_changes[flag] = false
|
548
|
-
new_active_opts.delete(flag)
|
549
|
-
end
|
550
|
-
end
|
551
|
-
|
552
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
553
|
-
enc_flag = enc_flag.to_sym
|
554
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
555
|
-
opt_changes[other] = false if new_active_opts[other]
|
556
|
-
new_active_opts.delete(other)
|
557
|
-
end
|
558
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
559
|
-
end
|
560
|
-
|
561
|
-
options_stack << new_active_opts
|
562
|
-
|
563
|
-
options_group = Group::Options.new(token, active_opts)
|
564
|
-
options_group.option_changes = opt_changes
|
565
|
-
|
566
|
-
nest(options_group)
|
567
|
-
end
|
568
|
-
|
569
|
-
def open_group(token)
|
557
|
+
def set(token)
|
570
558
|
case token.token
|
571
|
-
when :
|
572
|
-
|
573
|
-
when :
|
574
|
-
|
575
|
-
when :
|
576
|
-
exp = Group::Named.new(token, active_opts)
|
577
|
-
when :capture
|
578
|
-
exp = Group::Capture.new(token, active_opts)
|
579
|
-
when :absence
|
580
|
-
exp = Group::Absence.new(token, active_opts)
|
581
|
-
|
582
|
-
when :lookahead
|
583
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
584
|
-
when :nlookahead
|
585
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
586
|
-
when :lookbehind
|
587
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
588
|
-
when :nlookbehind
|
589
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
590
|
-
|
559
|
+
when :open; open_set(token)
|
560
|
+
when :close; close_set
|
561
|
+
when :negate; negate_set
|
562
|
+
when :range; range(token)
|
563
|
+
when :intersection; intersection(token)
|
591
564
|
else
|
592
|
-
raise UnknownTokenError.new('
|
593
|
-
end
|
594
|
-
|
595
|
-
if exp.capturing?
|
596
|
-
exp.number = total_captured_group_count + 1
|
597
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
598
|
-
count_captured_group
|
565
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
599
566
|
end
|
600
|
-
|
601
|
-
# Push the active options to the stack again. This way we can simply pop the
|
602
|
-
# stack for any group we close, no matter if it had its own options or not.
|
603
|
-
options_stack << active_opts
|
604
|
-
|
605
|
-
nest(exp)
|
606
|
-
end
|
607
|
-
|
608
|
-
def close_group
|
609
|
-
options_stack.pop unless switching_options
|
610
|
-
self.switching_options = false
|
611
|
-
decrease_nesting
|
612
567
|
end
|
613
568
|
|
614
569
|
def open_set(token)
|
@@ -631,51 +586,45 @@ class Regexp::Parser
|
|
631
586
|
nest(exp)
|
632
587
|
end
|
633
588
|
|
634
|
-
def close_completed_character_set_range
|
635
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
636
|
-
end
|
637
|
-
|
638
589
|
def intersection(token)
|
639
590
|
sequence_operation(CharacterSet::Intersection, token)
|
640
591
|
end
|
641
592
|
|
642
|
-
def
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
593
|
+
def type(token)
|
594
|
+
case token.token
|
595
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
596
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
597
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
598
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
599
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
600
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
601
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
602
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
603
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
604
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
605
|
+
else
|
606
|
+
raise UnknownTokenError.new('CharacterType', token)
|
649
607
|
end
|
650
|
-
node.add_sequence(active_opts)
|
651
|
-
end
|
652
|
-
|
653
|
-
def active_opts
|
654
|
-
options_stack.last
|
655
|
-
end
|
656
|
-
|
657
|
-
def total_captured_group_count
|
658
|
-
captured_group_counts.values.reduce(0, :+)
|
659
|
-
end
|
660
|
-
|
661
|
-
def captured_group_count_at_level
|
662
|
-
captured_group_counts[node.level]
|
663
608
|
end
|
664
609
|
|
665
|
-
def
|
666
|
-
|
610
|
+
def close_completed_character_set_range
|
611
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
667
612
|
end
|
668
613
|
|
669
|
-
def
|
670
|
-
|
671
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
614
|
+
def active_opts
|
615
|
+
options_stack.last
|
672
616
|
end
|
673
617
|
|
618
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
619
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
620
|
+
# the instance of Group::Capture that it refers to via its number.
|
674
621
|
def assign_referenced_expressions
|
675
622
|
targets = {}
|
623
|
+
# find all referencable expressions
|
676
624
|
root.each_expression do |exp|
|
677
625
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
678
626
|
end
|
627
|
+
# assign them to any refering expressions
|
679
628
|
root.each_expression do |exp|
|
680
629
|
exp.respond_to?(:reference) &&
|
681
630
|
exp.referenced_expression = targets[exp.reference]
|