regexp_parser 1.7.1 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +157 -1
- data/Gemfile +6 -1
- data/LICENSE +1 -1
- data/README.md +38 -32
- data/Rakefile +18 -27
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +1 -3
- data/lib/regexp_parser/expression/classes/root.rb +4 -17
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +11 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -20
- data/lib/regexp_parser/expression/subexpression.rb +1 -2
- data/lib/regexp_parser/expression.rb +7 -139
- data/lib/regexp_parser/lexer.rb +13 -11
- data/lib/regexp_parser/parser.rb +325 -344
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/properties/long.csv +604 -0
- data/lib/regexp_parser/scanner/properties/short.csv +242 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +235 -255
- data/lib/regexp_parser/scanner.rb +1324 -1387
- data/lib/regexp_parser/syntax/any.rb +4 -6
- data/lib/regexp_parser/syntax/base.rb +13 -15
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +34 -165
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -52
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -18,12 +18,12 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
22
|
-
new.parse(input, syntax, &block)
|
21
|
+
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
22
|
+
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
26
|
-
root = Root.build(
|
25
|
+
def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
26
|
+
root = Root.build(extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
29
29
|
self.node = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -54,105 +54,171 @@ class Regexp::Parser
|
|
54
54
|
:options_stack, :switching_options, :conditional_nesting,
|
55
55
|
:captured_group_counts
|
56
56
|
|
57
|
-
def
|
58
|
-
|
57
|
+
def extract_options(input, options)
|
58
|
+
if options && !input.is_a?(String)
|
59
|
+
raise ArgumentError, 'options cannot be supplied unless parsing a String'
|
60
|
+
end
|
59
61
|
|
60
|
-
options =
|
61
|
-
options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
|
62
|
-
options[:m] = true if input.options & ::Regexp::MULTILINE != 0
|
63
|
-
options[:x] = true if input.options & ::Regexp::EXTENDED != 0
|
64
|
-
options
|
65
|
-
end
|
62
|
+
options = input.options if input.is_a?(::Regexp)
|
66
63
|
|
67
|
-
|
68
|
-
nesting.push(exp)
|
69
|
-
node << exp
|
70
|
-
update_transplanted_subtree(exp, node)
|
71
|
-
self.node = exp
|
72
|
-
end
|
73
|
-
|
74
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
75
|
-
def update_transplanted_subtree(exp, new_parent)
|
76
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
77
|
-
exp.respond_to?(:each) &&
|
78
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
79
|
-
end
|
80
|
-
|
81
|
-
def decrease_nesting
|
82
|
-
while nesting.last.is_a?(SequenceOperation)
|
83
|
-
nesting.pop
|
84
|
-
self.node = nesting.last
|
85
|
-
end
|
86
|
-
nesting.pop
|
87
|
-
yield(node) if block_given?
|
88
|
-
self.node = nesting.last
|
89
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
90
|
-
end
|
64
|
+
return {} unless options
|
91
65
|
|
92
|
-
|
93
|
-
|
94
|
-
|
66
|
+
enabled_options = {}
|
67
|
+
enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
|
68
|
+
enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
|
69
|
+
enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
|
70
|
+
enabled_options
|
95
71
|
end
|
96
72
|
|
97
73
|
def parse_token(token)
|
98
|
-
close_completed_character_set_range
|
99
|
-
|
100
74
|
case token.type
|
101
|
-
when :
|
102
|
-
when :
|
103
|
-
when :
|
104
|
-
when :
|
105
|
-
when :
|
106
|
-
when :
|
107
|
-
when :
|
108
|
-
when :
|
109
|
-
when :
|
110
|
-
when :
|
111
|
-
when :
|
112
|
-
|
113
|
-
when :
|
114
|
-
|
115
|
-
when :property, :nonproperty
|
116
|
-
property(token)
|
117
|
-
|
118
|
-
when :literal
|
119
|
-
node << Literal.new(token, active_opts)
|
120
|
-
when :free_space
|
121
|
-
free_space(token)
|
122
|
-
|
75
|
+
when :anchor; anchor(token)
|
76
|
+
when :assertion, :group; group(token)
|
77
|
+
when :backref; backref(token)
|
78
|
+
when :conditional; conditional(token)
|
79
|
+
when :escape; escape(token)
|
80
|
+
when :free_space; free_space(token)
|
81
|
+
when :keep; keep(token)
|
82
|
+
when :literal; literal(token)
|
83
|
+
when :meta; meta(token)
|
84
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
85
|
+
when :property, :nonproperty; property(token)
|
86
|
+
when :quantifier; quantifier(token)
|
87
|
+
when :set; set(token)
|
88
|
+
when :type; type(token)
|
123
89
|
else
|
124
90
|
raise UnknownTokenTypeError.new(token.type, token)
|
125
91
|
end
|
92
|
+
|
93
|
+
close_completed_character_set_range
|
126
94
|
end
|
127
95
|
|
128
|
-
def
|
96
|
+
def anchor(token)
|
129
97
|
case token.token
|
130
|
-
when :
|
131
|
-
|
132
|
-
when :
|
133
|
-
|
134
|
-
when :
|
135
|
-
|
136
|
-
when :
|
137
|
-
|
138
|
-
when :intersection
|
139
|
-
intersection(token)
|
140
|
-
when :collation, :equivalent
|
141
|
-
node << Literal.new(token, active_opts)
|
98
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
99
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
100
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
101
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
102
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
103
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
104
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
105
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
142
106
|
else
|
143
|
-
raise UnknownTokenError.new('
|
107
|
+
raise UnknownTokenError.new('Anchor', token)
|
144
108
|
end
|
145
109
|
end
|
146
110
|
|
147
|
-
def
|
111
|
+
def group(token)
|
148
112
|
case token.token
|
149
|
-
when :
|
150
|
-
|
151
|
-
when :
|
152
|
-
|
113
|
+
when :options, :options_switch
|
114
|
+
options_group(token)
|
115
|
+
when :close
|
116
|
+
close_group
|
117
|
+
when :comment
|
118
|
+
node << Group::Comment.new(token, active_opts)
|
153
119
|
else
|
154
|
-
|
120
|
+
open_group(token)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
125
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
126
|
+
|
127
|
+
def options_group(token)
|
128
|
+
positive, negative = token.text.split('-', 2)
|
129
|
+
negative ||= ''
|
130
|
+
self.switching_options = token.token.equal?(:options_switch)
|
131
|
+
|
132
|
+
opt_changes = {}
|
133
|
+
new_active_opts = active_opts.dup
|
134
|
+
|
135
|
+
MOD_FLAGS.each do |flag|
|
136
|
+
if positive.include?(flag.to_s)
|
137
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
138
|
+
end
|
139
|
+
if negative.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = false
|
141
|
+
new_active_opts.delete(flag)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
146
|
+
enc_flag = enc_flag.to_sym
|
147
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
148
|
+
opt_changes[other] = false if new_active_opts[other]
|
149
|
+
new_active_opts.delete(other)
|
150
|
+
end
|
151
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
152
|
+
end
|
153
|
+
|
154
|
+
options_stack << new_active_opts
|
155
|
+
|
156
|
+
options_group = Group::Options.new(token, active_opts)
|
157
|
+
options_group.option_changes = opt_changes
|
158
|
+
|
159
|
+
nest(options_group)
|
160
|
+
end
|
161
|
+
|
162
|
+
def open_group(token)
|
163
|
+
group_class =
|
164
|
+
case token.token
|
165
|
+
when :absence; Group::Absence
|
166
|
+
when :atomic; Group::Atomic
|
167
|
+
when :capture; Group::Capture
|
168
|
+
when :named; Group::Named
|
169
|
+
when :passive; Group::Passive
|
170
|
+
|
171
|
+
when :lookahead; Assertion::Lookahead
|
172
|
+
when :lookbehind; Assertion::Lookbehind
|
173
|
+
when :nlookahead; Assertion::NegativeLookahead
|
174
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
175
|
+
|
176
|
+
else
|
177
|
+
raise UnknownTokenError.new('Group type open', token)
|
178
|
+
end
|
179
|
+
|
180
|
+
group = group_class.new(token, active_opts)
|
181
|
+
|
182
|
+
if group.capturing?
|
183
|
+
group.number = total_captured_group_count + 1
|
184
|
+
group.number_at_level = captured_group_count_at_level + 1
|
185
|
+
count_captured_group
|
186
|
+
end
|
187
|
+
|
188
|
+
# Push the active options to the stack again. This way we can simply pop the
|
189
|
+
# stack for any group we close, no matter if it had its own options or not.
|
190
|
+
options_stack << active_opts
|
191
|
+
|
192
|
+
nest(group)
|
193
|
+
end
|
194
|
+
|
195
|
+
def total_captured_group_count
|
196
|
+
captured_group_counts.values.reduce(0, :+)
|
197
|
+
end
|
198
|
+
|
199
|
+
def captured_group_count_at_level
|
200
|
+
captured_group_counts[node.level]
|
201
|
+
end
|
202
|
+
|
203
|
+
def count_captured_group
|
204
|
+
captured_group_counts[node.level] += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def close_group
|
208
|
+
options_stack.pop unless switching_options
|
209
|
+
self.switching_options = false
|
210
|
+
decrease_nesting
|
211
|
+
end
|
212
|
+
|
213
|
+
def decrease_nesting
|
214
|
+
while nesting.last.is_a?(SequenceOperation)
|
215
|
+
nesting.pop
|
216
|
+
self.node = nesting.last
|
155
217
|
end
|
218
|
+
nesting.pop
|
219
|
+
yield(node) if block_given?
|
220
|
+
self.node = nesting.last
|
221
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
156
222
|
end
|
157
223
|
|
158
224
|
def backref(token)
|
@@ -182,31 +248,9 @@ class Regexp::Parser
|
|
182
248
|
end
|
183
249
|
end
|
184
250
|
|
185
|
-
def
|
186
|
-
|
187
|
-
|
188
|
-
node << CharacterType::Digit.new(token, active_opts)
|
189
|
-
when :nondigit
|
190
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
191
|
-
when :hex
|
192
|
-
node << CharacterType::Hex.new(token, active_opts)
|
193
|
-
when :nonhex
|
194
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
195
|
-
when :space
|
196
|
-
node << CharacterType::Space.new(token, active_opts)
|
197
|
-
when :nonspace
|
198
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
199
|
-
when :word
|
200
|
-
node << CharacterType::Word.new(token, active_opts)
|
201
|
-
when :nonword
|
202
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
203
|
-
when :linebreak
|
204
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
205
|
-
when :xgrapheme
|
206
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
207
|
-
else
|
208
|
-
raise UnknownTokenError.new('CharacterType', token)
|
209
|
-
end
|
251
|
+
def assign_effective_number(exp)
|
252
|
+
exp.effective_number =
|
253
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
210
254
|
end
|
211
255
|
|
212
256
|
def conditional(token)
|
@@ -234,11 +278,118 @@ class Regexp::Parser
|
|
234
278
|
end
|
235
279
|
end
|
236
280
|
|
281
|
+
def nest_conditional(exp)
|
282
|
+
conditional_nesting.push(exp)
|
283
|
+
nest(exp)
|
284
|
+
end
|
285
|
+
|
286
|
+
def nest(exp)
|
287
|
+
nesting.push(exp)
|
288
|
+
node << exp
|
289
|
+
update_transplanted_subtree(exp, node)
|
290
|
+
self.node = exp
|
291
|
+
end
|
292
|
+
|
293
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
+
def update_transplanted_subtree(exp, new_parent)
|
295
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
+
exp.respond_to?(:each) &&
|
297
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
+
end
|
299
|
+
|
300
|
+
def escape(token)
|
301
|
+
case token.token
|
302
|
+
|
303
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
304
|
+
|
305
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
306
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
307
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
308
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
309
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
310
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
311
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
312
|
+
|
313
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
314
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
315
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
316
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
317
|
+
|
318
|
+
when :control
|
319
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
320
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
321
|
+
else
|
322
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
323
|
+
end
|
324
|
+
|
325
|
+
when :meta_sequence
|
326
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
327
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
328
|
+
else
|
329
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
330
|
+
end
|
331
|
+
|
332
|
+
else
|
333
|
+
# treating everything else as a literal
|
334
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
335
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
336
|
+
# as stuff that would be a literal even without the backslash.
|
337
|
+
# Right now, they all end up here.
|
338
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def free_space(token)
|
343
|
+
case token.token
|
344
|
+
when :comment
|
345
|
+
node << Comment.new(token, active_opts)
|
346
|
+
when :whitespace
|
347
|
+
if node.last.is_a?(WhiteSpace)
|
348
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
349
|
+
else
|
350
|
+
node << WhiteSpace.new(token, active_opts)
|
351
|
+
end
|
352
|
+
else
|
353
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep(token)
|
358
|
+
node << Keep::Mark.new(token, active_opts)
|
359
|
+
end
|
360
|
+
|
361
|
+
def literal(token)
|
362
|
+
node << Literal.new(token, active_opts)
|
363
|
+
end
|
364
|
+
|
365
|
+
def meta(token)
|
366
|
+
case token.token
|
367
|
+
when :dot
|
368
|
+
node << CharacterType::Any.new(token, active_opts)
|
369
|
+
when :alternation
|
370
|
+
sequence_operation(Alternation, token)
|
371
|
+
else
|
372
|
+
raise UnknownTokenError.new('Meta', token)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def sequence_operation(klass, token)
|
377
|
+
unless node.is_a?(klass)
|
378
|
+
operator = klass.new(token, active_opts)
|
379
|
+
sequence = operator.add_sequence(active_opts)
|
380
|
+
sequence.expressions = node.expressions
|
381
|
+
node.expressions = []
|
382
|
+
nest(operator)
|
383
|
+
end
|
384
|
+
node.add_sequence(active_opts)
|
385
|
+
end
|
386
|
+
|
237
387
|
def posixclass(token)
|
238
388
|
node << PosixClass.new(token, active_opts)
|
239
389
|
end
|
240
390
|
|
241
391
|
include Regexp::Expression::UnicodeProperty
|
392
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
242
393
|
|
243
394
|
def property(token)
|
244
395
|
case token.token
|
@@ -310,128 +461,43 @@ class Regexp::Parser
|
|
310
461
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
311
462
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
312
463
|
|
313
|
-
when *
|
314
|
-
node <<
|
315
|
-
|
316
|
-
when *
|
317
|
-
|
318
|
-
|
319
|
-
when *Token::UnicodeProperty::Emoji
|
320
|
-
node << Emoji.new(token, active_opts)
|
321
|
-
|
322
|
-
when *Token::UnicodeProperty::Script
|
323
|
-
node << Script.new(token, active_opts)
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
326
|
-
node << Block.new(token, active_opts)
|
464
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
465
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
466
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
467
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
468
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
327
469
|
|
328
470
|
else
|
329
471
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
330
472
|
end
|
331
473
|
end
|
332
474
|
|
333
|
-
def anchor(token)
|
334
|
-
case token.token
|
335
|
-
when :bol
|
336
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
337
|
-
when :eol
|
338
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
339
|
-
when :bos
|
340
|
-
node << Anchor::BOS.new(token, active_opts)
|
341
|
-
when :eos
|
342
|
-
node << Anchor::EOS.new(token, active_opts)
|
343
|
-
when :eos_ob_eol
|
344
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
345
|
-
when :word_boundary
|
346
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
347
|
-
when :nonword_boundary
|
348
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
349
|
-
when :match_start
|
350
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
351
|
-
else
|
352
|
-
raise UnknownTokenError.new('Anchor', token)
|
353
|
-
end
|
354
|
-
end
|
355
|
-
|
356
|
-
def escape(token)
|
357
|
-
case token.token
|
358
|
-
|
359
|
-
when :backspace
|
360
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
361
|
-
|
362
|
-
when :escape
|
363
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
364
|
-
when :bell
|
365
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
366
|
-
when :form_feed
|
367
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
368
|
-
when :newline
|
369
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
370
|
-
when :carriage
|
371
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
372
|
-
when :tab
|
373
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
374
|
-
when :vertical_tab
|
375
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
376
|
-
|
377
|
-
when :hex
|
378
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
379
|
-
when :octal
|
380
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
381
|
-
when :codepoint
|
382
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
383
|
-
when :codepoint_list
|
384
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
385
|
-
|
386
|
-
when :control
|
387
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
388
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
389
|
-
else
|
390
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
391
|
-
end
|
392
|
-
|
393
|
-
when :meta_sequence
|
394
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
395
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
396
|
-
else
|
397
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
398
|
-
end
|
399
|
-
|
400
|
-
else
|
401
|
-
# treating everything else as a literal
|
402
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
403
|
-
end
|
404
|
-
end
|
405
|
-
|
406
|
-
def keep(token)
|
407
|
-
node << Keep::Mark.new(token, active_opts)
|
408
|
-
end
|
409
|
-
|
410
|
-
def free_space(token)
|
411
|
-
case token.token
|
412
|
-
when :comment
|
413
|
-
node << Comment.new(token, active_opts)
|
414
|
-
when :whitespace
|
415
|
-
if node.last.is_a?(WhiteSpace)
|
416
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
417
|
-
else
|
418
|
-
node << WhiteSpace.new(token, active_opts)
|
419
|
-
end
|
420
|
-
else
|
421
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
422
|
-
end
|
423
|
-
end
|
424
|
-
|
425
475
|
def quantifier(token)
|
426
|
-
|
427
|
-
target_node
|
428
|
-
|
429
|
-
|
476
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
477
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
478
|
+
|
479
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
480
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
481
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
482
|
+
if target_node.quantified?
|
483
|
+
new_token = Regexp::Token.new(
|
484
|
+
:group,
|
485
|
+
:passive,
|
486
|
+
'', # text
|
487
|
+
target_node.ts,
|
488
|
+
nil, # te (unused)
|
489
|
+
target_node.level,
|
490
|
+
target_node.set_level,
|
491
|
+
target_node.conditional_level
|
492
|
+
)
|
493
|
+
new_group = Group::Passive.new(new_token, active_opts)
|
494
|
+
new_group.implicit = true
|
495
|
+
new_group << target_node
|
496
|
+
increase_level(target_node)
|
497
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
498
|
+
target_node = new_group
|
430
499
|
end
|
431
500
|
|
432
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
433
|
-
"'#{token.text}' ")
|
434
|
-
|
435
501
|
case token.token
|
436
502
|
when :zero_or_one
|
437
503
|
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
@@ -462,6 +528,11 @@ class Regexp::Parser
|
|
462
528
|
end
|
463
529
|
end
|
464
530
|
|
531
|
+
def increase_level(exp)
|
532
|
+
exp.level += 1
|
533
|
+
exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
|
534
|
+
end
|
535
|
+
|
465
536
|
def interval(target_node, token)
|
466
537
|
text = token.text
|
467
538
|
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
@@ -484,100 +555,16 @@ class Regexp::Parser
|
|
484
555
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
485
556
|
end
|
486
557
|
|
487
|
-
def
|
488
|
-
case token.token
|
489
|
-
when :options, :options_switch
|
490
|
-
options_group(token)
|
491
|
-
when :close
|
492
|
-
close_group
|
493
|
-
when :comment
|
494
|
-
node << Group::Comment.new(token, active_opts)
|
495
|
-
else
|
496
|
-
open_group(token)
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
501
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
502
|
-
|
503
|
-
def options_group(token)
|
504
|
-
positive, negative = token.text.split('-', 2)
|
505
|
-
negative ||= ''
|
506
|
-
self.switching_options = token.token.equal?(:options_switch)
|
507
|
-
|
508
|
-
opt_changes = {}
|
509
|
-
new_active_opts = active_opts.dup
|
510
|
-
|
511
|
-
MOD_FLAGS.each do |flag|
|
512
|
-
if positive.include?(flag.to_s)
|
513
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
514
|
-
end
|
515
|
-
if negative.include?(flag.to_s)
|
516
|
-
opt_changes[flag] = false
|
517
|
-
new_active_opts.delete(flag)
|
518
|
-
end
|
519
|
-
end
|
520
|
-
|
521
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
522
|
-
enc_flag = enc_flag.to_sym
|
523
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
524
|
-
opt_changes[other] = false if new_active_opts[other]
|
525
|
-
new_active_opts.delete(other)
|
526
|
-
end
|
527
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
528
|
-
end
|
529
|
-
|
530
|
-
options_stack << new_active_opts
|
531
|
-
|
532
|
-
options_group = Group::Options.new(token, active_opts)
|
533
|
-
options_group.option_changes = opt_changes
|
534
|
-
|
535
|
-
nest(options_group)
|
536
|
-
end
|
537
|
-
|
538
|
-
def open_group(token)
|
558
|
+
def set(token)
|
539
559
|
case token.token
|
540
|
-
when :
|
541
|
-
|
542
|
-
when :
|
543
|
-
|
544
|
-
when :
|
545
|
-
exp = Group::Named.new(token, active_opts)
|
546
|
-
when :capture
|
547
|
-
exp = Group::Capture.new(token, active_opts)
|
548
|
-
when :absence
|
549
|
-
exp = Group::Absence.new(token, active_opts)
|
550
|
-
|
551
|
-
when :lookahead
|
552
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
553
|
-
when :nlookahead
|
554
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
555
|
-
when :lookbehind
|
556
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
557
|
-
when :nlookbehind
|
558
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
559
|
-
|
560
|
+
when :open; open_set(token)
|
561
|
+
when :close; close_set
|
562
|
+
when :negate; negate_set
|
563
|
+
when :range; range(token)
|
564
|
+
when :intersection; intersection(token)
|
560
565
|
else
|
561
|
-
raise UnknownTokenError.new('
|
562
|
-
end
|
563
|
-
|
564
|
-
if exp.capturing?
|
565
|
-
exp.number = total_captured_group_count + 1
|
566
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
567
|
-
count_captured_group
|
566
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
568
567
|
end
|
569
|
-
|
570
|
-
# Push the active options to the stack again. This way we can simply pop the
|
571
|
-
# stack for any group we close, no matter if it had its own options or not.
|
572
|
-
options_stack << active_opts
|
573
|
-
|
574
|
-
nest(exp)
|
575
|
-
end
|
576
|
-
|
577
|
-
def close_group
|
578
|
-
options_stack.pop unless switching_options
|
579
|
-
self.switching_options = false
|
580
|
-
decrease_nesting
|
581
568
|
end
|
582
569
|
|
583
570
|
def open_set(token)
|
@@ -600,51 +587,45 @@ class Regexp::Parser
|
|
600
587
|
nest(exp)
|
601
588
|
end
|
602
589
|
|
603
|
-
def close_completed_character_set_range
|
604
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
605
|
-
end
|
606
|
-
|
607
590
|
def intersection(token)
|
608
591
|
sequence_operation(CharacterSet::Intersection, token)
|
609
592
|
end
|
610
593
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
594
|
+
def type(token)
|
595
|
+
case token.token
|
596
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
597
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
598
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
599
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
600
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
601
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
602
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
603
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
604
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
605
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
606
|
+
else
|
607
|
+
raise UnknownTokenError.new('CharacterType', token)
|
618
608
|
end
|
619
|
-
node.add_sequence(active_opts)
|
620
|
-
end
|
621
|
-
|
622
|
-
def active_opts
|
623
|
-
options_stack.last
|
624
|
-
end
|
625
|
-
|
626
|
-
def total_captured_group_count
|
627
|
-
captured_group_counts.values.reduce(0, :+)
|
628
|
-
end
|
629
|
-
|
630
|
-
def captured_group_count_at_level
|
631
|
-
captured_group_counts[node.level]
|
632
609
|
end
|
633
610
|
|
634
|
-
def
|
635
|
-
|
611
|
+
def close_completed_character_set_range
|
612
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
636
613
|
end
|
637
614
|
|
638
|
-
def
|
639
|
-
|
640
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
615
|
+
def active_opts
|
616
|
+
options_stack.last
|
641
617
|
end
|
642
618
|
|
619
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
620
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
621
|
+
# the instance of Group::Capture that it refers to via its number.
|
643
622
|
def assign_referenced_expressions
|
644
623
|
targets = {}
|
624
|
+
# find all referencable expressions
|
645
625
|
root.each_expression do |exp|
|
646
626
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
647
627
|
end
|
628
|
+
# assign them to any refering expressions
|
648
629
|
root.each_expression do |exp|
|
649
630
|
exp.respond_to?(:reference) &&
|
650
631
|
exp.referenced_expression = targets[exp.reference]
|