regexp_parser 2.0.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +54 -0
- data/Gemfile +5 -1
- data/README.md +15 -21
- data/Rakefile +11 -17
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +1 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
- data/lib/regexp_parser/expression/classes/group.rb +6 -1
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +1 -3
- data/lib/regexp_parser/expression/classes/root.rb +0 -1
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/quantifier.rb +2 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -10
- data/lib/regexp_parser/expression/subexpression.rb +1 -2
- data/lib/regexp_parser/expression.rb +7 -130
- data/lib/regexp_parser/lexer.rb +7 -5
- data/lib/regexp_parser/parser.rb +282 -334
- data/lib/regexp_parser/scanner/properties/long.yml +13 -0
- data/lib/regexp_parser/scanner/properties/short.yml +9 -1
- data/lib/regexp_parser/scanner/scanner.rl +64 -87
- data/lib/regexp_parser/scanner.rb +1024 -1073
- data/lib/regexp_parser/syntax/any.rb +2 -4
- data/lib/regexp_parser/syntax/base.rb +10 -10
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens/backref.rb → token/backreference.rb} +6 -5
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/lexer/nesting_spec.rb +2 -2
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/escapes_spec.rb +43 -31
- data/spec/parser/properties_spec.rb +6 -4
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +26 -16
- data/spec/scanner/escapes_spec.rb +29 -20
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +66 -23
- data/spec/spec_helper.rb +13 -1
- data/spec/support/capturing_stderr.rb +9 -0
- data/spec/syntax/versions/1.8.6_spec.rb +2 -2
- data/spec/syntax/versions/2.0.0_spec.rb +2 -2
- data/spec/syntax/versions/aliases_spec.rb +1 -0
- metadata +27 -26
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/support/runner.rb +0 -42
- data/spec/support/warning_extractor.rb +0 -60
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -70,95 +70,155 @@ class Regexp::Parser
|
|
70
70
|
enabled_options
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
def parse_token(token)
|
74
|
+
case token.type
|
75
|
+
when :anchor; anchor(token)
|
76
|
+
when :assertion, :group; group(token)
|
77
|
+
when :backref; backref(token)
|
78
|
+
when :conditional; conditional(token)
|
79
|
+
when :escape; escape(token)
|
80
|
+
when :free_space; free_space(token)
|
81
|
+
when :keep; keep(token)
|
82
|
+
when :literal; literal(token)
|
83
|
+
when :meta; meta(token)
|
84
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
85
|
+
when :property, :nonproperty; property(token)
|
86
|
+
when :quantifier; quantifier(token)
|
87
|
+
when :set; set(token)
|
88
|
+
when :type; type(token)
|
89
|
+
else
|
90
|
+
raise UnknownTokenTypeError.new(token.type, token)
|
91
|
+
end
|
79
92
|
|
80
|
-
|
81
|
-
def update_transplanted_subtree(exp, new_parent)
|
82
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
83
|
-
exp.respond_to?(:each) &&
|
84
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
93
|
+
close_completed_character_set_range
|
85
94
|
end
|
86
95
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
def anchor(token)
|
97
|
+
case token.token
|
98
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
99
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
100
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
101
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
102
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
103
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
104
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
105
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
106
|
+
else
|
107
|
+
raise UnknownTokenError.new('Anchor', token)
|
91
108
|
end
|
92
|
-
nesting.pop
|
93
|
-
yield(node) if block_given?
|
94
|
-
self.node = nesting.last
|
95
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
96
109
|
end
|
97
110
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
111
|
+
def group(token)
|
112
|
+
case token.token
|
113
|
+
when :options, :options_switch
|
114
|
+
options_group(token)
|
115
|
+
when :close
|
116
|
+
close_group
|
117
|
+
when :comment
|
118
|
+
node << Group::Comment.new(token, active_opts)
|
119
|
+
else
|
120
|
+
open_group(token)
|
121
|
+
end
|
101
122
|
end
|
102
123
|
|
103
|
-
|
104
|
-
|
124
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
125
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
105
126
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
when :escape; escape(token)
|
111
|
-
when :group; group(token)
|
112
|
-
when :assertion; group(token)
|
113
|
-
when :set; set(token)
|
114
|
-
when :type; type(token)
|
115
|
-
when :backref; backref(token)
|
116
|
-
when :conditional; conditional(token)
|
117
|
-
when :keep; keep(token)
|
118
|
-
|
119
|
-
when :posixclass, :nonposixclass
|
120
|
-
posixclass(token)
|
121
|
-
when :property, :nonproperty
|
122
|
-
property(token)
|
123
|
-
|
124
|
-
when :literal
|
125
|
-
node << Literal.new(token, active_opts)
|
126
|
-
when :free_space
|
127
|
-
free_space(token)
|
127
|
+
def options_group(token)
|
128
|
+
positive, negative = token.text.split('-', 2)
|
129
|
+
negative ||= ''
|
130
|
+
self.switching_options = token.token.equal?(:options_switch)
|
128
131
|
|
129
|
-
|
130
|
-
|
132
|
+
opt_changes = {}
|
133
|
+
new_active_opts = active_opts.dup
|
134
|
+
|
135
|
+
MOD_FLAGS.each do |flag|
|
136
|
+
if positive.include?(flag.to_s)
|
137
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
138
|
+
end
|
139
|
+
if negative.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = false
|
141
|
+
new_active_opts.delete(flag)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
146
|
+
enc_flag = enc_flag.to_sym
|
147
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
148
|
+
opt_changes[other] = false if new_active_opts[other]
|
149
|
+
new_active_opts.delete(other)
|
150
|
+
end
|
151
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
131
152
|
end
|
153
|
+
|
154
|
+
options_stack << new_active_opts
|
155
|
+
|
156
|
+
options_group = Group::Options.new(token, active_opts)
|
157
|
+
options_group.option_changes = opt_changes
|
158
|
+
|
159
|
+
nest(options_group)
|
132
160
|
end
|
133
161
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
162
|
+
def open_group(token)
|
163
|
+
group_class =
|
164
|
+
case token.token
|
165
|
+
when :absence; Group::Absence
|
166
|
+
when :atomic; Group::Atomic
|
167
|
+
when :capture; Group::Capture
|
168
|
+
when :named; Group::Named
|
169
|
+
when :passive; Group::Passive
|
170
|
+
|
171
|
+
when :lookahead; Assertion::Lookahead
|
172
|
+
when :lookbehind; Assertion::Lookbehind
|
173
|
+
when :nlookahead; Assertion::NegativeLookahead
|
174
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
175
|
+
|
176
|
+
else
|
177
|
+
raise UnknownTokenError.new('Group type open', token)
|
178
|
+
end
|
179
|
+
|
180
|
+
group = group_class.new(token, active_opts)
|
181
|
+
|
182
|
+
if group.capturing?
|
183
|
+
group.number = total_captured_group_count + 1
|
184
|
+
group.number_at_level = captured_group_count_at_level + 1
|
185
|
+
count_captured_group
|
150
186
|
end
|
187
|
+
|
188
|
+
# Push the active options to the stack again. This way we can simply pop the
|
189
|
+
# stack for any group we close, no matter if it had its own options or not.
|
190
|
+
options_stack << active_opts
|
191
|
+
|
192
|
+
nest(group)
|
151
193
|
end
|
152
194
|
|
153
|
-
def
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
195
|
+
def total_captured_group_count
|
196
|
+
captured_group_counts.values.reduce(0, :+)
|
197
|
+
end
|
198
|
+
|
199
|
+
def captured_group_count_at_level
|
200
|
+
captured_group_counts[node.level]
|
201
|
+
end
|
202
|
+
|
203
|
+
def count_captured_group
|
204
|
+
captured_group_counts[node.level] += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def close_group
|
208
|
+
options_stack.pop unless switching_options
|
209
|
+
self.switching_options = false
|
210
|
+
decrease_nesting
|
211
|
+
end
|
212
|
+
|
213
|
+
def decrease_nesting
|
214
|
+
while nesting.last.is_a?(SequenceOperation)
|
215
|
+
nesting.pop
|
216
|
+
self.node = nesting.last
|
161
217
|
end
|
218
|
+
nesting.pop
|
219
|
+
yield(node) if block_given?
|
220
|
+
self.node = nesting.last
|
221
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
162
222
|
end
|
163
223
|
|
164
224
|
def backref(token)
|
@@ -188,31 +248,9 @@ class Regexp::Parser
|
|
188
248
|
end
|
189
249
|
end
|
190
250
|
|
191
|
-
def
|
192
|
-
|
193
|
-
|
194
|
-
node << CharacterType::Digit.new(token, active_opts)
|
195
|
-
when :nondigit
|
196
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
197
|
-
when :hex
|
198
|
-
node << CharacterType::Hex.new(token, active_opts)
|
199
|
-
when :nonhex
|
200
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
201
|
-
when :space
|
202
|
-
node << CharacterType::Space.new(token, active_opts)
|
203
|
-
when :nonspace
|
204
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
205
|
-
when :word
|
206
|
-
node << CharacterType::Word.new(token, active_opts)
|
207
|
-
when :nonword
|
208
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
209
|
-
when :linebreak
|
210
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
211
|
-
when :xgrapheme
|
212
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
213
|
-
else
|
214
|
-
raise UnknownTokenError.new('CharacterType', token)
|
215
|
-
end
|
251
|
+
def assign_effective_number(exp)
|
252
|
+
exp.effective_number =
|
253
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
216
254
|
end
|
217
255
|
|
218
256
|
def conditional(token)
|
@@ -240,11 +278,118 @@ class Regexp::Parser
|
|
240
278
|
end
|
241
279
|
end
|
242
280
|
|
281
|
+
def nest_conditional(exp)
|
282
|
+
conditional_nesting.push(exp)
|
283
|
+
nest(exp)
|
284
|
+
end
|
285
|
+
|
286
|
+
def nest(exp)
|
287
|
+
nesting.push(exp)
|
288
|
+
node << exp
|
289
|
+
update_transplanted_subtree(exp, node)
|
290
|
+
self.node = exp
|
291
|
+
end
|
292
|
+
|
293
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
+
def update_transplanted_subtree(exp, new_parent)
|
295
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
+
exp.respond_to?(:each) &&
|
297
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
+
end
|
299
|
+
|
300
|
+
def escape(token)
|
301
|
+
case token.token
|
302
|
+
|
303
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
304
|
+
|
305
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
306
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
307
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
308
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
309
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
310
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
311
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
312
|
+
|
313
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
314
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
315
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
316
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
317
|
+
|
318
|
+
when :control
|
319
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
320
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
321
|
+
else
|
322
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
323
|
+
end
|
324
|
+
|
325
|
+
when :meta_sequence
|
326
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
327
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
328
|
+
else
|
329
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
330
|
+
end
|
331
|
+
|
332
|
+
else
|
333
|
+
# treating everything else as a literal
|
334
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
335
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
336
|
+
# as stuff that would be a literal even without the backslash.
|
337
|
+
# Right now, they all end up here.
|
338
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def free_space(token)
|
343
|
+
case token.token
|
344
|
+
when :comment
|
345
|
+
node << Comment.new(token, active_opts)
|
346
|
+
when :whitespace
|
347
|
+
if node.last.is_a?(WhiteSpace)
|
348
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
349
|
+
else
|
350
|
+
node << WhiteSpace.new(token, active_opts)
|
351
|
+
end
|
352
|
+
else
|
353
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep(token)
|
358
|
+
node << Keep::Mark.new(token, active_opts)
|
359
|
+
end
|
360
|
+
|
361
|
+
def literal(token)
|
362
|
+
node << Literal.new(token, active_opts)
|
363
|
+
end
|
364
|
+
|
365
|
+
def meta(token)
|
366
|
+
case token.token
|
367
|
+
when :dot
|
368
|
+
node << CharacterType::Any.new(token, active_opts)
|
369
|
+
when :alternation
|
370
|
+
sequence_operation(Alternation, token)
|
371
|
+
else
|
372
|
+
raise UnknownTokenError.new('Meta', token)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def sequence_operation(klass, token)
|
377
|
+
unless node.is_a?(klass)
|
378
|
+
operator = klass.new(token, active_opts)
|
379
|
+
sequence = operator.add_sequence(active_opts)
|
380
|
+
sequence.expressions = node.expressions
|
381
|
+
node.expressions = []
|
382
|
+
nest(operator)
|
383
|
+
end
|
384
|
+
node.add_sequence(active_opts)
|
385
|
+
end
|
386
|
+
|
243
387
|
def posixclass(token)
|
244
388
|
node << PosixClass.new(token, active_opts)
|
245
389
|
end
|
246
390
|
|
247
391
|
include Regexp::Expression::UnicodeProperty
|
392
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
248
393
|
|
249
394
|
def property(token)
|
250
395
|
case token.token
|
@@ -316,127 +461,20 @@ class Regexp::Parser
|
|
316
461
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
317
462
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
318
463
|
|
319
|
-
when *
|
320
|
-
node <<
|
321
|
-
|
322
|
-
when *
|
323
|
-
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::Emoji
|
326
|
-
node << Emoji.new(token, active_opts)
|
327
|
-
|
328
|
-
when *Token::UnicodeProperty::Script
|
329
|
-
node << Script.new(token, active_opts)
|
330
|
-
|
331
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
332
|
-
node << Block.new(token, active_opts)
|
464
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
465
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
466
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
467
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
468
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
333
469
|
|
334
470
|
else
|
335
471
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
336
472
|
end
|
337
473
|
end
|
338
474
|
|
339
|
-
def anchor(token)
|
340
|
-
case token.token
|
341
|
-
when :bol
|
342
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
343
|
-
when :eol
|
344
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
345
|
-
when :bos
|
346
|
-
node << Anchor::BOS.new(token, active_opts)
|
347
|
-
when :eos
|
348
|
-
node << Anchor::EOS.new(token, active_opts)
|
349
|
-
when :eos_ob_eol
|
350
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
351
|
-
when :word_boundary
|
352
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
353
|
-
when :nonword_boundary
|
354
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
355
|
-
when :match_start
|
356
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
357
|
-
else
|
358
|
-
raise UnknownTokenError.new('Anchor', token)
|
359
|
-
end
|
360
|
-
end
|
361
|
-
|
362
|
-
def escape(token)
|
363
|
-
case token.token
|
364
|
-
|
365
|
-
when :backspace
|
366
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
367
|
-
|
368
|
-
when :escape
|
369
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
370
|
-
when :bell
|
371
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
372
|
-
when :form_feed
|
373
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
374
|
-
when :newline
|
375
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
376
|
-
when :carriage
|
377
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
378
|
-
when :tab
|
379
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
380
|
-
when :vertical_tab
|
381
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
382
|
-
|
383
|
-
when :hex
|
384
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
385
|
-
when :octal
|
386
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
387
|
-
when :codepoint
|
388
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
389
|
-
when :codepoint_list
|
390
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
391
|
-
|
392
|
-
when :control
|
393
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
394
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
395
|
-
else
|
396
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
397
|
-
end
|
398
|
-
|
399
|
-
when :meta_sequence
|
400
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
401
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
402
|
-
else
|
403
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
404
|
-
end
|
405
|
-
|
406
|
-
else
|
407
|
-
# treating everything else as a literal
|
408
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
def keep(token)
|
413
|
-
node << Keep::Mark.new(token, active_opts)
|
414
|
-
end
|
415
|
-
|
416
|
-
def free_space(token)
|
417
|
-
case token.token
|
418
|
-
when :comment
|
419
|
-
node << Comment.new(token, active_opts)
|
420
|
-
when :whitespace
|
421
|
-
if node.last.is_a?(WhiteSpace)
|
422
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
423
|
-
else
|
424
|
-
node << WhiteSpace.new(token, active_opts)
|
425
|
-
end
|
426
|
-
else
|
427
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
428
|
-
end
|
429
|
-
end
|
430
|
-
|
431
475
|
def quantifier(token)
|
432
|
-
|
433
|
-
target_node
|
434
|
-
while target_node.is_a?(FreeSpace)
|
435
|
-
target_node = node.expressions[offset -= 1]
|
436
|
-
end
|
437
|
-
|
438
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
439
|
-
"'#{token.text}' ")
|
476
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
477
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
440
478
|
|
441
479
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
442
480
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -456,7 +494,7 @@ class Regexp::Parser
|
|
456
494
|
new_group.implicit = true
|
457
495
|
new_group << target_node
|
458
496
|
increase_level(target_node)
|
459
|
-
node.expressions[
|
497
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
460
498
|
target_node = new_group
|
461
499
|
end
|
462
500
|
|
@@ -517,100 +555,16 @@ class Regexp::Parser
|
|
517
555
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
518
556
|
end
|
519
557
|
|
520
|
-
def
|
521
|
-
case token.token
|
522
|
-
when :options, :options_switch
|
523
|
-
options_group(token)
|
524
|
-
when :close
|
525
|
-
close_group
|
526
|
-
when :comment
|
527
|
-
node << Group::Comment.new(token, active_opts)
|
528
|
-
else
|
529
|
-
open_group(token)
|
530
|
-
end
|
531
|
-
end
|
532
|
-
|
533
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
534
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
535
|
-
|
536
|
-
def options_group(token)
|
537
|
-
positive, negative = token.text.split('-', 2)
|
538
|
-
negative ||= ''
|
539
|
-
self.switching_options = token.token.equal?(:options_switch)
|
540
|
-
|
541
|
-
opt_changes = {}
|
542
|
-
new_active_opts = active_opts.dup
|
543
|
-
|
544
|
-
MOD_FLAGS.each do |flag|
|
545
|
-
if positive.include?(flag.to_s)
|
546
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
547
|
-
end
|
548
|
-
if negative.include?(flag.to_s)
|
549
|
-
opt_changes[flag] = false
|
550
|
-
new_active_opts.delete(flag)
|
551
|
-
end
|
552
|
-
end
|
553
|
-
|
554
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
555
|
-
enc_flag = enc_flag.to_sym
|
556
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
557
|
-
opt_changes[other] = false if new_active_opts[other]
|
558
|
-
new_active_opts.delete(other)
|
559
|
-
end
|
560
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
561
|
-
end
|
562
|
-
|
563
|
-
options_stack << new_active_opts
|
564
|
-
|
565
|
-
options_group = Group::Options.new(token, active_opts)
|
566
|
-
options_group.option_changes = opt_changes
|
567
|
-
|
568
|
-
nest(options_group)
|
569
|
-
end
|
570
|
-
|
571
|
-
def open_group(token)
|
558
|
+
def set(token)
|
572
559
|
case token.token
|
573
|
-
when :
|
574
|
-
|
575
|
-
when :
|
576
|
-
|
577
|
-
when :
|
578
|
-
exp = Group::Named.new(token, active_opts)
|
579
|
-
when :capture
|
580
|
-
exp = Group::Capture.new(token, active_opts)
|
581
|
-
when :absence
|
582
|
-
exp = Group::Absence.new(token, active_opts)
|
583
|
-
|
584
|
-
when :lookahead
|
585
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
586
|
-
when :nlookahead
|
587
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
588
|
-
when :lookbehind
|
589
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
590
|
-
when :nlookbehind
|
591
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
592
|
-
|
560
|
+
when :open; open_set(token)
|
561
|
+
when :close; close_set
|
562
|
+
when :negate; negate_set
|
563
|
+
when :range; range(token)
|
564
|
+
when :intersection; intersection(token)
|
593
565
|
else
|
594
|
-
raise UnknownTokenError.new('
|
595
|
-
end
|
596
|
-
|
597
|
-
if exp.capturing?
|
598
|
-
exp.number = total_captured_group_count + 1
|
599
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
600
|
-
count_captured_group
|
566
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
601
567
|
end
|
602
|
-
|
603
|
-
# Push the active options to the stack again. This way we can simply pop the
|
604
|
-
# stack for any group we close, no matter if it had its own options or not.
|
605
|
-
options_stack << active_opts
|
606
|
-
|
607
|
-
nest(exp)
|
608
|
-
end
|
609
|
-
|
610
|
-
def close_group
|
611
|
-
options_stack.pop unless switching_options
|
612
|
-
self.switching_options = false
|
613
|
-
decrease_nesting
|
614
568
|
end
|
615
569
|
|
616
570
|
def open_set(token)
|
@@ -633,51 +587,45 @@ class Regexp::Parser
|
|
633
587
|
nest(exp)
|
634
588
|
end
|
635
589
|
|
636
|
-
def close_completed_character_set_range
|
637
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
638
|
-
end
|
639
|
-
|
640
590
|
def intersection(token)
|
641
591
|
sequence_operation(CharacterSet::Intersection, token)
|
642
592
|
end
|
643
593
|
|
644
|
-
def
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
594
|
+
def type(token)
|
595
|
+
case token.token
|
596
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
597
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
598
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
599
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
600
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
601
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
602
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
603
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
604
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
605
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
606
|
+
else
|
607
|
+
raise UnknownTokenError.new('CharacterType', token)
|
651
608
|
end
|
652
|
-
node.add_sequence(active_opts)
|
653
|
-
end
|
654
|
-
|
655
|
-
def active_opts
|
656
|
-
options_stack.last
|
657
|
-
end
|
658
|
-
|
659
|
-
def total_captured_group_count
|
660
|
-
captured_group_counts.values.reduce(0, :+)
|
661
|
-
end
|
662
|
-
|
663
|
-
def captured_group_count_at_level
|
664
|
-
captured_group_counts[node.level]
|
665
609
|
end
|
666
610
|
|
667
|
-
def
|
668
|
-
|
611
|
+
def close_completed_character_set_range
|
612
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
669
613
|
end
|
670
614
|
|
671
|
-
def
|
672
|
-
|
673
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
615
|
+
def active_opts
|
616
|
+
options_stack.last
|
674
617
|
end
|
675
618
|
|
619
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
620
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
621
|
+
# the instance of Group::Capture that it refers to via its number.
|
676
622
|
def assign_referenced_expressions
|
677
623
|
targets = {}
|
624
|
+
# find all referencable expressions
|
678
625
|
root.each_expression do |exp|
|
679
626
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
680
627
|
end
|
628
|
+
# assign them to any refering expressions
|
681
629
|
root.each_expression do |exp|
|
682
630
|
exp.respond_to?(:reference) &&
|
683
631
|
exp.referenced_expression = targets[exp.reference]
|