regexp_parser 1.8.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +93 -0
- data/Gemfile +6 -1
- data/README.md +1 -4
- data/Rakefile +8 -8
- data/lib/regexp_parser.rb +1 -0
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +5 -18
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +306 -332
- data/lib/regexp_parser/scanner.rb +1272 -1338
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +206 -238
- data/lib/regexp_parser/syntax.rb +7 -7
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +8 -1
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +4 -7
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -7,16 +7,6 @@ module Regexp::Expression
|
|
7
7
|
# Used as the base class for the Alternation alternatives, Conditional
|
8
8
|
# branches, and CharacterSet::Intersection intersected sequences.
|
9
9
|
class Sequence < Regexp::Expression::Subexpression
|
10
|
-
# TODO: this override is here for backwards compatibility, remove in 2.0.0
|
11
|
-
def initialize(*args)
|
12
|
-
if args.count == 3
|
13
|
-
warn('WARNING: Sequence.new without a Regexp::Token argument is '\
|
14
|
-
'deprecated and will be removed in 2.0.0.')
|
15
|
-
return self.class.at_levels(*args)
|
16
|
-
end
|
17
|
-
super
|
18
|
-
end
|
19
|
-
|
20
10
|
class << self
|
21
11
|
def add_to(subexpression, params = {}, active_opts = {})
|
22
12
|
sequence = at_levels(
|
@@ -51,17 +41,11 @@ module Regexp::Expression
|
|
51
41
|
alias :ts :starts_at
|
52
42
|
|
53
43
|
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
54
|
-
|
55
|
-
target
|
56
|
-
|
57
|
-
target = expressions[offset -= 1]
|
58
|
-
end
|
59
|
-
|
60
|
-
target || raise(ArgumentError, "No valid target found for '#{text}' "\
|
61
|
-
'quantifier')
|
44
|
+
target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
45
|
+
target or raise Regexp::Parser::Error,
|
46
|
+
"No valid target found for '#{text}' quantifier"
|
62
47
|
|
63
48
|
target.quantify(token, text, min, max, mode)
|
64
49
|
end
|
65
50
|
end
|
66
|
-
|
67
51
|
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -96,10 +96,10 @@ class Regexp::Lexer
|
|
96
96
|
|
97
97
|
tokens.pop
|
98
98
|
tokens << Regexp::Token.new(:literal, :literal, lead,
|
99
|
-
token.ts, (token.te - last.
|
99
|
+
token.ts, (token.te - last.length),
|
100
100
|
nesting, set_nesting, conditional_nesting)
|
101
101
|
tokens << Regexp::Token.new(:literal, :literal, last,
|
102
|
-
(token.ts + lead.
|
102
|
+
(token.ts + lead.length), token.te,
|
103
103
|
nesting, set_nesting, conditional_nesting)
|
104
104
|
end
|
105
105
|
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -2,9 +2,8 @@ require 'regexp_parser/expression'
|
|
2
2
|
|
3
3
|
class Regexp::Parser
|
4
4
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
5
|
|
7
|
-
class ParserError <
|
6
|
+
class ParserError < Regexp::Parser::Error; end
|
8
7
|
|
9
8
|
class UnknownTokenTypeError < ParserError
|
10
9
|
def initialize(type, token)
|
@@ -70,95 +69,155 @@ class Regexp::Parser
|
|
70
69
|
enabled_options
|
71
70
|
end
|
72
71
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
72
|
+
def parse_token(token)
|
73
|
+
case token.type
|
74
|
+
when :anchor; anchor(token)
|
75
|
+
when :assertion, :group; group(token)
|
76
|
+
when :backref; backref(token)
|
77
|
+
when :conditional; conditional(token)
|
78
|
+
when :escape; escape(token)
|
79
|
+
when :free_space; free_space(token)
|
80
|
+
when :keep; keep(token)
|
81
|
+
when :literal; literal(token)
|
82
|
+
when :meta; meta(token)
|
83
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
84
|
+
when :property, :nonproperty; property(token)
|
85
|
+
when :quantifier; quantifier(token)
|
86
|
+
when :set; set(token)
|
87
|
+
when :type; type(token)
|
88
|
+
else
|
89
|
+
raise UnknownTokenTypeError.new(token.type, token)
|
90
|
+
end
|
79
91
|
|
80
|
-
|
81
|
-
def update_transplanted_subtree(exp, new_parent)
|
82
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
83
|
-
exp.respond_to?(:each) &&
|
84
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
92
|
+
close_completed_character_set_range
|
85
93
|
end
|
86
94
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
95
|
+
def anchor(token)
|
96
|
+
case token.token
|
97
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
98
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
99
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
100
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
101
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
102
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
103
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
104
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
105
|
+
else
|
106
|
+
raise UnknownTokenError.new('Anchor', token)
|
91
107
|
end
|
92
|
-
nesting.pop
|
93
|
-
yield(node) if block_given?
|
94
|
-
self.node = nesting.last
|
95
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
96
108
|
end
|
97
109
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
110
|
+
def group(token)
|
111
|
+
case token.token
|
112
|
+
when :options, :options_switch
|
113
|
+
options_group(token)
|
114
|
+
when :close
|
115
|
+
close_group
|
116
|
+
when :comment
|
117
|
+
node << Group::Comment.new(token, active_opts)
|
118
|
+
else
|
119
|
+
open_group(token)
|
120
|
+
end
|
101
121
|
end
|
102
122
|
|
103
|
-
|
104
|
-
|
123
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
124
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
105
125
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
when :escape; escape(token)
|
111
|
-
when :group; group(token)
|
112
|
-
when :assertion; group(token)
|
113
|
-
when :set; set(token)
|
114
|
-
when :type; type(token)
|
115
|
-
when :backref; backref(token)
|
116
|
-
when :conditional; conditional(token)
|
117
|
-
when :keep; keep(token)
|
118
|
-
|
119
|
-
when :posixclass, :nonposixclass
|
120
|
-
posixclass(token)
|
121
|
-
when :property, :nonproperty
|
122
|
-
property(token)
|
123
|
-
|
124
|
-
when :literal
|
125
|
-
node << Literal.new(token, active_opts)
|
126
|
-
when :free_space
|
127
|
-
free_space(token)
|
126
|
+
def options_group(token)
|
127
|
+
positive, negative = token.text.split('-', 2)
|
128
|
+
negative ||= ''
|
129
|
+
self.switching_options = token.token.equal?(:options_switch)
|
128
130
|
|
129
|
-
|
130
|
-
|
131
|
+
opt_changes = {}
|
132
|
+
new_active_opts = active_opts.dup
|
133
|
+
|
134
|
+
MOD_FLAGS.each do |flag|
|
135
|
+
if positive.include?(flag.to_s)
|
136
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
137
|
+
end
|
138
|
+
if negative.include?(flag.to_s)
|
139
|
+
opt_changes[flag] = false
|
140
|
+
new_active_opts.delete(flag)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
145
|
+
enc_flag = enc_flag.to_sym
|
146
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
147
|
+
opt_changes[other] = false if new_active_opts[other]
|
148
|
+
new_active_opts.delete(other)
|
149
|
+
end
|
150
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
131
151
|
end
|
152
|
+
|
153
|
+
options_stack << new_active_opts
|
154
|
+
|
155
|
+
options_group = Group::Options.new(token, active_opts)
|
156
|
+
options_group.option_changes = opt_changes
|
157
|
+
|
158
|
+
nest(options_group)
|
132
159
|
end
|
133
160
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
161
|
+
def open_group(token)
|
162
|
+
group_class =
|
163
|
+
case token.token
|
164
|
+
when :absence; Group::Absence
|
165
|
+
when :atomic; Group::Atomic
|
166
|
+
when :capture; Group::Capture
|
167
|
+
when :named; Group::Named
|
168
|
+
when :passive; Group::Passive
|
169
|
+
|
170
|
+
when :lookahead; Assertion::Lookahead
|
171
|
+
when :lookbehind; Assertion::Lookbehind
|
172
|
+
when :nlookahead; Assertion::NegativeLookahead
|
173
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
174
|
+
|
175
|
+
else
|
176
|
+
raise UnknownTokenError.new('Group type open', token)
|
177
|
+
end
|
178
|
+
|
179
|
+
group = group_class.new(token, active_opts)
|
180
|
+
|
181
|
+
if group.capturing?
|
182
|
+
group.number = total_captured_group_count + 1
|
183
|
+
group.number_at_level = captured_group_count_at_level + 1
|
184
|
+
count_captured_group
|
150
185
|
end
|
186
|
+
|
187
|
+
# Push the active options to the stack again. This way we can simply pop the
|
188
|
+
# stack for any group we close, no matter if it had its own options or not.
|
189
|
+
options_stack << active_opts
|
190
|
+
|
191
|
+
nest(group)
|
151
192
|
end
|
152
193
|
|
153
|
-
def
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
194
|
+
def total_captured_group_count
|
195
|
+
captured_group_counts.values.reduce(0, :+)
|
196
|
+
end
|
197
|
+
|
198
|
+
def captured_group_count_at_level
|
199
|
+
captured_group_counts[node.level]
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_captured_group
|
203
|
+
captured_group_counts[node.level] += 1
|
204
|
+
end
|
205
|
+
|
206
|
+
def close_group
|
207
|
+
options_stack.pop unless switching_options
|
208
|
+
self.switching_options = false
|
209
|
+
decrease_nesting
|
210
|
+
end
|
211
|
+
|
212
|
+
def decrease_nesting
|
213
|
+
while nesting.last.is_a?(SequenceOperation)
|
214
|
+
nesting.pop
|
215
|
+
self.node = nesting.last
|
161
216
|
end
|
217
|
+
nesting.pop
|
218
|
+
yield(node) if block_given?
|
219
|
+
self.node = nesting.last
|
220
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
162
221
|
end
|
163
222
|
|
164
223
|
def backref(token)
|
@@ -188,31 +247,9 @@ class Regexp::Parser
|
|
188
247
|
end
|
189
248
|
end
|
190
249
|
|
191
|
-
def
|
192
|
-
|
193
|
-
|
194
|
-
node << CharacterType::Digit.new(token, active_opts)
|
195
|
-
when :nondigit
|
196
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
197
|
-
when :hex
|
198
|
-
node << CharacterType::Hex.new(token, active_opts)
|
199
|
-
when :nonhex
|
200
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
201
|
-
when :space
|
202
|
-
node << CharacterType::Space.new(token, active_opts)
|
203
|
-
when :nonspace
|
204
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
205
|
-
when :word
|
206
|
-
node << CharacterType::Word.new(token, active_opts)
|
207
|
-
when :nonword
|
208
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
209
|
-
when :linebreak
|
210
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
211
|
-
when :xgrapheme
|
212
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
213
|
-
else
|
214
|
-
raise UnknownTokenError.new('CharacterType', token)
|
215
|
-
end
|
250
|
+
def assign_effective_number(exp)
|
251
|
+
exp.effective_number =
|
252
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
216
253
|
end
|
217
254
|
|
218
255
|
def conditional(token)
|
@@ -240,11 +277,118 @@ class Regexp::Parser
|
|
240
277
|
end
|
241
278
|
end
|
242
279
|
|
280
|
+
def nest_conditional(exp)
|
281
|
+
conditional_nesting.push(exp)
|
282
|
+
nest(exp)
|
283
|
+
end
|
284
|
+
|
285
|
+
def nest(exp)
|
286
|
+
nesting.push(exp)
|
287
|
+
node << exp
|
288
|
+
update_transplanted_subtree(exp, node)
|
289
|
+
self.node = exp
|
290
|
+
end
|
291
|
+
|
292
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
293
|
+
def update_transplanted_subtree(exp, new_parent)
|
294
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
295
|
+
exp.respond_to?(:each) &&
|
296
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
297
|
+
end
|
298
|
+
|
299
|
+
def escape(token)
|
300
|
+
case token.token
|
301
|
+
|
302
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
303
|
+
|
304
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
305
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
306
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
307
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
308
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
309
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
310
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
311
|
+
|
312
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
313
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
314
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
315
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
316
|
+
|
317
|
+
when :control
|
318
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
319
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
320
|
+
else
|
321
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
322
|
+
end
|
323
|
+
|
324
|
+
when :meta_sequence
|
325
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
326
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
327
|
+
else
|
328
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
329
|
+
end
|
330
|
+
|
331
|
+
else
|
332
|
+
# treating everything else as a literal
|
333
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
334
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
335
|
+
# as stuff that would be a literal even without the backslash.
|
336
|
+
# Right now, they all end up here.
|
337
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
def free_space(token)
|
342
|
+
case token.token
|
343
|
+
when :comment
|
344
|
+
node << Comment.new(token, active_opts)
|
345
|
+
when :whitespace
|
346
|
+
if node.last.is_a?(WhiteSpace)
|
347
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
348
|
+
else
|
349
|
+
node << WhiteSpace.new(token, active_opts)
|
350
|
+
end
|
351
|
+
else
|
352
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
def keep(token)
|
357
|
+
node << Keep::Mark.new(token, active_opts)
|
358
|
+
end
|
359
|
+
|
360
|
+
def literal(token)
|
361
|
+
node << Literal.new(token, active_opts)
|
362
|
+
end
|
363
|
+
|
364
|
+
def meta(token)
|
365
|
+
case token.token
|
366
|
+
when :dot
|
367
|
+
node << CharacterType::Any.new(token, active_opts)
|
368
|
+
when :alternation
|
369
|
+
sequence_operation(Alternation, token)
|
370
|
+
else
|
371
|
+
raise UnknownTokenError.new('Meta', token)
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
def sequence_operation(klass, token)
|
376
|
+
unless node.is_a?(klass)
|
377
|
+
operator = klass.new(token, active_opts)
|
378
|
+
sequence = operator.add_sequence(active_opts)
|
379
|
+
sequence.expressions = node.expressions
|
380
|
+
node.expressions = []
|
381
|
+
nest(operator)
|
382
|
+
end
|
383
|
+
node.add_sequence(active_opts)
|
384
|
+
end
|
385
|
+
|
243
386
|
def posixclass(token)
|
244
387
|
node << PosixClass.new(token, active_opts)
|
245
388
|
end
|
246
389
|
|
247
390
|
include Regexp::Expression::UnicodeProperty
|
391
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
248
392
|
|
249
393
|
def property(token)
|
250
394
|
case token.token
|
@@ -316,128 +460,43 @@ class Regexp::Parser
|
|
316
460
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
317
461
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
318
462
|
|
319
|
-
when *
|
320
|
-
node <<
|
321
|
-
|
322
|
-
when *
|
323
|
-
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::Emoji
|
326
|
-
node << Emoji.new(token, active_opts)
|
327
|
-
|
328
|
-
when *Token::UnicodeProperty::Script
|
329
|
-
node << Script.new(token, active_opts)
|
330
|
-
|
331
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
332
|
-
node << Block.new(token, active_opts)
|
463
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
464
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
465
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
466
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
467
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
333
468
|
|
334
469
|
else
|
335
470
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
336
471
|
end
|
337
472
|
end
|
338
473
|
|
339
|
-
def anchor(token)
|
340
|
-
case token.token
|
341
|
-
when :bol
|
342
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
343
|
-
when :eol
|
344
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
345
|
-
when :bos
|
346
|
-
node << Anchor::BOS.new(token, active_opts)
|
347
|
-
when :eos
|
348
|
-
node << Anchor::EOS.new(token, active_opts)
|
349
|
-
when :eos_ob_eol
|
350
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
351
|
-
when :word_boundary
|
352
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
353
|
-
when :nonword_boundary
|
354
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
355
|
-
when :match_start
|
356
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
357
|
-
else
|
358
|
-
raise UnknownTokenError.new('Anchor', token)
|
359
|
-
end
|
360
|
-
end
|
361
|
-
|
362
|
-
def escape(token)
|
363
|
-
case token.token
|
364
|
-
|
365
|
-
when :backspace
|
366
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
367
|
-
|
368
|
-
when :escape
|
369
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
370
|
-
when :bell
|
371
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
372
|
-
when :form_feed
|
373
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
374
|
-
when :newline
|
375
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
376
|
-
when :carriage
|
377
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
378
|
-
when :tab
|
379
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
380
|
-
when :vertical_tab
|
381
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
382
|
-
|
383
|
-
when :hex
|
384
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
385
|
-
when :octal
|
386
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
387
|
-
when :codepoint
|
388
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
389
|
-
when :codepoint_list
|
390
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
391
|
-
|
392
|
-
when :control
|
393
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
394
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
395
|
-
else
|
396
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
397
|
-
end
|
398
|
-
|
399
|
-
when :meta_sequence
|
400
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
401
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
402
|
-
else
|
403
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
404
|
-
end
|
405
|
-
|
406
|
-
else
|
407
|
-
# treating everything else as a literal
|
408
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
def keep(token)
|
413
|
-
node << Keep::Mark.new(token, active_opts)
|
414
|
-
end
|
415
|
-
|
416
|
-
def free_space(token)
|
417
|
-
case token.token
|
418
|
-
when :comment
|
419
|
-
node << Comment.new(token, active_opts)
|
420
|
-
when :whitespace
|
421
|
-
if node.last.is_a?(WhiteSpace)
|
422
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
423
|
-
else
|
424
|
-
node << WhiteSpace.new(token, active_opts)
|
425
|
-
end
|
426
|
-
else
|
427
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
428
|
-
end
|
429
|
-
end
|
430
|
-
|
431
474
|
def quantifier(token)
|
432
|
-
|
433
|
-
target_node
|
434
|
-
|
435
|
-
|
475
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
476
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
477
|
+
|
478
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
479
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
480
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
481
|
+
if target_node.quantified?
|
482
|
+
new_token = Regexp::Token.new(
|
483
|
+
:group,
|
484
|
+
:passive,
|
485
|
+
'', # text
|
486
|
+
target_node.ts,
|
487
|
+
nil, # te (unused)
|
488
|
+
target_node.level,
|
489
|
+
target_node.set_level,
|
490
|
+
target_node.conditional_level
|
491
|
+
)
|
492
|
+
new_group = Group::Passive.new(new_token, active_opts)
|
493
|
+
new_group.implicit = true
|
494
|
+
new_group << target_node
|
495
|
+
increase_level(target_node)
|
496
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
497
|
+
target_node = new_group
|
436
498
|
end
|
437
499
|
|
438
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
439
|
-
"'#{token.text}' ")
|
440
|
-
|
441
500
|
case token.token
|
442
501
|
when :zero_or_one
|
443
502
|
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
@@ -468,6 +527,11 @@ class Regexp::Parser
|
|
468
527
|
end
|
469
528
|
end
|
470
529
|
|
530
|
+
def increase_level(exp)
|
531
|
+
exp.level += 1
|
532
|
+
exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
|
533
|
+
end
|
534
|
+
|
471
535
|
def interval(target_node, token)
|
472
536
|
text = token.text
|
473
537
|
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
@@ -490,100 +554,16 @@ class Regexp::Parser
|
|
490
554
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
491
555
|
end
|
492
556
|
|
493
|
-
def
|
494
|
-
case token.token
|
495
|
-
when :options, :options_switch
|
496
|
-
options_group(token)
|
497
|
-
when :close
|
498
|
-
close_group
|
499
|
-
when :comment
|
500
|
-
node << Group::Comment.new(token, active_opts)
|
501
|
-
else
|
502
|
-
open_group(token)
|
503
|
-
end
|
504
|
-
end
|
505
|
-
|
506
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
507
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
508
|
-
|
509
|
-
def options_group(token)
|
510
|
-
positive, negative = token.text.split('-', 2)
|
511
|
-
negative ||= ''
|
512
|
-
self.switching_options = token.token.equal?(:options_switch)
|
513
|
-
|
514
|
-
opt_changes = {}
|
515
|
-
new_active_opts = active_opts.dup
|
516
|
-
|
517
|
-
MOD_FLAGS.each do |flag|
|
518
|
-
if positive.include?(flag.to_s)
|
519
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
520
|
-
end
|
521
|
-
if negative.include?(flag.to_s)
|
522
|
-
opt_changes[flag] = false
|
523
|
-
new_active_opts.delete(flag)
|
524
|
-
end
|
525
|
-
end
|
526
|
-
|
527
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
528
|
-
enc_flag = enc_flag.to_sym
|
529
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
530
|
-
opt_changes[other] = false if new_active_opts[other]
|
531
|
-
new_active_opts.delete(other)
|
532
|
-
end
|
533
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
534
|
-
end
|
535
|
-
|
536
|
-
options_stack << new_active_opts
|
537
|
-
|
538
|
-
options_group = Group::Options.new(token, active_opts)
|
539
|
-
options_group.option_changes = opt_changes
|
540
|
-
|
541
|
-
nest(options_group)
|
542
|
-
end
|
543
|
-
|
544
|
-
def open_group(token)
|
557
|
+
def set(token)
|
545
558
|
case token.token
|
546
|
-
when :
|
547
|
-
|
548
|
-
when :
|
549
|
-
|
550
|
-
when :
|
551
|
-
exp = Group::Named.new(token, active_opts)
|
552
|
-
when :capture
|
553
|
-
exp = Group::Capture.new(token, active_opts)
|
554
|
-
when :absence
|
555
|
-
exp = Group::Absence.new(token, active_opts)
|
556
|
-
|
557
|
-
when :lookahead
|
558
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
559
|
-
when :nlookahead
|
560
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
561
|
-
when :lookbehind
|
562
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
563
|
-
when :nlookbehind
|
564
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
565
|
-
|
559
|
+
when :open; open_set(token)
|
560
|
+
when :close; close_set
|
561
|
+
when :negate; negate_set
|
562
|
+
when :range; range(token)
|
563
|
+
when :intersection; intersection(token)
|
566
564
|
else
|
567
|
-
raise UnknownTokenError.new('
|
568
|
-
end
|
569
|
-
|
570
|
-
if exp.capturing?
|
571
|
-
exp.number = total_captured_group_count + 1
|
572
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
573
|
-
count_captured_group
|
565
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
574
566
|
end
|
575
|
-
|
576
|
-
# Push the active options to the stack again. This way we can simply pop the
|
577
|
-
# stack for any group we close, no matter if it had its own options or not.
|
578
|
-
options_stack << active_opts
|
579
|
-
|
580
|
-
nest(exp)
|
581
|
-
end
|
582
|
-
|
583
|
-
def close_group
|
584
|
-
options_stack.pop unless switching_options
|
585
|
-
self.switching_options = false
|
586
|
-
decrease_nesting
|
587
567
|
end
|
588
568
|
|
589
569
|
def open_set(token)
|
@@ -606,51 +586,45 @@ class Regexp::Parser
|
|
606
586
|
nest(exp)
|
607
587
|
end
|
608
588
|
|
609
|
-
def close_completed_character_set_range
|
610
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
611
|
-
end
|
612
|
-
|
613
589
|
def intersection(token)
|
614
590
|
sequence_operation(CharacterSet::Intersection, token)
|
615
591
|
end
|
616
592
|
|
617
|
-
def
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
593
|
+
def type(token)
|
594
|
+
case token.token
|
595
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
596
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
597
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
598
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
599
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
600
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
601
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
602
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
603
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
604
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
605
|
+
else
|
606
|
+
raise UnknownTokenError.new('CharacterType', token)
|
624
607
|
end
|
625
|
-
node.add_sequence(active_opts)
|
626
|
-
end
|
627
|
-
|
628
|
-
def active_opts
|
629
|
-
options_stack.last
|
630
|
-
end
|
631
|
-
|
632
|
-
def total_captured_group_count
|
633
|
-
captured_group_counts.values.reduce(0, :+)
|
634
|
-
end
|
635
|
-
|
636
|
-
def captured_group_count_at_level
|
637
|
-
captured_group_counts[node.level]
|
638
608
|
end
|
639
609
|
|
640
|
-
def
|
641
|
-
|
610
|
+
def close_completed_character_set_range
|
611
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
642
612
|
end
|
643
613
|
|
644
|
-
def
|
645
|
-
|
646
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
614
|
+
def active_opts
|
615
|
+
options_stack.last
|
647
616
|
end
|
648
617
|
|
618
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
619
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
620
|
+
# the instance of Group::Capture that it refers to via its number.
|
649
621
|
def assign_referenced_expressions
|
650
622
|
targets = {}
|
623
|
+
# find all referencable expressions
|
651
624
|
root.each_expression do |exp|
|
652
625
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
653
626
|
end
|
627
|
+
# assign them to any refering expressions
|
654
628
|
root.each_expression do |exp|
|
655
629
|
exp.respond_to?(:reference) &&
|
656
630
|
exp.referenced_expression = targets[exp.reference]
|