regexp_parser 1.8.2 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +100 -0
- data/Gemfile +6 -1
- data/README.md +1 -4
- data/Rakefile +8 -8
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/expression.rb +7 -19
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +307 -332
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +209 -240
- data/lib/regexp_parser/scanner.rb +1275 -1340
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +8 -1
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +4 -7
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -1,5 +1,6 @@
|
|
1
|
-
|
1
|
+
require 'regexp_parser/error'
|
2
2
|
|
3
|
+
module Regexp::Expression
|
3
4
|
class Base
|
4
5
|
attr_accessor :type, :token
|
5
6
|
attr_accessor :text, :ts
|
@@ -21,7 +22,7 @@ module Regexp::Expression
|
|
21
22
|
self.options = options
|
22
23
|
end
|
23
24
|
|
24
|
-
def
|
25
|
+
def initialize_copy(orig)
|
25
26
|
self.text = (orig.text ? orig.text.dup : nil)
|
26
27
|
self.options = (orig.options ? orig.options.dup : nil)
|
27
28
|
self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
|
@@ -34,6 +35,10 @@ module Regexp::Expression
|
|
34
35
|
|
35
36
|
alias :starts_at :ts
|
36
37
|
|
38
|
+
def base_length
|
39
|
+
to_s(:base).length
|
40
|
+
end
|
41
|
+
|
37
42
|
def full_length
|
38
43
|
to_s.length
|
39
44
|
end
|
@@ -118,23 +123,6 @@ module Regexp::Expression
|
|
118
123
|
alias :to_h :attributes
|
119
124
|
end
|
120
125
|
|
121
|
-
def self.parsed(exp)
|
122
|
-
warn('WARNING: Regexp::Expression::Base.parsed is buggy and '\
|
123
|
-
'will be removed in 2.0.0. Use Regexp::Parser.parse instead.')
|
124
|
-
case exp
|
125
|
-
when String
|
126
|
-
Regexp::Parser.parse(exp)
|
127
|
-
when Regexp
|
128
|
-
Regexp::Parser.parse(exp.source) # <- causes loss of root options
|
129
|
-
when Regexp::Expression # <- never triggers
|
130
|
-
exp
|
131
|
-
else
|
132
|
-
raise ArgumentError, 'Expression.parsed accepts a String, Regexp, or '\
|
133
|
-
'a Regexp::Expression as a value for exp, but it '\
|
134
|
-
"was given #{exp.class.name}."
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
126
|
end # module Regexp::Expression
|
139
127
|
|
140
128
|
require 'regexp_parser/expression/quantifier'
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -96,10 +96,10 @@ class Regexp::Lexer
|
|
96
96
|
|
97
97
|
tokens.pop
|
98
98
|
tokens << Regexp::Token.new(:literal, :literal, lead,
|
99
|
-
token.ts, (token.te - last.
|
99
|
+
token.ts, (token.te - last.length),
|
100
100
|
nesting, set_nesting, conditional_nesting)
|
101
101
|
tokens << Regexp::Token.new(:literal, :literal, last,
|
102
|
-
(token.ts + lead.
|
102
|
+
(token.ts + lead.length), token.te,
|
103
103
|
nesting, set_nesting, conditional_nesting)
|
104
104
|
end
|
105
105
|
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -70,95 +70,155 @@ class Regexp::Parser
|
|
70
70
|
enabled_options
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
def parse_token(token)
|
74
|
+
case token.type
|
75
|
+
when :anchor; anchor(token)
|
76
|
+
when :assertion, :group; group(token)
|
77
|
+
when :backref; backref(token)
|
78
|
+
when :conditional; conditional(token)
|
79
|
+
when :escape; escape(token)
|
80
|
+
when :free_space; free_space(token)
|
81
|
+
when :keep; keep(token)
|
82
|
+
when :literal; literal(token)
|
83
|
+
when :meta; meta(token)
|
84
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
85
|
+
when :property, :nonproperty; property(token)
|
86
|
+
when :quantifier; quantifier(token)
|
87
|
+
when :set; set(token)
|
88
|
+
when :type; type(token)
|
89
|
+
else
|
90
|
+
raise UnknownTokenTypeError.new(token.type, token)
|
91
|
+
end
|
79
92
|
|
80
|
-
|
81
|
-
def update_transplanted_subtree(exp, new_parent)
|
82
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
83
|
-
exp.respond_to?(:each) &&
|
84
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
93
|
+
close_completed_character_set_range
|
85
94
|
end
|
86
95
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
def anchor(token)
|
97
|
+
case token.token
|
98
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
99
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
100
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
101
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
102
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
103
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
104
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
105
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
106
|
+
else
|
107
|
+
raise UnknownTokenError.new('Anchor', token)
|
91
108
|
end
|
92
|
-
nesting.pop
|
93
|
-
yield(node) if block_given?
|
94
|
-
self.node = nesting.last
|
95
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
96
109
|
end
|
97
110
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
111
|
+
def group(token)
|
112
|
+
case token.token
|
113
|
+
when :options, :options_switch
|
114
|
+
options_group(token)
|
115
|
+
when :close
|
116
|
+
close_group
|
117
|
+
when :comment
|
118
|
+
node << Group::Comment.new(token, active_opts)
|
119
|
+
else
|
120
|
+
open_group(token)
|
121
|
+
end
|
101
122
|
end
|
102
123
|
|
103
|
-
|
104
|
-
|
124
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
125
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
105
126
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
when :escape; escape(token)
|
111
|
-
when :group; group(token)
|
112
|
-
when :assertion; group(token)
|
113
|
-
when :set; set(token)
|
114
|
-
when :type; type(token)
|
115
|
-
when :backref; backref(token)
|
116
|
-
when :conditional; conditional(token)
|
117
|
-
when :keep; keep(token)
|
118
|
-
|
119
|
-
when :posixclass, :nonposixclass
|
120
|
-
posixclass(token)
|
121
|
-
when :property, :nonproperty
|
122
|
-
property(token)
|
123
|
-
|
124
|
-
when :literal
|
125
|
-
node << Literal.new(token, active_opts)
|
126
|
-
when :free_space
|
127
|
-
free_space(token)
|
127
|
+
def options_group(token)
|
128
|
+
positive, negative = token.text.split('-', 2)
|
129
|
+
negative ||= ''
|
130
|
+
self.switching_options = token.token.equal?(:options_switch)
|
128
131
|
|
129
|
-
|
130
|
-
|
132
|
+
opt_changes = {}
|
133
|
+
new_active_opts = active_opts.dup
|
134
|
+
|
135
|
+
MOD_FLAGS.each do |flag|
|
136
|
+
if positive.include?(flag.to_s)
|
137
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
138
|
+
end
|
139
|
+
if negative.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = false
|
141
|
+
new_active_opts.delete(flag)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
146
|
+
enc_flag = enc_flag.to_sym
|
147
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
148
|
+
opt_changes[other] = false if new_active_opts[other]
|
149
|
+
new_active_opts.delete(other)
|
150
|
+
end
|
151
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
131
152
|
end
|
153
|
+
|
154
|
+
options_stack << new_active_opts
|
155
|
+
|
156
|
+
options_group = Group::Options.new(token, active_opts)
|
157
|
+
options_group.option_changes = opt_changes
|
158
|
+
|
159
|
+
nest(options_group)
|
132
160
|
end
|
133
161
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
162
|
+
def open_group(token)
|
163
|
+
group_class =
|
164
|
+
case token.token
|
165
|
+
when :absence; Group::Absence
|
166
|
+
when :atomic; Group::Atomic
|
167
|
+
when :capture; Group::Capture
|
168
|
+
when :named; Group::Named
|
169
|
+
when :passive; Group::Passive
|
170
|
+
|
171
|
+
when :lookahead; Assertion::Lookahead
|
172
|
+
when :lookbehind; Assertion::Lookbehind
|
173
|
+
when :nlookahead; Assertion::NegativeLookahead
|
174
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
175
|
+
|
176
|
+
else
|
177
|
+
raise UnknownTokenError.new('Group type open', token)
|
178
|
+
end
|
179
|
+
|
180
|
+
group = group_class.new(token, active_opts)
|
181
|
+
|
182
|
+
if group.capturing?
|
183
|
+
group.number = total_captured_group_count + 1
|
184
|
+
group.number_at_level = captured_group_count_at_level + 1
|
185
|
+
count_captured_group
|
150
186
|
end
|
187
|
+
|
188
|
+
# Push the active options to the stack again. This way we can simply pop the
|
189
|
+
# stack for any group we close, no matter if it had its own options or not.
|
190
|
+
options_stack << active_opts
|
191
|
+
|
192
|
+
nest(group)
|
151
193
|
end
|
152
194
|
|
153
|
-
def
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
195
|
+
def total_captured_group_count
|
196
|
+
captured_group_counts.values.reduce(0, :+)
|
197
|
+
end
|
198
|
+
|
199
|
+
def captured_group_count_at_level
|
200
|
+
captured_group_counts[node.level]
|
201
|
+
end
|
202
|
+
|
203
|
+
def count_captured_group
|
204
|
+
captured_group_counts[node.level] += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def close_group
|
208
|
+
options_stack.pop unless switching_options
|
209
|
+
self.switching_options = false
|
210
|
+
decrease_nesting
|
211
|
+
end
|
212
|
+
|
213
|
+
def decrease_nesting
|
214
|
+
while nesting.last.is_a?(SequenceOperation)
|
215
|
+
nesting.pop
|
216
|
+
self.node = nesting.last
|
161
217
|
end
|
218
|
+
nesting.pop
|
219
|
+
yield(node) if block_given?
|
220
|
+
self.node = nesting.last
|
221
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
162
222
|
end
|
163
223
|
|
164
224
|
def backref(token)
|
@@ -188,31 +248,9 @@ class Regexp::Parser
|
|
188
248
|
end
|
189
249
|
end
|
190
250
|
|
191
|
-
def
|
192
|
-
|
193
|
-
|
194
|
-
node << CharacterType::Digit.new(token, active_opts)
|
195
|
-
when :nondigit
|
196
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
197
|
-
when :hex
|
198
|
-
node << CharacterType::Hex.new(token, active_opts)
|
199
|
-
when :nonhex
|
200
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
201
|
-
when :space
|
202
|
-
node << CharacterType::Space.new(token, active_opts)
|
203
|
-
when :nonspace
|
204
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
205
|
-
when :word
|
206
|
-
node << CharacterType::Word.new(token, active_opts)
|
207
|
-
when :nonword
|
208
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
209
|
-
when :linebreak
|
210
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
211
|
-
when :xgrapheme
|
212
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
213
|
-
else
|
214
|
-
raise UnknownTokenError.new('CharacterType', token)
|
215
|
-
end
|
251
|
+
def assign_effective_number(exp)
|
252
|
+
exp.effective_number =
|
253
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
216
254
|
end
|
217
255
|
|
218
256
|
def conditional(token)
|
@@ -240,11 +278,118 @@ class Regexp::Parser
|
|
240
278
|
end
|
241
279
|
end
|
242
280
|
|
281
|
+
def nest_conditional(exp)
|
282
|
+
conditional_nesting.push(exp)
|
283
|
+
nest(exp)
|
284
|
+
end
|
285
|
+
|
286
|
+
def nest(exp)
|
287
|
+
nesting.push(exp)
|
288
|
+
node << exp
|
289
|
+
update_transplanted_subtree(exp, node)
|
290
|
+
self.node = exp
|
291
|
+
end
|
292
|
+
|
293
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
+
def update_transplanted_subtree(exp, new_parent)
|
295
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
+
exp.respond_to?(:each) &&
|
297
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
+
end
|
299
|
+
|
300
|
+
def escape(token)
|
301
|
+
case token.token
|
302
|
+
|
303
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
304
|
+
|
305
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
306
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
307
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
308
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
309
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
310
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
311
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
312
|
+
|
313
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
314
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
315
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
316
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
317
|
+
|
318
|
+
when :control
|
319
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
320
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
321
|
+
else
|
322
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
323
|
+
end
|
324
|
+
|
325
|
+
when :meta_sequence
|
326
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
327
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
328
|
+
else
|
329
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
330
|
+
end
|
331
|
+
|
332
|
+
else
|
333
|
+
# treating everything else as a literal
|
334
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
335
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
336
|
+
# as stuff that would be a literal even without the backslash.
|
337
|
+
# Right now, they all end up here.
|
338
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def free_space(token)
|
343
|
+
case token.token
|
344
|
+
when :comment
|
345
|
+
node << Comment.new(token, active_opts)
|
346
|
+
when :whitespace
|
347
|
+
if node.last.is_a?(WhiteSpace)
|
348
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
349
|
+
else
|
350
|
+
node << WhiteSpace.new(token, active_opts)
|
351
|
+
end
|
352
|
+
else
|
353
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep(token)
|
358
|
+
node << Keep::Mark.new(token, active_opts)
|
359
|
+
end
|
360
|
+
|
361
|
+
def literal(token)
|
362
|
+
node << Literal.new(token, active_opts)
|
363
|
+
end
|
364
|
+
|
365
|
+
def meta(token)
|
366
|
+
case token.token
|
367
|
+
when :dot
|
368
|
+
node << CharacterType::Any.new(token, active_opts)
|
369
|
+
when :alternation
|
370
|
+
sequence_operation(Alternation, token)
|
371
|
+
else
|
372
|
+
raise UnknownTokenError.new('Meta', token)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def sequence_operation(klass, token)
|
377
|
+
unless node.is_a?(klass)
|
378
|
+
operator = klass.new(token, active_opts)
|
379
|
+
sequence = operator.add_sequence(active_opts)
|
380
|
+
sequence.expressions = node.expressions
|
381
|
+
node.expressions = []
|
382
|
+
nest(operator)
|
383
|
+
end
|
384
|
+
node.add_sequence(active_opts)
|
385
|
+
end
|
386
|
+
|
243
387
|
def posixclass(token)
|
244
388
|
node << PosixClass.new(token, active_opts)
|
245
389
|
end
|
246
390
|
|
247
391
|
include Regexp::Expression::UnicodeProperty
|
392
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
248
393
|
|
249
394
|
def property(token)
|
250
395
|
case token.token
|
@@ -316,128 +461,43 @@ class Regexp::Parser
|
|
316
461
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
317
462
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
318
463
|
|
319
|
-
when *
|
320
|
-
node <<
|
321
|
-
|
322
|
-
when *
|
323
|
-
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::Emoji
|
326
|
-
node << Emoji.new(token, active_opts)
|
327
|
-
|
328
|
-
when *Token::UnicodeProperty::Script
|
329
|
-
node << Script.new(token, active_opts)
|
330
|
-
|
331
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
332
|
-
node << Block.new(token, active_opts)
|
464
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
465
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
466
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
467
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
468
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
333
469
|
|
334
470
|
else
|
335
471
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
336
472
|
end
|
337
473
|
end
|
338
474
|
|
339
|
-
def anchor(token)
|
340
|
-
case token.token
|
341
|
-
when :bol
|
342
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
343
|
-
when :eol
|
344
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
345
|
-
when :bos
|
346
|
-
node << Anchor::BOS.new(token, active_opts)
|
347
|
-
when :eos
|
348
|
-
node << Anchor::EOS.new(token, active_opts)
|
349
|
-
when :eos_ob_eol
|
350
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
351
|
-
when :word_boundary
|
352
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
353
|
-
when :nonword_boundary
|
354
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
355
|
-
when :match_start
|
356
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
357
|
-
else
|
358
|
-
raise UnknownTokenError.new('Anchor', token)
|
359
|
-
end
|
360
|
-
end
|
361
|
-
|
362
|
-
def escape(token)
|
363
|
-
case token.token
|
364
|
-
|
365
|
-
when :backspace
|
366
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
367
|
-
|
368
|
-
when :escape
|
369
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
370
|
-
when :bell
|
371
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
372
|
-
when :form_feed
|
373
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
374
|
-
when :newline
|
375
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
376
|
-
when :carriage
|
377
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
378
|
-
when :tab
|
379
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
380
|
-
when :vertical_tab
|
381
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
382
|
-
|
383
|
-
when :hex
|
384
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
385
|
-
when :octal
|
386
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
387
|
-
when :codepoint
|
388
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
389
|
-
when :codepoint_list
|
390
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
391
|
-
|
392
|
-
when :control
|
393
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
394
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
395
|
-
else
|
396
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
397
|
-
end
|
398
|
-
|
399
|
-
when :meta_sequence
|
400
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
401
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
402
|
-
else
|
403
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
404
|
-
end
|
405
|
-
|
406
|
-
else
|
407
|
-
# treating everything else as a literal
|
408
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
def keep(token)
|
413
|
-
node << Keep::Mark.new(token, active_opts)
|
414
|
-
end
|
415
|
-
|
416
|
-
def free_space(token)
|
417
|
-
case token.token
|
418
|
-
when :comment
|
419
|
-
node << Comment.new(token, active_opts)
|
420
|
-
when :whitespace
|
421
|
-
if node.last.is_a?(WhiteSpace)
|
422
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
423
|
-
else
|
424
|
-
node << WhiteSpace.new(token, active_opts)
|
425
|
-
end
|
426
|
-
else
|
427
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
428
|
-
end
|
429
|
-
end
|
430
|
-
|
431
475
|
def quantifier(token)
|
432
|
-
|
433
|
-
target_node
|
434
|
-
|
435
|
-
|
476
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
477
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
478
|
+
|
479
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
480
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
481
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
482
|
+
if target_node.quantified?
|
483
|
+
new_token = Regexp::Token.new(
|
484
|
+
:group,
|
485
|
+
:passive,
|
486
|
+
'', # text
|
487
|
+
target_node.ts,
|
488
|
+
nil, # te (unused)
|
489
|
+
target_node.level,
|
490
|
+
target_node.set_level,
|
491
|
+
target_node.conditional_level
|
492
|
+
)
|
493
|
+
new_group = Group::Passive.new(new_token, active_opts)
|
494
|
+
new_group.implicit = true
|
495
|
+
new_group << target_node
|
496
|
+
increase_level(target_node)
|
497
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
498
|
+
target_node = new_group
|
436
499
|
end
|
437
500
|
|
438
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
439
|
-
"'#{token.text}' ")
|
440
|
-
|
441
501
|
case token.token
|
442
502
|
when :zero_or_one
|
443
503
|
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
@@ -468,6 +528,11 @@ class Regexp::Parser
|
|
468
528
|
end
|
469
529
|
end
|
470
530
|
|
531
|
+
def increase_level(exp)
|
532
|
+
exp.level += 1
|
533
|
+
exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
|
534
|
+
end
|
535
|
+
|
471
536
|
def interval(target_node, token)
|
472
537
|
text = token.text
|
473
538
|
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
@@ -490,100 +555,16 @@ class Regexp::Parser
|
|
490
555
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
491
556
|
end
|
492
557
|
|
493
|
-
def
|
494
|
-
case token.token
|
495
|
-
when :options, :options_switch
|
496
|
-
options_group(token)
|
497
|
-
when :close
|
498
|
-
close_group
|
499
|
-
when :comment
|
500
|
-
node << Group::Comment.new(token, active_opts)
|
501
|
-
else
|
502
|
-
open_group(token)
|
503
|
-
end
|
504
|
-
end
|
505
|
-
|
506
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
507
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
508
|
-
|
509
|
-
def options_group(token)
|
510
|
-
positive, negative = token.text.split('-', 2)
|
511
|
-
negative ||= ''
|
512
|
-
self.switching_options = token.token.equal?(:options_switch)
|
513
|
-
|
514
|
-
opt_changes = {}
|
515
|
-
new_active_opts = active_opts.dup
|
516
|
-
|
517
|
-
MOD_FLAGS.each do |flag|
|
518
|
-
if positive.include?(flag.to_s)
|
519
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
520
|
-
end
|
521
|
-
if negative.include?(flag.to_s)
|
522
|
-
opt_changes[flag] = false
|
523
|
-
new_active_opts.delete(flag)
|
524
|
-
end
|
525
|
-
end
|
526
|
-
|
527
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
528
|
-
enc_flag = enc_flag.to_sym
|
529
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
530
|
-
opt_changes[other] = false if new_active_opts[other]
|
531
|
-
new_active_opts.delete(other)
|
532
|
-
end
|
533
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
534
|
-
end
|
535
|
-
|
536
|
-
options_stack << new_active_opts
|
537
|
-
|
538
|
-
options_group = Group::Options.new(token, active_opts)
|
539
|
-
options_group.option_changes = opt_changes
|
540
|
-
|
541
|
-
nest(options_group)
|
542
|
-
end
|
543
|
-
|
544
|
-
def open_group(token)
|
558
|
+
def set(token)
|
545
559
|
case token.token
|
546
|
-
when :
|
547
|
-
|
548
|
-
when :
|
549
|
-
|
550
|
-
when :
|
551
|
-
exp = Group::Named.new(token, active_opts)
|
552
|
-
when :capture
|
553
|
-
exp = Group::Capture.new(token, active_opts)
|
554
|
-
when :absence
|
555
|
-
exp = Group::Absence.new(token, active_opts)
|
556
|
-
|
557
|
-
when :lookahead
|
558
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
559
|
-
when :nlookahead
|
560
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
561
|
-
when :lookbehind
|
562
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
563
|
-
when :nlookbehind
|
564
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
565
|
-
|
560
|
+
when :open; open_set(token)
|
561
|
+
when :close; close_set
|
562
|
+
when :negate; negate_set
|
563
|
+
when :range; range(token)
|
564
|
+
when :intersection; intersection(token)
|
566
565
|
else
|
567
|
-
raise UnknownTokenError.new('
|
568
|
-
end
|
569
|
-
|
570
|
-
if exp.capturing?
|
571
|
-
exp.number = total_captured_group_count + 1
|
572
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
573
|
-
count_captured_group
|
566
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
574
567
|
end
|
575
|
-
|
576
|
-
# Push the active options to the stack again. This way we can simply pop the
|
577
|
-
# stack for any group we close, no matter if it had its own options or not.
|
578
|
-
options_stack << active_opts
|
579
|
-
|
580
|
-
nest(exp)
|
581
|
-
end
|
582
|
-
|
583
|
-
def close_group
|
584
|
-
options_stack.pop unless switching_options
|
585
|
-
self.switching_options = false
|
586
|
-
decrease_nesting
|
587
568
|
end
|
588
569
|
|
589
570
|
def open_set(token)
|
@@ -606,51 +587,45 @@ class Regexp::Parser
|
|
606
587
|
nest(exp)
|
607
588
|
end
|
608
589
|
|
609
|
-
def close_completed_character_set_range
|
610
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
611
|
-
end
|
612
|
-
|
613
590
|
def intersection(token)
|
614
591
|
sequence_operation(CharacterSet::Intersection, token)
|
615
592
|
end
|
616
593
|
|
617
|
-
def
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
594
|
+
def type(token)
|
595
|
+
case token.token
|
596
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
597
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
598
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
599
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
600
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
601
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
602
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
603
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
604
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
605
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
606
|
+
else
|
607
|
+
raise UnknownTokenError.new('CharacterType', token)
|
624
608
|
end
|
625
|
-
node.add_sequence(active_opts)
|
626
|
-
end
|
627
|
-
|
628
|
-
def active_opts
|
629
|
-
options_stack.last
|
630
|
-
end
|
631
|
-
|
632
|
-
def total_captured_group_count
|
633
|
-
captured_group_counts.values.reduce(0, :+)
|
634
|
-
end
|
635
|
-
|
636
|
-
def captured_group_count_at_level
|
637
|
-
captured_group_counts[node.level]
|
638
609
|
end
|
639
610
|
|
640
|
-
def
|
641
|
-
|
611
|
+
def close_completed_character_set_range
|
612
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
642
613
|
end
|
643
614
|
|
644
|
-
def
|
645
|
-
|
646
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
615
|
+
def active_opts
|
616
|
+
options_stack.last
|
647
617
|
end
|
648
618
|
|
619
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
620
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
621
|
+
# the instance of Group::Capture that it refers to via its number.
|
649
622
|
def assign_referenced_expressions
|
650
623
|
targets = {}
|
624
|
+
# find all referencable expressions
|
651
625
|
root.each_expression do |exp|
|
652
626
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
653
627
|
end
|
628
|
+
# assign them to any refering expressions
|
654
629
|
root.each_expression do |exp|
|
655
630
|
exp.respond_to?(:reference) &&
|
656
631
|
exp.referenced_expression = targets[exp.reference]
|