regexp_parser 1.7.1 → 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +138 -0
- data/Gemfile +6 -1
- data/README.md +23 -11
- data/Rakefile +8 -8
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +13 -21
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/lexer.rb +6 -6
- data/lib/regexp_parser/parser.rb +325 -344
- data/lib/regexp_parser/scanner.rb +1320 -1385
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +231 -253
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/options_spec.rb +28 -0
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +12 -1
- data/spec/scanner/free_space_spec.rb +32 -0
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/options_spec.rb +36 -0
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +61 -60
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -1,24 +1,12 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
|
3
3
|
class Root < Regexp::Expression::Subexpression
|
4
|
-
|
5
|
-
|
6
|
-
unless args.first.is_a?(Regexp::Token)
|
7
|
-
warn('WARNING: Root.new without a Token argument is deprecated and '\
|
8
|
-
'will be removed in 2.0.0. Use Root.build for the old behavior.')
|
9
|
-
return super(self.class.build_token, *args)
|
10
|
-
end
|
11
|
-
super
|
4
|
+
def self.build(options = {})
|
5
|
+
new(build_token, options)
|
12
6
|
end
|
13
7
|
|
14
|
-
|
15
|
-
|
16
|
-
new(build_token, options)
|
17
|
-
end
|
18
|
-
|
19
|
-
def build_token
|
20
|
-
Regexp::Token.new(:expression, :root, '', 0)
|
21
|
-
end
|
8
|
+
def self.build_token
|
9
|
+
Regexp::Token.new(:expression, :root, '', 0)
|
22
10
|
end
|
23
11
|
end
|
24
12
|
end
|
@@ -10,7 +10,7 @@ class Regexp::MatchLength
|
|
10
10
|
self.exp_class = exp.class
|
11
11
|
self.min_rep = exp.repetitions.min
|
12
12
|
self.max_rep = exp.repetitions.max
|
13
|
-
if base = opts[:base]
|
13
|
+
if (base = opts[:base])
|
14
14
|
self.base_min = base
|
15
15
|
self.base_max = base
|
16
16
|
self.reify = ->{ '.' * base }
|
@@ -32,7 +32,7 @@ class Regexp::MatchLength
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
def endless_each
|
35
|
+
def endless_each
|
36
36
|
return enum_for(__method__) unless block_given?
|
37
37
|
(min..max).each { |num| yield(num) if include?(num) }
|
38
38
|
end
|
@@ -36,7 +36,7 @@ module Regexp::Expression
|
|
36
36
|
|
37
37
|
# Iterates over the expressions of this expression as an array, passing
|
38
38
|
# the expression and its index within its parent to the given block.
|
39
|
-
def each_expression(include_self = false
|
39
|
+
def each_expression(include_self = false)
|
40
40
|
return enum_for(__method__, include_self) unless block_given?
|
41
41
|
|
42
42
|
traverse(include_self) do |event, exp, index|
|
@@ -47,7 +47,7 @@ module Regexp::Expression
|
|
47
47
|
# Returns a new array with the results of calling the given block once
|
48
48
|
# for every expression. If a block is not given, returns an array with
|
49
49
|
# each expression and its level index as an array.
|
50
|
-
def flat_map(include_self = false
|
50
|
+
def flat_map(include_self = false)
|
51
51
|
result = []
|
52
52
|
|
53
53
|
each_expression(include_self) do |exp, index|
|
@@ -12,7 +12,7 @@ module Regexp::Expression
|
|
12
12
|
@max = max
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def initialize_copy(orig)
|
16
16
|
@text = orig.text.dup
|
17
17
|
super
|
18
18
|
end
|
@@ -40,5 +40,14 @@ module Regexp::Expression
|
|
40
40
|
RUBY
|
41
41
|
end
|
42
42
|
alias :lazy? :reluctant?
|
43
|
+
|
44
|
+
def ==(other)
|
45
|
+
other.class == self.class &&
|
46
|
+
other.token == token &&
|
47
|
+
other.mode == mode &&
|
48
|
+
other.min == min &&
|
49
|
+
other.max == max
|
50
|
+
end
|
51
|
+
alias :eq :==
|
43
52
|
end
|
44
53
|
end
|
@@ -7,16 +7,6 @@ module Regexp::Expression
|
|
7
7
|
# Used as the base class for the Alternation alternatives, Conditional
|
8
8
|
# branches, and CharacterSet::Intersection intersected sequences.
|
9
9
|
class Sequence < Regexp::Expression::Subexpression
|
10
|
-
# TODO: this override is here for backwards compatibility, remove in 2.0.0
|
11
|
-
def initialize(*args)
|
12
|
-
if args.count == 3
|
13
|
-
warn('WARNING: Sequence.new without a Regexp::Token argument is '\
|
14
|
-
'deprecated and will be removed in 2.0.0.')
|
15
|
-
return self.class.at_levels(*args)
|
16
|
-
end
|
17
|
-
super
|
18
|
-
end
|
19
|
-
|
20
10
|
class << self
|
21
11
|
def add_to(subexpression, params = {}, active_opts = {})
|
22
12
|
sequence = at_levels(
|
@@ -51,17 +41,11 @@ module Regexp::Expression
|
|
51
41
|
alias :ts :starts_at
|
52
42
|
|
53
43
|
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
54
|
-
|
55
|
-
target
|
56
|
-
|
57
|
-
target = expressions[offset -= 1]
|
58
|
-
end
|
59
|
-
|
60
|
-
target || raise(ArgumentError, "No valid target found for '#{text}' "\
|
61
|
-
'quantifier')
|
44
|
+
target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
45
|
+
target or raise Regexp::Parser::Error,
|
46
|
+
"No valid target found for '#{text}' quantifier"
|
62
47
|
|
63
48
|
target.quantify(token, text, min, max, mode)
|
64
49
|
end
|
65
50
|
end
|
66
|
-
|
67
51
|
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -11,11 +11,11 @@ class Regexp::Lexer
|
|
11
11
|
|
12
12
|
CLOSING_TOKENS = [:close].freeze
|
13
13
|
|
14
|
-
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
15
|
-
new.lex(input, syntax, &block)
|
14
|
+
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
15
|
+
new.lex(input, syntax, options: options, &block)
|
16
16
|
end
|
17
17
|
|
18
|
-
def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
18
|
+
def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
19
19
|
syntax = Regexp::Syntax.new(syntax)
|
20
20
|
|
21
21
|
self.tokens = []
|
@@ -25,7 +25,7 @@ class Regexp::Lexer
|
|
25
25
|
self.shift = 0
|
26
26
|
|
27
27
|
last = nil
|
28
|
-
Regexp::Scanner.scan(input) do |type, token, text, ts, te|
|
28
|
+
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
29
29
|
type, token = *syntax.normalize(type, token)
|
30
30
|
syntax.check! type, token
|
31
31
|
|
@@ -96,10 +96,10 @@ class Regexp::Lexer
|
|
96
96
|
|
97
97
|
tokens.pop
|
98
98
|
tokens << Regexp::Token.new(:literal, :literal, lead,
|
99
|
-
token.ts, (token.te - last.
|
99
|
+
token.ts, (token.te - last.length),
|
100
100
|
nesting, set_nesting, conditional_nesting)
|
101
101
|
tokens << Regexp::Token.new(:literal, :literal, last,
|
102
|
-
(token.ts + lead.
|
102
|
+
(token.ts + lead.length), token.te,
|
103
103
|
nesting, set_nesting, conditional_nesting)
|
104
104
|
end
|
105
105
|
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -18,12 +18,12 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
22
|
-
new.parse(input, syntax, &block)
|
21
|
+
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
22
|
+
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
26
|
-
root = Root.build(
|
25
|
+
def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
26
|
+
root = Root.build(extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
29
29
|
self.node = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -54,105 +54,171 @@ class Regexp::Parser
|
|
54
54
|
:options_stack, :switching_options, :conditional_nesting,
|
55
55
|
:captured_group_counts
|
56
56
|
|
57
|
-
def
|
58
|
-
|
57
|
+
def extract_options(input, options)
|
58
|
+
if options && !input.is_a?(String)
|
59
|
+
raise ArgumentError, 'options cannot be supplied unless parsing a String'
|
60
|
+
end
|
59
61
|
|
60
|
-
options =
|
61
|
-
options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
|
62
|
-
options[:m] = true if input.options & ::Regexp::MULTILINE != 0
|
63
|
-
options[:x] = true if input.options & ::Regexp::EXTENDED != 0
|
64
|
-
options
|
65
|
-
end
|
62
|
+
options = input.options if input.is_a?(::Regexp)
|
66
63
|
|
67
|
-
|
68
|
-
nesting.push(exp)
|
69
|
-
node << exp
|
70
|
-
update_transplanted_subtree(exp, node)
|
71
|
-
self.node = exp
|
72
|
-
end
|
73
|
-
|
74
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
75
|
-
def update_transplanted_subtree(exp, new_parent)
|
76
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
77
|
-
exp.respond_to?(:each) &&
|
78
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
79
|
-
end
|
80
|
-
|
81
|
-
def decrease_nesting
|
82
|
-
while nesting.last.is_a?(SequenceOperation)
|
83
|
-
nesting.pop
|
84
|
-
self.node = nesting.last
|
85
|
-
end
|
86
|
-
nesting.pop
|
87
|
-
yield(node) if block_given?
|
88
|
-
self.node = nesting.last
|
89
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
90
|
-
end
|
64
|
+
return {} unless options
|
91
65
|
|
92
|
-
|
93
|
-
|
94
|
-
|
66
|
+
enabled_options = {}
|
67
|
+
enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
|
68
|
+
enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
|
69
|
+
enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
|
70
|
+
enabled_options
|
95
71
|
end
|
96
72
|
|
97
73
|
def parse_token(token)
|
98
|
-
close_completed_character_set_range
|
99
|
-
|
100
74
|
case token.type
|
101
|
-
when :
|
102
|
-
when :
|
103
|
-
when :
|
104
|
-
when :
|
105
|
-
when :
|
106
|
-
when :
|
107
|
-
when :
|
108
|
-
when :
|
109
|
-
when :
|
110
|
-
when :
|
111
|
-
when :
|
112
|
-
|
113
|
-
when :
|
114
|
-
|
115
|
-
when :property, :nonproperty
|
116
|
-
property(token)
|
117
|
-
|
118
|
-
when :literal
|
119
|
-
node << Literal.new(token, active_opts)
|
120
|
-
when :free_space
|
121
|
-
free_space(token)
|
122
|
-
|
75
|
+
when :anchor; anchor(token)
|
76
|
+
when :assertion, :group; group(token)
|
77
|
+
when :backref; backref(token)
|
78
|
+
when :conditional; conditional(token)
|
79
|
+
when :escape; escape(token)
|
80
|
+
when :free_space; free_space(token)
|
81
|
+
when :keep; keep(token)
|
82
|
+
when :literal; literal(token)
|
83
|
+
when :meta; meta(token)
|
84
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
85
|
+
when :property, :nonproperty; property(token)
|
86
|
+
when :quantifier; quantifier(token)
|
87
|
+
when :set; set(token)
|
88
|
+
when :type; type(token)
|
123
89
|
else
|
124
90
|
raise UnknownTokenTypeError.new(token.type, token)
|
125
91
|
end
|
92
|
+
|
93
|
+
close_completed_character_set_range
|
126
94
|
end
|
127
95
|
|
128
|
-
def
|
96
|
+
def anchor(token)
|
129
97
|
case token.token
|
130
|
-
when :
|
131
|
-
|
132
|
-
when :
|
133
|
-
|
134
|
-
when :
|
135
|
-
|
136
|
-
when :
|
137
|
-
|
138
|
-
when :intersection
|
139
|
-
intersection(token)
|
140
|
-
when :collation, :equivalent
|
141
|
-
node << Literal.new(token, active_opts)
|
98
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
99
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
100
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
101
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
102
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
103
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
104
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
105
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
142
106
|
else
|
143
|
-
raise UnknownTokenError.new('
|
107
|
+
raise UnknownTokenError.new('Anchor', token)
|
144
108
|
end
|
145
109
|
end
|
146
110
|
|
147
|
-
def
|
111
|
+
def group(token)
|
148
112
|
case token.token
|
149
|
-
when :
|
150
|
-
|
151
|
-
when :
|
152
|
-
|
113
|
+
when :options, :options_switch
|
114
|
+
options_group(token)
|
115
|
+
when :close
|
116
|
+
close_group
|
117
|
+
when :comment
|
118
|
+
node << Group::Comment.new(token, active_opts)
|
153
119
|
else
|
154
|
-
|
120
|
+
open_group(token)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
125
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
126
|
+
|
127
|
+
def options_group(token)
|
128
|
+
positive, negative = token.text.split('-', 2)
|
129
|
+
negative ||= ''
|
130
|
+
self.switching_options = token.token.equal?(:options_switch)
|
131
|
+
|
132
|
+
opt_changes = {}
|
133
|
+
new_active_opts = active_opts.dup
|
134
|
+
|
135
|
+
MOD_FLAGS.each do |flag|
|
136
|
+
if positive.include?(flag.to_s)
|
137
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
138
|
+
end
|
139
|
+
if negative.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = false
|
141
|
+
new_active_opts.delete(flag)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
146
|
+
enc_flag = enc_flag.to_sym
|
147
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
148
|
+
opt_changes[other] = false if new_active_opts[other]
|
149
|
+
new_active_opts.delete(other)
|
150
|
+
end
|
151
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
152
|
+
end
|
153
|
+
|
154
|
+
options_stack << new_active_opts
|
155
|
+
|
156
|
+
options_group = Group::Options.new(token, active_opts)
|
157
|
+
options_group.option_changes = opt_changes
|
158
|
+
|
159
|
+
nest(options_group)
|
160
|
+
end
|
161
|
+
|
162
|
+
def open_group(token)
|
163
|
+
group_class =
|
164
|
+
case token.token
|
165
|
+
when :absence; Group::Absence
|
166
|
+
when :atomic; Group::Atomic
|
167
|
+
when :capture; Group::Capture
|
168
|
+
when :named; Group::Named
|
169
|
+
when :passive; Group::Passive
|
170
|
+
|
171
|
+
when :lookahead; Assertion::Lookahead
|
172
|
+
when :lookbehind; Assertion::Lookbehind
|
173
|
+
when :nlookahead; Assertion::NegativeLookahead
|
174
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
175
|
+
|
176
|
+
else
|
177
|
+
raise UnknownTokenError.new('Group type open', token)
|
178
|
+
end
|
179
|
+
|
180
|
+
group = group_class.new(token, active_opts)
|
181
|
+
|
182
|
+
if group.capturing?
|
183
|
+
group.number = total_captured_group_count + 1
|
184
|
+
group.number_at_level = captured_group_count_at_level + 1
|
185
|
+
count_captured_group
|
186
|
+
end
|
187
|
+
|
188
|
+
# Push the active options to the stack again. This way we can simply pop the
|
189
|
+
# stack for any group we close, no matter if it had its own options or not.
|
190
|
+
options_stack << active_opts
|
191
|
+
|
192
|
+
nest(group)
|
193
|
+
end
|
194
|
+
|
195
|
+
def total_captured_group_count
|
196
|
+
captured_group_counts.values.reduce(0, :+)
|
197
|
+
end
|
198
|
+
|
199
|
+
def captured_group_count_at_level
|
200
|
+
captured_group_counts[node.level]
|
201
|
+
end
|
202
|
+
|
203
|
+
def count_captured_group
|
204
|
+
captured_group_counts[node.level] += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def close_group
|
208
|
+
options_stack.pop unless switching_options
|
209
|
+
self.switching_options = false
|
210
|
+
decrease_nesting
|
211
|
+
end
|
212
|
+
|
213
|
+
def decrease_nesting
|
214
|
+
while nesting.last.is_a?(SequenceOperation)
|
215
|
+
nesting.pop
|
216
|
+
self.node = nesting.last
|
155
217
|
end
|
218
|
+
nesting.pop
|
219
|
+
yield(node) if block_given?
|
220
|
+
self.node = nesting.last
|
221
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
156
222
|
end
|
157
223
|
|
158
224
|
def backref(token)
|
@@ -182,31 +248,9 @@ class Regexp::Parser
|
|
182
248
|
end
|
183
249
|
end
|
184
250
|
|
185
|
-
def
|
186
|
-
|
187
|
-
|
188
|
-
node << CharacterType::Digit.new(token, active_opts)
|
189
|
-
when :nondigit
|
190
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
191
|
-
when :hex
|
192
|
-
node << CharacterType::Hex.new(token, active_opts)
|
193
|
-
when :nonhex
|
194
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
195
|
-
when :space
|
196
|
-
node << CharacterType::Space.new(token, active_opts)
|
197
|
-
when :nonspace
|
198
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
199
|
-
when :word
|
200
|
-
node << CharacterType::Word.new(token, active_opts)
|
201
|
-
when :nonword
|
202
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
203
|
-
when :linebreak
|
204
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
205
|
-
when :xgrapheme
|
206
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
207
|
-
else
|
208
|
-
raise UnknownTokenError.new('CharacterType', token)
|
209
|
-
end
|
251
|
+
def assign_effective_number(exp)
|
252
|
+
exp.effective_number =
|
253
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
210
254
|
end
|
211
255
|
|
212
256
|
def conditional(token)
|
@@ -234,11 +278,118 @@ class Regexp::Parser
|
|
234
278
|
end
|
235
279
|
end
|
236
280
|
|
281
|
+
def nest_conditional(exp)
|
282
|
+
conditional_nesting.push(exp)
|
283
|
+
nest(exp)
|
284
|
+
end
|
285
|
+
|
286
|
+
def nest(exp)
|
287
|
+
nesting.push(exp)
|
288
|
+
node << exp
|
289
|
+
update_transplanted_subtree(exp, node)
|
290
|
+
self.node = exp
|
291
|
+
end
|
292
|
+
|
293
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
+
def update_transplanted_subtree(exp, new_parent)
|
295
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
+
exp.respond_to?(:each) &&
|
297
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
+
end
|
299
|
+
|
300
|
+
def escape(token)
|
301
|
+
case token.token
|
302
|
+
|
303
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
304
|
+
|
305
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
306
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
307
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
308
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
309
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
310
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
311
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
312
|
+
|
313
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
314
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
315
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
316
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
317
|
+
|
318
|
+
when :control
|
319
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
320
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
321
|
+
else
|
322
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
323
|
+
end
|
324
|
+
|
325
|
+
when :meta_sequence
|
326
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
327
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
328
|
+
else
|
329
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
330
|
+
end
|
331
|
+
|
332
|
+
else
|
333
|
+
# treating everything else as a literal
|
334
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
335
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
336
|
+
# as stuff that would be a literal even without the backslash.
|
337
|
+
# Right now, they all end up here.
|
338
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def free_space(token)
|
343
|
+
case token.token
|
344
|
+
when :comment
|
345
|
+
node << Comment.new(token, active_opts)
|
346
|
+
when :whitespace
|
347
|
+
if node.last.is_a?(WhiteSpace)
|
348
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
349
|
+
else
|
350
|
+
node << WhiteSpace.new(token, active_opts)
|
351
|
+
end
|
352
|
+
else
|
353
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep(token)
|
358
|
+
node << Keep::Mark.new(token, active_opts)
|
359
|
+
end
|
360
|
+
|
361
|
+
def literal(token)
|
362
|
+
node << Literal.new(token, active_opts)
|
363
|
+
end
|
364
|
+
|
365
|
+
def meta(token)
|
366
|
+
case token.token
|
367
|
+
when :dot
|
368
|
+
node << CharacterType::Any.new(token, active_opts)
|
369
|
+
when :alternation
|
370
|
+
sequence_operation(Alternation, token)
|
371
|
+
else
|
372
|
+
raise UnknownTokenError.new('Meta', token)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def sequence_operation(klass, token)
|
377
|
+
unless node.is_a?(klass)
|
378
|
+
operator = klass.new(token, active_opts)
|
379
|
+
sequence = operator.add_sequence(active_opts)
|
380
|
+
sequence.expressions = node.expressions
|
381
|
+
node.expressions = []
|
382
|
+
nest(operator)
|
383
|
+
end
|
384
|
+
node.add_sequence(active_opts)
|
385
|
+
end
|
386
|
+
|
237
387
|
def posixclass(token)
|
238
388
|
node << PosixClass.new(token, active_opts)
|
239
389
|
end
|
240
390
|
|
241
391
|
include Regexp::Expression::UnicodeProperty
|
392
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
242
393
|
|
243
394
|
def property(token)
|
244
395
|
case token.token
|
@@ -310,128 +461,43 @@ class Regexp::Parser
|
|
310
461
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
311
462
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
312
463
|
|
313
|
-
when *
|
314
|
-
node <<
|
315
|
-
|
316
|
-
when *
|
317
|
-
|
318
|
-
|
319
|
-
when *Token::UnicodeProperty::Emoji
|
320
|
-
node << Emoji.new(token, active_opts)
|
321
|
-
|
322
|
-
when *Token::UnicodeProperty::Script
|
323
|
-
node << Script.new(token, active_opts)
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
326
|
-
node << Block.new(token, active_opts)
|
464
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
465
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
466
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
467
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
468
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
327
469
|
|
328
470
|
else
|
329
471
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
330
472
|
end
|
331
473
|
end
|
332
474
|
|
333
|
-
def anchor(token)
|
334
|
-
case token.token
|
335
|
-
when :bol
|
336
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
337
|
-
when :eol
|
338
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
339
|
-
when :bos
|
340
|
-
node << Anchor::BOS.new(token, active_opts)
|
341
|
-
when :eos
|
342
|
-
node << Anchor::EOS.new(token, active_opts)
|
343
|
-
when :eos_ob_eol
|
344
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
345
|
-
when :word_boundary
|
346
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
347
|
-
when :nonword_boundary
|
348
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
349
|
-
when :match_start
|
350
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
351
|
-
else
|
352
|
-
raise UnknownTokenError.new('Anchor', token)
|
353
|
-
end
|
354
|
-
end
|
355
|
-
|
356
|
-
def escape(token)
|
357
|
-
case token.token
|
358
|
-
|
359
|
-
when :backspace
|
360
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
361
|
-
|
362
|
-
when :escape
|
363
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
364
|
-
when :bell
|
365
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
366
|
-
when :form_feed
|
367
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
368
|
-
when :newline
|
369
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
370
|
-
when :carriage
|
371
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
372
|
-
when :tab
|
373
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
374
|
-
when :vertical_tab
|
375
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
376
|
-
|
377
|
-
when :hex
|
378
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
379
|
-
when :octal
|
380
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
381
|
-
when :codepoint
|
382
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
383
|
-
when :codepoint_list
|
384
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
385
|
-
|
386
|
-
when :control
|
387
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
388
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
389
|
-
else
|
390
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
391
|
-
end
|
392
|
-
|
393
|
-
when :meta_sequence
|
394
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
395
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
396
|
-
else
|
397
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
398
|
-
end
|
399
|
-
|
400
|
-
else
|
401
|
-
# treating everything else as a literal
|
402
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
403
|
-
end
|
404
|
-
end
|
405
|
-
|
406
|
-
def keep(token)
|
407
|
-
node << Keep::Mark.new(token, active_opts)
|
408
|
-
end
|
409
|
-
|
410
|
-
def free_space(token)
|
411
|
-
case token.token
|
412
|
-
when :comment
|
413
|
-
node << Comment.new(token, active_opts)
|
414
|
-
when :whitespace
|
415
|
-
if node.last.is_a?(WhiteSpace)
|
416
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
417
|
-
else
|
418
|
-
node << WhiteSpace.new(token, active_opts)
|
419
|
-
end
|
420
|
-
else
|
421
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
422
|
-
end
|
423
|
-
end
|
424
|
-
|
425
475
|
def quantifier(token)
|
426
|
-
|
427
|
-
target_node
|
428
|
-
|
429
|
-
|
476
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
477
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
478
|
+
|
479
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
480
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
481
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
482
|
+
if target_node.quantified?
|
483
|
+
new_token = Regexp::Token.new(
|
484
|
+
:group,
|
485
|
+
:passive,
|
486
|
+
'', # text
|
487
|
+
target_node.ts,
|
488
|
+
nil, # te (unused)
|
489
|
+
target_node.level,
|
490
|
+
target_node.set_level,
|
491
|
+
target_node.conditional_level
|
492
|
+
)
|
493
|
+
new_group = Group::Passive.new(new_token, active_opts)
|
494
|
+
new_group.implicit = true
|
495
|
+
new_group << target_node
|
496
|
+
increase_level(target_node)
|
497
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
498
|
+
target_node = new_group
|
430
499
|
end
|
431
500
|
|
432
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
433
|
-
"'#{token.text}' ")
|
434
|
-
|
435
501
|
case token.token
|
436
502
|
when :zero_or_one
|
437
503
|
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
@@ -462,6 +528,11 @@ class Regexp::Parser
|
|
462
528
|
end
|
463
529
|
end
|
464
530
|
|
531
|
+
def increase_level(exp)
|
532
|
+
exp.level += 1
|
533
|
+
exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
|
534
|
+
end
|
535
|
+
|
465
536
|
def interval(target_node, token)
|
466
537
|
text = token.text
|
467
538
|
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
@@ -484,100 +555,16 @@ class Regexp::Parser
|
|
484
555
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
485
556
|
end
|
486
557
|
|
487
|
-
def
|
488
|
-
case token.token
|
489
|
-
when :options, :options_switch
|
490
|
-
options_group(token)
|
491
|
-
when :close
|
492
|
-
close_group
|
493
|
-
when :comment
|
494
|
-
node << Group::Comment.new(token, active_opts)
|
495
|
-
else
|
496
|
-
open_group(token)
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
501
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
502
|
-
|
503
|
-
def options_group(token)
|
504
|
-
positive, negative = token.text.split('-', 2)
|
505
|
-
negative ||= ''
|
506
|
-
self.switching_options = token.token.equal?(:options_switch)
|
507
|
-
|
508
|
-
opt_changes = {}
|
509
|
-
new_active_opts = active_opts.dup
|
510
|
-
|
511
|
-
MOD_FLAGS.each do |flag|
|
512
|
-
if positive.include?(flag.to_s)
|
513
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
514
|
-
end
|
515
|
-
if negative.include?(flag.to_s)
|
516
|
-
opt_changes[flag] = false
|
517
|
-
new_active_opts.delete(flag)
|
518
|
-
end
|
519
|
-
end
|
520
|
-
|
521
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
522
|
-
enc_flag = enc_flag.to_sym
|
523
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
524
|
-
opt_changes[other] = false if new_active_opts[other]
|
525
|
-
new_active_opts.delete(other)
|
526
|
-
end
|
527
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
528
|
-
end
|
529
|
-
|
530
|
-
options_stack << new_active_opts
|
531
|
-
|
532
|
-
options_group = Group::Options.new(token, active_opts)
|
533
|
-
options_group.option_changes = opt_changes
|
534
|
-
|
535
|
-
nest(options_group)
|
536
|
-
end
|
537
|
-
|
538
|
-
def open_group(token)
|
558
|
+
def set(token)
|
539
559
|
case token.token
|
540
|
-
when :
|
541
|
-
|
542
|
-
when :
|
543
|
-
|
544
|
-
when :
|
545
|
-
exp = Group::Named.new(token, active_opts)
|
546
|
-
when :capture
|
547
|
-
exp = Group::Capture.new(token, active_opts)
|
548
|
-
when :absence
|
549
|
-
exp = Group::Absence.new(token, active_opts)
|
550
|
-
|
551
|
-
when :lookahead
|
552
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
553
|
-
when :nlookahead
|
554
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
555
|
-
when :lookbehind
|
556
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
557
|
-
when :nlookbehind
|
558
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
559
|
-
|
560
|
+
when :open; open_set(token)
|
561
|
+
when :close; close_set
|
562
|
+
when :negate; negate_set
|
563
|
+
when :range; range(token)
|
564
|
+
when :intersection; intersection(token)
|
560
565
|
else
|
561
|
-
raise UnknownTokenError.new('
|
562
|
-
end
|
563
|
-
|
564
|
-
if exp.capturing?
|
565
|
-
exp.number = total_captured_group_count + 1
|
566
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
567
|
-
count_captured_group
|
566
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
568
567
|
end
|
569
|
-
|
570
|
-
# Push the active options to the stack again. This way we can simply pop the
|
571
|
-
# stack for any group we close, no matter if it had its own options or not.
|
572
|
-
options_stack << active_opts
|
573
|
-
|
574
|
-
nest(exp)
|
575
|
-
end
|
576
|
-
|
577
|
-
def close_group
|
578
|
-
options_stack.pop unless switching_options
|
579
|
-
self.switching_options = false
|
580
|
-
decrease_nesting
|
581
568
|
end
|
582
569
|
|
583
570
|
def open_set(token)
|
@@ -600,51 +587,45 @@ class Regexp::Parser
|
|
600
587
|
nest(exp)
|
601
588
|
end
|
602
589
|
|
603
|
-
def close_completed_character_set_range
|
604
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
605
|
-
end
|
606
|
-
|
607
590
|
def intersection(token)
|
608
591
|
sequence_operation(CharacterSet::Intersection, token)
|
609
592
|
end
|
610
593
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
594
|
+
def type(token)
|
595
|
+
case token.token
|
596
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
597
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
598
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
599
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
600
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
601
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
602
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
603
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
604
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
605
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
606
|
+
else
|
607
|
+
raise UnknownTokenError.new('CharacterType', token)
|
618
608
|
end
|
619
|
-
node.add_sequence(active_opts)
|
620
|
-
end
|
621
|
-
|
622
|
-
def active_opts
|
623
|
-
options_stack.last
|
624
|
-
end
|
625
|
-
|
626
|
-
def total_captured_group_count
|
627
|
-
captured_group_counts.values.reduce(0, :+)
|
628
|
-
end
|
629
|
-
|
630
|
-
def captured_group_count_at_level
|
631
|
-
captured_group_counts[node.level]
|
632
609
|
end
|
633
610
|
|
634
|
-
def
|
635
|
-
|
611
|
+
def close_completed_character_set_range
|
612
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
636
613
|
end
|
637
614
|
|
638
|
-
def
|
639
|
-
|
640
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
615
|
+
def active_opts
|
616
|
+
options_stack.last
|
641
617
|
end
|
642
618
|
|
619
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
620
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
621
|
+
# the instance of Group::Capture that it refers to via its number.
|
643
622
|
def assign_referenced_expressions
|
644
623
|
targets = {}
|
624
|
+
# find all referencable expressions
|
645
625
|
root.each_expression do |exp|
|
646
626
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
647
627
|
end
|
628
|
+
# assign them to any refering expressions
|
648
629
|
root.each_expression do |exp|
|
649
630
|
exp.respond_to?(:reference) &&
|
650
631
|
exp.referenced_expression = targets[exp.reference]
|