regexp_parser 1.7.0 → 2.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +364 -22
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/README.md +124 -88
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +295 -368
- data/lib/regexp_parser/scanner.rb +1405 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +49 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -18,12 +18,12 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
22
|
-
new.parse(input, syntax, &block)
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
|
+
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
26
|
-
root = Root.
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
|
+
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
29
29
|
self.node = root
|
@@ -35,10 +35,13 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -54,107 +57,173 @@ class Regexp::Parser
|
|
54
57
|
:options_stack, :switching_options, :conditional_nesting,
|
55
58
|
:captured_group_counts
|
56
59
|
|
57
|
-
def
|
58
|
-
|
60
|
+
def extract_options(input, options)
|
61
|
+
if options && !input.is_a?(String)
|
62
|
+
raise ArgumentError, 'options cannot be supplied unless parsing a String'
|
63
|
+
end
|
59
64
|
|
60
|
-
options =
|
61
|
-
options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
|
62
|
-
options[:m] = true if input.options & ::Regexp::MULTILINE != 0
|
63
|
-
options[:x] = true if input.options & ::Regexp::EXTENDED != 0
|
64
|
-
options
|
65
|
-
end
|
65
|
+
options = input.options if input.is_a?(::Regexp)
|
66
66
|
|
67
|
-
|
68
|
-
nesting.push(exp)
|
69
|
-
node << exp
|
70
|
-
update_transplanted_subtree(exp, node)
|
71
|
-
self.node = exp
|
72
|
-
end
|
67
|
+
return {} unless options
|
73
68
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
def decrease_nesting
|
82
|
-
while nesting.last.is_a?(SequenceOperation)
|
83
|
-
nesting.pop
|
84
|
-
self.node = nesting.last
|
85
|
-
end
|
86
|
-
nesting.pop
|
87
|
-
yield(node) if block_given?
|
88
|
-
self.node = nesting.last
|
89
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
90
|
-
end
|
91
|
-
|
92
|
-
def nest_conditional(exp)
|
93
|
-
conditional_nesting.push(exp)
|
94
|
-
nest(exp)
|
69
|
+
enabled_options = {}
|
70
|
+
enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
|
71
|
+
enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
|
72
|
+
enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
|
73
|
+
enabled_options
|
95
74
|
end
|
96
75
|
|
97
76
|
def parse_token(token)
|
98
|
-
close_completed_character_set_range
|
99
|
-
|
100
77
|
case token.type
|
101
|
-
when :
|
102
|
-
when :
|
103
|
-
when :
|
104
|
-
when :
|
105
|
-
when :
|
106
|
-
when :
|
107
|
-
when :
|
108
|
-
when :
|
109
|
-
when :
|
110
|
-
when :
|
111
|
-
when :
|
112
|
-
|
113
|
-
when :
|
114
|
-
|
115
|
-
when :property, :nonproperty
|
116
|
-
property(token)
|
117
|
-
|
118
|
-
when :literal
|
119
|
-
node << Literal.new(token, active_opts)
|
120
|
-
when :free_space
|
121
|
-
free_space(token)
|
122
|
-
|
78
|
+
when :anchor; anchor(token)
|
79
|
+
when :assertion, :group; group(token)
|
80
|
+
when :backref; backref(token)
|
81
|
+
when :conditional; conditional(token)
|
82
|
+
when :escape; escape(token)
|
83
|
+
when :free_space; free_space(token)
|
84
|
+
when :keep; keep(token)
|
85
|
+
when :literal; literal(token)
|
86
|
+
when :meta; meta(token)
|
87
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
88
|
+
when :property, :nonproperty; property(token)
|
89
|
+
when :quantifier; quantifier(token)
|
90
|
+
when :set; set(token)
|
91
|
+
when :type; type(token)
|
123
92
|
else
|
124
93
|
raise UnknownTokenTypeError.new(token.type, token)
|
125
94
|
end
|
95
|
+
|
96
|
+
close_completed_character_set_range
|
126
97
|
end
|
127
98
|
|
128
|
-
def
|
99
|
+
def anchor(token)
|
129
100
|
case token.token
|
130
|
-
when :
|
131
|
-
|
132
|
-
when :
|
133
|
-
|
134
|
-
when :
|
135
|
-
|
136
|
-
when :
|
137
|
-
|
138
|
-
when :intersection
|
139
|
-
intersection(token)
|
140
|
-
when :collation, :equivalent
|
141
|
-
node << Literal.new(token, active_opts)
|
101
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
102
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
103
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
104
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
105
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
106
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
107
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
108
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
142
109
|
else
|
143
|
-
raise UnknownTokenError.new('
|
110
|
+
raise UnknownTokenError.new('Anchor', token)
|
144
111
|
end
|
145
112
|
end
|
146
113
|
|
147
|
-
def
|
114
|
+
def group(token)
|
148
115
|
case token.token
|
149
|
-
when :
|
150
|
-
|
151
|
-
when :
|
152
|
-
|
116
|
+
when :options, :options_switch
|
117
|
+
options_group(token)
|
118
|
+
when :close
|
119
|
+
close_group
|
120
|
+
when :comment
|
121
|
+
node << Group::Comment.new(token, active_opts)
|
153
122
|
else
|
154
|
-
|
123
|
+
open_group(token)
|
155
124
|
end
|
156
125
|
end
|
157
126
|
|
127
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
128
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
129
|
+
|
130
|
+
def options_group(token)
|
131
|
+
positive, negative = token.text.split('-', 2)
|
132
|
+
negative ||= ''
|
133
|
+
self.switching_options = token.token.equal?(:options_switch)
|
134
|
+
|
135
|
+
opt_changes = {}
|
136
|
+
new_active_opts = active_opts.dup
|
137
|
+
|
138
|
+
MOD_FLAGS.each do |flag|
|
139
|
+
if positive.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
141
|
+
end
|
142
|
+
if negative.include?(flag.to_s)
|
143
|
+
opt_changes[flag] = false
|
144
|
+
new_active_opts.delete(flag)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
149
|
+
enc_flag = enc_flag.to_sym
|
150
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
151
|
+
opt_changes[other] = false if new_active_opts[other]
|
152
|
+
new_active_opts.delete(other)
|
153
|
+
end
|
154
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
155
|
+
end
|
156
|
+
|
157
|
+
options_stack << new_active_opts
|
158
|
+
|
159
|
+
options_group = Group::Options.new(token, active_opts)
|
160
|
+
options_group.option_changes = opt_changes
|
161
|
+
|
162
|
+
nest(options_group)
|
163
|
+
end
|
164
|
+
|
165
|
+
def open_group(token)
|
166
|
+
group_class =
|
167
|
+
case token.token
|
168
|
+
when :absence; Group::Absence
|
169
|
+
when :atomic; Group::Atomic
|
170
|
+
when :capture; Group::Capture
|
171
|
+
when :named; Group::Named
|
172
|
+
when :passive; Group::Passive
|
173
|
+
|
174
|
+
when :lookahead; Assertion::Lookahead
|
175
|
+
when :lookbehind; Assertion::Lookbehind
|
176
|
+
when :nlookahead; Assertion::NegativeLookahead
|
177
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
178
|
+
|
179
|
+
else
|
180
|
+
raise UnknownTokenError.new('Group type open', token)
|
181
|
+
end
|
182
|
+
|
183
|
+
group = group_class.new(token, active_opts)
|
184
|
+
|
185
|
+
if group.capturing?
|
186
|
+
group.number = total_captured_group_count + 1
|
187
|
+
group.number_at_level = captured_group_count_at_level + 1
|
188
|
+
count_captured_group
|
189
|
+
end
|
190
|
+
|
191
|
+
# Push the active options to the stack again. This way we can simply pop the
|
192
|
+
# stack for any group we close, no matter if it had its own options or not.
|
193
|
+
options_stack << active_opts
|
194
|
+
|
195
|
+
nest(group)
|
196
|
+
end
|
197
|
+
|
198
|
+
def total_captured_group_count
|
199
|
+
captured_group_counts.values.reduce(0, :+)
|
200
|
+
end
|
201
|
+
|
202
|
+
def captured_group_count_at_level
|
203
|
+
captured_group_counts[node]
|
204
|
+
end
|
205
|
+
|
206
|
+
def count_captured_group
|
207
|
+
captured_group_counts[node] += 1
|
208
|
+
end
|
209
|
+
|
210
|
+
def close_group
|
211
|
+
options_stack.pop unless switching_options
|
212
|
+
self.switching_options = false
|
213
|
+
decrease_nesting
|
214
|
+
end
|
215
|
+
|
216
|
+
def decrease_nesting
|
217
|
+
while nesting.last.is_a?(SequenceOperation)
|
218
|
+
nesting.pop
|
219
|
+
self.node = nesting.last
|
220
|
+
end
|
221
|
+
nesting.pop
|
222
|
+
yield(node) if block_given?
|
223
|
+
self.node = nesting.last
|
224
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
225
|
+
end
|
226
|
+
|
158
227
|
def backref(token)
|
159
228
|
case token.token
|
160
229
|
when :name_ref
|
@@ -163,10 +232,18 @@ class Regexp::Parser
|
|
163
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
164
233
|
when :name_call
|
165
234
|
node << Backreference::NameCall.new(token, active_opts)
|
166
|
-
when :number, :number_ref
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
167
236
|
node << Backreference::Number.new(token, active_opts)
|
168
237
|
when :number_recursion_ref
|
169
|
-
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
238
|
+
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
239
|
+
# TODO: should split off new token number_recursion_rel_ref and new
|
240
|
+
# class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
|
241
|
+
if exp.text =~ /[<'][+-]/
|
242
|
+
assign_effective_number(exp)
|
243
|
+
else
|
244
|
+
exp.effective_number = exp.number
|
245
|
+
end
|
246
|
+
end
|
170
247
|
when :number_call
|
171
248
|
node << Backreference::NumberCall.new(token, active_opts)
|
172
249
|
when :number_rel_ref
|
@@ -182,31 +259,11 @@ class Regexp::Parser
|
|
182
259
|
end
|
183
260
|
end
|
184
261
|
|
185
|
-
def
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
191
|
-
when :hex
|
192
|
-
node << CharacterType::Hex.new(token, active_opts)
|
193
|
-
when :nonhex
|
194
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
195
|
-
when :space
|
196
|
-
node << CharacterType::Space.new(token, active_opts)
|
197
|
-
when :nonspace
|
198
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
199
|
-
when :word
|
200
|
-
node << CharacterType::Word.new(token, active_opts)
|
201
|
-
when :nonword
|
202
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
203
|
-
when :linebreak
|
204
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
205
|
-
when :xgrapheme
|
206
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
207
|
-
else
|
208
|
-
raise UnknownTokenError.new('CharacterType', token)
|
209
|
-
end
|
262
|
+
def assign_effective_number(exp)
|
263
|
+
exp.effective_number =
|
264
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
265
|
+
exp.effective_number > 0 ||
|
266
|
+
raise(ParserError, "Invalid reference: #{exp.reference}")
|
210
267
|
end
|
211
268
|
|
212
269
|
def conditional(token)
|
@@ -215,9 +272,9 @@ class Regexp::Parser
|
|
215
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
216
273
|
when :condition
|
217
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
218
|
-
conditional_nesting.last.add_sequence(active_opts)
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
219
276
|
when :separator
|
220
|
-
conditional_nesting.last.add_sequence(active_opts)
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
221
278
|
self.node = conditional_nesting.last.branches.last
|
222
279
|
when :close
|
223
280
|
conditional_nesting.pop
|
@@ -234,157 +291,38 @@ class Regexp::Parser
|
|
234
291
|
end
|
235
292
|
end
|
236
293
|
|
237
|
-
def
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
include Regexp::Expression::UnicodeProperty
|
242
|
-
|
243
|
-
def property(token)
|
244
|
-
case token.token
|
245
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
246
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
247
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
248
|
-
when :blank; node << Blank.new(token, active_opts)
|
249
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
250
|
-
when :digit; node << Digit.new(token, active_opts)
|
251
|
-
when :graph; node << Graph.new(token, active_opts)
|
252
|
-
when :lower; node << Lower.new(token, active_opts)
|
253
|
-
when :print; node << Print.new(token, active_opts)
|
254
|
-
when :punct; node << Punct.new(token, active_opts)
|
255
|
-
when :space; node << Space.new(token, active_opts)
|
256
|
-
when :upper; node << Upper.new(token, active_opts)
|
257
|
-
when :word; node << Word.new(token, active_opts)
|
258
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
259
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
260
|
-
|
261
|
-
# only in Oniguruma (old rubies)
|
262
|
-
when :newline; node << Newline.new(token, active_opts)
|
263
|
-
|
264
|
-
when :any; node << Any.new(token, active_opts)
|
265
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
266
|
-
|
267
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
268
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
269
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
270
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
271
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
272
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
273
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
274
|
-
|
275
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
276
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
277
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
278
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
279
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
280
|
-
|
281
|
-
when :number; node << Number::Any.new(token, active_opts)
|
282
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
283
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
284
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
285
|
-
|
286
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
287
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
288
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
289
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
290
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
291
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
292
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
293
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
294
|
-
|
295
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
296
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
297
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
298
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
299
|
-
|
300
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
301
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
302
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
303
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
304
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
305
|
-
|
306
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
307
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
308
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
309
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
310
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
311
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
312
|
-
|
313
|
-
when *Token::UnicodeProperty::Age
|
314
|
-
node << Age.new(token, active_opts)
|
315
|
-
|
316
|
-
when *Token::UnicodeProperty::Derived
|
317
|
-
node << Derived.new(token, active_opts)
|
318
|
-
|
319
|
-
when *Token::UnicodeProperty::Emoji
|
320
|
-
node << Emoji.new(token, active_opts)
|
321
|
-
|
322
|
-
when *Token::UnicodeProperty::Script
|
323
|
-
node << Script.new(token, active_opts)
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
326
|
-
node << Block.new(token, active_opts)
|
327
|
-
|
328
|
-
else
|
329
|
-
raise UnknownTokenError.new('UnicodeProperty', token)
|
330
|
-
end
|
294
|
+
def nest_conditional(exp)
|
295
|
+
conditional_nesting.push(exp)
|
296
|
+
nest(exp)
|
331
297
|
end
|
332
298
|
|
333
|
-
def
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
when :eol
|
338
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
339
|
-
when :bos
|
340
|
-
node << Anchor::BOS.new(token, active_opts)
|
341
|
-
when :eos
|
342
|
-
node << Anchor::EOS.new(token, active_opts)
|
343
|
-
when :eos_ob_eol
|
344
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
345
|
-
when :word_boundary
|
346
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
347
|
-
when :nonword_boundary
|
348
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
349
|
-
when :match_start
|
350
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
351
|
-
else
|
352
|
-
raise UnknownTokenError.new('Anchor', token)
|
353
|
-
end
|
299
|
+
def nest(exp)
|
300
|
+
nesting.push(exp)
|
301
|
+
node << exp
|
302
|
+
self.node = exp
|
354
303
|
end
|
355
304
|
|
356
305
|
def escape(token)
|
357
306
|
case token.token
|
358
307
|
|
359
|
-
when :backspace
|
360
|
-
|
361
|
-
|
362
|
-
when :
|
363
|
-
node << EscapeSequence::
|
364
|
-
when :
|
365
|
-
|
366
|
-
when :
|
367
|
-
|
368
|
-
|
369
|
-
node << EscapeSequence::
|
370
|
-
when :
|
371
|
-
|
372
|
-
when :
|
373
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
374
|
-
when :vertical_tab
|
375
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
376
|
-
|
377
|
-
when :hex
|
378
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
379
|
-
when :octal
|
380
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
381
|
-
when :codepoint
|
382
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
383
|
-
when :codepoint_list
|
384
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
308
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
309
|
+
|
310
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
311
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
312
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
313
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
314
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
315
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
316
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
317
|
+
|
318
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
319
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
320
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
321
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
385
322
|
|
386
323
|
when :control
|
387
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
388
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
389
327
|
else
|
390
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
@@ -392,6 +330,7 @@ class Regexp::Parser
|
|
392
330
|
|
393
331
|
when :meta_sequence
|
394
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
395
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
396
335
|
else
|
397
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
@@ -399,188 +338,194 @@ class Regexp::Parser
|
|
399
338
|
|
400
339
|
else
|
401
340
|
# treating everything else as a literal
|
341
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
342
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
343
|
+
# as stuff that would be a literal even without the backslash.
|
344
|
+
# Right now, they all end up here.
|
402
345
|
node << EscapeSequence::Literal.new(token, active_opts)
|
403
346
|
end
|
404
347
|
end
|
405
348
|
|
406
|
-
def keep(token)
|
407
|
-
node << Keep::Mark.new(token, active_opts)
|
408
|
-
end
|
409
|
-
|
410
349
|
def free_space(token)
|
411
350
|
case token.token
|
412
351
|
when :comment
|
413
352
|
node << Comment.new(token, active_opts)
|
414
353
|
when :whitespace
|
415
|
-
|
416
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
417
|
-
else
|
418
|
-
node << WhiteSpace.new(token, active_opts)
|
419
|
-
end
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
420
355
|
else
|
421
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
422
357
|
end
|
423
358
|
end
|
424
359
|
|
425
|
-
def
|
426
|
-
|
427
|
-
|
428
|
-
while target_node.is_a?(FreeSpace)
|
429
|
-
target_node = node.expressions[offset -= 1]
|
430
|
-
end
|
360
|
+
def keep(token)
|
361
|
+
node << Keep::Mark.new(token, active_opts)
|
362
|
+
end
|
431
363
|
|
432
|
-
|
433
|
-
|
364
|
+
def literal(token)
|
365
|
+
node << Literal.new(token, active_opts)
|
366
|
+
end
|
434
367
|
|
368
|
+
def meta(token)
|
435
369
|
case token.token
|
436
|
-
when :
|
437
|
-
|
438
|
-
when :
|
439
|
-
|
440
|
-
when :zero_or_one_possessive
|
441
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
442
|
-
|
443
|
-
when :zero_or_more
|
444
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
445
|
-
when :zero_or_more_reluctant
|
446
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
447
|
-
when :zero_or_more_possessive
|
448
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
449
|
-
|
450
|
-
when :one_or_more
|
451
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
452
|
-
when :one_or_more_reluctant
|
453
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
454
|
-
when :one_or_more_possessive
|
455
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
456
|
-
|
457
|
-
when :interval
|
458
|
-
interval(target_node, token)
|
459
|
-
|
370
|
+
when :dot
|
371
|
+
node << CharacterType::Any.new(token, active_opts)
|
372
|
+
when :alternation
|
373
|
+
sequence_operation(Alternation, token)
|
460
374
|
else
|
461
|
-
raise UnknownTokenError.new('
|
375
|
+
raise UnknownTokenError.new('Meta', token)
|
462
376
|
end
|
463
377
|
end
|
464
378
|
|
465
|
-
def
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
when '+'
|
473
|
-
range_text = text[0...-1]
|
474
|
-
mode = :possessive
|
475
|
-
else
|
476
|
-
range_text = text
|
477
|
-
mode = :greedy
|
379
|
+
def sequence_operation(klass, token)
|
380
|
+
unless node.instance_of?(klass)
|
381
|
+
operator = klass.new(token, active_opts)
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
383
|
+
sequence.expressions = node.expressions
|
384
|
+
node.expressions = []
|
385
|
+
nest(operator)
|
478
386
|
end
|
479
|
-
|
480
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
481
|
-
min = range[0].empty? ? 0 : range[0]
|
482
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
483
|
-
|
484
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
485
388
|
end
|
486
389
|
|
487
|
-
def
|
488
|
-
|
489
|
-
when :options, :options_switch
|
490
|
-
options_group(token)
|
491
|
-
when :close
|
492
|
-
close_group
|
493
|
-
when :comment
|
494
|
-
node << Group::Comment.new(token, active_opts)
|
495
|
-
else
|
496
|
-
open_group(token)
|
497
|
-
end
|
390
|
+
def posixclass(token)
|
391
|
+
node << PosixClass.new(token, active_opts)
|
498
392
|
end
|
499
393
|
|
500
|
-
|
501
|
-
|
394
|
+
UP = Regexp::Expression::Property
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
502
396
|
|
503
|
-
def
|
504
|
-
|
505
|
-
|
506
|
-
|
397
|
+
def property(token)
|
398
|
+
case token.token
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
507
414
|
|
508
|
-
|
509
|
-
|
415
|
+
# only in Oniguruma (old rubies)
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
417
|
+
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
420
|
+
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
428
|
+
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
434
|
+
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
439
|
+
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
448
|
+
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
453
|
+
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
459
|
+
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
466
|
+
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
470
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
471
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
510
472
|
|
511
|
-
|
512
|
-
|
513
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
514
|
-
end
|
515
|
-
if negative.include?(flag.to_s)
|
516
|
-
opt_changes[flag] = false
|
517
|
-
new_active_opts.delete(flag)
|
518
|
-
end
|
473
|
+
else
|
474
|
+
raise UnknownTokenError.new('UnicodeProperty', token)
|
519
475
|
end
|
476
|
+
end
|
520
477
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
478
|
+
def quantifier(token)
|
479
|
+
target_node = node.extract_quantifier_target(token.text)
|
480
|
+
|
481
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
482
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
483
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
484
|
+
if target_node.quantified?
|
485
|
+
new_group = Group::Passive.construct(
|
486
|
+
token: :passive,
|
487
|
+
ts: target_node.ts,
|
488
|
+
level: target_node.level,
|
489
|
+
set_level: target_node.set_level,
|
490
|
+
conditional_level: target_node.conditional_level,
|
491
|
+
options: active_opts,
|
492
|
+
)
|
493
|
+
new_group.implicit = true
|
494
|
+
new_group << target_node
|
495
|
+
increase_group_level(target_node)
|
496
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
497
|
+
target_node = new_group
|
528
498
|
end
|
529
499
|
|
530
|
-
|
500
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
501
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
502
|
+
raise UnknownTokenError.new('Quantifier', token)
|
503
|
+
end
|
531
504
|
|
532
|
-
|
533
|
-
|
505
|
+
target_node.quantify(token, active_opts)
|
506
|
+
end
|
534
507
|
|
535
|
-
|
508
|
+
def increase_group_level(exp)
|
509
|
+
exp.level += 1
|
510
|
+
exp.quantifier.level += 1 if exp.quantifier
|
511
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
536
512
|
end
|
537
513
|
|
538
|
-
def
|
514
|
+
def set(token)
|
539
515
|
case token.token
|
540
|
-
when :
|
541
|
-
|
542
|
-
when :
|
543
|
-
|
544
|
-
when :
|
545
|
-
exp = Group::Named.new(token, active_opts)
|
546
|
-
when :capture
|
547
|
-
exp = Group::Capture.new(token, active_opts)
|
548
|
-
when :absence
|
549
|
-
exp = Group::Absence.new(token, active_opts)
|
550
|
-
|
551
|
-
when :lookahead
|
552
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
553
|
-
when :nlookahead
|
554
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
555
|
-
when :lookbehind
|
556
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
557
|
-
when :nlookbehind
|
558
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
559
|
-
|
516
|
+
when :open; open_set(token)
|
517
|
+
when :close; close_set
|
518
|
+
when :negate; negate_set
|
519
|
+
when :range; range(token)
|
520
|
+
when :intersection; intersection(token)
|
560
521
|
else
|
561
|
-
raise UnknownTokenError.new('
|
562
|
-
end
|
563
|
-
|
564
|
-
if exp.capturing?
|
565
|
-
exp.number = total_captured_group_count + 1
|
566
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
567
|
-
count_captured_group
|
522
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
568
523
|
end
|
569
|
-
|
570
|
-
# Push the active options to the stack again. This way we can simply pop the
|
571
|
-
# stack for any group we close, no matter if it had its own options or not.
|
572
|
-
options_stack << active_opts
|
573
|
-
|
574
|
-
nest(exp)
|
575
|
-
end
|
576
|
-
|
577
|
-
def close_group
|
578
|
-
options_stack.pop unless switching_options
|
579
|
-
self.switching_options = false
|
580
|
-
decrease_nesting
|
581
524
|
end
|
582
525
|
|
583
526
|
def open_set(token)
|
527
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
528
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
584
529
|
token.token = :character
|
585
530
|
nest(CharacterSet.new(token, active_opts))
|
586
531
|
end
|
@@ -595,59 +540,56 @@ class Regexp::Parser
|
|
595
540
|
|
596
541
|
def range(token)
|
597
542
|
exp = CharacterSet::Range.new(token, active_opts)
|
598
|
-
scope = node.last.
|
543
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
599
544
|
exp << scope.expressions.pop
|
600
545
|
nest(exp)
|
601
546
|
end
|
602
547
|
|
603
|
-
def close_completed_character_set_range
|
604
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
605
|
-
end
|
606
|
-
|
607
548
|
def intersection(token)
|
608
549
|
sequence_operation(CharacterSet::Intersection, token)
|
609
550
|
end
|
610
551
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
552
|
+
def type(token)
|
553
|
+
case token.token
|
554
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
555
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
556
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
557
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
558
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
559
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
560
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
561
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
562
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
563
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
564
|
+
else
|
565
|
+
raise UnknownTokenError.new('CharacterType', token)
|
618
566
|
end
|
619
|
-
node.add_sequence(active_opts)
|
620
|
-
end
|
621
|
-
|
622
|
-
def active_opts
|
623
|
-
options_stack.last
|
624
|
-
end
|
625
|
-
|
626
|
-
def total_captured_group_count
|
627
|
-
captured_group_counts.values.reduce(0, :+)
|
628
|
-
end
|
629
|
-
|
630
|
-
def captured_group_count_at_level
|
631
|
-
captured_group_counts[node.level]
|
632
567
|
end
|
633
568
|
|
634
|
-
def
|
635
|
-
|
569
|
+
def close_completed_character_set_range
|
570
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
636
571
|
end
|
637
572
|
|
638
|
-
def
|
639
|
-
|
640
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
573
|
+
def active_opts
|
574
|
+
options_stack.last
|
641
575
|
end
|
642
576
|
|
577
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
578
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
579
|
+
# the instance of Group::Capture that it refers to via its number.
|
643
580
|
def assign_referenced_expressions
|
644
|
-
|
581
|
+
# find all referencable and refering expressions
|
582
|
+
targets = { 0 => root }
|
583
|
+
referrers = []
|
645
584
|
root.each_expression do |exp|
|
646
585
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
586
|
+
referrers << exp if exp.referential?
|
647
587
|
end
|
648
|
-
|
649
|
-
|
650
|
-
|
588
|
+
# assign reference expression to refering expressions
|
589
|
+
# (in a second iteration because there might be forward references)
|
590
|
+
referrers.each do |exp|
|
591
|
+
exp.referenced_expression = targets[exp.reference] ||
|
592
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
651
593
|
end
|
652
594
|
end
|
653
595
|
end # module Regexp::Parser
|