regexp_parser 1.7.0 → 2.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +303 -368
- data/lib/regexp_parser/scanner.rb +1423 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +19 -23
- metadata +52 -171
- data/CHANGELOG.md +0 -349
- data/README.md +0 -470
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -18,12 +18,12 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
22
|
-
new.parse(input, syntax, &block)
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
|
+
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
26
|
-
root = Root.
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
|
+
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
29
29
|
self.node = root
|
@@ -35,10 +35,13 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -54,107 +57,173 @@ class Regexp::Parser
|
|
54
57
|
:options_stack, :switching_options, :conditional_nesting,
|
55
58
|
:captured_group_counts
|
56
59
|
|
57
|
-
def
|
58
|
-
|
60
|
+
def extract_options(input, options)
|
61
|
+
if options && !input.is_a?(String)
|
62
|
+
raise ArgumentError, 'options cannot be supplied unless parsing a String'
|
63
|
+
end
|
59
64
|
|
60
|
-
options =
|
61
|
-
options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
|
62
|
-
options[:m] = true if input.options & ::Regexp::MULTILINE != 0
|
63
|
-
options[:x] = true if input.options & ::Regexp::EXTENDED != 0
|
64
|
-
options
|
65
|
-
end
|
65
|
+
options = input.options if input.is_a?(::Regexp)
|
66
66
|
|
67
|
-
|
68
|
-
nesting.push(exp)
|
69
|
-
node << exp
|
70
|
-
update_transplanted_subtree(exp, node)
|
71
|
-
self.node = exp
|
72
|
-
end
|
67
|
+
return {} unless options
|
73
68
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
def decrease_nesting
|
82
|
-
while nesting.last.is_a?(SequenceOperation)
|
83
|
-
nesting.pop
|
84
|
-
self.node = nesting.last
|
85
|
-
end
|
86
|
-
nesting.pop
|
87
|
-
yield(node) if block_given?
|
88
|
-
self.node = nesting.last
|
89
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
90
|
-
end
|
91
|
-
|
92
|
-
def nest_conditional(exp)
|
93
|
-
conditional_nesting.push(exp)
|
94
|
-
nest(exp)
|
69
|
+
enabled_options = {}
|
70
|
+
enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
|
71
|
+
enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
|
72
|
+
enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
|
73
|
+
enabled_options
|
95
74
|
end
|
96
75
|
|
97
76
|
def parse_token(token)
|
98
|
-
close_completed_character_set_range
|
99
|
-
|
100
77
|
case token.type
|
101
|
-
when :
|
102
|
-
when :
|
103
|
-
when :
|
104
|
-
when :
|
105
|
-
when :
|
106
|
-
when :
|
107
|
-
when :
|
108
|
-
when :
|
109
|
-
when :
|
110
|
-
when :
|
111
|
-
when :
|
112
|
-
|
113
|
-
when :
|
114
|
-
|
115
|
-
when :property, :nonproperty
|
116
|
-
property(token)
|
117
|
-
|
118
|
-
when :literal
|
119
|
-
node << Literal.new(token, active_opts)
|
120
|
-
when :free_space
|
121
|
-
free_space(token)
|
122
|
-
|
78
|
+
when :anchor; anchor(token)
|
79
|
+
when :assertion, :group; group(token)
|
80
|
+
when :backref; backref(token)
|
81
|
+
when :conditional; conditional(token)
|
82
|
+
when :escape; escape(token)
|
83
|
+
when :free_space; free_space(token)
|
84
|
+
when :keep; keep(token)
|
85
|
+
when :literal; literal(token)
|
86
|
+
when :meta; meta(token)
|
87
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
88
|
+
when :property, :nonproperty; property(token)
|
89
|
+
when :quantifier; quantifier(token)
|
90
|
+
when :set; set(token)
|
91
|
+
when :type; type(token)
|
123
92
|
else
|
124
93
|
raise UnknownTokenTypeError.new(token.type, token)
|
125
94
|
end
|
95
|
+
|
96
|
+
close_completed_character_set_range
|
126
97
|
end
|
127
98
|
|
128
|
-
def
|
99
|
+
def anchor(token)
|
129
100
|
case token.token
|
130
|
-
when :
|
131
|
-
|
132
|
-
when :
|
133
|
-
|
134
|
-
when :
|
135
|
-
|
136
|
-
when :
|
137
|
-
|
138
|
-
when :intersection
|
139
|
-
intersection(token)
|
140
|
-
when :collation, :equivalent
|
141
|
-
node << Literal.new(token, active_opts)
|
101
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
102
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
103
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
104
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
105
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
106
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
107
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
108
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
142
109
|
else
|
143
|
-
raise UnknownTokenError.new('
|
110
|
+
raise UnknownTokenError.new('Anchor', token)
|
144
111
|
end
|
145
112
|
end
|
146
113
|
|
147
|
-
def
|
114
|
+
def group(token)
|
148
115
|
case token.token
|
149
|
-
when :
|
150
|
-
|
151
|
-
when :
|
152
|
-
|
116
|
+
when :options, :options_switch
|
117
|
+
options_group(token)
|
118
|
+
when :close
|
119
|
+
close_group
|
120
|
+
when :comment
|
121
|
+
node << Group::Comment.new(token, active_opts)
|
153
122
|
else
|
154
|
-
|
123
|
+
open_group(token)
|
155
124
|
end
|
156
125
|
end
|
157
126
|
|
127
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
128
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
129
|
+
|
130
|
+
def options_group(token)
|
131
|
+
positive, negative = token.text.split('-', 2)
|
132
|
+
negative ||= ''
|
133
|
+
self.switching_options = token.token.equal?(:options_switch)
|
134
|
+
|
135
|
+
opt_changes = {}
|
136
|
+
new_active_opts = active_opts.dup
|
137
|
+
|
138
|
+
MOD_FLAGS.each do |flag|
|
139
|
+
if positive.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
141
|
+
end
|
142
|
+
if negative.include?(flag.to_s)
|
143
|
+
opt_changes[flag] = false
|
144
|
+
new_active_opts.delete(flag)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
149
|
+
enc_flag = enc_flag.to_sym
|
150
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
151
|
+
opt_changes[other] = false if new_active_opts[other]
|
152
|
+
new_active_opts.delete(other)
|
153
|
+
end
|
154
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
155
|
+
end
|
156
|
+
|
157
|
+
options_stack << new_active_opts
|
158
|
+
|
159
|
+
options_group = Group::Options.new(token, active_opts)
|
160
|
+
options_group.option_changes = opt_changes
|
161
|
+
|
162
|
+
nest(options_group)
|
163
|
+
end
|
164
|
+
|
165
|
+
def open_group(token)
|
166
|
+
group_class =
|
167
|
+
case token.token
|
168
|
+
when :absence; Group::Absence
|
169
|
+
when :atomic; Group::Atomic
|
170
|
+
when :capture; Group::Capture
|
171
|
+
when :named; Group::Named
|
172
|
+
when :passive; Group::Passive
|
173
|
+
|
174
|
+
when :lookahead; Assertion::Lookahead
|
175
|
+
when :lookbehind; Assertion::Lookbehind
|
176
|
+
when :nlookahead; Assertion::NegativeLookahead
|
177
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
178
|
+
|
179
|
+
else
|
180
|
+
raise UnknownTokenError.new('Group type open', token)
|
181
|
+
end
|
182
|
+
|
183
|
+
group = group_class.new(token, active_opts)
|
184
|
+
|
185
|
+
if group.capturing?
|
186
|
+
group.number = total_captured_group_count + 1
|
187
|
+
group.number_at_level = captured_group_count_at_level + 1
|
188
|
+
count_captured_group
|
189
|
+
end
|
190
|
+
|
191
|
+
# Push the active options to the stack again. This way we can simply pop the
|
192
|
+
# stack for any group we close, no matter if it had its own options or not.
|
193
|
+
options_stack << active_opts
|
194
|
+
|
195
|
+
nest(group)
|
196
|
+
end
|
197
|
+
|
198
|
+
def total_captured_group_count
|
199
|
+
captured_group_counts.values.reduce(0, :+)
|
200
|
+
end
|
201
|
+
|
202
|
+
def captured_group_count_at_level
|
203
|
+
captured_group_counts[node]
|
204
|
+
end
|
205
|
+
|
206
|
+
def count_captured_group
|
207
|
+
captured_group_counts[node] += 1
|
208
|
+
end
|
209
|
+
|
210
|
+
def close_group
|
211
|
+
options_stack.pop unless switching_options
|
212
|
+
self.switching_options = false
|
213
|
+
decrease_nesting
|
214
|
+
end
|
215
|
+
|
216
|
+
def decrease_nesting
|
217
|
+
while nesting.last.is_a?(SequenceOperation)
|
218
|
+
nesting.pop
|
219
|
+
self.node = nesting.last
|
220
|
+
end
|
221
|
+
nesting.pop
|
222
|
+
yield(node) if block_given?
|
223
|
+
self.node = nesting.last
|
224
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
225
|
+
end
|
226
|
+
|
158
227
|
def backref(token)
|
159
228
|
case token.token
|
160
229
|
when :name_ref
|
@@ -163,10 +232,18 @@ class Regexp::Parser
|
|
163
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
164
233
|
when :name_call
|
165
234
|
node << Backreference::NameCall.new(token, active_opts)
|
166
|
-
when :number, :number_ref
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
167
236
|
node << Backreference::Number.new(token, active_opts)
|
168
237
|
when :number_recursion_ref
|
169
|
-
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
238
|
+
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
239
|
+
# TODO: should split off new token number_recursion_rel_ref and new
|
240
|
+
# class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
|
241
|
+
if exp.text =~ /[<'][+-]/
|
242
|
+
assign_effective_number(exp)
|
243
|
+
else
|
244
|
+
exp.effective_number = exp.number
|
245
|
+
end
|
246
|
+
end
|
170
247
|
when :number_call
|
171
248
|
node << Backreference::NumberCall.new(token, active_opts)
|
172
249
|
when :number_rel_ref
|
@@ -182,31 +259,11 @@ class Regexp::Parser
|
|
182
259
|
end
|
183
260
|
end
|
184
261
|
|
185
|
-
def
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
191
|
-
when :hex
|
192
|
-
node << CharacterType::Hex.new(token, active_opts)
|
193
|
-
when :nonhex
|
194
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
195
|
-
when :space
|
196
|
-
node << CharacterType::Space.new(token, active_opts)
|
197
|
-
when :nonspace
|
198
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
199
|
-
when :word
|
200
|
-
node << CharacterType::Word.new(token, active_opts)
|
201
|
-
when :nonword
|
202
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
203
|
-
when :linebreak
|
204
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
205
|
-
when :xgrapheme
|
206
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
207
|
-
else
|
208
|
-
raise UnknownTokenError.new('CharacterType', token)
|
209
|
-
end
|
262
|
+
def assign_effective_number(exp)
|
263
|
+
exp.effective_number =
|
264
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
265
|
+
exp.effective_number > 0 ||
|
266
|
+
raise(ParserError, "Invalid reference: #{exp.reference}")
|
210
267
|
end
|
211
268
|
|
212
269
|
def conditional(token)
|
@@ -215,9 +272,9 @@ class Regexp::Parser
|
|
215
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
216
273
|
when :condition
|
217
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
218
|
-
conditional_nesting.last.add_sequence(active_opts)
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
219
276
|
when :separator
|
220
|
-
conditional_nesting.last.add_sequence(active_opts)
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
221
278
|
self.node = conditional_nesting.last.branches.last
|
222
279
|
when :close
|
223
280
|
conditional_nesting.pop
|
@@ -234,157 +291,38 @@ class Regexp::Parser
|
|
234
291
|
end
|
235
292
|
end
|
236
293
|
|
237
|
-
def
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
include Regexp::Expression::UnicodeProperty
|
242
|
-
|
243
|
-
def property(token)
|
244
|
-
case token.token
|
245
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
246
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
247
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
248
|
-
when :blank; node << Blank.new(token, active_opts)
|
249
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
250
|
-
when :digit; node << Digit.new(token, active_opts)
|
251
|
-
when :graph; node << Graph.new(token, active_opts)
|
252
|
-
when :lower; node << Lower.new(token, active_opts)
|
253
|
-
when :print; node << Print.new(token, active_opts)
|
254
|
-
when :punct; node << Punct.new(token, active_opts)
|
255
|
-
when :space; node << Space.new(token, active_opts)
|
256
|
-
when :upper; node << Upper.new(token, active_opts)
|
257
|
-
when :word; node << Word.new(token, active_opts)
|
258
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
259
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
260
|
-
|
261
|
-
# only in Oniguruma (old rubies)
|
262
|
-
when :newline; node << Newline.new(token, active_opts)
|
263
|
-
|
264
|
-
when :any; node << Any.new(token, active_opts)
|
265
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
266
|
-
|
267
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
268
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
269
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
270
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
271
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
272
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
273
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
274
|
-
|
275
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
276
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
277
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
278
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
279
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
280
|
-
|
281
|
-
when :number; node << Number::Any.new(token, active_opts)
|
282
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
283
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
284
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
285
|
-
|
286
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
287
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
288
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
289
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
290
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
291
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
292
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
293
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
294
|
-
|
295
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
296
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
297
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
298
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
299
|
-
|
300
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
301
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
302
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
303
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
304
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
305
|
-
|
306
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
307
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
308
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
309
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
310
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
311
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
312
|
-
|
313
|
-
when *Token::UnicodeProperty::Age
|
314
|
-
node << Age.new(token, active_opts)
|
315
|
-
|
316
|
-
when *Token::UnicodeProperty::Derived
|
317
|
-
node << Derived.new(token, active_opts)
|
318
|
-
|
319
|
-
when *Token::UnicodeProperty::Emoji
|
320
|
-
node << Emoji.new(token, active_opts)
|
321
|
-
|
322
|
-
when *Token::UnicodeProperty::Script
|
323
|
-
node << Script.new(token, active_opts)
|
324
|
-
|
325
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
326
|
-
node << Block.new(token, active_opts)
|
327
|
-
|
328
|
-
else
|
329
|
-
raise UnknownTokenError.new('UnicodeProperty', token)
|
330
|
-
end
|
294
|
+
def nest_conditional(exp)
|
295
|
+
conditional_nesting.push(exp)
|
296
|
+
nest(exp)
|
331
297
|
end
|
332
298
|
|
333
|
-
def
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
when :eol
|
338
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
339
|
-
when :bos
|
340
|
-
node << Anchor::BOS.new(token, active_opts)
|
341
|
-
when :eos
|
342
|
-
node << Anchor::EOS.new(token, active_opts)
|
343
|
-
when :eos_ob_eol
|
344
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
345
|
-
when :word_boundary
|
346
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
347
|
-
when :nonword_boundary
|
348
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
349
|
-
when :match_start
|
350
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
351
|
-
else
|
352
|
-
raise UnknownTokenError.new('Anchor', token)
|
353
|
-
end
|
299
|
+
def nest(exp)
|
300
|
+
nesting.push(exp)
|
301
|
+
node << exp
|
302
|
+
self.node = exp
|
354
303
|
end
|
355
304
|
|
356
305
|
def escape(token)
|
357
306
|
case token.token
|
358
307
|
|
359
|
-
when :backspace
|
360
|
-
|
361
|
-
|
362
|
-
when :
|
363
|
-
node << EscapeSequence::
|
364
|
-
when :
|
365
|
-
|
366
|
-
when :
|
367
|
-
|
368
|
-
|
369
|
-
node << EscapeSequence::
|
370
|
-
when :
|
371
|
-
|
372
|
-
when :
|
373
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
374
|
-
when :vertical_tab
|
375
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
376
|
-
|
377
|
-
when :hex
|
378
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
379
|
-
when :octal
|
380
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
381
|
-
when :codepoint
|
382
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
383
|
-
when :codepoint_list
|
384
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
308
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
309
|
+
|
310
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
311
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
312
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
313
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
314
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
315
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
316
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
317
|
+
|
318
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
319
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
320
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
321
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
385
322
|
|
386
323
|
when :control
|
387
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
388
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
389
327
|
else
|
390
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
@@ -392,6 +330,7 @@ class Regexp::Parser
|
|
392
330
|
|
393
331
|
when :meta_sequence
|
394
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
395
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
396
335
|
else
|
397
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
@@ -399,188 +338,194 @@ class Regexp::Parser
|
|
399
338
|
|
400
339
|
else
|
401
340
|
# treating everything else as a literal
|
341
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
342
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
343
|
+
# as stuff that would be a literal even without the backslash.
|
344
|
+
# Right now, they all end up here.
|
402
345
|
node << EscapeSequence::Literal.new(token, active_opts)
|
403
346
|
end
|
404
347
|
end
|
405
348
|
|
406
|
-
def keep(token)
|
407
|
-
node << Keep::Mark.new(token, active_opts)
|
408
|
-
end
|
409
|
-
|
410
349
|
def free_space(token)
|
411
350
|
case token.token
|
412
351
|
when :comment
|
413
352
|
node << Comment.new(token, active_opts)
|
414
353
|
when :whitespace
|
415
|
-
|
416
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
417
|
-
else
|
418
|
-
node << WhiteSpace.new(token, active_opts)
|
419
|
-
end
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
420
355
|
else
|
421
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
422
357
|
end
|
423
358
|
end
|
424
359
|
|
425
|
-
def
|
426
|
-
|
427
|
-
|
428
|
-
while target_node.is_a?(FreeSpace)
|
429
|
-
target_node = node.expressions[offset -= 1]
|
430
|
-
end
|
360
|
+
def keep(token)
|
361
|
+
node << Keep::Mark.new(token, active_opts)
|
362
|
+
end
|
431
363
|
|
432
|
-
|
433
|
-
|
364
|
+
def literal(token)
|
365
|
+
node << Literal.new(token, active_opts)
|
366
|
+
end
|
434
367
|
|
368
|
+
def meta(token)
|
435
369
|
case token.token
|
436
|
-
when :
|
437
|
-
|
438
|
-
when :
|
439
|
-
|
440
|
-
when :zero_or_one_possessive
|
441
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
442
|
-
|
443
|
-
when :zero_or_more
|
444
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
445
|
-
when :zero_or_more_reluctant
|
446
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
447
|
-
when :zero_or_more_possessive
|
448
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
449
|
-
|
450
|
-
when :one_or_more
|
451
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
452
|
-
when :one_or_more_reluctant
|
453
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
454
|
-
when :one_or_more_possessive
|
455
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
456
|
-
|
457
|
-
when :interval
|
458
|
-
interval(target_node, token)
|
459
|
-
|
370
|
+
when :dot
|
371
|
+
node << CharacterType::Any.new(token, active_opts)
|
372
|
+
when :alternation
|
373
|
+
sequence_operation(Alternation, token)
|
460
374
|
else
|
461
|
-
raise UnknownTokenError.new('
|
375
|
+
raise UnknownTokenError.new('Meta', token)
|
462
376
|
end
|
463
377
|
end
|
464
378
|
|
465
|
-
def
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
when '+'
|
473
|
-
range_text = text[0...-1]
|
474
|
-
mode = :possessive
|
475
|
-
else
|
476
|
-
range_text = text
|
477
|
-
mode = :greedy
|
379
|
+
def sequence_operation(klass, token)
|
380
|
+
unless node.instance_of?(klass)
|
381
|
+
operator = klass.new(token, active_opts)
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
383
|
+
sequence.expressions = node.expressions
|
384
|
+
node.expressions = []
|
385
|
+
nest(operator)
|
478
386
|
end
|
479
|
-
|
480
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
481
|
-
min = range[0].empty? ? 0 : range[0]
|
482
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
483
|
-
|
484
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
485
388
|
end
|
486
389
|
|
487
|
-
def
|
488
|
-
|
489
|
-
when :options, :options_switch
|
490
|
-
options_group(token)
|
491
|
-
when :close
|
492
|
-
close_group
|
493
|
-
when :comment
|
494
|
-
node << Group::Comment.new(token, active_opts)
|
495
|
-
else
|
496
|
-
open_group(token)
|
497
|
-
end
|
390
|
+
def posixclass(token)
|
391
|
+
node << PosixClass.new(token, active_opts)
|
498
392
|
end
|
499
393
|
|
500
|
-
|
501
|
-
|
394
|
+
UP = Regexp::Expression::Property
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
502
396
|
|
503
|
-
def
|
504
|
-
|
505
|
-
|
506
|
-
|
397
|
+
def property(token)
|
398
|
+
case token.token
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
507
414
|
|
508
|
-
|
509
|
-
|
415
|
+
# only in Oniguruma (old rubies)
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
417
|
+
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
420
|
+
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
428
|
+
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
434
|
+
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
439
|
+
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
448
|
+
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
453
|
+
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
459
|
+
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
466
|
+
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
470
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
471
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
510
472
|
|
511
|
-
|
512
|
-
|
513
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
514
|
-
end
|
515
|
-
if negative.include?(flag.to_s)
|
516
|
-
opt_changes[flag] = false
|
517
|
-
new_active_opts.delete(flag)
|
518
|
-
end
|
473
|
+
else
|
474
|
+
raise UnknownTokenError.new('UnicodeProperty', token)
|
519
475
|
end
|
476
|
+
end
|
520
477
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
478
|
+
def quantifier(token)
|
479
|
+
target_node = node.extract_quantifier_target(token.text)
|
480
|
+
|
481
|
+
# in case of chained quantifiers, wrap target in an implicit passive group
|
482
|
+
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
483
|
+
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
484
|
+
if target_node.quantified?
|
485
|
+
new_group = Group::Passive.construct(
|
486
|
+
token: :passive,
|
487
|
+
ts: target_node.ts,
|
488
|
+
level: target_node.level,
|
489
|
+
set_level: target_node.set_level,
|
490
|
+
conditional_level: target_node.conditional_level,
|
491
|
+
options: active_opts,
|
492
|
+
)
|
493
|
+
new_group.implicit = true
|
494
|
+
new_group << target_node
|
495
|
+
increase_group_level(target_node)
|
496
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
497
|
+
target_node = new_group
|
528
498
|
end
|
529
499
|
|
530
|
-
|
500
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
501
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
502
|
+
raise UnknownTokenError.new('Quantifier', token)
|
503
|
+
end
|
531
504
|
|
532
|
-
|
533
|
-
|
505
|
+
target_node.quantify(token, active_opts)
|
506
|
+
end
|
534
507
|
|
535
|
-
|
508
|
+
def increase_group_level(exp)
|
509
|
+
exp.level += 1
|
510
|
+
exp.quantifier.level += 1 if exp.quantifier
|
511
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
536
512
|
end
|
537
513
|
|
538
|
-
def
|
514
|
+
def set(token)
|
539
515
|
case token.token
|
540
|
-
when :
|
541
|
-
|
542
|
-
when :
|
543
|
-
|
544
|
-
when :
|
545
|
-
exp = Group::Named.new(token, active_opts)
|
546
|
-
when :capture
|
547
|
-
exp = Group::Capture.new(token, active_opts)
|
548
|
-
when :absence
|
549
|
-
exp = Group::Absence.new(token, active_opts)
|
550
|
-
|
551
|
-
when :lookahead
|
552
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
553
|
-
when :nlookahead
|
554
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
555
|
-
when :lookbehind
|
556
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
557
|
-
when :nlookbehind
|
558
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
559
|
-
|
516
|
+
when :open; open_set(token)
|
517
|
+
when :close; close_set
|
518
|
+
when :negate; negate_set
|
519
|
+
when :range; range(token)
|
520
|
+
when :intersection; intersection(token)
|
560
521
|
else
|
561
|
-
raise UnknownTokenError.new('
|
562
|
-
end
|
563
|
-
|
564
|
-
if exp.capturing?
|
565
|
-
exp.number = total_captured_group_count + 1
|
566
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
567
|
-
count_captured_group
|
522
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
568
523
|
end
|
569
|
-
|
570
|
-
# Push the active options to the stack again. This way we can simply pop the
|
571
|
-
# stack for any group we close, no matter if it had its own options or not.
|
572
|
-
options_stack << active_opts
|
573
|
-
|
574
|
-
nest(exp)
|
575
|
-
end
|
576
|
-
|
577
|
-
def close_group
|
578
|
-
options_stack.pop unless switching_options
|
579
|
-
self.switching_options = false
|
580
|
-
decrease_nesting
|
581
524
|
end
|
582
525
|
|
583
526
|
def open_set(token)
|
527
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
528
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
584
529
|
token.token = :character
|
585
530
|
nest(CharacterSet.new(token, active_opts))
|
586
531
|
end
|
@@ -595,59 +540,56 @@ class Regexp::Parser
|
|
595
540
|
|
596
541
|
def range(token)
|
597
542
|
exp = CharacterSet::Range.new(token, active_opts)
|
598
|
-
scope = node.last.
|
543
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
599
544
|
exp << scope.expressions.pop
|
600
545
|
nest(exp)
|
601
546
|
end
|
602
547
|
|
603
|
-
def close_completed_character_set_range
|
604
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
605
|
-
end
|
606
|
-
|
607
548
|
def intersection(token)
|
608
549
|
sequence_operation(CharacterSet::Intersection, token)
|
609
550
|
end
|
610
551
|
|
611
|
-
def
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
552
|
+
def type(token)
|
553
|
+
case token.token
|
554
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
555
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
556
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
557
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
558
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
559
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
560
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
561
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
562
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
563
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
564
|
+
else
|
565
|
+
raise UnknownTokenError.new('CharacterType', token)
|
618
566
|
end
|
619
|
-
node.add_sequence(active_opts)
|
620
|
-
end
|
621
|
-
|
622
|
-
def active_opts
|
623
|
-
options_stack.last
|
624
|
-
end
|
625
|
-
|
626
|
-
def total_captured_group_count
|
627
|
-
captured_group_counts.values.reduce(0, :+)
|
628
|
-
end
|
629
|
-
|
630
|
-
def captured_group_count_at_level
|
631
|
-
captured_group_counts[node.level]
|
632
567
|
end
|
633
568
|
|
634
|
-
def
|
635
|
-
|
569
|
+
def close_completed_character_set_range
|
570
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
636
571
|
end
|
637
572
|
|
638
|
-
def
|
639
|
-
|
640
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
573
|
+
def active_opts
|
574
|
+
options_stack.last
|
641
575
|
end
|
642
576
|
|
577
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
578
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
579
|
+
# the instance of Group::Capture that it refers to via its number.
|
643
580
|
def assign_referenced_expressions
|
644
|
-
|
581
|
+
# find all referencable and refering expressions
|
582
|
+
targets = { 0 => root }
|
583
|
+
referrers = []
|
645
584
|
root.each_expression do |exp|
|
646
585
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
586
|
+
referrers << exp if exp.referential?
|
647
587
|
end
|
648
|
-
|
649
|
-
|
650
|
-
|
588
|
+
# assign reference expression to refering expressions
|
589
|
+
# (in a second iteration because there might be forward references)
|
590
|
+
referrers.each do |exp|
|
591
|
+
exp.referenced_expression = targets[exp.reference] ||
|
592
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
651
593
|
end
|
652
594
|
end
|
653
595
|
end # module Regexp::Parser
|