regexp_parser 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -7,22 +7,12 @@ module Regexp::Expression
|
|
7
7
|
# # is it a :group expression
|
8
8
|
# exp.type? :group
|
9
9
|
#
|
10
|
-
# # is it a :set,
|
11
|
-
# exp.type? [:set, :
|
10
|
+
# # is it a :set, or :meta
|
11
|
+
# exp.type? [:set, :meta]
|
12
12
|
#
|
13
13
|
def type?(test_type)
|
14
|
-
|
15
|
-
|
16
|
-
if test_type.include?(:*)
|
17
|
-
return (test_type.include?(type) or test_type.include?(:*))
|
18
|
-
else
|
19
|
-
return test_type.include?(type)
|
20
|
-
end
|
21
|
-
when Symbol
|
22
|
-
return (type == test_type or test_type == :*)
|
23
|
-
else
|
24
|
-
raise "Array or Symbol expected, #{test_type.class.name} given"
|
25
|
-
end
|
14
|
+
test_types = Array(test_type).map(&:to_sym)
|
15
|
+
test_types.include?(:*) || test_types.include?(type)
|
26
16
|
end
|
27
17
|
|
28
18
|
# Test if this expression has the given test_token, and optionally a given
|
@@ -45,7 +45,7 @@ module Regexp::Expression
|
|
45
45
|
# Returns a new array with the results of calling the given block once
|
46
46
|
# for every expression. If a block is not given, returns an array with
|
47
47
|
# each expression and its level index as an array.
|
48
|
-
def
|
48
|
+
def flat_map(include_self = false, &block)
|
49
49
|
result = []
|
50
50
|
|
51
51
|
each_expression(include_self) do |exp, index|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
# abstract class
|
3
|
+
class SequenceOperation < Regexp::Expression::Subexpression
|
4
|
+
alias :sequences :expressions
|
5
|
+
alias :operands :expressions
|
6
|
+
alias :operator :text
|
7
|
+
|
8
|
+
def starts_at
|
9
|
+
expressions.first.starts_at
|
10
|
+
end
|
11
|
+
alias :ts :starts_at
|
12
|
+
|
13
|
+
def <<(exp)
|
14
|
+
expressions.last << exp
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_sequence
|
18
|
+
exp = self.class::OPERAND.new(level, set_level, conditional_level)
|
19
|
+
exp.nesting_level = nesting_level + 1
|
20
|
+
expressions << exp
|
21
|
+
exp
|
22
|
+
end
|
23
|
+
|
24
|
+
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
25
|
+
sequences.last.last.quantify(token, text, min, max, mode)
|
26
|
+
sequences.last.last.quantify(token, text, min, max, mode)
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_s(format = :full)
|
30
|
+
sequences.map { |e| e.to_s(format) }.join(text)
|
31
|
+
sequences.map { |e| e.to_s(format) }.join(text)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -10,26 +10,22 @@ module Regexp::Expression
|
|
10
10
|
end
|
11
11
|
|
12
12
|
# Override base method to clone the expressions as well.
|
13
|
-
def
|
14
|
-
|
15
|
-
|
16
|
-
copy
|
13
|
+
def initialize_clone(other)
|
14
|
+
other.expressions = expressions.map(&:clone)
|
15
|
+
super
|
17
16
|
end
|
18
17
|
|
19
18
|
def <<(exp)
|
20
19
|
if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
|
21
20
|
last.merge(exp)
|
22
21
|
else
|
22
|
+
exp.nesting_level = nesting_level + 1
|
23
23
|
expressions << exp
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
%w[[] all? any? at count each each_with_index empty?
|
32
|
-
fetch find first index join last length values_at].each do |m|
|
27
|
+
%w[[] all? any? at collect count each each_with_index empty?
|
28
|
+
fetch find first index join last length map values_at].each do |m|
|
33
29
|
define_method(m) { |*args, &block| expressions.send(m, *args, &block) }
|
34
30
|
end
|
35
31
|
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -4,9 +4,10 @@
|
|
4
4
|
# given syntax flavor.
|
5
5
|
class Regexp::Lexer
|
6
6
|
|
7
|
-
OPENING_TOKENS = [
|
8
|
-
|
9
|
-
|
7
|
+
OPENING_TOKENS = [
|
8
|
+
:capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
|
9
|
+
:atomic, :options, :options_switch, :named, :absence
|
10
|
+
].freeze
|
10
11
|
|
11
12
|
CLOSING_TOKENS = [:close].freeze
|
12
13
|
|
@@ -36,6 +37,7 @@ class Regexp::Lexer
|
|
36
37
|
nesting, set_nesting, conditional_nesting)
|
37
38
|
|
38
39
|
current = merge_literal(current) if type == :literal and
|
40
|
+
set_nesting == 0 and
|
39
41
|
last and last.type == :literal
|
40
42
|
|
41
43
|
current = merge_condition(current) if type == :conditional and
|
@@ -66,29 +68,23 @@ class Regexp::Lexer
|
|
66
68
|
attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
|
67
69
|
|
68
70
|
def ascend(type, token)
|
69
|
-
|
71
|
+
case type
|
72
|
+
when :group, :assertion
|
70
73
|
self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
|
71
|
-
|
72
|
-
|
73
|
-
if type == :set or type == :subset
|
74
|
+
when :set
|
74
75
|
self.set_nesting = set_nesting - 1 if token == :close
|
75
|
-
|
76
|
-
|
77
|
-
if type == :conditional
|
76
|
+
when :conditional
|
78
77
|
self.conditional_nesting = conditional_nesting - 1 if token == :close
|
79
78
|
end
|
80
79
|
end
|
81
80
|
|
82
81
|
def descend(type, token)
|
83
|
-
|
82
|
+
case type
|
83
|
+
when :group, :assertion
|
84
84
|
self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
|
85
|
-
|
86
|
-
|
87
|
-
if type == :set or type == :subset
|
85
|
+
when :set
|
88
86
|
self.set_nesting = set_nesting + 1 if token == :open
|
89
|
-
|
90
|
-
|
91
|
-
if type == :conditional
|
87
|
+
when :conditional
|
92
88
|
self.conditional_nesting = conditional_nesting + 1 if token == :open
|
93
89
|
end
|
94
90
|
end
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -33,6 +33,8 @@ class Regexp::Parser
|
|
33
33
|
self.switching_options = false
|
34
34
|
self.conditional_nesting = []
|
35
35
|
|
36
|
+
self.captured_group_counts = Hash.new(0)
|
37
|
+
|
36
38
|
Regexp::Lexer.scan(input, syntax) do |token|
|
37
39
|
parse_token(token)
|
38
40
|
end
|
@@ -48,7 +50,7 @@ class Regexp::Parser
|
|
48
50
|
|
49
51
|
attr_accessor :root, :node, :nesting,
|
50
52
|
:options_stack, :switching_options, :conditional_nesting,
|
51
|
-
:
|
53
|
+
:captured_group_counts
|
52
54
|
|
53
55
|
def options_from_input(input)
|
54
56
|
return {} unless input.is_a?(::Regexp)
|
@@ -63,9 +65,28 @@ class Regexp::Parser
|
|
63
65
|
def nest(exp)
|
64
66
|
nesting.push(exp)
|
65
67
|
node << exp
|
68
|
+
update_transplanted_subtree(exp, node)
|
66
69
|
self.node = exp
|
67
70
|
end
|
68
71
|
|
72
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
73
|
+
def update_transplanted_subtree(exp, new_parent)
|
74
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
75
|
+
exp.respond_to?(:each) &&
|
76
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
77
|
+
end
|
78
|
+
|
79
|
+
def decrease_nesting
|
80
|
+
while nesting.last.is_a?(SequenceOperation)
|
81
|
+
nesting.pop
|
82
|
+
self.node = nesting.last
|
83
|
+
end
|
84
|
+
nesting.pop
|
85
|
+
yield(node) if block_given?
|
86
|
+
self.node = nesting.last
|
87
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
88
|
+
end
|
89
|
+
|
69
90
|
def nest_conditional(exp)
|
70
91
|
conditional_nesting.push(exp)
|
71
92
|
node << exp
|
@@ -73,6 +94,8 @@ class Regexp::Parser
|
|
73
94
|
end
|
74
95
|
|
75
96
|
def parse_token(token)
|
97
|
+
close_completed_character_set_range
|
98
|
+
|
76
99
|
case token.type
|
77
100
|
when :meta; meta(token)
|
78
101
|
when :quantifier; quantifier(token)
|
@@ -80,12 +103,14 @@ class Regexp::Parser
|
|
80
103
|
when :escape; escape(token)
|
81
104
|
when :group; group(token)
|
82
105
|
when :assertion; group(token)
|
83
|
-
when :set
|
106
|
+
when :set; set(token)
|
84
107
|
when :type; type(token)
|
85
108
|
when :backref; backref(token)
|
86
109
|
when :conditional; conditional(token)
|
87
110
|
when :keep; keep(token)
|
88
111
|
|
112
|
+
when :posixclass, :nonposixclass
|
113
|
+
posixclass(token)
|
89
114
|
when :property, :nonproperty
|
90
115
|
property(token)
|
91
116
|
|
@@ -104,17 +129,15 @@ class Regexp::Parser
|
|
104
129
|
when :open
|
105
130
|
open_set(token)
|
106
131
|
when :close
|
107
|
-
close_set
|
132
|
+
close_set
|
108
133
|
when :negate
|
109
134
|
negate_set
|
110
|
-
when :
|
111
|
-
|
112
|
-
when
|
113
|
-
|
114
|
-
when
|
115
|
-
|
116
|
-
when *Token::UnicodeProperty::All
|
117
|
-
append_set(token)
|
135
|
+
when :range
|
136
|
+
range(token)
|
137
|
+
when :intersection
|
138
|
+
intersection(token)
|
139
|
+
when :collation, :equivalent
|
140
|
+
node << Literal.new(token, active_opts)
|
118
141
|
else
|
119
142
|
raise UnknownTokenError.new('CharacterSet', token)
|
120
143
|
end
|
@@ -125,19 +148,7 @@ class Regexp::Parser
|
|
125
148
|
when :dot
|
126
149
|
node << CharacterType::Any.new(token, active_opts)
|
127
150
|
when :alternation
|
128
|
-
|
129
|
-
elsif node.last.is_a?(Alternation)
|
130
|
-
self.node = node.last
|
131
|
-
else
|
132
|
-
alt = Alternation.new(token, active_opts)
|
133
|
-
seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
|
134
|
-
node.expressions.count.times { seq.insert(node.expressions.pop) }
|
135
|
-
alt.alternative(seq)
|
136
|
-
|
137
|
-
node << alt
|
138
|
-
self.node = alt
|
139
|
-
end
|
140
|
-
node.alternative
|
151
|
+
sequence_operation(Alternation, token)
|
141
152
|
else
|
142
153
|
raise UnknownTokenError.new('Meta', token)
|
143
154
|
end
|
@@ -147,16 +158,16 @@ class Regexp::Parser
|
|
147
158
|
case token.token
|
148
159
|
when :name_ref
|
149
160
|
node << Backreference::Name.new(token, active_opts)
|
150
|
-
when :
|
151
|
-
node << Backreference::
|
161
|
+
when :name_recursion_ref
|
162
|
+
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
152
163
|
when :name_call
|
153
164
|
node << Backreference::NameCall.new(token, active_opts)
|
154
165
|
when :number, :number_ref
|
155
166
|
node << Backreference::Number.new(token, active_opts)
|
156
167
|
when :number_rel_ref
|
157
168
|
node << Backreference::NumberRelative.new(token, active_opts)
|
158
|
-
when :
|
159
|
-
node << Backreference::
|
169
|
+
when :number_recursion_ref
|
170
|
+
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
160
171
|
when :number_call
|
161
172
|
node << Backreference::NumberCall.new(token, active_opts)
|
162
173
|
when :number_rel_call
|
@@ -217,75 +228,81 @@ class Regexp::Parser
|
|
217
228
|
end
|
218
229
|
end
|
219
230
|
|
231
|
+
def posixclass(token)
|
232
|
+
node << PosixClass.new(token)
|
233
|
+
end
|
234
|
+
|
220
235
|
include Regexp::Expression::UnicodeProperty
|
221
236
|
|
222
237
|
def property(token)
|
223
238
|
case token.token
|
224
|
-
when :alnum;
|
225
|
-
when :alpha;
|
226
|
-
when :ascii;
|
227
|
-
when :blank;
|
228
|
-
when :cntrl;
|
229
|
-
when :digit;
|
230
|
-
when :graph;
|
231
|
-
when :lower;
|
232
|
-
when :print;
|
233
|
-
when :punct;
|
234
|
-
when :space;
|
235
|
-
when :upper;
|
236
|
-
when :word;
|
237
|
-
when :xdigit;
|
238
|
-
when :xposixpunct;
|
239
|
+
when :alnum; node << Alnum.new(token, active_opts)
|
240
|
+
when :alpha; node << Alpha.new(token, active_opts)
|
241
|
+
when :ascii; node << Ascii.new(token, active_opts)
|
242
|
+
when :blank; node << Blank.new(token, active_opts)
|
243
|
+
when :cntrl; node << Cntrl.new(token, active_opts)
|
244
|
+
when :digit; node << Digit.new(token, active_opts)
|
245
|
+
when :graph; node << Graph.new(token, active_opts)
|
246
|
+
when :lower; node << Lower.new(token, active_opts)
|
247
|
+
when :print; node << Print.new(token, active_opts)
|
248
|
+
when :punct; node << Punct.new(token, active_opts)
|
249
|
+
when :space; node << Space.new(token, active_opts)
|
250
|
+
when :upper; node << Upper.new(token, active_opts)
|
251
|
+
when :word; node << Word.new(token, active_opts)
|
252
|
+
when :xdigit; node << Xdigit.new(token, active_opts)
|
253
|
+
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
239
254
|
|
240
255
|
# only in Oniguruma (old rubies)
|
241
|
-
when :newline;
|
242
|
-
|
243
|
-
when :any;
|
244
|
-
when :assigned;
|
245
|
-
|
246
|
-
when :
|
247
|
-
when :
|
248
|
-
when :
|
249
|
-
when :
|
250
|
-
when :
|
251
|
-
when :
|
252
|
-
|
253
|
-
|
254
|
-
when :
|
255
|
-
when :
|
256
|
-
when :
|
257
|
-
|
258
|
-
when :
|
259
|
-
|
260
|
-
when :
|
261
|
-
when :
|
262
|
-
|
263
|
-
when :
|
264
|
-
|
265
|
-
when :
|
266
|
-
when :
|
267
|
-
when :
|
268
|
-
when :
|
269
|
-
when :
|
270
|
-
when :
|
271
|
-
|
272
|
-
when :
|
273
|
-
|
274
|
-
when :
|
275
|
-
when :
|
276
|
-
|
277
|
-
when :
|
278
|
-
|
279
|
-
when :
|
280
|
-
when :
|
281
|
-
when :
|
282
|
-
|
283
|
-
when :
|
284
|
-
|
285
|
-
when :
|
286
|
-
when :
|
287
|
-
when :
|
288
|
-
when :
|
256
|
+
when :newline; node << Newline.new(token, active_opts)
|
257
|
+
|
258
|
+
when :any; node << Any.new(token, active_opts)
|
259
|
+
when :assigned; node << Assigned.new(token, active_opts)
|
260
|
+
|
261
|
+
when :letter; node << Letter::Any.new(token, active_opts)
|
262
|
+
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
263
|
+
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
264
|
+
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
265
|
+
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
266
|
+
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
267
|
+
when :other_letter; node << Letter::Other.new(token, active_opts)
|
268
|
+
|
269
|
+
when :mark; node << Mark::Any.new(token, active_opts)
|
270
|
+
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
271
|
+
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
272
|
+
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
273
|
+
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
274
|
+
|
275
|
+
when :number; node << Number::Any.new(token, active_opts)
|
276
|
+
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
277
|
+
when :letter_number; node << Number::Letter.new(token, active_opts)
|
278
|
+
when :other_number; node << Number::Other.new(token, active_opts)
|
279
|
+
|
280
|
+
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
281
|
+
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
282
|
+
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
283
|
+
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
284
|
+
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
285
|
+
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
286
|
+
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
287
|
+
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
288
|
+
|
289
|
+
when :separator; node << Separator::Any.new(token, active_opts)
|
290
|
+
when :space_separator; node << Separator::Space.new(token, active_opts)
|
291
|
+
when :line_separator; node << Separator::Line.new(token, active_opts)
|
292
|
+
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
293
|
+
|
294
|
+
when :symbol; node << Symbol::Any.new(token, active_opts)
|
295
|
+
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
296
|
+
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
297
|
+
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
298
|
+
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
299
|
+
|
300
|
+
when :other; node << Codepoint::Any.new(token, active_opts)
|
301
|
+
when :control; node << Codepoint::Control.new(token, active_opts)
|
302
|
+
when :format; node << Codepoint::Format.new(token, active_opts)
|
303
|
+
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
304
|
+
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
305
|
+
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
289
306
|
|
290
307
|
when *Token::UnicodeProperty::Age
|
291
308
|
node << Age.new(token, active_opts)
|
@@ -346,13 +363,20 @@ class Regexp::Parser
|
|
346
363
|
node << EscapeSequence::Newline.new(token, active_opts)
|
347
364
|
when :carriage
|
348
365
|
node << EscapeSequence::Return.new(token, active_opts)
|
349
|
-
when :space
|
350
|
-
node << EscapeSequence::Space.new(token, active_opts)
|
351
366
|
when :tab
|
352
367
|
node << EscapeSequence::Tab.new(token, active_opts)
|
353
368
|
when :vertical_tab
|
354
369
|
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
355
370
|
|
371
|
+
when :hex
|
372
|
+
node << EscapeSequence::Hex.new(token, active_opts)
|
373
|
+
when :octal
|
374
|
+
node << EscapeSequence::Octal.new(token, active_opts)
|
375
|
+
when :codepoint
|
376
|
+
node << EscapeSequence::Codepoint.new(token, active_opts)
|
377
|
+
when :codepoint_list
|
378
|
+
node << EscapeSequence::CodepointList.new(token, active_opts)
|
379
|
+
|
356
380
|
when :control
|
357
381
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
358
382
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
@@ -447,7 +471,7 @@ class Regexp::Parser
|
|
447
471
|
mode = :greedy
|
448
472
|
end
|
449
473
|
|
450
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
474
|
+
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
451
475
|
min = range[0].empty? ? 0 : range[0]
|
452
476
|
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
453
477
|
|
@@ -456,7 +480,7 @@ class Regexp::Parser
|
|
456
480
|
|
457
481
|
def group(token)
|
458
482
|
case token.token
|
459
|
-
when :options
|
483
|
+
when :options, :options_switch
|
460
484
|
options_group(token)
|
461
485
|
when :close
|
462
486
|
close_group
|
@@ -470,8 +494,7 @@ class Regexp::Parser
|
|
470
494
|
def options_group(token)
|
471
495
|
positive, negative = token.text.split('-', 2)
|
472
496
|
negative ||= ''
|
473
|
-
self.switching_options =
|
474
|
-
# TODO: change this -^ to token.type == :options_switch in v1.0.0
|
497
|
+
self.switching_options = token.token.equal?(:options_switch)
|
475
498
|
|
476
499
|
new_options = active_opts.dup
|
477
500
|
|
@@ -491,9 +514,7 @@ class Regexp::Parser
|
|
491
514
|
|
492
515
|
options_stack << new_options
|
493
516
|
|
494
|
-
|
495
|
-
|
496
|
-
nest(exp)
|
517
|
+
nest(Group::Options.new(token, active_opts))
|
497
518
|
end
|
498
519
|
|
499
520
|
def open_group(token)
|
@@ -522,6 +543,12 @@ class Regexp::Parser
|
|
522
543
|
raise UnknownTokenError.new('Group type open', token)
|
523
544
|
end
|
524
545
|
|
546
|
+
if exp.capturing?
|
547
|
+
exp.number = total_captured_group_count + 1
|
548
|
+
exp.number_at_level = captured_group_count_at_level + 1
|
549
|
+
count_captured_group
|
550
|
+
end
|
551
|
+
|
525
552
|
# Push the active options to the stack again. This way we can simply pop the
|
526
553
|
# stack for any group we close, no matter if it had its own options or not.
|
527
554
|
options_stack << active_opts
|
@@ -530,38 +557,65 @@ class Regexp::Parser
|
|
530
557
|
end
|
531
558
|
|
532
559
|
def close_group
|
533
|
-
nesting.pop
|
534
560
|
options_stack.pop unless switching_options
|
535
561
|
self.switching_options = false
|
536
|
-
|
537
|
-
self.node = nesting.last
|
538
|
-
self.node = node.last if node.last and node.last.is_a?(Alternation)
|
562
|
+
decrease_nesting
|
539
563
|
end
|
540
564
|
|
541
565
|
def open_set(token)
|
542
566
|
token.token = :character
|
543
|
-
|
544
|
-
if token.type == :subset
|
545
|
-
current_set << CharacterSubSet.new(token, active_opts)
|
546
|
-
else
|
547
|
-
self.current_set = CharacterSet.new(token, active_opts)
|
548
|
-
node << current_set
|
549
|
-
end
|
567
|
+
nest(CharacterSet.new(token, active_opts))
|
550
568
|
end
|
551
569
|
|
552
570
|
def negate_set
|
553
|
-
|
571
|
+
node.negate
|
554
572
|
end
|
555
573
|
|
556
|
-
def
|
557
|
-
|
574
|
+
def close_set
|
575
|
+
decrease_nesting(&:close)
|
558
576
|
end
|
559
577
|
|
560
|
-
def
|
561
|
-
|
578
|
+
def range(token)
|
579
|
+
exp = CharacterSet::Range.new(token, active_opts)
|
580
|
+
scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
|
581
|
+
exp << scope.expressions.pop
|
582
|
+
nest(exp)
|
583
|
+
end
|
584
|
+
|
585
|
+
def close_completed_character_set_range
|
586
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
587
|
+
end
|
588
|
+
|
589
|
+
def intersection(token)
|
590
|
+
sequence_operation(CharacterSet::Intersection, token)
|
591
|
+
end
|
592
|
+
|
593
|
+
def sequence_operation(klass, token)
|
594
|
+
if node.last.is_a?(klass)
|
595
|
+
self.node = node.last
|
596
|
+
elsif !node.is_a?(klass)
|
597
|
+
operator = klass.new(token, active_opts)
|
598
|
+
sequence = operator.add_sequence
|
599
|
+
sequence.expressions = node.expressions
|
600
|
+
node.expressions = []
|
601
|
+
nest(operator)
|
602
|
+
end
|
603
|
+
node.add_sequence
|
562
604
|
end
|
563
605
|
|
564
606
|
def active_opts
|
565
607
|
options_stack.last
|
566
608
|
end
|
609
|
+
|
610
|
+
def total_captured_group_count
|
611
|
+
captured_group_counts.values.reduce(0, :+)
|
612
|
+
end
|
613
|
+
|
614
|
+
def captured_group_count_at_level
|
615
|
+
captured_group_counts[node.level]
|
616
|
+
end
|
617
|
+
|
618
|
+
def count_captured_group
|
619
|
+
captured_group_counts[node.level] += 1
|
620
|
+
end
|
567
621
|
end # module Regexp::Parser
|