regexp_parser 1.8.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +93 -0
- data/Gemfile +6 -1
- data/README.md +1 -4
- data/Rakefile +8 -8
- data/lib/regexp_parser.rb +1 -0
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +5 -18
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +306 -332
- data/lib/regexp_parser/scanner.rb +1272 -1338
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +206 -238
- data/lib/regexp_parser/syntax.rb +7 -7
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +8 -1
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +4 -7
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,7 +20,7 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -53,6 +58,8 @@
|
|
53
58
|
|
54
59
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
60
|
|
61
|
+
sequence_char = [CMcux];
|
62
|
+
|
56
63
|
zero_or_one = '?' | '??' | '?+';
|
57
64
|
zero_or_more = '*' | '*?' | '*+';
|
58
65
|
one_or_more = '+' | '+?' | '++';
|
@@ -90,21 +97,26 @@
|
|
90
97
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
98
|
|
92
99
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
100
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
101
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
102
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
103
|
group_level = [+\-] . [0-9]+;
|
97
104
|
|
98
|
-
group_name = ('<' .
|
105
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
106
|
+
("'" . group_name_id_sq? . "'");
|
99
107
|
group_lookup = group_name | group_number;
|
100
108
|
|
101
109
|
group_named = ('?' . group_name );
|
102
110
|
|
103
|
-
|
104
|
-
|
111
|
+
group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
|
112
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
113
|
+
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
114
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
115
|
|
106
|
-
|
107
|
-
|
116
|
+
group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
|
117
|
+
("'" . group_number . group_level? "'"));
|
118
|
+
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
119
|
+
("'" . ((group_number . group_level?) | '0') "'"));
|
108
120
|
|
109
121
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
110
122
|
|
@@ -115,7 +127,7 @@
|
|
115
127
|
|
116
128
|
# characters that 'break' a literal
|
117
129
|
meta_char = dot | backslash | alternation |
|
118
|
-
curlies |
|
130
|
+
curlies | parentheses | brackets |
|
119
131
|
line_anchor | quantifier_greedy;
|
120
132
|
|
121
133
|
literal_delimiters = ']' | '}';
|
@@ -123,25 +135,23 @@
|
|
123
135
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
136
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
137
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
138
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
|
-
keep_mark |
|
139
|
+
keep_mark | sequence_char;
|
140
|
+
|
141
|
+
# escapes that also work within a character set
|
142
|
+
set_escape = backslash | brackets | escaped_ascii | property_char |
|
143
|
+
sequence_char | single_codepoint_char_type;
|
132
144
|
|
133
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
134
|
-
multi_codepoint_char_type | [0-9cCM];
|
135
145
|
|
136
146
|
# EOF error, used where it can be detected
|
137
147
|
action premature_end_error {
|
138
|
-
text =
|
148
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
149
|
raise PrematureEndError.new( text )
|
140
150
|
}
|
141
151
|
|
142
152
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
153
|
action invalid_sequence_error {
|
144
|
-
text =
|
154
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
155
|
validation_error(:sequence, 'sequence', text)
|
146
156
|
}
|
147
157
|
|
@@ -156,7 +166,7 @@
|
|
156
166
|
# --------------------------------------------------------------------------
|
157
167
|
character_set := |*
|
158
168
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
169
|
+
emit(:set, :close, copy(data, ts, te))
|
160
170
|
if in_set?
|
161
171
|
fret;
|
162
172
|
else
|
@@ -165,8 +175,8 @@
|
|
165
175
|
};
|
166
176
|
|
167
177
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
178
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
179
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
180
|
if in_set?
|
171
181
|
fret;
|
172
182
|
else
|
@@ -175,33 +185,33 @@
|
|
175
185
|
};
|
176
186
|
|
177
187
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
188
|
+
emit(:literal, :literal, '-')
|
189
|
+
emit(:set, :intersection, '&&')
|
180
190
|
};
|
181
191
|
|
182
192
|
'^' {
|
183
|
-
text =
|
193
|
+
text = copy(data, ts, te)
|
184
194
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
195
|
+
emit(:set, :negate, text)
|
186
196
|
else
|
187
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
188
198
|
end
|
189
199
|
};
|
190
200
|
|
191
201
|
'-' {
|
192
|
-
text =
|
202
|
+
text = copy(data, ts, te)
|
193
203
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
204
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
205
|
+
emit(:literal, :literal, text)
|
196
206
|
else
|
197
|
-
emit(:set, :range, text
|
207
|
+
emit(:set, :range, text)
|
198
208
|
end
|
199
209
|
};
|
200
210
|
|
201
211
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
212
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
213
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
214
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
215
|
};
|
206
216
|
|
207
217
|
backslash {
|
@@ -209,12 +219,12 @@
|
|
209
219
|
};
|
210
220
|
|
211
221
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
222
|
+
emit(:set, :open, copy(data, ts, te))
|
213
223
|
fcall character_set;
|
214
224
|
};
|
215
225
|
|
216
226
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
227
|
+
text = copy(data, ts, te)
|
218
228
|
|
219
229
|
type = :posixclass
|
220
230
|
class_name = text[2..-3]
|
@@ -223,45 +233,40 @@
|
|
223
233
|
type = :nonposixclass
|
224
234
|
end
|
225
235
|
|
226
|
-
emit(type, class_name.to_sym, text
|
236
|
+
emit(type, class_name.to_sym, text)
|
227
237
|
};
|
228
238
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
239
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
+
# emit(:set, :collation, copy(data, ts, te))
|
242
|
+
# };
|
243
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
245
|
+
# };
|
236
246
|
|
237
247
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
248
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
249
|
};
|
240
250
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
251
|
+
any | ascii_nonprint | utf8_multibyte {
|
252
|
+
text = copy(data, ts, te)
|
253
|
+
emit(:literal, :literal, text)
|
249
254
|
};
|
250
255
|
*|;
|
251
256
|
|
252
257
|
# set escapes scanner
|
253
258
|
# --------------------------------------------------------------------------
|
254
259
|
set_escape_sequence := |*
|
255
|
-
|
256
|
-
emit(:escape, :literal, *text(data, ts, te, 1))
|
257
|
-
fret;
|
258
|
-
};
|
259
|
-
|
260
|
-
any > (escaped_set_alpha, 1) {
|
260
|
+
set_escape > (escaped_set_alpha, 2) {
|
261
261
|
fhold;
|
262
262
|
fnext character_set;
|
263
263
|
fcall escape_sequence;
|
264
264
|
};
|
265
|
+
|
266
|
+
any > (escaped_set_alpha, 1) {
|
267
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
268
|
+
fret;
|
269
|
+
};
|
265
270
|
*|;
|
266
271
|
|
267
272
|
|
@@ -269,33 +274,33 @@
|
|
269
274
|
# --------------------------------------------------------------------------
|
270
275
|
escape_sequence := |*
|
271
276
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
277
|
+
text = copy(data, ts-1, te)
|
278
|
+
emit(:backref, :number, text)
|
274
279
|
fret;
|
275
280
|
};
|
276
281
|
|
277
282
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
283
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
284
|
fret;
|
280
285
|
};
|
281
286
|
|
282
287
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
288
|
+
case text = copy(data, ts-1, te)
|
289
|
+
when '\.'; emit(:escape, :dot, text)
|
290
|
+
when '\|'; emit(:escape, :alternation, text)
|
291
|
+
when '\^'; emit(:escape, :bol, text)
|
292
|
+
when '\$'; emit(:escape, :eol, text)
|
293
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
294
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
295
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
296
|
+
when '\('; emit(:escape, :group_open, text)
|
297
|
+
when '\)'; emit(:escape, :group_close, text)
|
298
|
+
when '\{'; emit(:escape, :interval_open, text)
|
299
|
+
when '\}'; emit(:escape, :interval_close, text)
|
300
|
+
when '\['; emit(:escape, :set_open, text)
|
301
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
302
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
303
|
+
emit(:escape, :backslash, text)
|
299
304
|
end
|
300
305
|
fret;
|
301
306
|
};
|
@@ -303,31 +308,31 @@
|
|
303
308
|
escaped_ascii > (escaped_alpha, 7) {
|
304
309
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
310
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
311
|
+
case text = copy(data, ts-1, te)
|
312
|
+
when '\a'; emit(:escape, :bell, text)
|
313
|
+
when '\b'; emit(:escape, :backspace, text)
|
314
|
+
when '\e'; emit(:escape, :escape, text)
|
315
|
+
when '\f'; emit(:escape, :form_feed, text)
|
316
|
+
when '\n'; emit(:escape, :newline, text)
|
317
|
+
when '\r'; emit(:escape, :carriage, text)
|
318
|
+
when '\t'; emit(:escape, :tab, text)
|
319
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
320
|
end
|
316
321
|
fret;
|
317
322
|
};
|
318
323
|
|
319
324
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
325
|
+
text = copy(data, ts-1, te)
|
321
326
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
327
|
+
emit(:escape, :codepoint_list, text)
|
323
328
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
329
|
+
emit(:escape, :codepoint, text)
|
325
330
|
end
|
326
331
|
fret;
|
327
332
|
};
|
328
333
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
334
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
335
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
336
|
fret;
|
332
337
|
};
|
333
338
|
|
@@ -357,8 +362,8 @@
|
|
357
362
|
fcall unicode_property;
|
358
363
|
};
|
359
364
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
365
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
366
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
367
|
fret;
|
363
368
|
};
|
364
369
|
*|;
|
@@ -368,9 +373,9 @@
|
|
368
373
|
# --------------------------------------------------------------------------
|
369
374
|
conditional_expression := |*
|
370
375
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
376
|
+
text = copy(data, ts, te-1)
|
377
|
+
emit(:conditional, :condition, text)
|
378
|
+
emit(:conditional, :condition_close, ')')
|
374
379
|
};
|
375
380
|
|
376
381
|
any {
|
@@ -387,39 +392,39 @@
|
|
387
392
|
# Meta characters
|
388
393
|
# ------------------------------------------------------------------------
|
389
394
|
dot {
|
390
|
-
emit(:meta, :dot,
|
395
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
396
|
};
|
392
397
|
|
393
398
|
alternation {
|
394
399
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
400
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
401
|
else
|
397
|
-
emit(:meta, :alternation,
|
402
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
403
|
end
|
399
404
|
};
|
400
405
|
|
401
406
|
# Anchors
|
402
407
|
# ------------------------------------------------------------------------
|
403
408
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
409
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
410
|
};
|
406
411
|
|
407
412
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
413
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
414
|
};
|
410
415
|
|
411
416
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
417
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
418
|
};
|
414
419
|
|
415
420
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
421
|
+
case text = copy(data, ts, te)
|
422
|
+
when '\\A'; emit(:anchor, :bos, text)
|
423
|
+
when '\\z'; emit(:anchor, :eos, text)
|
424
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
425
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
426
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
427
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
428
|
end
|
424
429
|
};
|
425
430
|
|
@@ -430,7 +435,7 @@
|
|
430
435
|
# Character sets
|
431
436
|
# ------------------------------------------------------------------------
|
432
437
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
438
|
+
emit(:set, :open, copy(data, ts, te))
|
434
439
|
fcall character_set;
|
435
440
|
};
|
436
441
|
|
@@ -439,12 +444,12 @@
|
|
439
444
|
# (?(condition)Y|N) conditional expression
|
440
445
|
# ------------------------------------------------------------------------
|
441
446
|
conditional {
|
442
|
-
text =
|
447
|
+
text = copy(data, ts, te)
|
443
448
|
|
444
449
|
conditional_stack << group_depth
|
445
450
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
451
|
+
emit(:conditional, :open, text[0..-2])
|
452
|
+
emit(:conditional, :condition_open, '(')
|
448
453
|
fcall conditional_expression;
|
449
454
|
};
|
450
455
|
|
@@ -455,7 +460,7 @@
|
|
455
460
|
# correct closing count.
|
456
461
|
# ------------------------------------------------------------------------
|
457
462
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
463
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
464
|
};
|
460
465
|
|
461
466
|
# Expression options:
|
@@ -470,11 +475,11 @@
|
|
470
475
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
476
|
# ------------------------------------------------------------------------
|
472
477
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
478
|
+
text = copy(data, ts, te)
|
474
479
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
480
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
481
|
end
|
477
|
-
emit_options(text
|
482
|
+
emit_options(text)
|
478
483
|
};
|
479
484
|
|
480
485
|
# Assertions
|
@@ -484,11 +489,11 @@
|
|
484
489
|
# (?<!subexp) negative look-behind
|
485
490
|
# ------------------------------------------------------------------------
|
486
491
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
492
|
+
case text = copy(data, ts, te)
|
493
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
494
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
495
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
496
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
497
|
end
|
493
498
|
};
|
494
499
|
|
@@ -501,32 +506,32 @@
|
|
501
506
|
# (subexp) captured group
|
502
507
|
# ------------------------------------------------------------------------
|
503
508
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
509
|
+
case text = copy(data, ts, te)
|
510
|
+
when '(?:'; emit(:group, :passive, text)
|
511
|
+
when '(?>'; emit(:group, :atomic, text)
|
512
|
+
when '(?~'; emit(:group, :absence, text)
|
508
513
|
|
509
514
|
when /^\(\?(?:<>|'')/
|
510
515
|
validation_error(:group, 'named group', 'name is empty')
|
511
516
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
517
|
+
when /^\(\?<[^>]+>/
|
518
|
+
emit(:group, :named_ab, text)
|
514
519
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
520
|
+
when /^\(\?'[^']+'/
|
521
|
+
emit(:group, :named_sq, text)
|
517
522
|
|
518
523
|
end
|
519
524
|
};
|
520
525
|
|
521
526
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
527
|
+
text = copy(data, ts, te)
|
528
|
+
emit(:group, :capture, text)
|
524
529
|
};
|
525
530
|
|
526
531
|
group_close @group_closed {
|
527
532
|
if conditional_stack.last == group_depth + 1
|
528
533
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
534
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
535
|
else
|
531
536
|
if spacing_stack.length > 1 &&
|
532
537
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,72 +539,42 @@
|
|
534
539
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
540
|
end
|
536
541
|
|
537
|
-
emit(:group, :close,
|
542
|
+
emit(:group, :close, copy(data, ts, te))
|
538
543
|
end
|
539
544
|
};
|
540
545
|
|
541
546
|
|
542
547
|
# Group backreference, named and numbered
|
543
548
|
# ------------------------------------------------------------------------
|
544
|
-
backslash . (
|
545
|
-
case text =
|
546
|
-
when /^\\(
|
547
|
-
validation_error(:backref, '
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
when /^\\(
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
561
|
-
end
|
562
|
-
|
563
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
564
|
-
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
566
|
-
else
|
567
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
568
|
-
end
|
569
|
-
|
570
|
-
when /^\\([gk])'\d+'/ # single quotes
|
571
|
-
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
573
|
-
else
|
574
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
575
|
-
end
|
576
|
-
|
577
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
|
-
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
580
|
-
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
582
|
-
end
|
583
|
-
|
584
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
|
-
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
587
|
-
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
589
|
-
end
|
590
|
-
|
591
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
593
|
-
|
594
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
596
|
-
|
597
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
599
|
-
|
600
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
549
|
+
backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
|
550
|
+
case text = copy(data, ts, te)
|
551
|
+
when /^\\k(<>|'')/
|
552
|
+
validation_error(:backref, 'backreference', 'ref ID is empty')
|
553
|
+
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
554
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
555
|
+
when /^\\k(.)\d+\D$/
|
556
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
557
|
+
when /^\\k(.)-\d+\D$/
|
558
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
559
|
+
when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
|
560
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
561
|
+
when /^\\k(.)-?\d+[+\-]\d+\D$/
|
562
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
563
|
+
end
|
564
|
+
};
|
602
565
|
|
566
|
+
# Group call, named and numbered
|
567
|
+
# ------------------------------------------------------------------------
|
568
|
+
backslash . (group_name_call | group_number_call) > (backslashed, 4) {
|
569
|
+
case text = copy(data, ts, te)
|
570
|
+
when /^\\g(<>|'')/
|
571
|
+
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
572
|
+
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
573
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
574
|
+
when /^\\g(.)\d+\D$/
|
575
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
576
|
+
when /^\\g(.)[+-]\d+/
|
577
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
603
578
|
end
|
604
579
|
};
|
605
580
|
|
@@ -607,31 +582,31 @@
|
|
607
582
|
# Quantifiers
|
608
583
|
# ------------------------------------------------------------------------
|
609
584
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
587
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
588
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
589
|
end
|
615
590
|
};
|
616
591
|
|
617
592
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
595
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
596
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
597
|
end
|
623
598
|
};
|
624
599
|
|
625
600
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
601
|
+
case text = copy(data, ts, te)
|
602
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
603
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
604
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
605
|
end
|
631
606
|
};
|
632
607
|
|
633
608
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
609
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
610
|
};
|
636
611
|
|
637
612
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +622,7 @@
|
|
647
622
|
|
648
623
|
comment {
|
649
624
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
625
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
626
|
else
|
652
627
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
628
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +632,7 @@
|
|
657
632
|
|
658
633
|
space+ {
|
659
634
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
635
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
636
|
else
|
662
637
|
append_literal(data, ts, te)
|
663
638
|
end
|
@@ -666,11 +641,7 @@
|
|
666
641
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
667
642
|
# except meta characters.
|
668
643
|
# ------------------------------------------------------------------------
|
669
|
-
(ascii_print -- space)+
|
670
|
-
ascii_nonprint+ |
|
671
|
-
utf8_2_byte+ |
|
672
|
-
utf8_3_byte+ |
|
673
|
-
utf8_4_byte+ {
|
644
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
674
645
|
append_literal(data, ts, te)
|
675
646
|
};
|
676
647
|
|
@@ -682,10 +653,10 @@
|
|
682
653
|
|
683
654
|
class Regexp::Scanner
|
684
655
|
# General scanner error (catch all)
|
685
|
-
class ScannerError <
|
656
|
+
class ScannerError < Regexp::Parser::Error; end
|
686
657
|
|
687
658
|
# Base for all scanner validation errors
|
688
|
-
class ValidationError <
|
659
|
+
class ValidationError < Regexp::Parser::Error
|
689
660
|
def initialize(reason)
|
690
661
|
super reason
|
691
662
|
end
|
@@ -760,6 +731,7 @@ class Regexp::Scanner
|
|
760
731
|
self.set_depth = 0
|
761
732
|
self.group_depth = 0
|
762
733
|
self.conditional_stack = []
|
734
|
+
self.char_pos = 0
|
763
735
|
|
764
736
|
%% write data;
|
765
737
|
%% write init;
|
@@ -769,7 +741,7 @@ class Regexp::Scanner
|
|
769
741
|
testEof = testEof
|
770
742
|
|
771
743
|
if cs == re_scanner_error
|
772
|
-
text =
|
744
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
745
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
746
|
end
|
775
747
|
|
@@ -786,7 +758,7 @@ class Regexp::Scanner
|
|
786
758
|
|
787
759
|
# lazy-load property maps when first needed
|
788
760
|
require 'yaml'
|
789
|
-
PROP_MAPS_DIR = File.
|
761
|
+
PROP_MAPS_DIR = File.join(__dir__, 'scanner', 'properties')
|
790
762
|
|
791
763
|
def self.short_prop_map
|
792
764
|
@short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
|
@@ -797,22 +769,29 @@ class Regexp::Scanner
|
|
797
769
|
end
|
798
770
|
|
799
771
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
772
|
+
def emit(type, token, text)
|
801
773
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
774
|
|
803
775
|
emit_literal if literal
|
804
776
|
|
777
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
778
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
779
|
+
ts_char_pos = char_pos
|
780
|
+
te_char_pos = char_pos + text.length
|
781
|
+
|
805
782
|
if block
|
806
|
-
block.call type, token, text,
|
783
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
784
|
end
|
808
785
|
|
809
|
-
tokens << [type, token, text,
|
786
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
787
|
+
|
788
|
+
self.char_pos = te_char_pos
|
810
789
|
end
|
811
790
|
|
812
791
|
private
|
813
792
|
|
814
793
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
794
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
795
|
|
817
796
|
def free_spacing?(input_object, options)
|
818
797
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +814,25 @@ class Regexp::Scanner
|
|
835
814
|
end
|
836
815
|
|
837
816
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
817
|
+
def copy(data, ts, te)
|
818
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
819
|
end
|
847
820
|
|
848
821
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
822
|
+
# by a call to emit_literal.
|
850
823
|
def append_literal(data, ts, te)
|
851
824
|
self.literal = literal || []
|
852
|
-
literal <<
|
825
|
+
literal << copy(data, ts, te)
|
853
826
|
end
|
854
827
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
828
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
829
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
830
|
+
text = literal.join
|
863
831
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
832
|
+
emit(:literal, :literal, text)
|
865
833
|
end
|
866
834
|
|
867
|
-
def emit_options(text
|
835
|
+
def emit_options(text)
|
868
836
|
token = nil
|
869
837
|
|
870
838
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +858,14 @@ class Regexp::Scanner
|
|
890
858
|
token = :options_switch
|
891
859
|
end
|
892
860
|
|
893
|
-
emit(:group, token, text
|
861
|
+
emit(:group, token, text)
|
894
862
|
end
|
895
863
|
|
896
864
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
865
|
if data.last < 0x00 || data.last > 0x7F
|
898
866
|
validation_error(:sequence, 'escape', token.to_s)
|
899
867
|
end
|
900
|
-
emit(:escape, token,
|
868
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
869
|
end
|
902
870
|
|
903
871
|
# Centralizes and unifies the handling of validation related
|