regexp_parser 1.8.1 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +70 -0
- data/Gemfile +1 -0
- data/README.md +12 -11
- data/Rakefile +2 -2
- data/lib/regexp_parser/expression.rb +10 -19
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +22 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +27 -2
- data/lib/regexp_parser/scanner.rb +1194 -1272
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +178 -186
- data/lib/regexp_parser/syntax.rb +4 -4
- data/lib/regexp_parser/syntax/any.rb +2 -2
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +7 -0
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +23 -5
- data/spec/spec_helper.rb +1 -0
- metadata +56 -60
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -90,18 +95,19 @@
|
|
90
95
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
96
|
|
92
97
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
98
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
99
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
100
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
101
|
group_level = [+\-] . [0-9]+;
|
97
102
|
|
98
|
-
group_name = ('<' .
|
103
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
104
|
+
("'" . group_name_id_sq? . "'");
|
99
105
|
group_lookup = group_name | group_number;
|
100
106
|
|
101
107
|
group_named = ('?' . group_name );
|
102
108
|
|
103
|
-
group_name_ref = group_ref . (('<' .
|
104
|
-
("'" .
|
109
|
+
group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
|
110
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
111
|
|
106
112
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
107
113
|
("'" . group_number . group_level? "'"));
|
@@ -123,10 +129,6 @@
|
|
123
129
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
130
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
131
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
132
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
133
|
keep_mark | [xucCM];
|
132
134
|
|
@@ -135,13 +137,13 @@
|
|
135
137
|
|
136
138
|
# EOF error, used where it can be detected
|
137
139
|
action premature_end_error {
|
138
|
-
text =
|
140
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
141
|
raise PrematureEndError.new( text )
|
140
142
|
}
|
141
143
|
|
142
144
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
145
|
action invalid_sequence_error {
|
144
|
-
text =
|
146
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
147
|
validation_error(:sequence, 'sequence', text)
|
146
148
|
}
|
147
149
|
|
@@ -156,7 +158,7 @@
|
|
156
158
|
# --------------------------------------------------------------------------
|
157
159
|
character_set := |*
|
158
160
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
161
|
+
emit(:set, :close, copy(data, ts, te))
|
160
162
|
if in_set?
|
161
163
|
fret;
|
162
164
|
else
|
@@ -165,8 +167,8 @@
|
|
165
167
|
};
|
166
168
|
|
167
169
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
170
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
171
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
172
|
if in_set?
|
171
173
|
fret;
|
172
174
|
else
|
@@ -175,33 +177,33 @@
|
|
175
177
|
};
|
176
178
|
|
177
179
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
180
|
+
emit(:literal, :literal, '-')
|
181
|
+
emit(:set, :intersection, '&&')
|
180
182
|
};
|
181
183
|
|
182
184
|
'^' {
|
183
|
-
text =
|
185
|
+
text = copy(data, ts, te)
|
184
186
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
187
|
+
emit(:set, :negate, text)
|
186
188
|
else
|
187
|
-
emit(:literal, :literal, text
|
189
|
+
emit(:literal, :literal, text)
|
188
190
|
end
|
189
191
|
};
|
190
192
|
|
191
193
|
'-' {
|
192
|
-
text =
|
194
|
+
text = copy(data, ts, te)
|
193
195
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
196
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
196
198
|
else
|
197
|
-
emit(:set, :range, text
|
199
|
+
emit(:set, :range, text)
|
198
200
|
end
|
199
201
|
};
|
200
202
|
|
201
203
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
204
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
205
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
206
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
207
|
};
|
206
208
|
|
207
209
|
backslash {
|
@@ -209,12 +211,12 @@
|
|
209
211
|
};
|
210
212
|
|
211
213
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
214
|
+
emit(:set, :open, copy(data, ts, te))
|
213
215
|
fcall character_set;
|
214
216
|
};
|
215
217
|
|
216
218
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
219
|
+
text = copy(data, ts, te)
|
218
220
|
|
219
221
|
type = :posixclass
|
220
222
|
class_name = text[2..-3]
|
@@ -223,29 +225,24 @@
|
|
223
225
|
type = :nonposixclass
|
224
226
|
end
|
225
227
|
|
226
|
-
emit(type, class_name.to_sym, text
|
228
|
+
emit(type, class_name.to_sym, text)
|
227
229
|
};
|
228
230
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
231
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
232
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
233
|
+
# emit(:set, :collation, copy(data, ts, te))
|
234
|
+
# };
|
235
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
236
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
237
|
+
# };
|
236
238
|
|
237
239
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
240
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
241
|
};
|
240
242
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
243
|
+
any | ascii_nonprint | utf8_multibyte {
|
244
|
+
text = copy(data, ts, te)
|
245
|
+
emit(:literal, :literal, text)
|
249
246
|
};
|
250
247
|
*|;
|
251
248
|
|
@@ -253,7 +250,7 @@
|
|
253
250
|
# --------------------------------------------------------------------------
|
254
251
|
set_escape_sequence := |*
|
255
252
|
non_set_escape > (escaped_set_alpha, 2) {
|
256
|
-
emit(:escape, :literal,
|
253
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
257
254
|
fret;
|
258
255
|
};
|
259
256
|
|
@@ -269,33 +266,33 @@
|
|
269
266
|
# --------------------------------------------------------------------------
|
270
267
|
escape_sequence := |*
|
271
268
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
269
|
+
text = copy(data, ts-1, te)
|
270
|
+
emit(:backref, :number, text)
|
274
271
|
fret;
|
275
272
|
};
|
276
273
|
|
277
274
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
275
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
276
|
fret;
|
280
277
|
};
|
281
278
|
|
282
279
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
280
|
+
case text = copy(data, ts-1, te)
|
281
|
+
when '\.'; emit(:escape, :dot, text)
|
282
|
+
when '\|'; emit(:escape, :alternation, text)
|
283
|
+
when '\^'; emit(:escape, :bol, text)
|
284
|
+
when '\$'; emit(:escape, :eol, text)
|
285
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
286
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
287
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
288
|
+
when '\('; emit(:escape, :group_open, text)
|
289
|
+
when '\)'; emit(:escape, :group_close, text)
|
290
|
+
when '\{'; emit(:escape, :interval_open, text)
|
291
|
+
when '\}'; emit(:escape, :interval_close, text)
|
292
|
+
when '\['; emit(:escape, :set_open, text)
|
293
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
294
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
295
|
+
emit(:escape, :backslash, text)
|
299
296
|
end
|
300
297
|
fret;
|
301
298
|
};
|
@@ -303,31 +300,31 @@
|
|
303
300
|
escaped_ascii > (escaped_alpha, 7) {
|
304
301
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
302
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
303
|
+
case text = copy(data, ts-1, te)
|
304
|
+
when '\a'; emit(:escape, :bell, text)
|
305
|
+
when '\b'; emit(:escape, :backspace, text)
|
306
|
+
when '\e'; emit(:escape, :escape, text)
|
307
|
+
when '\f'; emit(:escape, :form_feed, text)
|
308
|
+
when '\n'; emit(:escape, :newline, text)
|
309
|
+
when '\r'; emit(:escape, :carriage, text)
|
310
|
+
when '\t'; emit(:escape, :tab, text)
|
311
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
312
|
end
|
316
313
|
fret;
|
317
314
|
};
|
318
315
|
|
319
316
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
317
|
+
text = copy(data, ts-1, te)
|
321
318
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
319
|
+
emit(:escape, :codepoint_list, text)
|
323
320
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
321
|
+
emit(:escape, :codepoint, text)
|
325
322
|
end
|
326
323
|
fret;
|
327
324
|
};
|
328
325
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
326
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
327
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
328
|
fret;
|
332
329
|
};
|
333
330
|
|
@@ -357,8 +354,8 @@
|
|
357
354
|
fcall unicode_property;
|
358
355
|
};
|
359
356
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
357
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
358
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
359
|
fret;
|
363
360
|
};
|
364
361
|
*|;
|
@@ -368,9 +365,9 @@
|
|
368
365
|
# --------------------------------------------------------------------------
|
369
366
|
conditional_expression := |*
|
370
367
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
368
|
+
text = copy(data, ts, te-1)
|
369
|
+
emit(:conditional, :condition, text)
|
370
|
+
emit(:conditional, :condition_close, ')')
|
374
371
|
};
|
375
372
|
|
376
373
|
any {
|
@@ -387,39 +384,39 @@
|
|
387
384
|
# Meta characters
|
388
385
|
# ------------------------------------------------------------------------
|
389
386
|
dot {
|
390
|
-
emit(:meta, :dot,
|
387
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
388
|
};
|
392
389
|
|
393
390
|
alternation {
|
394
391
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
392
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
393
|
else
|
397
|
-
emit(:meta, :alternation,
|
394
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
395
|
end
|
399
396
|
};
|
400
397
|
|
401
398
|
# Anchors
|
402
399
|
# ------------------------------------------------------------------------
|
403
400
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
401
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
402
|
};
|
406
403
|
|
407
404
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
405
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
406
|
};
|
410
407
|
|
411
408
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
409
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
410
|
};
|
414
411
|
|
415
412
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
413
|
+
case text = copy(data, ts, te)
|
414
|
+
when '\\A'; emit(:anchor, :bos, text)
|
415
|
+
when '\\z'; emit(:anchor, :eos, text)
|
416
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
417
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
418
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
419
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
420
|
end
|
424
421
|
};
|
425
422
|
|
@@ -430,7 +427,7 @@
|
|
430
427
|
# Character sets
|
431
428
|
# ------------------------------------------------------------------------
|
432
429
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
430
|
+
emit(:set, :open, copy(data, ts, te))
|
434
431
|
fcall character_set;
|
435
432
|
};
|
436
433
|
|
@@ -439,12 +436,12 @@
|
|
439
436
|
# (?(condition)Y|N) conditional expression
|
440
437
|
# ------------------------------------------------------------------------
|
441
438
|
conditional {
|
442
|
-
text =
|
439
|
+
text = copy(data, ts, te)
|
443
440
|
|
444
441
|
conditional_stack << group_depth
|
445
442
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
443
|
+
emit(:conditional, :open, text[0..-2])
|
444
|
+
emit(:conditional, :condition_open, '(')
|
448
445
|
fcall conditional_expression;
|
449
446
|
};
|
450
447
|
|
@@ -455,7 +452,7 @@
|
|
455
452
|
# correct closing count.
|
456
453
|
# ------------------------------------------------------------------------
|
457
454
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
455
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
456
|
};
|
460
457
|
|
461
458
|
# Expression options:
|
@@ -470,11 +467,11 @@
|
|
470
467
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
468
|
# ------------------------------------------------------------------------
|
472
469
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
470
|
+
text = copy(data, ts, te)
|
474
471
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
472
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
473
|
end
|
477
|
-
emit_options(text
|
474
|
+
emit_options(text)
|
478
475
|
};
|
479
476
|
|
480
477
|
# Assertions
|
@@ -484,11 +481,11 @@
|
|
484
481
|
# (?<!subexp) negative look-behind
|
485
482
|
# ------------------------------------------------------------------------
|
486
483
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
484
|
+
case text = copy(data, ts, te)
|
485
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
486
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
487
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
488
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
489
|
end
|
493
490
|
};
|
494
491
|
|
@@ -501,32 +498,32 @@
|
|
501
498
|
# (subexp) captured group
|
502
499
|
# ------------------------------------------------------------------------
|
503
500
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
501
|
+
case text = copy(data, ts, te)
|
502
|
+
when '(?:'; emit(:group, :passive, text)
|
503
|
+
when '(?>'; emit(:group, :atomic, text)
|
504
|
+
when '(?~'; emit(:group, :absence, text)
|
508
505
|
|
509
506
|
when /^\(\?(?:<>|'')/
|
510
507
|
validation_error(:group, 'named group', 'name is empty')
|
511
508
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
509
|
+
when /^\(\?<[^>]+>/
|
510
|
+
emit(:group, :named_ab, text)
|
514
511
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
512
|
+
when /^\(\?'[^']+'/
|
513
|
+
emit(:group, :named_sq, text)
|
517
514
|
|
518
515
|
end
|
519
516
|
};
|
520
517
|
|
521
518
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
519
|
+
text = copy(data, ts, te)
|
520
|
+
emit(:group, :capture, text)
|
524
521
|
};
|
525
522
|
|
526
523
|
group_close @group_closed {
|
527
524
|
if conditional_stack.last == group_depth + 1
|
528
525
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
526
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
527
|
else
|
531
528
|
if spacing_stack.length > 1 &&
|
532
529
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,7 +531,7 @@
|
|
534
531
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
532
|
end
|
536
533
|
|
537
|
-
emit(:group, :close,
|
534
|
+
emit(:group, :close, copy(data, ts, te))
|
538
535
|
end
|
539
536
|
};
|
540
537
|
|
@@ -542,63 +539,65 @@
|
|
542
539
|
# Group backreference, named and numbered
|
543
540
|
# ------------------------------------------------------------------------
|
544
541
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
545
|
-
case text =
|
542
|
+
case text = copy(data, ts, te)
|
546
543
|
when /^\\([gk])(<>|'')/ # angle brackets
|
547
544
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
548
545
|
|
549
|
-
|
546
|
+
# TODO: finer quirks of choosing recursive or non-recursive refs/calls.
|
547
|
+
# e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
|
548
|
+
when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
|
550
549
|
if $1 == 'k'
|
551
|
-
emit(:backref, :name_ref_ab, text
|
550
|
+
emit(:backref, :name_ref_ab, text)
|
552
551
|
else
|
553
|
-
emit(:backref, :name_call_ab, text
|
552
|
+
emit(:backref, :name_call_ab, text)
|
554
553
|
end
|
555
554
|
|
556
|
-
when /^\\([gk])'[^\
|
555
|
+
when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
|
557
556
|
if $1 == 'k'
|
558
|
-
emit(:backref, :name_ref_sq, text
|
557
|
+
emit(:backref, :name_ref_sq, text)
|
559
558
|
else
|
560
|
-
emit(:backref, :name_call_sq, text
|
559
|
+
emit(:backref, :name_call_sq, text)
|
561
560
|
end
|
562
561
|
|
563
562
|
when /^\\([gk])<\d+>/ # angle-brackets
|
564
563
|
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text
|
564
|
+
emit(:backref, :number_ref_ab, text)
|
566
565
|
else
|
567
|
-
emit(:backref, :number_call_ab, text
|
566
|
+
emit(:backref, :number_call_ab, text)
|
568
567
|
end
|
569
568
|
|
570
569
|
when /^\\([gk])'\d+'/ # single quotes
|
571
570
|
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text
|
571
|
+
emit(:backref, :number_ref_sq, text)
|
573
572
|
else
|
574
|
-
emit(:backref, :number_call_sq, text
|
573
|
+
emit(:backref, :number_call_sq, text)
|
575
574
|
end
|
576
575
|
|
577
576
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
577
|
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text
|
578
|
+
emit(:backref, :number_rel_ref_ab, text)
|
580
579
|
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text
|
580
|
+
emit(:backref, :number_rel_call_ab, text)
|
582
581
|
end
|
583
582
|
|
584
583
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
584
|
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text
|
585
|
+
emit(:backref, :number_rel_ref_sq, text)
|
587
586
|
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text
|
587
|
+
emit(:backref, :number_rel_call_sq, text)
|
589
588
|
end
|
590
589
|
|
591
|
-
when /^\\k<[^\
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text
|
590
|
+
when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
|
591
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
593
592
|
|
594
|
-
when /^\\k'[^\
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text
|
593
|
+
when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
|
594
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
596
595
|
|
597
596
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text
|
597
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
599
598
|
|
600
599
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text
|
600
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
602
601
|
|
603
602
|
end
|
604
603
|
};
|
@@ -607,31 +606,31 @@
|
|
607
606
|
# Quantifiers
|
608
607
|
# ------------------------------------------------------------------------
|
609
608
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
609
|
+
case text = copy(data, ts, te)
|
610
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
611
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
612
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
613
|
end
|
615
614
|
};
|
616
615
|
|
617
616
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
617
|
+
case text = copy(data, ts, te)
|
618
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
619
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
620
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
621
|
end
|
623
622
|
};
|
624
623
|
|
625
624
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
625
|
+
case text = copy(data, ts, te)
|
626
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
627
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
628
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
629
|
end
|
631
630
|
};
|
632
631
|
|
633
632
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
633
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
634
|
};
|
636
635
|
|
637
636
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +646,7 @@
|
|
647
646
|
|
648
647
|
comment {
|
649
648
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
649
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
650
|
else
|
652
651
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
652
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +656,7 @@
|
|
657
656
|
|
658
657
|
space+ {
|
659
658
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
659
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
660
|
else
|
662
661
|
append_literal(data, ts, te)
|
663
662
|
end
|
@@ -666,11 +665,7 @@
|
|
666
665
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
667
666
|
# except meta characters.
|
668
667
|
# ------------------------------------------------------------------------
|
669
|
-
(ascii_print -- space)+
|
670
|
-
ascii_nonprint+ |
|
671
|
-
utf8_2_byte+ |
|
672
|
-
utf8_3_byte+ |
|
673
|
-
utf8_4_byte+ {
|
668
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
674
669
|
append_literal(data, ts, te)
|
675
670
|
};
|
676
671
|
|
@@ -760,6 +755,7 @@ class Regexp::Scanner
|
|
760
755
|
self.set_depth = 0
|
761
756
|
self.group_depth = 0
|
762
757
|
self.conditional_stack = []
|
758
|
+
self.char_pos = 0
|
763
759
|
|
764
760
|
%% write data;
|
765
761
|
%% write init;
|
@@ -769,7 +765,7 @@ class Regexp::Scanner
|
|
769
765
|
testEof = testEof
|
770
766
|
|
771
767
|
if cs == re_scanner_error
|
772
|
-
text =
|
768
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
769
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
770
|
end
|
775
771
|
|
@@ -797,22 +793,29 @@ class Regexp::Scanner
|
|
797
793
|
end
|
798
794
|
|
799
795
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
796
|
+
def emit(type, token, text)
|
801
797
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
798
|
|
803
799
|
emit_literal if literal
|
804
800
|
|
801
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
802
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
803
|
+
ts_char_pos = char_pos
|
804
|
+
te_char_pos = char_pos + text.length
|
805
|
+
|
805
806
|
if block
|
806
|
-
block.call type, token, text,
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
808
|
end
|
808
809
|
|
809
|
-
tokens << [type, token, text,
|
810
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
811
|
+
|
812
|
+
self.char_pos = te_char_pos
|
810
813
|
end
|
811
814
|
|
812
815
|
private
|
813
816
|
|
814
817
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
818
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
819
|
|
817
820
|
def free_spacing?(input_object, options)
|
818
821
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +838,25 @@ class Regexp::Scanner
|
|
835
838
|
end
|
836
839
|
|
837
840
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
841
|
+
def copy(data, ts, te)
|
842
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
843
|
end
|
847
844
|
|
848
845
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
846
|
+
# by a call to emit_literal.
|
850
847
|
def append_literal(data, ts, te)
|
851
848
|
self.literal = literal || []
|
852
|
-
literal <<
|
849
|
+
literal << copy(data, ts, te)
|
853
850
|
end
|
854
851
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
852
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
853
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
854
|
+
text = literal.join
|
863
855
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
856
|
+
emit(:literal, :literal, text)
|
865
857
|
end
|
866
858
|
|
867
|
-
def emit_options(text
|
859
|
+
def emit_options(text)
|
868
860
|
token = nil
|
869
861
|
|
870
862
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +882,14 @@ class Regexp::Scanner
|
|
890
882
|
token = :options_switch
|
891
883
|
end
|
892
884
|
|
893
|
-
emit(:group, token, text
|
885
|
+
emit(:group, token, text)
|
894
886
|
end
|
895
887
|
|
896
888
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
889
|
if data.last < 0x00 || data.last > 0x7F
|
898
890
|
validation_error(:sequence, 'escape', token.to_s)
|
899
891
|
end
|
900
|
-
emit(:escape, token,
|
892
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
893
|
end
|
902
894
|
|
903
895
|
# Centralizes and unifies the handling of validation related
|