regexp_parser 1.8.1 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +70 -0
- data/Gemfile +1 -0
- data/README.md +12 -11
- data/Rakefile +2 -2
- data/lib/regexp_parser/expression.rb +10 -19
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +22 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +27 -2
- data/lib/regexp_parser/scanner.rb +1194 -1272
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +178 -186
- data/lib/regexp_parser/syntax.rb +4 -4
- data/lib/regexp_parser/syntax/any.rb +2 -2
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +7 -0
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +23 -5
- data/spec/spec_helper.rb +1 -0
- metadata +56 -60
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -90,18 +95,19 @@
|
|
90
95
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
96
|
|
92
97
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
98
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
99
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
100
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
101
|
group_level = [+\-] . [0-9]+;
|
97
102
|
|
98
|
-
group_name = ('<' .
|
103
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
104
|
+
("'" . group_name_id_sq? . "'");
|
99
105
|
group_lookup = group_name | group_number;
|
100
106
|
|
101
107
|
group_named = ('?' . group_name );
|
102
108
|
|
103
|
-
group_name_ref = group_ref . (('<' .
|
104
|
-
("'" .
|
109
|
+
group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
|
110
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
111
|
|
106
112
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
107
113
|
("'" . group_number . group_level? "'"));
|
@@ -123,10 +129,6 @@
|
|
123
129
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
130
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
131
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
132
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
133
|
keep_mark | [xucCM];
|
132
134
|
|
@@ -135,13 +137,13 @@
|
|
135
137
|
|
136
138
|
# EOF error, used where it can be detected
|
137
139
|
action premature_end_error {
|
138
|
-
text =
|
140
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
141
|
raise PrematureEndError.new( text )
|
140
142
|
}
|
141
143
|
|
142
144
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
145
|
action invalid_sequence_error {
|
144
|
-
text =
|
146
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
147
|
validation_error(:sequence, 'sequence', text)
|
146
148
|
}
|
147
149
|
|
@@ -156,7 +158,7 @@
|
|
156
158
|
# --------------------------------------------------------------------------
|
157
159
|
character_set := |*
|
158
160
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
161
|
+
emit(:set, :close, copy(data, ts, te))
|
160
162
|
if in_set?
|
161
163
|
fret;
|
162
164
|
else
|
@@ -165,8 +167,8 @@
|
|
165
167
|
};
|
166
168
|
|
167
169
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
170
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
171
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
172
|
if in_set?
|
171
173
|
fret;
|
172
174
|
else
|
@@ -175,33 +177,33 @@
|
|
175
177
|
};
|
176
178
|
|
177
179
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
180
|
+
emit(:literal, :literal, '-')
|
181
|
+
emit(:set, :intersection, '&&')
|
180
182
|
};
|
181
183
|
|
182
184
|
'^' {
|
183
|
-
text =
|
185
|
+
text = copy(data, ts, te)
|
184
186
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
187
|
+
emit(:set, :negate, text)
|
186
188
|
else
|
187
|
-
emit(:literal, :literal, text
|
189
|
+
emit(:literal, :literal, text)
|
188
190
|
end
|
189
191
|
};
|
190
192
|
|
191
193
|
'-' {
|
192
|
-
text =
|
194
|
+
text = copy(data, ts, te)
|
193
195
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
196
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
196
198
|
else
|
197
|
-
emit(:set, :range, text
|
199
|
+
emit(:set, :range, text)
|
198
200
|
end
|
199
201
|
};
|
200
202
|
|
201
203
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
204
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
205
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
206
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
207
|
};
|
206
208
|
|
207
209
|
backslash {
|
@@ -209,12 +211,12 @@
|
|
209
211
|
};
|
210
212
|
|
211
213
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
214
|
+
emit(:set, :open, copy(data, ts, te))
|
213
215
|
fcall character_set;
|
214
216
|
};
|
215
217
|
|
216
218
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
219
|
+
text = copy(data, ts, te)
|
218
220
|
|
219
221
|
type = :posixclass
|
220
222
|
class_name = text[2..-3]
|
@@ -223,29 +225,24 @@
|
|
223
225
|
type = :nonposixclass
|
224
226
|
end
|
225
227
|
|
226
|
-
emit(type, class_name.to_sym, text
|
228
|
+
emit(type, class_name.to_sym, text)
|
227
229
|
};
|
228
230
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
231
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
232
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
233
|
+
# emit(:set, :collation, copy(data, ts, te))
|
234
|
+
# };
|
235
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
236
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
237
|
+
# };
|
236
238
|
|
237
239
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
240
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
241
|
};
|
240
242
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
243
|
+
any | ascii_nonprint | utf8_multibyte {
|
244
|
+
text = copy(data, ts, te)
|
245
|
+
emit(:literal, :literal, text)
|
249
246
|
};
|
250
247
|
*|;
|
251
248
|
|
@@ -253,7 +250,7 @@
|
|
253
250
|
# --------------------------------------------------------------------------
|
254
251
|
set_escape_sequence := |*
|
255
252
|
non_set_escape > (escaped_set_alpha, 2) {
|
256
|
-
emit(:escape, :literal,
|
253
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
257
254
|
fret;
|
258
255
|
};
|
259
256
|
|
@@ -269,33 +266,33 @@
|
|
269
266
|
# --------------------------------------------------------------------------
|
270
267
|
escape_sequence := |*
|
271
268
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
269
|
+
text = copy(data, ts-1, te)
|
270
|
+
emit(:backref, :number, text)
|
274
271
|
fret;
|
275
272
|
};
|
276
273
|
|
277
274
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
275
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
276
|
fret;
|
280
277
|
};
|
281
278
|
|
282
279
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
280
|
+
case text = copy(data, ts-1, te)
|
281
|
+
when '\.'; emit(:escape, :dot, text)
|
282
|
+
when '\|'; emit(:escape, :alternation, text)
|
283
|
+
when '\^'; emit(:escape, :bol, text)
|
284
|
+
when '\$'; emit(:escape, :eol, text)
|
285
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
286
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
287
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
288
|
+
when '\('; emit(:escape, :group_open, text)
|
289
|
+
when '\)'; emit(:escape, :group_close, text)
|
290
|
+
when '\{'; emit(:escape, :interval_open, text)
|
291
|
+
when '\}'; emit(:escape, :interval_close, text)
|
292
|
+
when '\['; emit(:escape, :set_open, text)
|
293
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
294
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
295
|
+
emit(:escape, :backslash, text)
|
299
296
|
end
|
300
297
|
fret;
|
301
298
|
};
|
@@ -303,31 +300,31 @@
|
|
303
300
|
escaped_ascii > (escaped_alpha, 7) {
|
304
301
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
302
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
303
|
+
case text = copy(data, ts-1, te)
|
304
|
+
when '\a'; emit(:escape, :bell, text)
|
305
|
+
when '\b'; emit(:escape, :backspace, text)
|
306
|
+
when '\e'; emit(:escape, :escape, text)
|
307
|
+
when '\f'; emit(:escape, :form_feed, text)
|
308
|
+
when '\n'; emit(:escape, :newline, text)
|
309
|
+
when '\r'; emit(:escape, :carriage, text)
|
310
|
+
when '\t'; emit(:escape, :tab, text)
|
311
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
312
|
end
|
316
313
|
fret;
|
317
314
|
};
|
318
315
|
|
319
316
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
317
|
+
text = copy(data, ts-1, te)
|
321
318
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
319
|
+
emit(:escape, :codepoint_list, text)
|
323
320
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
321
|
+
emit(:escape, :codepoint, text)
|
325
322
|
end
|
326
323
|
fret;
|
327
324
|
};
|
328
325
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
326
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
327
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
328
|
fret;
|
332
329
|
};
|
333
330
|
|
@@ -357,8 +354,8 @@
|
|
357
354
|
fcall unicode_property;
|
358
355
|
};
|
359
356
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
357
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
358
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
359
|
fret;
|
363
360
|
};
|
364
361
|
*|;
|
@@ -368,9 +365,9 @@
|
|
368
365
|
# --------------------------------------------------------------------------
|
369
366
|
conditional_expression := |*
|
370
367
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
368
|
+
text = copy(data, ts, te-1)
|
369
|
+
emit(:conditional, :condition, text)
|
370
|
+
emit(:conditional, :condition_close, ')')
|
374
371
|
};
|
375
372
|
|
376
373
|
any {
|
@@ -387,39 +384,39 @@
|
|
387
384
|
# Meta characters
|
388
385
|
# ------------------------------------------------------------------------
|
389
386
|
dot {
|
390
|
-
emit(:meta, :dot,
|
387
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
388
|
};
|
392
389
|
|
393
390
|
alternation {
|
394
391
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
392
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
393
|
else
|
397
|
-
emit(:meta, :alternation,
|
394
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
395
|
end
|
399
396
|
};
|
400
397
|
|
401
398
|
# Anchors
|
402
399
|
# ------------------------------------------------------------------------
|
403
400
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
401
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
402
|
};
|
406
403
|
|
407
404
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
405
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
406
|
};
|
410
407
|
|
411
408
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
409
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
410
|
};
|
414
411
|
|
415
412
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
413
|
+
case text = copy(data, ts, te)
|
414
|
+
when '\\A'; emit(:anchor, :bos, text)
|
415
|
+
when '\\z'; emit(:anchor, :eos, text)
|
416
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
417
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
418
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
419
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
420
|
end
|
424
421
|
};
|
425
422
|
|
@@ -430,7 +427,7 @@
|
|
430
427
|
# Character sets
|
431
428
|
# ------------------------------------------------------------------------
|
432
429
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
430
|
+
emit(:set, :open, copy(data, ts, te))
|
434
431
|
fcall character_set;
|
435
432
|
};
|
436
433
|
|
@@ -439,12 +436,12 @@
|
|
439
436
|
# (?(condition)Y|N) conditional expression
|
440
437
|
# ------------------------------------------------------------------------
|
441
438
|
conditional {
|
442
|
-
text =
|
439
|
+
text = copy(data, ts, te)
|
443
440
|
|
444
441
|
conditional_stack << group_depth
|
445
442
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
443
|
+
emit(:conditional, :open, text[0..-2])
|
444
|
+
emit(:conditional, :condition_open, '(')
|
448
445
|
fcall conditional_expression;
|
449
446
|
};
|
450
447
|
|
@@ -455,7 +452,7 @@
|
|
455
452
|
# correct closing count.
|
456
453
|
# ------------------------------------------------------------------------
|
457
454
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
455
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
456
|
};
|
460
457
|
|
461
458
|
# Expression options:
|
@@ -470,11 +467,11 @@
|
|
470
467
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
468
|
# ------------------------------------------------------------------------
|
472
469
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
470
|
+
text = copy(data, ts, te)
|
474
471
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
472
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
473
|
end
|
477
|
-
emit_options(text
|
474
|
+
emit_options(text)
|
478
475
|
};
|
479
476
|
|
480
477
|
# Assertions
|
@@ -484,11 +481,11 @@
|
|
484
481
|
# (?<!subexp) negative look-behind
|
485
482
|
# ------------------------------------------------------------------------
|
486
483
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
484
|
+
case text = copy(data, ts, te)
|
485
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
486
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
487
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
488
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
489
|
end
|
493
490
|
};
|
494
491
|
|
@@ -501,32 +498,32 @@
|
|
501
498
|
# (subexp) captured group
|
502
499
|
# ------------------------------------------------------------------------
|
503
500
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
501
|
+
case text = copy(data, ts, te)
|
502
|
+
when '(?:'; emit(:group, :passive, text)
|
503
|
+
when '(?>'; emit(:group, :atomic, text)
|
504
|
+
when '(?~'; emit(:group, :absence, text)
|
508
505
|
|
509
506
|
when /^\(\?(?:<>|'')/
|
510
507
|
validation_error(:group, 'named group', 'name is empty')
|
511
508
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
509
|
+
when /^\(\?<[^>]+>/
|
510
|
+
emit(:group, :named_ab, text)
|
514
511
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
512
|
+
when /^\(\?'[^']+'/
|
513
|
+
emit(:group, :named_sq, text)
|
517
514
|
|
518
515
|
end
|
519
516
|
};
|
520
517
|
|
521
518
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
519
|
+
text = copy(data, ts, te)
|
520
|
+
emit(:group, :capture, text)
|
524
521
|
};
|
525
522
|
|
526
523
|
group_close @group_closed {
|
527
524
|
if conditional_stack.last == group_depth + 1
|
528
525
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
526
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
527
|
else
|
531
528
|
if spacing_stack.length > 1 &&
|
532
529
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,7 +531,7 @@
|
|
534
531
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
532
|
end
|
536
533
|
|
537
|
-
emit(:group, :close,
|
534
|
+
emit(:group, :close, copy(data, ts, te))
|
538
535
|
end
|
539
536
|
};
|
540
537
|
|
@@ -542,63 +539,65 @@
|
|
542
539
|
# Group backreference, named and numbered
|
543
540
|
# ------------------------------------------------------------------------
|
544
541
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
545
|
-
case text =
|
542
|
+
case text = copy(data, ts, te)
|
546
543
|
when /^\\([gk])(<>|'')/ # angle brackets
|
547
544
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
548
545
|
|
549
|
-
|
546
|
+
# TODO: finer quirks of choosing recursive or non-recursive refs/calls.
|
547
|
+
# e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
|
548
|
+
when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
|
550
549
|
if $1 == 'k'
|
551
|
-
emit(:backref, :name_ref_ab, text
|
550
|
+
emit(:backref, :name_ref_ab, text)
|
552
551
|
else
|
553
|
-
emit(:backref, :name_call_ab, text
|
552
|
+
emit(:backref, :name_call_ab, text)
|
554
553
|
end
|
555
554
|
|
556
|
-
when /^\\([gk])'[^\
|
555
|
+
when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
|
557
556
|
if $1 == 'k'
|
558
|
-
emit(:backref, :name_ref_sq, text
|
557
|
+
emit(:backref, :name_ref_sq, text)
|
559
558
|
else
|
560
|
-
emit(:backref, :name_call_sq, text
|
559
|
+
emit(:backref, :name_call_sq, text)
|
561
560
|
end
|
562
561
|
|
563
562
|
when /^\\([gk])<\d+>/ # angle-brackets
|
564
563
|
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text
|
564
|
+
emit(:backref, :number_ref_ab, text)
|
566
565
|
else
|
567
|
-
emit(:backref, :number_call_ab, text
|
566
|
+
emit(:backref, :number_call_ab, text)
|
568
567
|
end
|
569
568
|
|
570
569
|
when /^\\([gk])'\d+'/ # single quotes
|
571
570
|
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text
|
571
|
+
emit(:backref, :number_ref_sq, text)
|
573
572
|
else
|
574
|
-
emit(:backref, :number_call_sq, text
|
573
|
+
emit(:backref, :number_call_sq, text)
|
575
574
|
end
|
576
575
|
|
577
576
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
577
|
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text
|
578
|
+
emit(:backref, :number_rel_ref_ab, text)
|
580
579
|
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text
|
580
|
+
emit(:backref, :number_rel_call_ab, text)
|
582
581
|
end
|
583
582
|
|
584
583
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
584
|
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text
|
585
|
+
emit(:backref, :number_rel_ref_sq, text)
|
587
586
|
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text
|
587
|
+
emit(:backref, :number_rel_call_sq, text)
|
589
588
|
end
|
590
589
|
|
591
|
-
when /^\\k<[^\
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text
|
590
|
+
when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
|
591
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
593
592
|
|
594
|
-
when /^\\k'[^\
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text
|
593
|
+
when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
|
594
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
596
595
|
|
597
596
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text
|
597
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
599
598
|
|
600
599
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text
|
600
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
602
601
|
|
603
602
|
end
|
604
603
|
};
|
@@ -607,31 +606,31 @@
|
|
607
606
|
# Quantifiers
|
608
607
|
# ------------------------------------------------------------------------
|
609
608
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
609
|
+
case text = copy(data, ts, te)
|
610
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
611
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
612
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
613
|
end
|
615
614
|
};
|
616
615
|
|
617
616
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
617
|
+
case text = copy(data, ts, te)
|
618
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
619
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
620
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
621
|
end
|
623
622
|
};
|
624
623
|
|
625
624
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
625
|
+
case text = copy(data, ts, te)
|
626
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
627
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
628
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
629
|
end
|
631
630
|
};
|
632
631
|
|
633
632
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
633
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
634
|
};
|
636
635
|
|
637
636
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +646,7 @@
|
|
647
646
|
|
648
647
|
comment {
|
649
648
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
649
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
650
|
else
|
652
651
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
652
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +656,7 @@
|
|
657
656
|
|
658
657
|
space+ {
|
659
658
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
659
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
660
|
else
|
662
661
|
append_literal(data, ts, te)
|
663
662
|
end
|
@@ -666,11 +665,7 @@
|
|
666
665
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
667
666
|
# except meta characters.
|
668
667
|
# ------------------------------------------------------------------------
|
669
|
-
(ascii_print -- space)+
|
670
|
-
ascii_nonprint+ |
|
671
|
-
utf8_2_byte+ |
|
672
|
-
utf8_3_byte+ |
|
673
|
-
utf8_4_byte+ {
|
668
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
674
669
|
append_literal(data, ts, te)
|
675
670
|
};
|
676
671
|
|
@@ -760,6 +755,7 @@ class Regexp::Scanner
|
|
760
755
|
self.set_depth = 0
|
761
756
|
self.group_depth = 0
|
762
757
|
self.conditional_stack = []
|
758
|
+
self.char_pos = 0
|
763
759
|
|
764
760
|
%% write data;
|
765
761
|
%% write init;
|
@@ -769,7 +765,7 @@ class Regexp::Scanner
|
|
769
765
|
testEof = testEof
|
770
766
|
|
771
767
|
if cs == re_scanner_error
|
772
|
-
text =
|
768
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
769
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
770
|
end
|
775
771
|
|
@@ -797,22 +793,29 @@ class Regexp::Scanner
|
|
797
793
|
end
|
798
794
|
|
799
795
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
796
|
+
def emit(type, token, text)
|
801
797
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
798
|
|
803
799
|
emit_literal if literal
|
804
800
|
|
801
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
802
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
803
|
+
ts_char_pos = char_pos
|
804
|
+
te_char_pos = char_pos + text.length
|
805
|
+
|
805
806
|
if block
|
806
|
-
block.call type, token, text,
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
808
|
end
|
808
809
|
|
809
|
-
tokens << [type, token, text,
|
810
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
811
|
+
|
812
|
+
self.char_pos = te_char_pos
|
810
813
|
end
|
811
814
|
|
812
815
|
private
|
813
816
|
|
814
817
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
818
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
819
|
|
817
820
|
def free_spacing?(input_object, options)
|
818
821
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +838,25 @@ class Regexp::Scanner
|
|
835
838
|
end
|
836
839
|
|
837
840
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
841
|
+
def copy(data, ts, te)
|
842
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
843
|
end
|
847
844
|
|
848
845
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
846
|
+
# by a call to emit_literal.
|
850
847
|
def append_literal(data, ts, te)
|
851
848
|
self.literal = literal || []
|
852
|
-
literal <<
|
849
|
+
literal << copy(data, ts, te)
|
853
850
|
end
|
854
851
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
852
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
853
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
854
|
+
text = literal.join
|
863
855
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
856
|
+
emit(:literal, :literal, text)
|
865
857
|
end
|
866
858
|
|
867
|
-
def emit_options(text
|
859
|
+
def emit_options(text)
|
868
860
|
token = nil
|
869
861
|
|
870
862
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +882,14 @@ class Regexp::Scanner
|
|
890
882
|
token = :options_switch
|
891
883
|
end
|
892
884
|
|
893
|
-
emit(:group, token, text
|
885
|
+
emit(:group, token, text)
|
894
886
|
end
|
895
887
|
|
896
888
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
889
|
if data.last < 0x00 || data.last > 0x7F
|
898
890
|
validation_error(:sequence, 'escape', token.to_s)
|
899
891
|
end
|
900
|
-
emit(:escape, token,
|
892
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
893
|
end
|
902
894
|
|
903
895
|
# Centralizes and unifies the handling of validation related
|