regexp_parser 1.7.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +80 -1
- data/README.md +24 -12
- data/lib/regexp_parser/expression.rb +10 -19
- data/lib/regexp_parser/expression/classes/group.rb +17 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +6 -6
- data/lib/regexp_parser/parser.rb +45 -12
- data/lib/regexp_parser/scanner.rb +1305 -1193
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +194 -171
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/to_s_spec.rb +16 -0
- data/spec/lexer/delimiters_spec.rb +68 -0
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/options_spec.rb +28 -0
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/delimiters_spec.rb +52 -0
- data/spec/scanner/errors_spec.rb +0 -1
- data/spec/scanner/escapes_spec.rb +10 -0
- data/spec/scanner/free_space_spec.rb +32 -0
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/options_spec.rb +36 -0
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +8 -2
- metadata +65 -61
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -21,7 +21,7 @@
|
|
21
21
|
set_close = ']';
|
22
22
|
brackets = set_open | set_close;
|
23
23
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
24
|
+
comment = ('#' . [^\n]* . '\n'?);
|
25
25
|
|
26
26
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
27
|
'cntrl' | 'digit' | 'graph' |
|
@@ -62,13 +62,17 @@
|
|
62
62
|
quantifier_possessive = '?+' | '*+' | '++';
|
63
63
|
quantifier_mode = '?' | '+';
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
quantity_exact = (digit+);
|
66
|
+
quantity_minimum = (digit+) . ',';
|
67
|
+
quantity_maximum = ',' . (digit+);
|
68
|
+
quantity_range = (digit+) . ',' . (digit+);
|
69
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
70
|
+
quantity_maximum | quantity_range ) . range_close .
|
71
|
+
quantifier_mode?;
|
67
72
|
|
68
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
74
|
quantifier_possessive | quantifier_interval;
|
70
75
|
|
71
|
-
|
72
76
|
conditional = '(?(';
|
73
77
|
|
74
78
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -114,7 +118,9 @@
|
|
114
118
|
curlies | parantheses | brackets |
|
115
119
|
line_anchor | quantifier_greedy;
|
116
120
|
|
117
|
-
|
121
|
+
literal_delimiters = ']' | '}';
|
122
|
+
|
123
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
118
124
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
119
125
|
|
120
126
|
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
@@ -122,20 +128,20 @@
|
|
122
128
|
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
123
129
|
|
124
130
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
125
|
-
|
131
|
+
keep_mark | [xucCM];
|
126
132
|
|
127
133
|
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
128
134
|
multi_codepoint_char_type | [0-9cCM];
|
129
135
|
|
130
136
|
# EOF error, used where it can be detected
|
131
137
|
action premature_end_error {
|
132
|
-
text =
|
138
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
133
139
|
raise PrematureEndError.new( text )
|
134
140
|
}
|
135
141
|
|
136
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
137
143
|
action invalid_sequence_error {
|
138
|
-
text =
|
144
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
145
|
validation_error(:sequence, 'sequence', text)
|
140
146
|
}
|
141
147
|
|
@@ -150,7 +156,7 @@
|
|
150
156
|
# --------------------------------------------------------------------------
|
151
157
|
character_set := |*
|
152
158
|
set_close > (set_meta, 2) @set_closed {
|
153
|
-
emit(:set, :close,
|
159
|
+
emit(:set, :close, copy(data, ts, te))
|
154
160
|
if in_set?
|
155
161
|
fret;
|
156
162
|
else
|
@@ -159,8 +165,8 @@
|
|
159
165
|
};
|
160
166
|
|
161
167
|
'-]' @set_closed { # special case, emits two tokens
|
162
|
-
emit(:literal, :literal, copy(data, ts
|
163
|
-
emit(:set, :close, copy(data, ts+1
|
168
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
169
|
+
emit(:set, :close, copy(data, ts+1, te))
|
164
170
|
if in_set?
|
165
171
|
fret;
|
166
172
|
else
|
@@ -169,33 +175,33 @@
|
|
169
175
|
};
|
170
176
|
|
171
177
|
'-&&' { # special case, emits two tokens
|
172
|
-
emit(:literal, :literal, '-'
|
173
|
-
emit(:set, :intersection, '&&'
|
178
|
+
emit(:literal, :literal, '-')
|
179
|
+
emit(:set, :intersection, '&&')
|
174
180
|
};
|
175
181
|
|
176
182
|
'^' {
|
177
|
-
text =
|
183
|
+
text = copy(data, ts, te)
|
178
184
|
if tokens.last[1] == :open
|
179
|
-
emit(:set, :negate, text
|
185
|
+
emit(:set, :negate, text)
|
180
186
|
else
|
181
|
-
emit(:literal, :literal, text
|
187
|
+
emit(:literal, :literal, text)
|
182
188
|
end
|
183
189
|
};
|
184
190
|
|
185
191
|
'-' {
|
186
|
-
text =
|
192
|
+
text = copy(data, ts, te)
|
187
193
|
# ranges cant start with a subset or intersection/negation/range operator
|
188
194
|
if tokens.last[0] == :set
|
189
|
-
emit(:literal, :literal, text
|
195
|
+
emit(:literal, :literal, text)
|
190
196
|
else
|
191
|
-
emit(:set, :range, text
|
197
|
+
emit(:set, :range, text)
|
192
198
|
end
|
193
199
|
};
|
194
200
|
|
195
201
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
196
202
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
197
203
|
'&&' {
|
198
|
-
emit(:set, :intersection,
|
204
|
+
emit(:set, :intersection, copy(data, ts, te))
|
199
205
|
};
|
200
206
|
|
201
207
|
backslash {
|
@@ -203,12 +209,12 @@
|
|
203
209
|
};
|
204
210
|
|
205
211
|
set_open >(open_bracket, 1) >set_opened {
|
206
|
-
emit(:set, :open,
|
212
|
+
emit(:set, :open, copy(data, ts, te))
|
207
213
|
fcall character_set;
|
208
214
|
};
|
209
215
|
|
210
216
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
211
|
-
text =
|
217
|
+
text = copy(data, ts, te)
|
212
218
|
|
213
219
|
type = :posixclass
|
214
220
|
class_name = text[2..-3]
|
@@ -217,19 +223,19 @@
|
|
217
223
|
type = :nonposixclass
|
218
224
|
end
|
219
225
|
|
220
|
-
emit(type, class_name.to_sym, text
|
226
|
+
emit(type, class_name.to_sym, text)
|
221
227
|
};
|
222
228
|
|
223
229
|
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
224
|
-
emit(:set, :collation,
|
230
|
+
emit(:set, :collation, copy(data, ts, te))
|
225
231
|
};
|
226
232
|
|
227
233
|
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
228
|
-
emit(:set, :equivalent,
|
234
|
+
emit(:set, :equivalent, copy(data, ts, te))
|
229
235
|
};
|
230
236
|
|
231
237
|
meta_char > (set_meta, 1) {
|
232
|
-
emit(:literal, :literal,
|
238
|
+
emit(:literal, :literal, copy(data, ts, te))
|
233
239
|
};
|
234
240
|
|
235
241
|
any |
|
@@ -237,9 +243,8 @@
|
|
237
243
|
utf8_2_byte |
|
238
244
|
utf8_3_byte |
|
239
245
|
utf8_4_byte {
|
240
|
-
|
241
|
-
|
242
|
-
emit(:literal, :literal, char, *rest)
|
246
|
+
text = copy(data, ts, te)
|
247
|
+
emit(:literal, :literal, text)
|
243
248
|
};
|
244
249
|
*|;
|
245
250
|
|
@@ -247,7 +252,7 @@
|
|
247
252
|
# --------------------------------------------------------------------------
|
248
253
|
set_escape_sequence := |*
|
249
254
|
non_set_escape > (escaped_set_alpha, 2) {
|
250
|
-
emit(:escape, :literal,
|
255
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
251
256
|
fret;
|
252
257
|
};
|
253
258
|
|
@@ -263,33 +268,33 @@
|
|
263
268
|
# --------------------------------------------------------------------------
|
264
269
|
escape_sequence := |*
|
265
270
|
[1-9] {
|
266
|
-
text =
|
267
|
-
emit(:backref, :number, text
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:backref, :number, text)
|
268
273
|
fret;
|
269
274
|
};
|
270
275
|
|
271
276
|
octal_sequence {
|
272
|
-
emit(:escape, :octal,
|
277
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
273
278
|
fret;
|
274
279
|
};
|
275
280
|
|
276
281
|
meta_char {
|
277
|
-
case text =
|
278
|
-
when '\.'; emit(:escape, :dot, text
|
279
|
-
when '\|'; emit(:escape, :alternation, text
|
280
|
-
when '\^'; emit(:escape, :bol, text
|
281
|
-
when '\$'; emit(:escape, :eol, text
|
282
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
283
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
284
|
-
when '\+'; emit(:escape, :one_or_more, text
|
285
|
-
when '\('; emit(:escape, :group_open, text
|
286
|
-
when '\)'; emit(:escape, :group_close, text
|
287
|
-
when '\{'; emit(:escape, :interval_open, text
|
288
|
-
when '\}'; emit(:escape, :interval_close, text
|
289
|
-
when '\['; emit(:escape, :set_open, text
|
290
|
-
when '\]'; emit(:escape, :set_close, text
|
282
|
+
case text = copy(data, ts-1, te)
|
283
|
+
when '\.'; emit(:escape, :dot, text)
|
284
|
+
when '\|'; emit(:escape, :alternation, text)
|
285
|
+
when '\^'; emit(:escape, :bol, text)
|
286
|
+
when '\$'; emit(:escape, :eol, text)
|
287
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
288
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
289
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
290
|
+
when '\('; emit(:escape, :group_open, text)
|
291
|
+
when '\)'; emit(:escape, :group_close, text)
|
292
|
+
when '\{'; emit(:escape, :interval_open, text)
|
293
|
+
when '\}'; emit(:escape, :interval_close, text)
|
294
|
+
when '\['; emit(:escape, :set_open, text)
|
295
|
+
when '\]'; emit(:escape, :set_close, text)
|
291
296
|
when "\\\\";
|
292
|
-
emit(:escape, :backslash, text
|
297
|
+
emit(:escape, :backslash, text)
|
293
298
|
end
|
294
299
|
fret;
|
295
300
|
};
|
@@ -297,31 +302,31 @@
|
|
297
302
|
escaped_ascii > (escaped_alpha, 7) {
|
298
303
|
# \b is emitted as backspace only when inside a character set, otherwise
|
299
304
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
300
|
-
case text =
|
301
|
-
when '\a'; emit(:escape, :bell, text
|
302
|
-
when '\b'; emit(:escape, :backspace, text
|
303
|
-
when '\e'; emit(:escape, :escape, text
|
304
|
-
when '\f'; emit(:escape, :form_feed, text
|
305
|
-
when '\n'; emit(:escape, :newline, text
|
306
|
-
when '\r'; emit(:escape, :carriage, text
|
307
|
-
when '\t'; emit(:escape, :tab, text
|
308
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
305
|
+
case text = copy(data, ts-1, te)
|
306
|
+
when '\a'; emit(:escape, :bell, text)
|
307
|
+
when '\b'; emit(:escape, :backspace, text)
|
308
|
+
when '\e'; emit(:escape, :escape, text)
|
309
|
+
when '\f'; emit(:escape, :form_feed, text)
|
310
|
+
when '\n'; emit(:escape, :newline, text)
|
311
|
+
when '\r'; emit(:escape, :carriage, text)
|
312
|
+
when '\t'; emit(:escape, :tab, text)
|
313
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
309
314
|
end
|
310
315
|
fret;
|
311
316
|
};
|
312
317
|
|
313
318
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
314
|
-
text =
|
319
|
+
text = copy(data, ts-1, te)
|
315
320
|
if text[2].chr == '{'
|
316
|
-
emit(:escape, :codepoint_list, text
|
321
|
+
emit(:escape, :codepoint_list, text)
|
317
322
|
else
|
318
|
-
emit(:escape, :codepoint, text
|
323
|
+
emit(:escape, :codepoint, text)
|
319
324
|
end
|
320
325
|
fret;
|
321
326
|
};
|
322
327
|
|
323
328
|
hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
324
|
-
emit(:escape, :hex,
|
329
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
325
330
|
fret;
|
326
331
|
};
|
327
332
|
|
@@ -351,8 +356,11 @@
|
|
351
356
|
fcall unicode_property;
|
352
357
|
};
|
353
358
|
|
354
|
-
(any -- non_literal_escape)
|
355
|
-
|
359
|
+
(any -- non_literal_escape) |
|
360
|
+
utf8_2_byte |
|
361
|
+
utf8_3_byte |
|
362
|
+
utf8_4_byte > (escaped_alpha, 1) {
|
363
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
356
364
|
fret;
|
357
365
|
};
|
358
366
|
*|;
|
@@ -362,9 +370,9 @@
|
|
362
370
|
# --------------------------------------------------------------------------
|
363
371
|
conditional_expression := |*
|
364
372
|
group_lookup . ')' {
|
365
|
-
text =
|
366
|
-
emit(:conditional, :condition, text
|
367
|
-
emit(:conditional, :condition_close, ')'
|
373
|
+
text = copy(data, ts, te-1)
|
374
|
+
emit(:conditional, :condition, text)
|
375
|
+
emit(:conditional, :condition_close, ')')
|
368
376
|
};
|
369
377
|
|
370
378
|
any {
|
@@ -381,46 +389,50 @@
|
|
381
389
|
# Meta characters
|
382
390
|
# ------------------------------------------------------------------------
|
383
391
|
dot {
|
384
|
-
emit(:meta, :dot,
|
392
|
+
emit(:meta, :dot, copy(data, ts, te))
|
385
393
|
};
|
386
394
|
|
387
395
|
alternation {
|
388
396
|
if conditional_stack.last == group_depth
|
389
|
-
emit(:conditional, :separator,
|
397
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
390
398
|
else
|
391
|
-
emit(:meta, :alternation,
|
399
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
392
400
|
end
|
393
401
|
};
|
394
402
|
|
395
403
|
# Anchors
|
396
404
|
# ------------------------------------------------------------------------
|
397
405
|
beginning_of_line {
|
398
|
-
emit(:anchor, :bol,
|
406
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
399
407
|
};
|
400
408
|
|
401
409
|
end_of_line {
|
402
|
-
emit(:anchor, :eol,
|
410
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
403
411
|
};
|
404
412
|
|
405
413
|
backslash . keep_mark > (backslashed, 4) {
|
406
|
-
emit(:keep, :mark,
|
414
|
+
emit(:keep, :mark, copy(data, ts, te))
|
407
415
|
};
|
408
416
|
|
409
417
|
backslash . anchor_char > (backslashed, 3) {
|
410
|
-
case text =
|
411
|
-
when '\\A'; emit(:anchor, :bos, text
|
412
|
-
when '\\z'; emit(:anchor, :eos, text
|
413
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
414
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
415
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
416
|
-
when '\\G'; emit(:anchor, :match_start, text
|
418
|
+
case text = copy(data, ts, te)
|
419
|
+
when '\\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
417
425
|
end
|
418
426
|
};
|
419
427
|
|
428
|
+
literal_delimiters {
|
429
|
+
append_literal(data, ts, te)
|
430
|
+
};
|
431
|
+
|
420
432
|
# Character sets
|
421
433
|
# ------------------------------------------------------------------------
|
422
434
|
set_open >set_opened {
|
423
|
-
emit(:set, :open,
|
435
|
+
emit(:set, :open, copy(data, ts, te))
|
424
436
|
fcall character_set;
|
425
437
|
};
|
426
438
|
|
@@ -429,12 +441,12 @@
|
|
429
441
|
# (?(condition)Y|N) conditional expression
|
430
442
|
# ------------------------------------------------------------------------
|
431
443
|
conditional {
|
432
|
-
text =
|
444
|
+
text = copy(data, ts, te)
|
433
445
|
|
434
446
|
conditional_stack << group_depth
|
435
447
|
|
436
|
-
emit(:conditional, :open, text[0..-2]
|
437
|
-
emit(:conditional, :condition_open, '('
|
448
|
+
emit(:conditional, :open, text[0..-2])
|
449
|
+
emit(:conditional, :condition_open, '(')
|
438
450
|
fcall conditional_expression;
|
439
451
|
};
|
440
452
|
|
@@ -445,7 +457,7 @@
|
|
445
457
|
# correct closing count.
|
446
458
|
# ------------------------------------------------------------------------
|
447
459
|
group_open . group_comment $group_closed {
|
448
|
-
emit(:group, :comment,
|
460
|
+
emit(:group, :comment, copy(data, ts, te))
|
449
461
|
};
|
450
462
|
|
451
463
|
# Expression options:
|
@@ -460,11 +472,11 @@
|
|
460
472
|
# (?imxdau-imx:subexp) option on/off for subexp
|
461
473
|
# ------------------------------------------------------------------------
|
462
474
|
group_open . group_options >group_opened {
|
463
|
-
text =
|
475
|
+
text = copy(data, ts, te)
|
464
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
465
477
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
466
478
|
end
|
467
|
-
emit_options(text
|
479
|
+
emit_options(text)
|
468
480
|
};
|
469
481
|
|
470
482
|
# Assertions
|
@@ -474,11 +486,11 @@
|
|
474
486
|
# (?<!subexp) negative look-behind
|
475
487
|
# ------------------------------------------------------------------------
|
476
488
|
group_open . assertion_type >group_opened {
|
477
|
-
case text =
|
478
|
-
when '(?='; emit(:assertion, :lookahead, text
|
479
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
480
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
481
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
489
|
+
case text = copy(data, ts, te)
|
490
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
491
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
492
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
493
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
482
494
|
end
|
483
495
|
};
|
484
496
|
|
@@ -491,32 +503,32 @@
|
|
491
503
|
# (subexp) captured group
|
492
504
|
# ------------------------------------------------------------------------
|
493
505
|
group_open . group_type >group_opened {
|
494
|
-
case text =
|
495
|
-
when '(?:'; emit(:group, :passive, text
|
496
|
-
when '(?>'; emit(:group, :atomic, text
|
497
|
-
when '(?~'; emit(:group, :absence, text
|
506
|
+
case text = copy(data, ts, te)
|
507
|
+
when '(?:'; emit(:group, :passive, text)
|
508
|
+
when '(?>'; emit(:group, :atomic, text)
|
509
|
+
when '(?~'; emit(:group, :absence, text)
|
498
510
|
|
499
511
|
when /^\(\?(?:<>|'')/
|
500
512
|
validation_error(:group, 'named group', 'name is empty')
|
501
513
|
|
502
514
|
when /^\(\?<\w*>/
|
503
|
-
emit(:group, :named_ab, text
|
515
|
+
emit(:group, :named_ab, text)
|
504
516
|
|
505
517
|
when /^\(\?'\w*'/
|
506
|
-
emit(:group, :named_sq, text
|
518
|
+
emit(:group, :named_sq, text)
|
507
519
|
|
508
520
|
end
|
509
521
|
};
|
510
522
|
|
511
523
|
group_open @group_opened {
|
512
|
-
text =
|
513
|
-
emit(:group, :capture, text
|
524
|
+
text = copy(data, ts, te)
|
525
|
+
emit(:group, :capture, text)
|
514
526
|
};
|
515
527
|
|
516
528
|
group_close @group_closed {
|
517
529
|
if conditional_stack.last == group_depth + 1
|
518
530
|
conditional_stack.pop
|
519
|
-
emit(:conditional, :close,
|
531
|
+
emit(:conditional, :close, copy(data, ts, te))
|
520
532
|
else
|
521
533
|
if spacing_stack.length > 1 &&
|
522
534
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -524,7 +536,7 @@
|
|
524
536
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
525
537
|
end
|
526
538
|
|
527
|
-
emit(:group, :close,
|
539
|
+
emit(:group, :close, copy(data, ts, te))
|
528
540
|
end
|
529
541
|
};
|
530
542
|
|
@@ -532,63 +544,63 @@
|
|
532
544
|
# Group backreference, named and numbered
|
533
545
|
# ------------------------------------------------------------------------
|
534
546
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
535
|
-
case text =
|
547
|
+
case text = copy(data, ts, te)
|
536
548
|
when /^\\([gk])(<>|'')/ # angle brackets
|
537
549
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
538
550
|
|
539
551
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
540
552
|
if $1 == 'k'
|
541
|
-
emit(:backref, :name_ref_ab, text
|
553
|
+
emit(:backref, :name_ref_ab, text)
|
542
554
|
else
|
543
|
-
emit(:backref, :name_call_ab, text
|
555
|
+
emit(:backref, :name_call_ab, text)
|
544
556
|
end
|
545
557
|
|
546
558
|
when /^\\([gk])'[^\d+-]\w*'/ #single quotes
|
547
559
|
if $1 == 'k'
|
548
|
-
emit(:backref, :name_ref_sq, text
|
560
|
+
emit(:backref, :name_ref_sq, text)
|
549
561
|
else
|
550
|
-
emit(:backref, :name_call_sq, text
|
562
|
+
emit(:backref, :name_call_sq, text)
|
551
563
|
end
|
552
564
|
|
553
565
|
when /^\\([gk])<\d+>/ # angle-brackets
|
554
566
|
if $1 == 'k'
|
555
|
-
emit(:backref, :number_ref_ab, text
|
567
|
+
emit(:backref, :number_ref_ab, text)
|
556
568
|
else
|
557
|
-
emit(:backref, :number_call_ab, text
|
569
|
+
emit(:backref, :number_call_ab, text)
|
558
570
|
end
|
559
571
|
|
560
572
|
when /^\\([gk])'\d+'/ # single quotes
|
561
573
|
if $1 == 'k'
|
562
|
-
emit(:backref, :number_ref_sq, text
|
574
|
+
emit(:backref, :number_ref_sq, text)
|
563
575
|
else
|
564
|
-
emit(:backref, :number_call_sq, text
|
576
|
+
emit(:backref, :number_call_sq, text)
|
565
577
|
end
|
566
578
|
|
567
579
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
568
580
|
if $1 == 'k'
|
569
|
-
emit(:backref, :number_rel_ref_ab, text
|
581
|
+
emit(:backref, :number_rel_ref_ab, text)
|
570
582
|
else
|
571
|
-
emit(:backref, :number_rel_call_ab, text
|
583
|
+
emit(:backref, :number_rel_call_ab, text)
|
572
584
|
end
|
573
585
|
|
574
586
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
575
587
|
if $1 == 'k'
|
576
|
-
emit(:backref, :number_rel_ref_sq, text
|
588
|
+
emit(:backref, :number_rel_ref_sq, text)
|
577
589
|
else
|
578
|
-
emit(:backref, :number_rel_call_sq, text
|
590
|
+
emit(:backref, :number_rel_call_sq, text)
|
579
591
|
end
|
580
592
|
|
581
593
|
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
582
|
-
emit(:backref, :name_recursion_ref_ab, text
|
594
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
583
595
|
|
584
596
|
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
585
|
-
emit(:backref, :name_recursion_ref_sq, text
|
597
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
586
598
|
|
587
599
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
588
|
-
emit(:backref, :number_recursion_ref_ab, text
|
600
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
589
601
|
|
590
602
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
591
|
-
emit(:backref, :number_recursion_ref_sq, text
|
603
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
592
604
|
|
593
605
|
end
|
594
606
|
};
|
@@ -597,31 +609,36 @@
|
|
597
609
|
# Quantifiers
|
598
610
|
# ------------------------------------------------------------------------
|
599
611
|
zero_or_one {
|
600
|
-
case text =
|
601
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
602
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
603
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
612
|
+
case text = copy(data, ts, te)
|
613
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
614
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
615
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
604
616
|
end
|
605
617
|
};
|
606
618
|
|
607
619
|
zero_or_more {
|
608
|
-
case text =
|
609
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
610
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
611
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
620
|
+
case text = copy(data, ts, te)
|
621
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
622
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
623
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
612
624
|
end
|
613
625
|
};
|
614
626
|
|
615
627
|
one_or_more {
|
616
|
-
case text =
|
617
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
618
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
619
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
628
|
+
case text = copy(data, ts, te)
|
629
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
630
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
631
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
620
632
|
end
|
621
633
|
};
|
622
634
|
|
623
|
-
quantifier_interval
|
624
|
-
emit(:quantifier, :interval,
|
635
|
+
quantifier_interval {
|
636
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
637
|
+
};
|
638
|
+
|
639
|
+
# Catch unmatched curly braces as literals
|
640
|
+
range_open {
|
641
|
+
append_literal(data, ts, te)
|
625
642
|
};
|
626
643
|
|
627
644
|
# Escaped sequences
|
@@ -632,15 +649,17 @@
|
|
632
649
|
|
633
650
|
comment {
|
634
651
|
if free_spacing
|
635
|
-
emit(:free_space, :comment,
|
652
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
636
653
|
else
|
637
|
-
|
654
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
655
|
+
append_literal(data, ts, ts + 1)
|
656
|
+
fexec ts + 1;
|
638
657
|
end
|
639
658
|
};
|
640
659
|
|
641
660
|
space+ {
|
642
661
|
if free_spacing
|
643
|
-
emit(:free_space, :whitespace,
|
662
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
644
663
|
else
|
645
664
|
append_literal(data, ts, te)
|
646
665
|
end
|
@@ -722,21 +741,16 @@ class Regexp::Scanner
|
|
722
741
|
#
|
723
742
|
# This method may raise errors if a syntax error is encountered.
|
724
743
|
# --------------------------------------------------------------------------
|
725
|
-
def self.scan(input_object, &block)
|
726
|
-
new.scan(input_object, &block)
|
744
|
+
def self.scan(input_object, options: nil, &block)
|
745
|
+
new.scan(input_object, options: options, &block)
|
727
746
|
end
|
728
747
|
|
729
|
-
def scan(input_object, &block)
|
748
|
+
def scan(input_object, options: nil, &block)
|
730
749
|
self.literal = nil
|
731
750
|
stack = []
|
732
751
|
|
733
|
-
|
734
|
-
|
735
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
736
|
-
else
|
737
|
-
input = input_object
|
738
|
-
self.free_spacing = false
|
739
|
-
end
|
752
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
753
|
+
self.free_spacing = free_spacing?(input_object, options)
|
740
754
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
741
755
|
|
742
756
|
data = input.unpack("c*") if input.is_a?(String)
|
@@ -748,6 +762,7 @@ class Regexp::Scanner
|
|
748
762
|
self.set_depth = 0
|
749
763
|
self.group_depth = 0
|
750
764
|
self.conditional_stack = []
|
765
|
+
self.char_pos = 0
|
751
766
|
|
752
767
|
%% write data;
|
753
768
|
%% write init;
|
@@ -757,7 +772,7 @@ class Regexp::Scanner
|
|
757
772
|
testEof = testEof
|
758
773
|
|
759
774
|
if cs == re_scanner_error
|
760
|
-
text =
|
775
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
761
776
|
raise ScannerError.new("Scan error at '#{text}'")
|
762
777
|
end
|
763
778
|
|
@@ -785,22 +800,41 @@ class Regexp::Scanner
|
|
785
800
|
end
|
786
801
|
|
787
802
|
# Emits an array with the details of the scanned pattern
|
788
|
-
def emit(type, token, text
|
803
|
+
def emit(type, token, text)
|
789
804
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
790
805
|
|
791
806
|
emit_literal if literal
|
792
807
|
|
808
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
809
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
810
|
+
ts_char_pos = char_pos
|
811
|
+
te_char_pos = char_pos + text.length
|
812
|
+
|
793
813
|
if block
|
794
|
-
block.call type, token, text,
|
814
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
795
815
|
end
|
796
816
|
|
797
|
-
tokens << [type, token, text,
|
817
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
818
|
+
|
819
|
+
self.char_pos = te_char_pos
|
798
820
|
end
|
799
821
|
|
800
822
|
private
|
801
823
|
|
802
824
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
803
|
-
:group_depth, :set_depth, :conditional_stack
|
825
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
826
|
+
|
827
|
+
def free_spacing?(input_object, options)
|
828
|
+
if options && !input_object.is_a?(String)
|
829
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
830
|
+
end
|
831
|
+
|
832
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
833
|
+
|
834
|
+
return false unless options
|
835
|
+
|
836
|
+
options & Regexp::EXTENDED != 0
|
837
|
+
end
|
804
838
|
|
805
839
|
def in_group?
|
806
840
|
group_depth > 0
|
@@ -811,36 +845,25 @@ class Regexp::Scanner
|
|
811
845
|
end
|
812
846
|
|
813
847
|
# Copy from ts to te from data as text
|
814
|
-
def copy(data,
|
815
|
-
data[
|
816
|
-
end
|
817
|
-
|
818
|
-
# Copy from ts to te from data as text, returning an array with the text
|
819
|
-
# and the offsets used to copy it.
|
820
|
-
def text(data, ts, te, soff = 0)
|
821
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
848
|
+
def copy(data, ts, te)
|
849
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
822
850
|
end
|
823
851
|
|
824
852
|
# Appends one or more characters to the literal buffer, to be emitted later
|
825
|
-
# by a call to emit_literal.
|
853
|
+
# by a call to emit_literal.
|
826
854
|
def append_literal(data, ts, te)
|
827
855
|
self.literal = literal || []
|
828
|
-
literal <<
|
856
|
+
literal << copy(data, ts, te)
|
829
857
|
end
|
830
858
|
|
831
|
-
# Emits the literal run collected by calls to the append_literal method
|
832
|
-
# using the total start (ts) and end (te) offsets of the run.
|
859
|
+
# Emits the literal run collected by calls to the append_literal method.
|
833
860
|
def emit_literal
|
834
|
-
|
835
|
-
text = literal.map {|t| t[0]}.join
|
836
|
-
|
837
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
838
|
-
|
861
|
+
text = literal.join
|
839
862
|
self.literal = nil
|
840
|
-
emit(:literal, :literal, text
|
863
|
+
emit(:literal, :literal, text)
|
841
864
|
end
|
842
865
|
|
843
|
-
def emit_options(text
|
866
|
+
def emit_options(text)
|
844
867
|
token = nil
|
845
868
|
|
846
869
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -866,14 +889,14 @@ class Regexp::Scanner
|
|
866
889
|
token = :options_switch
|
867
890
|
end
|
868
891
|
|
869
|
-
emit(:group, token, text
|
892
|
+
emit(:group, token, text)
|
870
893
|
end
|
871
894
|
|
872
895
|
def emit_meta_control_sequence(data, ts, te, token)
|
873
896
|
if data.last < 0x00 || data.last > 0x7F
|
874
897
|
validation_error(:sequence, 'escape', token.to_s)
|
875
898
|
end
|
876
|
-
emit(:escape, token,
|
899
|
+
emit(:escape, token, copy(data, ts-1, te))
|
877
900
|
end
|
878
901
|
|
879
902
|
# Centralizes and unifies the handling of validation related
|