regexp_parser 0.1.1 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog +45 -0
- data/Rakefile +12 -44
- data/VERSION.yml +5 -0
- data/lib/regexp_parser.rb +5 -38
- data/lib/regexp_parser/expression.rb +68 -221
- data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
- data/lib/regexp_parser/expression/classes/backref.rb +42 -0
- data/lib/regexp_parser/expression/classes/escape.rb +27 -0
- data/lib/regexp_parser/expression/classes/group.rb +67 -0
- data/lib/regexp_parser/expression/classes/literal.rb +7 -0
- data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +26 -0
- data/lib/regexp_parser/expression/classes/set.rb +100 -0
- data/lib/regexp_parser/expression/classes/type.rb +17 -0
- data/lib/regexp_parser/expression/quantifier.rb +26 -0
- data/lib/regexp_parser/expression/subexpression.rb +69 -0
- data/lib/regexp_parser/lexer.rb +4 -4
- data/lib/regexp_parser/parser.rb +31 -13
- data/lib/regexp_parser/scanner.rb +1849 -1488
- data/lib/regexp_parser/scanner/property.rl +7 -2
- data/lib/regexp_parser/scanner/scanner.rl +377 -191
- data/lib/regexp_parser/syntax.rb +7 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
- data/lib/regexp_parser/syntax/tokens.rb +21 -320
- data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
- data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
- data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
- data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
- data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
- data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
- data/lib/regexp_parser/token.rb +37 -0
- data/test/expression/test_all.rb +7 -0
- data/test/expression/test_base.rb +72 -0
- data/test/expression/test_clone.rb +144 -0
- data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
- data/test/helpers.rb +1 -0
- data/test/parser/test_all.rb +1 -1
- data/test/parser/test_alternation.rb +35 -0
- data/test/parser/test_anchors.rb +2 -2
- data/test/parser/test_refcalls.rb +1 -1
- data/test/parser/test_sets.rb +54 -8
- data/test/scanner/test_anchors.rb +2 -2
- data/test/scanner/test_conditionals.rb +31 -0
- data/test/scanner/test_errors.rb +88 -8
- data/test/scanner/test_escapes.rb +4 -4
- data/test/scanner/test_groups.rb +7 -0
- data/test/scanner/test_quoting.rb +29 -0
- data/test/scanner/test_sets.rb +1 -0
- data/test/syntax/ruby/test_1.8.rb +3 -3
- data/test/test_all.rb +1 -1
- metadata +62 -48
- data/lib/regexp_parser/expression/set.rb +0 -59
@@ -58,7 +58,7 @@
|
|
58
58
|
unicode_property := |*
|
59
59
|
|
60
60
|
property_sequence < eof(premature_property_end) {
|
61
|
-
text = data
|
61
|
+
text = text(data, ts, te, 1).first
|
62
62
|
if in_set
|
63
63
|
type = :set
|
64
64
|
else
|
@@ -525,9 +525,14 @@
|
|
525
525
|
self.emit(type, :script_unknown, text, ts-1, te)
|
526
526
|
|
527
527
|
else
|
528
|
-
|
528
|
+
# Should this really be an error? Or would emitting
|
529
|
+
# an :unknown for the property be better?
|
530
|
+
#
|
531
|
+
# self.emit(type, :unknown, text, ts-1, te)
|
529
532
|
|
533
|
+
raise UnknownUnicodePropertyError.new(name)
|
530
534
|
end
|
535
|
+
|
531
536
|
fret;
|
532
537
|
};
|
533
538
|
*|;
|
@@ -28,6 +28,7 @@
|
|
28
28
|
|
29
29
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
30
30
|
|
31
|
+
|
31
32
|
# these are not supported in ruby, and need verification
|
32
33
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
33
34
|
character_equivalent = '[=' . alpha . '=]';
|
@@ -41,14 +42,21 @@
|
|
41
42
|
octal_sequence = [0-7]{1,3};
|
42
43
|
|
43
44
|
hex_sequence = 'x' . xdigit{1,2};
|
45
|
+
hex_sequence_err = 'x' . [^0-9a-fA-F{];
|
44
46
|
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
45
47
|
|
48
|
+
hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
|
49
|
+
|
50
|
+
wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
|
51
|
+
wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
|
52
|
+
|
46
53
|
codepoint_single = 'u' . xdigit{4};
|
47
54
|
codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
|
48
55
|
codepoint_sequence = codepoint_single | codepoint_list;
|
49
56
|
|
50
|
-
control_sequence = ('c' | 'C-')
|
51
|
-
|
57
|
+
control_sequence = ('c' | 'C-');
|
58
|
+
|
59
|
+
meta_sequence = 'M-' . (backslash . control_sequence)?;
|
52
60
|
|
53
61
|
zero_or_one = '?' | '??' | '?+';
|
54
62
|
zero_or_more = '*' | '*?' | '*+';
|
@@ -59,11 +67,11 @@
|
|
59
67
|
quantifier_possessive = '?+' | '*+' | '++';
|
60
68
|
quantifier_mode = '?' | '+';
|
61
69
|
|
62
|
-
|
70
|
+
quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
|
63
71
|
range_close . quantifier_mode?;
|
64
72
|
|
65
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
66
|
-
quantifier_possessive |
|
74
|
+
quantifier_possessive | quantifier_interval;
|
67
75
|
|
68
76
|
|
69
77
|
group_comment = '?#' . [^)]+ . group_close;
|
@@ -76,10 +84,10 @@
|
|
76
84
|
assertion_lookbehind = '?<=';
|
77
85
|
assertion_nlookbehind = '?<!';
|
78
86
|
|
79
|
-
group_options = '?' .
|
87
|
+
group_options = '?' . [\-mix];
|
80
88
|
|
81
89
|
group_ref = [gk];
|
82
|
-
group_name =
|
90
|
+
group_name = (alnum . (alnum+)?)?;
|
83
91
|
group_number = '-'? . [1-9] . ([0-9]+)?;
|
84
92
|
group_level = [+\-] . [0-9]+;
|
85
93
|
|
@@ -113,7 +121,16 @@
|
|
113
121
|
group_ref | [xucCM];
|
114
122
|
|
115
123
|
# EOF error, used where it can be detected
|
116
|
-
action premature_end_error {
|
124
|
+
action premature_end_error {
|
125
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
126
|
+
raise PrematureEndError.new( text )
|
127
|
+
}
|
128
|
+
|
129
|
+
# Invalid sequence error, used from sequences, like escapes and sets
|
130
|
+
action invalid_sequence_error {
|
131
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
132
|
+
raise InvalidSequenceError.new('sequence', text)
|
133
|
+
}
|
117
134
|
|
118
135
|
# group (nesting) and set open/close actions
|
119
136
|
action group_opened { group_depth += 1; in_group = true }
|
@@ -127,7 +144,7 @@
|
|
127
144
|
set_type = set_depth > 1 ? :subset : :set
|
128
145
|
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
129
146
|
|
130
|
-
|
147
|
+
emit(set_type, :close, *text(data, ts, te))
|
131
148
|
|
132
149
|
if set_depth == 0
|
133
150
|
fgoto main;
|
@@ -140,8 +157,8 @@
|
|
140
157
|
set_type = set_depth > 1 ? :subset : :set
|
141
158
|
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
142
159
|
|
143
|
-
|
144
|
-
|
160
|
+
emit(set_type, :member, copy(data, ts..te-2), ts, te)
|
161
|
+
emit(set_type, :close, copy(data, ts+1..te-1), ts, te)
|
145
162
|
|
146
163
|
if set_depth == 0
|
147
164
|
fgoto main;
|
@@ -151,20 +168,20 @@
|
|
151
168
|
};
|
152
169
|
|
153
170
|
'^' {
|
154
|
-
text = data
|
171
|
+
text = text(data, ts, te).first
|
155
172
|
if @tokens.last[1] == :open
|
156
|
-
|
173
|
+
emit(set_type, :negate, text, ts, te)
|
157
174
|
else
|
158
|
-
|
175
|
+
emit(set_type, :member, text, ts, te)
|
159
176
|
end
|
160
177
|
};
|
161
178
|
|
162
179
|
alnum . '-' . alnum {
|
163
|
-
|
180
|
+
emit(set_type, :range, *text(data, ts, te))
|
164
181
|
};
|
165
182
|
|
166
183
|
'&&' {
|
167
|
-
|
184
|
+
emit(set_type, :intersection, *text(data, ts, te))
|
168
185
|
};
|
169
186
|
|
170
187
|
'\\' {
|
@@ -175,12 +192,12 @@
|
|
175
192
|
set_depth += 1; in_set = true
|
176
193
|
set_type = set_depth > 1 ? :subset : :set
|
177
194
|
|
178
|
-
|
195
|
+
emit(set_type, :open, *text(data, ts, te))
|
179
196
|
fcall character_set;
|
180
197
|
};
|
181
198
|
|
182
199
|
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
183
|
-
text = data
|
200
|
+
text = text(data, ts, te).first
|
184
201
|
|
185
202
|
class_name = text[2..-3]
|
186
203
|
if class_name[0].chr == '^'
|
@@ -188,21 +205,21 @@
|
|
188
205
|
end
|
189
206
|
|
190
207
|
token_sym = "class_#{class_name}".to_sym
|
191
|
-
|
208
|
+
emit(set_type, token_sym, text, ts, te)
|
192
209
|
};
|
193
210
|
|
194
211
|
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
195
|
-
|
212
|
+
emit(set_type, :collation, *text(data, ts, te))
|
196
213
|
};
|
197
214
|
|
198
215
|
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
199
|
-
|
216
|
+
emit(set_type, :equivalent, *text(data, ts, te))
|
200
217
|
};
|
201
218
|
|
202
219
|
# exclude the closing bracket as a cleaner workaround for dealing with the
|
203
220
|
# ambiguity caused upon exit from the unicode properties machine
|
204
221
|
meta_char -- ']' {
|
205
|
-
|
222
|
+
emit(set_type, :member, *text(data, ts, te))
|
206
223
|
};
|
207
224
|
|
208
225
|
any |
|
@@ -210,48 +227,48 @@
|
|
210
227
|
utf8_2_byte |
|
211
228
|
utf8_3_byte |
|
212
229
|
utf8_4_byte {
|
213
|
-
|
230
|
+
emit(set_type, :member, *text(data, ts, te))
|
214
231
|
};
|
215
232
|
*|;
|
216
233
|
|
217
234
|
# set escapes scanner
|
218
235
|
# --------------------------------------------------------------------------
|
219
236
|
set_escape_sequence := |*
|
220
|
-
'b' {
|
221
|
-
|
237
|
+
'b' > (escaped_set_alpha, 2) {
|
238
|
+
emit(set_type, :backspace, *text(data, ts, te, 1))
|
222
239
|
fret;
|
223
240
|
};
|
224
241
|
|
225
242
|
char_type {
|
226
|
-
case text = data
|
227
|
-
when '\d';
|
228
|
-
when '\D';
|
229
|
-
when '\h';
|
230
|
-
when '\H';
|
231
|
-
when '\s';
|
232
|
-
when '\S';
|
233
|
-
when '\w';
|
234
|
-
when '\W';
|
243
|
+
case text = text(data, ts, te, 1).first
|
244
|
+
when '\d'; emit(set_type, :type_digit, text, ts-1, te)
|
245
|
+
when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
|
246
|
+
when '\h'; emit(set_type, :type_hex, text, ts-1, te)
|
247
|
+
when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
|
248
|
+
when '\s'; emit(set_type, :type_space, text, ts-1, te)
|
249
|
+
when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
|
250
|
+
when '\w'; emit(set_type, :type_word, text, ts-1, te)
|
251
|
+
when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
|
235
252
|
end
|
236
253
|
fret;
|
237
254
|
};
|
238
255
|
|
239
256
|
hex_sequence . '-\\' . hex_sequence {
|
240
|
-
|
257
|
+
emit(set_type, :range_hex, *text(data, ts, te, 1))
|
241
258
|
fret;
|
242
259
|
};
|
243
260
|
|
244
261
|
hex_sequence {
|
245
|
-
|
262
|
+
emit(set_type, :member_hex, *text(data, ts, te, 1))
|
246
263
|
fret;
|
247
264
|
};
|
248
265
|
|
249
266
|
meta_char | [\\\]\-\,] {
|
250
|
-
|
267
|
+
emit(set_type, :escape, *text(data, ts, te, 1))
|
251
268
|
fret;
|
252
269
|
};
|
253
270
|
|
254
|
-
property_char > (escaped_set_alpha,
|
271
|
+
property_char > (escaped_set_alpha, 3) {
|
255
272
|
fhold;
|
256
273
|
fnext character_set;
|
257
274
|
fcall unicode_property;
|
@@ -264,7 +281,7 @@
|
|
264
281
|
utf8_2_byte |
|
265
282
|
utf8_3_byte |
|
266
283
|
utf8_4_byte {
|
267
|
-
|
284
|
+
emit(set_type, :escape, *text(data, ts, te, 1))
|
268
285
|
fret;
|
269
286
|
};
|
270
287
|
*|;
|
@@ -274,33 +291,33 @@
|
|
274
291
|
# --------------------------------------------------------------------------
|
275
292
|
escape_sequence := |*
|
276
293
|
[1-9] {
|
277
|
-
text = data
|
278
|
-
|
294
|
+
text = text(data, ts, te, 1).first
|
295
|
+
emit(:backref, :number, text, ts-1, te)
|
279
296
|
fret;
|
280
297
|
};
|
281
298
|
|
282
299
|
octal_sequence {
|
283
|
-
|
300
|
+
emit(:escape, :octal, *text(data, ts, te, 1))
|
284
301
|
fret;
|
285
302
|
};
|
286
303
|
|
287
304
|
meta_char {
|
288
|
-
case text = data
|
289
|
-
when '\.';
|
290
|
-
when '\|';
|
291
|
-
when '\^';
|
292
|
-
when '\$';
|
293
|
-
when '\?';
|
294
|
-
when '\*';
|
295
|
-
when '\+';
|
296
|
-
when '\(';
|
297
|
-
when '\)';
|
298
|
-
when '\{';
|
299
|
-
when '\}';
|
300
|
-
when '\[';
|
301
|
-
when '\]';
|
305
|
+
case text = text(data, ts, te, 1).first
|
306
|
+
when '\.'; emit(:escape, :dot, text, ts-1, te)
|
307
|
+
when '\|'; emit(:escape, :alternation, text, ts-1, te)
|
308
|
+
when '\^'; emit(:escape, :bol, text, ts-1, te)
|
309
|
+
when '\$'; emit(:escape, :eol, text, ts-1, te)
|
310
|
+
when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
|
311
|
+
when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
|
312
|
+
when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
|
313
|
+
when '\('; emit(:escape, :group_open, text, ts-1, te)
|
314
|
+
when '\)'; emit(:escape, :group_close, text, ts-1, te)
|
315
|
+
when '\{'; emit(:escape, :interval_open, text, ts-1, te)
|
316
|
+
when '\}'; emit(:escape, :interval_close, text, ts-1, te)
|
317
|
+
when '\['; emit(:escape, :set_open, text, ts-1, te)
|
318
|
+
when '\]'; emit(:escape, :set_close, text, ts-1, te)
|
302
319
|
when "\\\\";
|
303
|
-
|
320
|
+
emit(:escape, :backslash, text, ts-1, te)
|
304
321
|
end
|
305
322
|
fret;
|
306
323
|
};
|
@@ -308,46 +325,76 @@
|
|
308
325
|
escaped_ascii > (escaped_alpha, 7) {
|
309
326
|
# \b is emitted as backspace only when inside a character set, otherwise
|
310
327
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
311
|
-
case text = data
|
312
|
-
when '\a';
|
313
|
-
when '\e';
|
314
|
-
when '\f';
|
315
|
-
when '\n';
|
316
|
-
when '\r';
|
317
|
-
when '\s';
|
318
|
-
when '\t';
|
319
|
-
when '\v';
|
328
|
+
case text = text(data, ts, te, 1).first
|
329
|
+
when '\a'; emit(:escape, :bell, text, ts-1, te)
|
330
|
+
when '\e'; emit(:escape, :escape, text, ts-1, te)
|
331
|
+
when '\f'; emit(:escape, :form_feed, text, ts-1, te)
|
332
|
+
when '\n'; emit(:escape, :newline, text, ts-1, te)
|
333
|
+
when '\r'; emit(:escape, :carriage, text, ts-1, te)
|
334
|
+
when '\s'; emit(:escape, :space, text, ts-1, te)
|
335
|
+
when '\t'; emit(:escape, :tab, text, ts-1, te)
|
336
|
+
when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
|
320
337
|
end
|
321
338
|
fret;
|
322
339
|
};
|
323
340
|
|
324
|
-
codepoint_sequence > (escaped_alpha, 6) {
|
325
|
-
text = data
|
341
|
+
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
342
|
+
text = text(data, ts, te, 1).first
|
326
343
|
if text[2].chr == '{'
|
327
|
-
|
344
|
+
emit(:escape, :codepoint_list, text, ts-1, te)
|
328
345
|
else
|
329
|
-
|
346
|
+
emit(:escape, :codepoint, text, ts-1, te)
|
330
347
|
end
|
331
348
|
fret;
|
332
349
|
};
|
333
350
|
|
334
|
-
hex_sequence > (escaped_alpha, 5) {
|
335
|
-
|
351
|
+
hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
352
|
+
emit(:escape, :hex, *text(data, ts, te, 1))
|
353
|
+
fret;
|
354
|
+
};
|
355
|
+
|
356
|
+
wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
357
|
+
emit(:escape, :hex_wide, *text(data, ts, te, 1))
|
336
358
|
fret;
|
337
359
|
};
|
338
360
|
|
339
|
-
|
340
|
-
self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
|
361
|
+
hex_sequence_err @invalid_sequence_error {
|
341
362
|
fret;
|
342
363
|
};
|
343
364
|
|
344
|
-
|
345
|
-
|
365
|
+
(wide_hex_seq_invalid | wide_hex_seq_empty) {
|
366
|
+
raise InvalidSequenceError.new("wide hex sequence")
|
346
367
|
fret;
|
347
368
|
};
|
348
369
|
|
349
|
-
|
350
|
-
|
370
|
+
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
371
|
+
if data[te]
|
372
|
+
c = data[te].chr
|
373
|
+
if c =~ /[\x00-\x7F]/
|
374
|
+
emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
|
375
|
+
p += 1
|
376
|
+
else
|
377
|
+
raise InvalidSequenceError.new("control sequence")
|
378
|
+
end
|
379
|
+
else
|
380
|
+
raise PrematureEndError.new("control sequence")
|
381
|
+
end
|
382
|
+
fret;
|
383
|
+
};
|
384
|
+
|
385
|
+
meta_sequence >(backslashed, 3) $eof(premature_end_error) {
|
386
|
+
if data[te]
|
387
|
+
c = data[te].chr
|
388
|
+
if c =~ /[\x00-\x7F]/
|
389
|
+
emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
|
390
|
+
p += 1
|
391
|
+
else
|
392
|
+
raise InvalidSequenceError.new("meta sequence")
|
393
|
+
end
|
394
|
+
else
|
395
|
+
raise PrematureEndError.new("meta sequence")
|
396
|
+
end
|
397
|
+
fret;
|
351
398
|
};
|
352
399
|
|
353
400
|
property_char > (escaped_alpha, 2) {
|
@@ -357,7 +404,7 @@
|
|
357
404
|
};
|
358
405
|
|
359
406
|
(any -- non_literal_escape) > (escaped_alpha, 1) {
|
360
|
-
|
407
|
+
emit(:escape, :literal, *text(data, ts, te, 1))
|
361
408
|
fret;
|
362
409
|
};
|
363
410
|
*|;
|
@@ -370,32 +417,34 @@
|
|
370
417
|
# Meta characters
|
371
418
|
# ------------------------------------------------------------------------
|
372
419
|
dot {
|
373
|
-
|
420
|
+
emit(:meta, :dot, *text(data, ts, te))
|
374
421
|
};
|
375
422
|
|
376
423
|
alternation {
|
377
|
-
|
424
|
+
emit(:meta, :alternation, *text(data, ts, te))
|
378
425
|
};
|
379
426
|
|
380
427
|
# Anchors
|
381
428
|
# ------------------------------------------------------------------------
|
382
429
|
beginning_of_line {
|
383
|
-
|
430
|
+
emit(:anchor, :bol, *text(data, ts, te))
|
384
431
|
};
|
385
432
|
|
386
433
|
end_of_line {
|
387
|
-
|
434
|
+
emit(:anchor, :eol, *text(data, ts, te))
|
388
435
|
};
|
389
436
|
|
390
437
|
backslash . anchor_char > (backslashed, 3) {
|
391
|
-
case text = data
|
392
|
-
when '\\A';
|
393
|
-
when '\\z';
|
394
|
-
when '\\Z';
|
395
|
-
when '\\b';
|
396
|
-
when '\\B';
|
397
|
-
when '\\G';
|
398
|
-
else
|
438
|
+
case text = text(data, ts, te).first
|
439
|
+
when '\\A'; emit(:anchor, :bos, text, ts, te)
|
440
|
+
when '\\z'; emit(:anchor, :eos, text, ts, te)
|
441
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
|
442
|
+
when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
|
443
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
|
444
|
+
when '\\G'; emit(:anchor, :match_start, text, ts, te)
|
445
|
+
else
|
446
|
+
raise ScannerError.new(
|
447
|
+
"Unexpected character in anchor at #{text} (char #{ts})")
|
399
448
|
end
|
400
449
|
};
|
401
450
|
|
@@ -406,15 +455,18 @@
|
|
406
455
|
# \w, \W word, non-word
|
407
456
|
# ------------------------------------------------------------------------
|
408
457
|
backslash . char_type > (backslashed, 2) {
|
409
|
-
case text = data
|
410
|
-
when '\\d';
|
411
|
-
when '\\D';
|
412
|
-
when '\\h';
|
413
|
-
when '\\H';
|
414
|
-
when '\\s';
|
415
|
-
when '\\S';
|
416
|
-
when '\\w';
|
417
|
-
when '\\W';
|
458
|
+
case text = text(data, ts, te).first
|
459
|
+
when '\\d'; emit(:type, :digit, text, ts, te)
|
460
|
+
when '\\D'; emit(:type, :nondigit, text, ts, te)
|
461
|
+
when '\\h'; emit(:type, :hex, text, ts, te)
|
462
|
+
when '\\H'; emit(:type, :nonhex, text, ts, te)
|
463
|
+
when '\\s'; emit(:type, :space, text, ts, te)
|
464
|
+
when '\\S'; emit(:type, :nonspace, text, ts, te)
|
465
|
+
when '\\w'; emit(:type, :word, text, ts, te)
|
466
|
+
when '\\W'; emit(:type, :nonword, text, ts, te)
|
467
|
+
else
|
468
|
+
raise ScannerError.new(
|
469
|
+
"Unexpected character in type at #{text} (char #{ts})")
|
418
470
|
end
|
419
471
|
};
|
420
472
|
|
@@ -425,7 +477,7 @@
|
|
425
477
|
set_depth += 1; in_set = true
|
426
478
|
set_type = set_depth > 1 ? :subset : :set
|
427
479
|
|
428
|
-
|
480
|
+
emit(set_type, :open, *text(data, ts, te))
|
429
481
|
fcall character_set;
|
430
482
|
};
|
431
483
|
|
@@ -435,7 +487,7 @@
|
|
435
487
|
# correct closing count.
|
436
488
|
# ------------------------------------------------------------------------
|
437
489
|
group_open . group_comment $group_closed {
|
438
|
-
|
490
|
+
emit(:group, :comment, *text(data, ts, te))
|
439
491
|
};
|
440
492
|
|
441
493
|
# Expression options:
|
@@ -447,21 +499,7 @@
|
|
447
499
|
# (?imx-imx:subexp) option on/off for subexp
|
448
500
|
# ------------------------------------------------------------------------
|
449
501
|
group_open . group_options >group_opened {
|
450
|
-
|
451
|
-
if data[te]
|
452
|
-
c = data[te].chr
|
453
|
-
if c == ':' # include the ':'
|
454
|
-
self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
|
455
|
-
p += 1
|
456
|
-
elsif c == ')' # just options by themselves
|
457
|
-
self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
|
458
|
-
else
|
459
|
-
raise ScannerError.new(
|
460
|
-
"Unexpected '#{c}' in options sequence, ':' or ')' expected")
|
461
|
-
end
|
462
|
-
else
|
463
|
-
raise PrematureEndError.new("options") unless data[te]
|
464
|
-
end
|
502
|
+
p = scan_options(p, data, ts, te)
|
465
503
|
};
|
466
504
|
|
467
505
|
# Assertions
|
@@ -471,11 +509,11 @@
|
|
471
509
|
# (?<!subexp) negative look-behind
|
472
510
|
# ------------------------------------------------------------------------
|
473
511
|
group_open . assertion_type >group_opened {
|
474
|
-
case text =
|
475
|
-
when '(?=';
|
476
|
-
when '(?!';
|
477
|
-
when '(?<=';
|
478
|
-
when '(?<!';
|
512
|
+
case text = text(data, ts, te).first
|
513
|
+
when '(?='; emit(:assertion, :lookahead, text, ts, te)
|
514
|
+
when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
|
515
|
+
when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
|
516
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
|
479
517
|
end
|
480
518
|
};
|
481
519
|
|
@@ -487,85 +525,103 @@
|
|
487
525
|
# (subexp) captured group
|
488
526
|
# ------------------------------------------------------------------------
|
489
527
|
group_open . group_type >group_opened {
|
490
|
-
case text =
|
491
|
-
when '(?:';
|
492
|
-
when '(?>';
|
493
|
-
|
494
|
-
when
|
495
|
-
|
496
|
-
|
497
|
-
|
528
|
+
case text = text(data, ts, te).first
|
529
|
+
when '(?:'; emit(:group, :passive, text, ts, te)
|
530
|
+
when '(?>'; emit(:group, :atomic, text, ts, te)
|
531
|
+
|
532
|
+
when /^\(\?<(\w*)>/
|
533
|
+
empty_name_error(:group, 'named group (ab)') if $1.empty?
|
534
|
+
|
535
|
+
emit(:group, :named_ab, text, ts, te)
|
536
|
+
|
537
|
+
when /^\(\?'(\w*)'/
|
538
|
+
empty_name_error(:group, 'named group (sq)') if $1.empty?
|
539
|
+
|
540
|
+
emit(:group, :named_sq, text, ts, te)
|
541
|
+
|
542
|
+
else
|
543
|
+
raise ScannerError.new(
|
544
|
+
"Unknown subexpression group format '#{text}'")
|
498
545
|
end
|
499
546
|
};
|
500
547
|
|
501
548
|
group_open @group_opened {
|
502
|
-
text =
|
503
|
-
|
549
|
+
text = text(data, ts, te).first
|
550
|
+
emit(:group, :capture, text, ts, te)
|
504
551
|
};
|
505
552
|
|
506
553
|
group_close @group_closed {
|
507
|
-
|
554
|
+
emit(:group, :close, *text(data, ts, te))
|
508
555
|
};
|
509
556
|
|
510
557
|
|
511
|
-
# Group
|
558
|
+
# Group backreference, named and numbered
|
512
559
|
# ------------------------------------------------------------------------
|
513
560
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
514
|
-
case text = data
|
515
|
-
when
|
561
|
+
case text = text(data, ts, te).first
|
562
|
+
when /^\\([gk])<>/ # angle brackets
|
563
|
+
empty_backref_error("ref/call (ab)")
|
564
|
+
|
565
|
+
when /^\\([gk])''/ # single quotes
|
566
|
+
empty_backref_error("ref/call (sq)")
|
567
|
+
|
568
|
+
when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
516
569
|
if $1 == 'k'
|
517
|
-
|
570
|
+
emit(:backref, :name_ref_ab, text, ts, te)
|
518
571
|
else
|
519
|
-
|
572
|
+
emit(:backref, :name_call_ab, text, ts, te)
|
520
573
|
end
|
521
574
|
|
522
|
-
when
|
575
|
+
when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
|
523
576
|
if $1 == 'k'
|
524
|
-
|
577
|
+
emit(:backref, :name_ref_sq, text, ts, te)
|
525
578
|
else
|
526
|
-
|
579
|
+
emit(:backref, :name_call_sq, text, ts, te)
|
527
580
|
end
|
528
581
|
|
529
|
-
when
|
582
|
+
when /^\\([gk])<\d+>/ # angle-brackets
|
530
583
|
if $1 == 'k'
|
531
|
-
|
584
|
+
emit(:backref, :number_ref_ab, text, ts, te)
|
532
585
|
else
|
533
|
-
|
586
|
+
emit(:backref, :number_call_ab, text, ts, te)
|
534
587
|
end
|
535
588
|
|
536
|
-
when
|
589
|
+
when /^\\([gk])'\d+'/ # single quotes
|
537
590
|
if $1 == 'k'
|
538
|
-
|
591
|
+
emit(:backref, :number_ref_sq, text, ts, te)
|
539
592
|
else
|
540
|
-
|
593
|
+
emit(:backref, :number_call_sq, text, ts, te)
|
541
594
|
end
|
542
595
|
|
543
|
-
when
|
596
|
+
when /^\\([gk])<-\d+>/ # angle-brackets
|
544
597
|
if $1 == 'k'
|
545
|
-
|
598
|
+
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
546
599
|
else
|
547
|
-
|
600
|
+
emit(:backref, :number_rel_call_ab, text, ts, te)
|
548
601
|
end
|
549
602
|
|
550
|
-
when
|
603
|
+
when /^\\([gk])'-\d+'/ # single quotes
|
551
604
|
if $1 == 'k'
|
552
|
-
|
605
|
+
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
553
606
|
else
|
554
|
-
|
607
|
+
emit(:backref, :number_rel_call_sq, text, ts, te)
|
555
608
|
end
|
556
609
|
|
557
|
-
when
|
558
|
-
|
610
|
+
when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
611
|
+
emit(:backref, :name_nest_ref_ab, text, ts, te)
|
559
612
|
|
560
|
-
when
|
561
|
-
|
613
|
+
when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
614
|
+
emit(:backref, :name_nest_ref_sq, text, ts, te)
|
562
615
|
|
563
|
-
when
|
564
|
-
|
616
|
+
when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
|
617
|
+
emit(:backref, :number_nest_ref_ab, text, ts, te)
|
565
618
|
|
566
|
-
when
|
567
|
-
|
619
|
+
when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
|
620
|
+
emit(:backref, :number_nest_ref_sq, text, ts, te)
|
568
621
|
|
622
|
+
else
|
623
|
+
raise ScannerError.new(
|
624
|
+
"Unknown backreference format '#{text}'")
|
569
625
|
end
|
570
626
|
};
|
571
627
|
|
@@ -573,31 +629,31 @@
|
|
573
629
|
# Quantifiers
|
574
630
|
# ------------------------------------------------------------------------
|
575
631
|
zero_or_one {
|
576
|
-
case text =
|
577
|
-
when '?' ;
|
578
|
-
when '??';
|
579
|
-
when '?+';
|
632
|
+
case text = text(data, ts, te).first
|
633
|
+
when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
|
634
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
|
635
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
|
580
636
|
end
|
581
637
|
};
|
582
|
-
|
638
|
+
|
583
639
|
zero_or_more {
|
584
|
-
case text =
|
585
|
-
when '*' ;
|
586
|
-
when '*?';
|
587
|
-
when '*+';
|
640
|
+
case text = text(data, ts, te).first
|
641
|
+
when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
|
642
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
|
643
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
|
588
644
|
end
|
589
645
|
};
|
590
|
-
|
646
|
+
|
591
647
|
one_or_more {
|
592
|
-
case text =
|
593
|
-
when '+' ;
|
594
|
-
when '+?';
|
595
|
-
when '++';
|
648
|
+
case text = text(data, ts, te).first
|
649
|
+
when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
|
650
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
|
651
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
|
596
652
|
end
|
597
653
|
};
|
598
654
|
|
599
|
-
|
600
|
-
|
655
|
+
quantifier_interval @err(premature_end_error) {
|
656
|
+
emit(:quantifier, :interval, *text(data, ts, te))
|
601
657
|
};
|
602
658
|
|
603
659
|
# Escaped sequences
|
@@ -614,35 +670,67 @@
|
|
614
670
|
utf8_2_byte+ |
|
615
671
|
utf8_3_byte+ |
|
616
672
|
utf8_4_byte+ {
|
617
|
-
|
673
|
+
append_literal(data, ts, te)
|
618
674
|
};
|
619
675
|
|
620
676
|
*|;
|
621
677
|
}%%
|
622
678
|
|
679
|
+
# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
|
680
|
+
# This file was generated from scanner.rl
|
623
681
|
|
624
682
|
module Regexp::Scanner
|
625
683
|
%% write data;
|
626
684
|
|
685
|
+
# General scanner error (catch all)
|
627
686
|
class ScannerError < StandardError
|
628
687
|
def initialize(what)
|
629
688
|
super what
|
630
689
|
end
|
631
690
|
end
|
632
691
|
|
692
|
+
# Base for all scanner validation errors
|
693
|
+
class ValidationError < StandardError
|
694
|
+
def initialize(reason)
|
695
|
+
super reason
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
# Unexpected end of pattern
|
633
700
|
class PrematureEndError < ScannerError
|
634
701
|
def initialize(where = '')
|
635
|
-
super "Premature end of pattern
|
702
|
+
super "Premature end of pattern at #{where}"
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
707
|
+
class InvalidSequenceError < ValidationError
|
708
|
+
def initialize(what = 'sequence', where = '')
|
709
|
+
super "Invalid #{what} at #{where}"
|
710
|
+
end
|
711
|
+
end
|
712
|
+
|
713
|
+
# Invalid group. Used for named groups.
|
714
|
+
class InvalidGroupError < ValidationError
|
715
|
+
def initialize(what, reason)
|
716
|
+
super "Invalid #{what}, #{reason}."
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
# Invalid back reference. Used for name a number refs/calls.
|
721
|
+
class InvalidBackrefError < ValidationError
|
722
|
+
def initialize(what, reason)
|
723
|
+
super "Invalid back reference #{what}, #{reason}"
|
636
724
|
end
|
637
725
|
end
|
638
726
|
|
639
|
-
|
727
|
+
# The property name was not recognized by the scanner.
|
728
|
+
class UnknownUnicodePropertyError < ValidationError
|
640
729
|
def initialize(name)
|
641
730
|
super "Unknown unicode character property name #{name}"
|
642
731
|
end
|
643
732
|
end
|
644
733
|
|
645
|
-
|
646
734
|
# Scans the given regular expression text, or Regexp object and collects the
|
647
735
|
# emitted token into an array that gets returned at the end. If a block is
|
648
736
|
# given, it gets called for each emitted token.
|
@@ -665,42 +753,107 @@ module Regexp::Scanner
|
|
665
753
|
%% write init;
|
666
754
|
%% write exec;
|
667
755
|
|
756
|
+
if cs == re_scanner_error
|
757
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
758
|
+
raise ScannerError.new("Scan error at '#{text}'")
|
759
|
+
end
|
760
|
+
|
668
761
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
669
762
|
"[#{in_group}:#{group_depth}]") if in_group
|
670
763
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
671
764
|
"[#{in_set}:#{set_depth}]") if in_set
|
672
765
|
|
673
766
|
# when the entire expression is a literal run
|
674
|
-
|
767
|
+
emit_literal if @literal
|
675
768
|
|
676
769
|
@tokens
|
677
770
|
end
|
678
771
|
|
679
|
-
|
680
|
-
|
772
|
+
private
|
773
|
+
|
774
|
+
# Ragel's regex-based scan of the group options introduced a lot of
|
775
|
+
# ambiguity, so we just ask it to find the beginning of what looks
|
776
|
+
# like an options run and handle the rest in here.
|
777
|
+
def self.scan_options(p, data, ts, te)
|
778
|
+
text = text(data, ts, te).first
|
779
|
+
|
780
|
+
options_char, options_length = true, 0
|
781
|
+
|
782
|
+
# Copy while we have option characters, the maximum is 7, for (?mix-mix,
|
783
|
+
# even though it doesn't make sense it is possible.
|
784
|
+
while options_char and options_length < 7
|
785
|
+
if data[te + options_length]
|
786
|
+
c = data[te + options_length].chr
|
787
|
+
|
788
|
+
if c =~ /[-mix]/
|
789
|
+
text << c ; p += 1 ; options_length += 1
|
790
|
+
else
|
791
|
+
options_char = false
|
792
|
+
end
|
793
|
+
else
|
794
|
+
raise PrematureEndError.new("expression options `#{text}'")
|
795
|
+
end
|
796
|
+
end
|
797
|
+
|
798
|
+
if data[te + options_length]
|
799
|
+
c = data[te + options_length].chr
|
800
|
+
|
801
|
+
if c == ':'
|
802
|
+
# Include the ':' in the options text
|
803
|
+
text << c ; p += 1 ; options_length += 1
|
804
|
+
emit(:group, :options, text, ts, te + options_length)
|
805
|
+
|
806
|
+
elsif c == ')'
|
807
|
+
# Don't include the closing ')', let group_close handle it.
|
808
|
+
emit(:group, :options, text, ts, te + options_length)
|
809
|
+
|
810
|
+
else
|
811
|
+
# Plain Regexp reports this as 'undefined group option'
|
812
|
+
raise ScannerError.new(
|
813
|
+
"Unexpected `#{c}' in options sequence, ':' or ')' expected")
|
814
|
+
end
|
815
|
+
else
|
816
|
+
raise PrematureEndError.new("expression options `#{text}'")
|
817
|
+
end
|
818
|
+
|
819
|
+
p # return the new value of the data pointer
|
820
|
+
end
|
821
|
+
|
822
|
+
# Copy from ts to te from data as text
|
823
|
+
def self.copy(data, range)
|
824
|
+
data[range].pack('c*')
|
825
|
+
end
|
826
|
+
|
827
|
+
# Copy from ts to te from data as text, returning an array with the text
|
828
|
+
# and the offsets used to copy it.
|
829
|
+
def self.text(data, ts, te, soff = 0)
|
830
|
+
[copy(data, ts-soff..te-1), ts-soff, te]
|
831
|
+
end
|
832
|
+
|
833
|
+
# Appends one or more characters to the literal buffer, to be emitted later
|
834
|
+
# by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
|
681
835
|
def self.append_literal(data, ts, te)
|
682
836
|
@literal ||= []
|
683
|
-
@literal <<
|
837
|
+
@literal << text(data, ts, te)
|
684
838
|
end
|
685
839
|
|
686
|
-
#
|
687
|
-
#
|
840
|
+
# Emits the literal run collected by calls to the append_literal method,
|
841
|
+
# using the total start (ts) and end (te) offsets of the run.
|
688
842
|
def self.emit_literal
|
689
843
|
ts, te = @literal.first[1], @literal.last[2]
|
690
844
|
text = @literal.map {|t| t[0]}.join
|
691
845
|
|
692
846
|
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
693
847
|
|
694
|
-
self.emit(:literal, :literal, text, ts, te)
|
695
848
|
@literal = nil
|
849
|
+
emit(:literal, :literal, text, ts, te)
|
696
850
|
end
|
697
851
|
|
852
|
+
# Emits an array with the details of the scanned pattern
|
698
853
|
def self.emit(type, token, text, ts, te)
|
699
|
-
#puts "
|
854
|
+
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
700
855
|
|
701
|
-
if @literal
|
702
|
-
self.emit_literal
|
703
|
-
end
|
856
|
+
emit_literal if @literal
|
704
857
|
|
705
858
|
if @block
|
706
859
|
@block.call type, token, text, ts, te
|
@@ -709,4 +862,37 @@ module Regexp::Scanner
|
|
709
862
|
@tokens << [type, token, text, ts, te]
|
710
863
|
end
|
711
864
|
|
865
|
+
# Centralizes and unifies the handling of validation related
|
866
|
+
# errors.
|
867
|
+
def self.validation_error(type, what, reason)
|
868
|
+
case type
|
869
|
+
when :group
|
870
|
+
error = InvalidGroupError.new(what, reason)
|
871
|
+
when :backref
|
872
|
+
error = InvalidBackrefError.new(what, reason)
|
873
|
+
when :sequence
|
874
|
+
error = InvalidSequenceError.new(what, reason)
|
875
|
+
else
|
876
|
+
error = ValidationError.new('expression')
|
877
|
+
end
|
878
|
+
|
879
|
+
# TODO: configuration option to treat scanner level validation
|
880
|
+
# errors as warnings or ignore them
|
881
|
+
if false # @@config.validation_warn
|
882
|
+
$stderr.puts error.to_s # unless @@config.validation_ignore
|
883
|
+
else
|
884
|
+
raise error # unless @@config.validation_ignore
|
885
|
+
end
|
886
|
+
end
|
887
|
+
|
888
|
+
# Used for references with an empty name or number
|
889
|
+
def self.empty_backref_error(type, what)
|
890
|
+
validation_error(:backref, what, 'ref ID is empty')
|
891
|
+
end
|
892
|
+
|
893
|
+
# Used for named expressions with an empty name
|
894
|
+
def self.empty_name_error(type, what)
|
895
|
+
validation_error(type, what, 'name is empty')
|
896
|
+
end
|
897
|
+
|
712
898
|
end # module Regexp::Scanner
|