regexp_parser 0.1.1 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ChangeLog +45 -0
- data/Rakefile +12 -44
- data/VERSION.yml +5 -0
- data/lib/regexp_parser.rb +5 -38
- data/lib/regexp_parser/expression.rb +68 -221
- data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
- data/lib/regexp_parser/expression/classes/backref.rb +42 -0
- data/lib/regexp_parser/expression/classes/escape.rb +27 -0
- data/lib/regexp_parser/expression/classes/group.rb +67 -0
- data/lib/regexp_parser/expression/classes/literal.rb +7 -0
- data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +26 -0
- data/lib/regexp_parser/expression/classes/set.rb +100 -0
- data/lib/regexp_parser/expression/classes/type.rb +17 -0
- data/lib/regexp_parser/expression/quantifier.rb +26 -0
- data/lib/regexp_parser/expression/subexpression.rb +69 -0
- data/lib/regexp_parser/lexer.rb +4 -4
- data/lib/regexp_parser/parser.rb +31 -13
- data/lib/regexp_parser/scanner.rb +1849 -1488
- data/lib/regexp_parser/scanner/property.rl +7 -2
- data/lib/regexp_parser/scanner/scanner.rl +377 -191
- data/lib/regexp_parser/syntax.rb +7 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
- data/lib/regexp_parser/syntax/tokens.rb +21 -320
- data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
- data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
- data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
- data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
- data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
- data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
- data/lib/regexp_parser/token.rb +37 -0
- data/test/expression/test_all.rb +7 -0
- data/test/expression/test_base.rb +72 -0
- data/test/expression/test_clone.rb +144 -0
- data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
- data/test/helpers.rb +1 -0
- data/test/parser/test_all.rb +1 -1
- data/test/parser/test_alternation.rb +35 -0
- data/test/parser/test_anchors.rb +2 -2
- data/test/parser/test_refcalls.rb +1 -1
- data/test/parser/test_sets.rb +54 -8
- data/test/scanner/test_anchors.rb +2 -2
- data/test/scanner/test_conditionals.rb +31 -0
- data/test/scanner/test_errors.rb +88 -8
- data/test/scanner/test_escapes.rb +4 -4
- data/test/scanner/test_groups.rb +7 -0
- data/test/scanner/test_quoting.rb +29 -0
- data/test/scanner/test_sets.rb +1 -0
- data/test/syntax/ruby/test_1.8.rb +3 -3
- data/test/test_all.rb +1 -1
- metadata +62 -48
- data/lib/regexp_parser/expression/set.rb +0 -59
@@ -58,7 +58,7 @@
|
|
58
58
|
unicode_property := |*
|
59
59
|
|
60
60
|
property_sequence < eof(premature_property_end) {
|
61
|
-
text = data
|
61
|
+
text = text(data, ts, te, 1).first
|
62
62
|
if in_set
|
63
63
|
type = :set
|
64
64
|
else
|
@@ -525,9 +525,14 @@
|
|
525
525
|
self.emit(type, :script_unknown, text, ts-1, te)
|
526
526
|
|
527
527
|
else
|
528
|
-
|
528
|
+
# Should this really be an error? Or would emitting
|
529
|
+
# an :unknown for the property be better?
|
530
|
+
#
|
531
|
+
# self.emit(type, :unknown, text, ts-1, te)
|
529
532
|
|
533
|
+
raise UnknownUnicodePropertyError.new(name)
|
530
534
|
end
|
535
|
+
|
531
536
|
fret;
|
532
537
|
};
|
533
538
|
*|;
|
@@ -28,6 +28,7 @@
|
|
28
28
|
|
29
29
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
30
30
|
|
31
|
+
|
31
32
|
# these are not supported in ruby, and need verification
|
32
33
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
33
34
|
character_equivalent = '[=' . alpha . '=]';
|
@@ -41,14 +42,21 @@
|
|
41
42
|
octal_sequence = [0-7]{1,3};
|
42
43
|
|
43
44
|
hex_sequence = 'x' . xdigit{1,2};
|
45
|
+
hex_sequence_err = 'x' . [^0-9a-fA-F{];
|
44
46
|
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
45
47
|
|
48
|
+
hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
|
49
|
+
|
50
|
+
wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
|
51
|
+
wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
|
52
|
+
|
46
53
|
codepoint_single = 'u' . xdigit{4};
|
47
54
|
codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
|
48
55
|
codepoint_sequence = codepoint_single | codepoint_list;
|
49
56
|
|
50
|
-
control_sequence = ('c' | 'C-')
|
51
|
-
|
57
|
+
control_sequence = ('c' | 'C-');
|
58
|
+
|
59
|
+
meta_sequence = 'M-' . (backslash . control_sequence)?;
|
52
60
|
|
53
61
|
zero_or_one = '?' | '??' | '?+';
|
54
62
|
zero_or_more = '*' | '*?' | '*+';
|
@@ -59,11 +67,11 @@
|
|
59
67
|
quantifier_possessive = '?+' | '*+' | '++';
|
60
68
|
quantifier_mode = '?' | '+';
|
61
69
|
|
62
|
-
|
70
|
+
quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
|
63
71
|
range_close . quantifier_mode?;
|
64
72
|
|
65
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
66
|
-
quantifier_possessive |
|
74
|
+
quantifier_possessive | quantifier_interval;
|
67
75
|
|
68
76
|
|
69
77
|
group_comment = '?#' . [^)]+ . group_close;
|
@@ -76,10 +84,10 @@
|
|
76
84
|
assertion_lookbehind = '?<=';
|
77
85
|
assertion_nlookbehind = '?<!';
|
78
86
|
|
79
|
-
group_options = '?' .
|
87
|
+
group_options = '?' . [\-mix];
|
80
88
|
|
81
89
|
group_ref = [gk];
|
82
|
-
group_name =
|
90
|
+
group_name = (alnum . (alnum+)?)?;
|
83
91
|
group_number = '-'? . [1-9] . ([0-9]+)?;
|
84
92
|
group_level = [+\-] . [0-9]+;
|
85
93
|
|
@@ -113,7 +121,16 @@
|
|
113
121
|
group_ref | [xucCM];
|
114
122
|
|
115
123
|
# EOF error, used where it can be detected
|
116
|
-
action premature_end_error {
|
124
|
+
action premature_end_error {
|
125
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
126
|
+
raise PrematureEndError.new( text )
|
127
|
+
}
|
128
|
+
|
129
|
+
# Invalid sequence error, used from sequences, like escapes and sets
|
130
|
+
action invalid_sequence_error {
|
131
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
132
|
+
raise InvalidSequenceError.new('sequence', text)
|
133
|
+
}
|
117
134
|
|
118
135
|
# group (nesting) and set open/close actions
|
119
136
|
action group_opened { group_depth += 1; in_group = true }
|
@@ -127,7 +144,7 @@
|
|
127
144
|
set_type = set_depth > 1 ? :subset : :set
|
128
145
|
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
129
146
|
|
130
|
-
|
147
|
+
emit(set_type, :close, *text(data, ts, te))
|
131
148
|
|
132
149
|
if set_depth == 0
|
133
150
|
fgoto main;
|
@@ -140,8 +157,8 @@
|
|
140
157
|
set_type = set_depth > 1 ? :subset : :set
|
141
158
|
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
142
159
|
|
143
|
-
|
144
|
-
|
160
|
+
emit(set_type, :member, copy(data, ts..te-2), ts, te)
|
161
|
+
emit(set_type, :close, copy(data, ts+1..te-1), ts, te)
|
145
162
|
|
146
163
|
if set_depth == 0
|
147
164
|
fgoto main;
|
@@ -151,20 +168,20 @@
|
|
151
168
|
};
|
152
169
|
|
153
170
|
'^' {
|
154
|
-
text = data
|
171
|
+
text = text(data, ts, te).first
|
155
172
|
if @tokens.last[1] == :open
|
156
|
-
|
173
|
+
emit(set_type, :negate, text, ts, te)
|
157
174
|
else
|
158
|
-
|
175
|
+
emit(set_type, :member, text, ts, te)
|
159
176
|
end
|
160
177
|
};
|
161
178
|
|
162
179
|
alnum . '-' . alnum {
|
163
|
-
|
180
|
+
emit(set_type, :range, *text(data, ts, te))
|
164
181
|
};
|
165
182
|
|
166
183
|
'&&' {
|
167
|
-
|
184
|
+
emit(set_type, :intersection, *text(data, ts, te))
|
168
185
|
};
|
169
186
|
|
170
187
|
'\\' {
|
@@ -175,12 +192,12 @@
|
|
175
192
|
set_depth += 1; in_set = true
|
176
193
|
set_type = set_depth > 1 ? :subset : :set
|
177
194
|
|
178
|
-
|
195
|
+
emit(set_type, :open, *text(data, ts, te))
|
179
196
|
fcall character_set;
|
180
197
|
};
|
181
198
|
|
182
199
|
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
183
|
-
text = data
|
200
|
+
text = text(data, ts, te).first
|
184
201
|
|
185
202
|
class_name = text[2..-3]
|
186
203
|
if class_name[0].chr == '^'
|
@@ -188,21 +205,21 @@
|
|
188
205
|
end
|
189
206
|
|
190
207
|
token_sym = "class_#{class_name}".to_sym
|
191
|
-
|
208
|
+
emit(set_type, token_sym, text, ts, te)
|
192
209
|
};
|
193
210
|
|
194
211
|
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
195
|
-
|
212
|
+
emit(set_type, :collation, *text(data, ts, te))
|
196
213
|
};
|
197
214
|
|
198
215
|
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
199
|
-
|
216
|
+
emit(set_type, :equivalent, *text(data, ts, te))
|
200
217
|
};
|
201
218
|
|
202
219
|
# exclude the closing bracket as a cleaner workaround for dealing with the
|
203
220
|
# ambiguity caused upon exit from the unicode properties machine
|
204
221
|
meta_char -- ']' {
|
205
|
-
|
222
|
+
emit(set_type, :member, *text(data, ts, te))
|
206
223
|
};
|
207
224
|
|
208
225
|
any |
|
@@ -210,48 +227,48 @@
|
|
210
227
|
utf8_2_byte |
|
211
228
|
utf8_3_byte |
|
212
229
|
utf8_4_byte {
|
213
|
-
|
230
|
+
emit(set_type, :member, *text(data, ts, te))
|
214
231
|
};
|
215
232
|
*|;
|
216
233
|
|
217
234
|
# set escapes scanner
|
218
235
|
# --------------------------------------------------------------------------
|
219
236
|
set_escape_sequence := |*
|
220
|
-
'b' {
|
221
|
-
|
237
|
+
'b' > (escaped_set_alpha, 2) {
|
238
|
+
emit(set_type, :backspace, *text(data, ts, te, 1))
|
222
239
|
fret;
|
223
240
|
};
|
224
241
|
|
225
242
|
char_type {
|
226
|
-
case text = data
|
227
|
-
when '\d';
|
228
|
-
when '\D';
|
229
|
-
when '\h';
|
230
|
-
when '\H';
|
231
|
-
when '\s';
|
232
|
-
when '\S';
|
233
|
-
when '\w';
|
234
|
-
when '\W';
|
243
|
+
case text = text(data, ts, te, 1).first
|
244
|
+
when '\d'; emit(set_type, :type_digit, text, ts-1, te)
|
245
|
+
when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
|
246
|
+
when '\h'; emit(set_type, :type_hex, text, ts-1, te)
|
247
|
+
when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
|
248
|
+
when '\s'; emit(set_type, :type_space, text, ts-1, te)
|
249
|
+
when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
|
250
|
+
when '\w'; emit(set_type, :type_word, text, ts-1, te)
|
251
|
+
when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
|
235
252
|
end
|
236
253
|
fret;
|
237
254
|
};
|
238
255
|
|
239
256
|
hex_sequence . '-\\' . hex_sequence {
|
240
|
-
|
257
|
+
emit(set_type, :range_hex, *text(data, ts, te, 1))
|
241
258
|
fret;
|
242
259
|
};
|
243
260
|
|
244
261
|
hex_sequence {
|
245
|
-
|
262
|
+
emit(set_type, :member_hex, *text(data, ts, te, 1))
|
246
263
|
fret;
|
247
264
|
};
|
248
265
|
|
249
266
|
meta_char | [\\\]\-\,] {
|
250
|
-
|
267
|
+
emit(set_type, :escape, *text(data, ts, te, 1))
|
251
268
|
fret;
|
252
269
|
};
|
253
270
|
|
254
|
-
property_char > (escaped_set_alpha,
|
271
|
+
property_char > (escaped_set_alpha, 3) {
|
255
272
|
fhold;
|
256
273
|
fnext character_set;
|
257
274
|
fcall unicode_property;
|
@@ -264,7 +281,7 @@
|
|
264
281
|
utf8_2_byte |
|
265
282
|
utf8_3_byte |
|
266
283
|
utf8_4_byte {
|
267
|
-
|
284
|
+
emit(set_type, :escape, *text(data, ts, te, 1))
|
268
285
|
fret;
|
269
286
|
};
|
270
287
|
*|;
|
@@ -274,33 +291,33 @@
|
|
274
291
|
# --------------------------------------------------------------------------
|
275
292
|
escape_sequence := |*
|
276
293
|
[1-9] {
|
277
|
-
text = data
|
278
|
-
|
294
|
+
text = text(data, ts, te, 1).first
|
295
|
+
emit(:backref, :number, text, ts-1, te)
|
279
296
|
fret;
|
280
297
|
};
|
281
298
|
|
282
299
|
octal_sequence {
|
283
|
-
|
300
|
+
emit(:escape, :octal, *text(data, ts, te, 1))
|
284
301
|
fret;
|
285
302
|
};
|
286
303
|
|
287
304
|
meta_char {
|
288
|
-
case text = data
|
289
|
-
when '\.';
|
290
|
-
when '\|';
|
291
|
-
when '\^';
|
292
|
-
when '\$';
|
293
|
-
when '\?';
|
294
|
-
when '\*';
|
295
|
-
when '\+';
|
296
|
-
when '\(';
|
297
|
-
when '\)';
|
298
|
-
when '\{';
|
299
|
-
when '\}';
|
300
|
-
when '\[';
|
301
|
-
when '\]';
|
305
|
+
case text = text(data, ts, te, 1).first
|
306
|
+
when '\.'; emit(:escape, :dot, text, ts-1, te)
|
307
|
+
when '\|'; emit(:escape, :alternation, text, ts-1, te)
|
308
|
+
when '\^'; emit(:escape, :bol, text, ts-1, te)
|
309
|
+
when '\$'; emit(:escape, :eol, text, ts-1, te)
|
310
|
+
when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
|
311
|
+
when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
|
312
|
+
when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
|
313
|
+
when '\('; emit(:escape, :group_open, text, ts-1, te)
|
314
|
+
when '\)'; emit(:escape, :group_close, text, ts-1, te)
|
315
|
+
when '\{'; emit(:escape, :interval_open, text, ts-1, te)
|
316
|
+
when '\}'; emit(:escape, :interval_close, text, ts-1, te)
|
317
|
+
when '\['; emit(:escape, :set_open, text, ts-1, te)
|
318
|
+
when '\]'; emit(:escape, :set_close, text, ts-1, te)
|
302
319
|
when "\\\\";
|
303
|
-
|
320
|
+
emit(:escape, :backslash, text, ts-1, te)
|
304
321
|
end
|
305
322
|
fret;
|
306
323
|
};
|
@@ -308,46 +325,76 @@
|
|
308
325
|
escaped_ascii > (escaped_alpha, 7) {
|
309
326
|
# \b is emitted as backspace only when inside a character set, otherwise
|
310
327
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
311
|
-
case text = data
|
312
|
-
when '\a';
|
313
|
-
when '\e';
|
314
|
-
when '\f';
|
315
|
-
when '\n';
|
316
|
-
when '\r';
|
317
|
-
when '\s';
|
318
|
-
when '\t';
|
319
|
-
when '\v';
|
328
|
+
case text = text(data, ts, te, 1).first
|
329
|
+
when '\a'; emit(:escape, :bell, text, ts-1, te)
|
330
|
+
when '\e'; emit(:escape, :escape, text, ts-1, te)
|
331
|
+
when '\f'; emit(:escape, :form_feed, text, ts-1, te)
|
332
|
+
when '\n'; emit(:escape, :newline, text, ts-1, te)
|
333
|
+
when '\r'; emit(:escape, :carriage, text, ts-1, te)
|
334
|
+
when '\s'; emit(:escape, :space, text, ts-1, te)
|
335
|
+
when '\t'; emit(:escape, :tab, text, ts-1, te)
|
336
|
+
when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
|
320
337
|
end
|
321
338
|
fret;
|
322
339
|
};
|
323
340
|
|
324
|
-
codepoint_sequence > (escaped_alpha, 6) {
|
325
|
-
text = data
|
341
|
+
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
342
|
+
text = text(data, ts, te, 1).first
|
326
343
|
if text[2].chr == '{'
|
327
|
-
|
344
|
+
emit(:escape, :codepoint_list, text, ts-1, te)
|
328
345
|
else
|
329
|
-
|
346
|
+
emit(:escape, :codepoint, text, ts-1, te)
|
330
347
|
end
|
331
348
|
fret;
|
332
349
|
};
|
333
350
|
|
334
|
-
hex_sequence > (escaped_alpha, 5) {
|
335
|
-
|
351
|
+
hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
352
|
+
emit(:escape, :hex, *text(data, ts, te, 1))
|
353
|
+
fret;
|
354
|
+
};
|
355
|
+
|
356
|
+
wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
357
|
+
emit(:escape, :hex_wide, *text(data, ts, te, 1))
|
336
358
|
fret;
|
337
359
|
};
|
338
360
|
|
339
|
-
|
340
|
-
self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
|
361
|
+
hex_sequence_err @invalid_sequence_error {
|
341
362
|
fret;
|
342
363
|
};
|
343
364
|
|
344
|
-
|
345
|
-
|
365
|
+
(wide_hex_seq_invalid | wide_hex_seq_empty) {
|
366
|
+
raise InvalidSequenceError.new("wide hex sequence")
|
346
367
|
fret;
|
347
368
|
};
|
348
369
|
|
349
|
-
|
350
|
-
|
370
|
+
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
371
|
+
if data[te]
|
372
|
+
c = data[te].chr
|
373
|
+
if c =~ /[\x00-\x7F]/
|
374
|
+
emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
|
375
|
+
p += 1
|
376
|
+
else
|
377
|
+
raise InvalidSequenceError.new("control sequence")
|
378
|
+
end
|
379
|
+
else
|
380
|
+
raise PrematureEndError.new("control sequence")
|
381
|
+
end
|
382
|
+
fret;
|
383
|
+
};
|
384
|
+
|
385
|
+
meta_sequence >(backslashed, 3) $eof(premature_end_error) {
|
386
|
+
if data[te]
|
387
|
+
c = data[te].chr
|
388
|
+
if c =~ /[\x00-\x7F]/
|
389
|
+
emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
|
390
|
+
p += 1
|
391
|
+
else
|
392
|
+
raise InvalidSequenceError.new("meta sequence")
|
393
|
+
end
|
394
|
+
else
|
395
|
+
raise PrematureEndError.new("meta sequence")
|
396
|
+
end
|
397
|
+
fret;
|
351
398
|
};
|
352
399
|
|
353
400
|
property_char > (escaped_alpha, 2) {
|
@@ -357,7 +404,7 @@
|
|
357
404
|
};
|
358
405
|
|
359
406
|
(any -- non_literal_escape) > (escaped_alpha, 1) {
|
360
|
-
|
407
|
+
emit(:escape, :literal, *text(data, ts, te, 1))
|
361
408
|
fret;
|
362
409
|
};
|
363
410
|
*|;
|
@@ -370,32 +417,34 @@
|
|
370
417
|
# Meta characters
|
371
418
|
# ------------------------------------------------------------------------
|
372
419
|
dot {
|
373
|
-
|
420
|
+
emit(:meta, :dot, *text(data, ts, te))
|
374
421
|
};
|
375
422
|
|
376
423
|
alternation {
|
377
|
-
|
424
|
+
emit(:meta, :alternation, *text(data, ts, te))
|
378
425
|
};
|
379
426
|
|
380
427
|
# Anchors
|
381
428
|
# ------------------------------------------------------------------------
|
382
429
|
beginning_of_line {
|
383
|
-
|
430
|
+
emit(:anchor, :bol, *text(data, ts, te))
|
384
431
|
};
|
385
432
|
|
386
433
|
end_of_line {
|
387
|
-
|
434
|
+
emit(:anchor, :eol, *text(data, ts, te))
|
388
435
|
};
|
389
436
|
|
390
437
|
backslash . anchor_char > (backslashed, 3) {
|
391
|
-
case text = data
|
392
|
-
when '\\A';
|
393
|
-
when '\\z';
|
394
|
-
when '\\Z';
|
395
|
-
when '\\b';
|
396
|
-
when '\\B';
|
397
|
-
when '\\G';
|
398
|
-
else
|
438
|
+
case text = text(data, ts, te).first
|
439
|
+
when '\\A'; emit(:anchor, :bos, text, ts, te)
|
440
|
+
when '\\z'; emit(:anchor, :eos, text, ts, te)
|
441
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
|
442
|
+
when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
|
443
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
|
444
|
+
when '\\G'; emit(:anchor, :match_start, text, ts, te)
|
445
|
+
else
|
446
|
+
raise ScannerError.new(
|
447
|
+
"Unexpected character in anchor at #{text} (char #{ts})")
|
399
448
|
end
|
400
449
|
};
|
401
450
|
|
@@ -406,15 +455,18 @@
|
|
406
455
|
# \w, \W word, non-word
|
407
456
|
# ------------------------------------------------------------------------
|
408
457
|
backslash . char_type > (backslashed, 2) {
|
409
|
-
case text = data
|
410
|
-
when '\\d';
|
411
|
-
when '\\D';
|
412
|
-
when '\\h';
|
413
|
-
when '\\H';
|
414
|
-
when '\\s';
|
415
|
-
when '\\S';
|
416
|
-
when '\\w';
|
417
|
-
when '\\W';
|
458
|
+
case text = text(data, ts, te).first
|
459
|
+
when '\\d'; emit(:type, :digit, text, ts, te)
|
460
|
+
when '\\D'; emit(:type, :nondigit, text, ts, te)
|
461
|
+
when '\\h'; emit(:type, :hex, text, ts, te)
|
462
|
+
when '\\H'; emit(:type, :nonhex, text, ts, te)
|
463
|
+
when '\\s'; emit(:type, :space, text, ts, te)
|
464
|
+
when '\\S'; emit(:type, :nonspace, text, ts, te)
|
465
|
+
when '\\w'; emit(:type, :word, text, ts, te)
|
466
|
+
when '\\W'; emit(:type, :nonword, text, ts, te)
|
467
|
+
else
|
468
|
+
raise ScannerError.new(
|
469
|
+
"Unexpected character in type at #{text} (char #{ts})")
|
418
470
|
end
|
419
471
|
};
|
420
472
|
|
@@ -425,7 +477,7 @@
|
|
425
477
|
set_depth += 1; in_set = true
|
426
478
|
set_type = set_depth > 1 ? :subset : :set
|
427
479
|
|
428
|
-
|
480
|
+
emit(set_type, :open, *text(data, ts, te))
|
429
481
|
fcall character_set;
|
430
482
|
};
|
431
483
|
|
@@ -435,7 +487,7 @@
|
|
435
487
|
# correct closing count.
|
436
488
|
# ------------------------------------------------------------------------
|
437
489
|
group_open . group_comment $group_closed {
|
438
|
-
|
490
|
+
emit(:group, :comment, *text(data, ts, te))
|
439
491
|
};
|
440
492
|
|
441
493
|
# Expression options:
|
@@ -447,21 +499,7 @@
|
|
447
499
|
# (?imx-imx:subexp) option on/off for subexp
|
448
500
|
# ------------------------------------------------------------------------
|
449
501
|
group_open . group_options >group_opened {
|
450
|
-
|
451
|
-
if data[te]
|
452
|
-
c = data[te].chr
|
453
|
-
if c == ':' # include the ':'
|
454
|
-
self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
|
455
|
-
p += 1
|
456
|
-
elsif c == ')' # just options by themselves
|
457
|
-
self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
|
458
|
-
else
|
459
|
-
raise ScannerError.new(
|
460
|
-
"Unexpected '#{c}' in options sequence, ':' or ')' expected")
|
461
|
-
end
|
462
|
-
else
|
463
|
-
raise PrematureEndError.new("options") unless data[te]
|
464
|
-
end
|
502
|
+
p = scan_options(p, data, ts, te)
|
465
503
|
};
|
466
504
|
|
467
505
|
# Assertions
|
@@ -471,11 +509,11 @@
|
|
471
509
|
# (?<!subexp) negative look-behind
|
472
510
|
# ------------------------------------------------------------------------
|
473
511
|
group_open . assertion_type >group_opened {
|
474
|
-
case text =
|
475
|
-
when '(?=';
|
476
|
-
when '(?!';
|
477
|
-
when '(?<=';
|
478
|
-
when '(?<!';
|
512
|
+
case text = text(data, ts, te).first
|
513
|
+
when '(?='; emit(:assertion, :lookahead, text, ts, te)
|
514
|
+
when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
|
515
|
+
when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
|
516
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
|
479
517
|
end
|
480
518
|
};
|
481
519
|
|
@@ -487,85 +525,103 @@
|
|
487
525
|
# (subexp) captured group
|
488
526
|
# ------------------------------------------------------------------------
|
489
527
|
group_open . group_type >group_opened {
|
490
|
-
case text =
|
491
|
-
when '(?:';
|
492
|
-
when '(?>';
|
493
|
-
|
494
|
-
when
|
495
|
-
|
496
|
-
|
497
|
-
|
528
|
+
case text = text(data, ts, te).first
|
529
|
+
when '(?:'; emit(:group, :passive, text, ts, te)
|
530
|
+
when '(?>'; emit(:group, :atomic, text, ts, te)
|
531
|
+
|
532
|
+
when /^\(\?<(\w*)>/
|
533
|
+
empty_name_error(:group, 'named group (ab)') if $1.empty?
|
534
|
+
|
535
|
+
emit(:group, :named_ab, text, ts, te)
|
536
|
+
|
537
|
+
when /^\(\?'(\w*)'/
|
538
|
+
empty_name_error(:group, 'named group (sq)') if $1.empty?
|
539
|
+
|
540
|
+
emit(:group, :named_sq, text, ts, te)
|
541
|
+
|
542
|
+
else
|
543
|
+
raise ScannerError.new(
|
544
|
+
"Unknown subexpression group format '#{text}'")
|
498
545
|
end
|
499
546
|
};
|
500
547
|
|
501
548
|
group_open @group_opened {
|
502
|
-
text =
|
503
|
-
|
549
|
+
text = text(data, ts, te).first
|
550
|
+
emit(:group, :capture, text, ts, te)
|
504
551
|
};
|
505
552
|
|
506
553
|
group_close @group_closed {
|
507
|
-
|
554
|
+
emit(:group, :close, *text(data, ts, te))
|
508
555
|
};
|
509
556
|
|
510
557
|
|
511
|
-
# Group
|
558
|
+
# Group backreference, named and numbered
|
512
559
|
# ------------------------------------------------------------------------
|
513
560
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
514
|
-
case text = data
|
515
|
-
when
|
561
|
+
case text = text(data, ts, te).first
|
562
|
+
when /^\\([gk])<>/ # angle brackets
|
563
|
+
empty_backref_error("ref/call (ab)")
|
564
|
+
|
565
|
+
when /^\\([gk])''/ # single quotes
|
566
|
+
empty_backref_error("ref/call (sq)")
|
567
|
+
|
568
|
+
when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
516
569
|
if $1 == 'k'
|
517
|
-
|
570
|
+
emit(:backref, :name_ref_ab, text, ts, te)
|
518
571
|
else
|
519
|
-
|
572
|
+
emit(:backref, :name_call_ab, text, ts, te)
|
520
573
|
end
|
521
574
|
|
522
|
-
when
|
575
|
+
when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
|
523
576
|
if $1 == 'k'
|
524
|
-
|
577
|
+
emit(:backref, :name_ref_sq, text, ts, te)
|
525
578
|
else
|
526
|
-
|
579
|
+
emit(:backref, :name_call_sq, text, ts, te)
|
527
580
|
end
|
528
581
|
|
529
|
-
when
|
582
|
+
when /^\\([gk])<\d+>/ # angle-brackets
|
530
583
|
if $1 == 'k'
|
531
|
-
|
584
|
+
emit(:backref, :number_ref_ab, text, ts, te)
|
532
585
|
else
|
533
|
-
|
586
|
+
emit(:backref, :number_call_ab, text, ts, te)
|
534
587
|
end
|
535
588
|
|
536
|
-
when
|
589
|
+
when /^\\([gk])'\d+'/ # single quotes
|
537
590
|
if $1 == 'k'
|
538
|
-
|
591
|
+
emit(:backref, :number_ref_sq, text, ts, te)
|
539
592
|
else
|
540
|
-
|
593
|
+
emit(:backref, :number_call_sq, text, ts, te)
|
541
594
|
end
|
542
595
|
|
543
|
-
when
|
596
|
+
when /^\\([gk])<-\d+>/ # angle-brackets
|
544
597
|
if $1 == 'k'
|
545
|
-
|
598
|
+
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
546
599
|
else
|
547
|
-
|
600
|
+
emit(:backref, :number_rel_call_ab, text, ts, te)
|
548
601
|
end
|
549
602
|
|
550
|
-
when
|
603
|
+
when /^\\([gk])'-\d+'/ # single quotes
|
551
604
|
if $1 == 'k'
|
552
|
-
|
605
|
+
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
553
606
|
else
|
554
|
-
|
607
|
+
emit(:backref, :number_rel_call_sq, text, ts, te)
|
555
608
|
end
|
556
609
|
|
557
|
-
when
|
558
|
-
|
610
|
+
when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
611
|
+
emit(:backref, :name_nest_ref_ab, text, ts, te)
|
559
612
|
|
560
|
-
when
|
561
|
-
|
613
|
+
when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
614
|
+
emit(:backref, :name_nest_ref_sq, text, ts, te)
|
562
615
|
|
563
|
-
when
|
564
|
-
|
616
|
+
when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
|
617
|
+
emit(:backref, :number_nest_ref_ab, text, ts, te)
|
565
618
|
|
566
|
-
when
|
567
|
-
|
619
|
+
when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
|
620
|
+
emit(:backref, :number_nest_ref_sq, text, ts, te)
|
568
621
|
|
622
|
+
else
|
623
|
+
raise ScannerError.new(
|
624
|
+
"Unknown backreference format '#{text}'")
|
569
625
|
end
|
570
626
|
};
|
571
627
|
|
@@ -573,31 +629,31 @@
|
|
573
629
|
# Quantifiers
|
574
630
|
# ------------------------------------------------------------------------
|
575
631
|
zero_or_one {
|
576
|
-
case text =
|
577
|
-
when '?' ;
|
578
|
-
when '??';
|
579
|
-
when '?+';
|
632
|
+
case text = text(data, ts, te).first
|
633
|
+
when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
|
634
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
|
635
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
|
580
636
|
end
|
581
637
|
};
|
582
|
-
|
638
|
+
|
583
639
|
zero_or_more {
|
584
|
-
case text =
|
585
|
-
when '*' ;
|
586
|
-
when '*?';
|
587
|
-
when '*+';
|
640
|
+
case text = text(data, ts, te).first
|
641
|
+
when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
|
642
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
|
643
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
|
588
644
|
end
|
589
645
|
};
|
590
|
-
|
646
|
+
|
591
647
|
one_or_more {
|
592
|
-
case text =
|
593
|
-
when '+' ;
|
594
|
-
when '+?';
|
595
|
-
when '++';
|
648
|
+
case text = text(data, ts, te).first
|
649
|
+
when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
|
650
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
|
651
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
|
596
652
|
end
|
597
653
|
};
|
598
654
|
|
599
|
-
|
600
|
-
|
655
|
+
quantifier_interval @err(premature_end_error) {
|
656
|
+
emit(:quantifier, :interval, *text(data, ts, te))
|
601
657
|
};
|
602
658
|
|
603
659
|
# Escaped sequences
|
@@ -614,35 +670,67 @@
|
|
614
670
|
utf8_2_byte+ |
|
615
671
|
utf8_3_byte+ |
|
616
672
|
utf8_4_byte+ {
|
617
|
-
|
673
|
+
append_literal(data, ts, te)
|
618
674
|
};
|
619
675
|
|
620
676
|
*|;
|
621
677
|
}%%
|
622
678
|
|
679
|
+
# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
|
680
|
+
# This file was generated from scanner.rl
|
623
681
|
|
624
682
|
module Regexp::Scanner
|
625
683
|
%% write data;
|
626
684
|
|
685
|
+
# General scanner error (catch all)
|
627
686
|
class ScannerError < StandardError
|
628
687
|
def initialize(what)
|
629
688
|
super what
|
630
689
|
end
|
631
690
|
end
|
632
691
|
|
692
|
+
# Base for all scanner validation errors
|
693
|
+
class ValidationError < StandardError
|
694
|
+
def initialize(reason)
|
695
|
+
super reason
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
# Unexpected end of pattern
|
633
700
|
class PrematureEndError < ScannerError
|
634
701
|
def initialize(where = '')
|
635
|
-
super "Premature end of pattern
|
702
|
+
super "Premature end of pattern at #{where}"
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
707
|
+
class InvalidSequenceError < ValidationError
|
708
|
+
def initialize(what = 'sequence', where = '')
|
709
|
+
super "Invalid #{what} at #{where}"
|
710
|
+
end
|
711
|
+
end
|
712
|
+
|
713
|
+
# Invalid group. Used for named groups.
|
714
|
+
class InvalidGroupError < ValidationError
|
715
|
+
def initialize(what, reason)
|
716
|
+
super "Invalid #{what}, #{reason}."
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
# Invalid back reference. Used for name a number refs/calls.
|
721
|
+
class InvalidBackrefError < ValidationError
|
722
|
+
def initialize(what, reason)
|
723
|
+
super "Invalid back reference #{what}, #{reason}"
|
636
724
|
end
|
637
725
|
end
|
638
726
|
|
639
|
-
|
727
|
+
# The property name was not recognized by the scanner.
|
728
|
+
class UnknownUnicodePropertyError < ValidationError
|
640
729
|
def initialize(name)
|
641
730
|
super "Unknown unicode character property name #{name}"
|
642
731
|
end
|
643
732
|
end
|
644
733
|
|
645
|
-
|
646
734
|
# Scans the given regular expression text, or Regexp object and collects the
|
647
735
|
# emitted token into an array that gets returned at the end. If a block is
|
648
736
|
# given, it gets called for each emitted token.
|
@@ -665,42 +753,107 @@ module Regexp::Scanner
|
|
665
753
|
%% write init;
|
666
754
|
%% write exec;
|
667
755
|
|
756
|
+
if cs == re_scanner_error
|
757
|
+
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
758
|
+
raise ScannerError.new("Scan error at '#{text}'")
|
759
|
+
end
|
760
|
+
|
668
761
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
669
762
|
"[#{in_group}:#{group_depth}]") if in_group
|
670
763
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
671
764
|
"[#{in_set}:#{set_depth}]") if in_set
|
672
765
|
|
673
766
|
# when the entire expression is a literal run
|
674
|
-
|
767
|
+
emit_literal if @literal
|
675
768
|
|
676
769
|
@tokens
|
677
770
|
end
|
678
771
|
|
679
|
-
|
680
|
-
|
772
|
+
private
|
773
|
+
|
774
|
+
# Ragel's regex-based scan of the group options introduced a lot of
|
775
|
+
# ambiguity, so we just ask it to find the beginning of what looks
|
776
|
+
# like an options run and handle the rest in here.
|
777
|
+
def self.scan_options(p, data, ts, te)
|
778
|
+
text = text(data, ts, te).first
|
779
|
+
|
780
|
+
options_char, options_length = true, 0
|
781
|
+
|
782
|
+
# Copy while we have option characters, the maximum is 7, for (?mix-mix,
|
783
|
+
# even though it doesn't make sense it is possible.
|
784
|
+
while options_char and options_length < 7
|
785
|
+
if data[te + options_length]
|
786
|
+
c = data[te + options_length].chr
|
787
|
+
|
788
|
+
if c =~ /[-mix]/
|
789
|
+
text << c ; p += 1 ; options_length += 1
|
790
|
+
else
|
791
|
+
options_char = false
|
792
|
+
end
|
793
|
+
else
|
794
|
+
raise PrematureEndError.new("expression options `#{text}'")
|
795
|
+
end
|
796
|
+
end
|
797
|
+
|
798
|
+
if data[te + options_length]
|
799
|
+
c = data[te + options_length].chr
|
800
|
+
|
801
|
+
if c == ':'
|
802
|
+
# Include the ':' in the options text
|
803
|
+
text << c ; p += 1 ; options_length += 1
|
804
|
+
emit(:group, :options, text, ts, te + options_length)
|
805
|
+
|
806
|
+
elsif c == ')'
|
807
|
+
# Don't include the closing ')', let group_close handle it.
|
808
|
+
emit(:group, :options, text, ts, te + options_length)
|
809
|
+
|
810
|
+
else
|
811
|
+
# Plain Regexp reports this as 'undefined group option'
|
812
|
+
raise ScannerError.new(
|
813
|
+
"Unexpected `#{c}' in options sequence, ':' or ')' expected")
|
814
|
+
end
|
815
|
+
else
|
816
|
+
raise PrematureEndError.new("expression options `#{text}'")
|
817
|
+
end
|
818
|
+
|
819
|
+
p # return the new value of the data pointer
|
820
|
+
end
|
821
|
+
|
822
|
+
# Copy from ts to te from data as text
|
823
|
+
def self.copy(data, range)
|
824
|
+
data[range].pack('c*')
|
825
|
+
end
|
826
|
+
|
827
|
+
# Copy from ts to te from data as text, returning an array with the text
|
828
|
+
# and the offsets used to copy it.
|
829
|
+
def self.text(data, ts, te, soff = 0)
|
830
|
+
[copy(data, ts-soff..te-1), ts-soff, te]
|
831
|
+
end
|
832
|
+
|
833
|
+
# Appends one or more characters to the literal buffer, to be emitted later
|
834
|
+
# by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
|
681
835
|
def self.append_literal(data, ts, te)
|
682
836
|
@literal ||= []
|
683
|
-
@literal <<
|
837
|
+
@literal << text(data, ts, te)
|
684
838
|
end
|
685
839
|
|
686
|
-
#
|
687
|
-
#
|
840
|
+
# Emits the literal run collected by calls to the append_literal method,
|
841
|
+
# using the total start (ts) and end (te) offsets of the run.
|
688
842
|
def self.emit_literal
|
689
843
|
ts, te = @literal.first[1], @literal.last[2]
|
690
844
|
text = @literal.map {|t| t[0]}.join
|
691
845
|
|
692
846
|
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
693
847
|
|
694
|
-
self.emit(:literal, :literal, text, ts, te)
|
695
848
|
@literal = nil
|
849
|
+
emit(:literal, :literal, text, ts, te)
|
696
850
|
end
|
697
851
|
|
852
|
+
# Emits an array with the details of the scanned pattern
|
698
853
|
def self.emit(type, token, text, ts, te)
|
699
|
-
#puts "
|
854
|
+
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
700
855
|
|
701
|
-
if @literal
|
702
|
-
self.emit_literal
|
703
|
-
end
|
856
|
+
emit_literal if @literal
|
704
857
|
|
705
858
|
if @block
|
706
859
|
@block.call type, token, text, ts, te
|
@@ -709,4 +862,37 @@ module Regexp::Scanner
|
|
709
862
|
@tokens << [type, token, text, ts, te]
|
710
863
|
end
|
711
864
|
|
865
|
+
# Centralizes and unifies the handling of validation related
|
866
|
+
# errors.
|
867
|
+
def self.validation_error(type, what, reason)
|
868
|
+
case type
|
869
|
+
when :group
|
870
|
+
error = InvalidGroupError.new(what, reason)
|
871
|
+
when :backref
|
872
|
+
error = InvalidBackrefError.new(what, reason)
|
873
|
+
when :sequence
|
874
|
+
error = InvalidSequenceError.new(what, reason)
|
875
|
+
else
|
876
|
+
error = ValidationError.new('expression')
|
877
|
+
end
|
878
|
+
|
879
|
+
# TODO: configuration option to treat scanner level validation
|
880
|
+
# errors as warnings or ignore them
|
881
|
+
if false # @@config.validation_warn
|
882
|
+
$stderr.puts error.to_s # unless @@config.validation_ignore
|
883
|
+
else
|
884
|
+
raise error # unless @@config.validation_ignore
|
885
|
+
end
|
886
|
+
end
|
887
|
+
|
888
|
+
# Used for references with an empty name or number
|
889
|
+
def self.empty_backref_error(type, what)
|
890
|
+
validation_error(:backref, what, 'ref ID is empty')
|
891
|
+
end
|
892
|
+
|
893
|
+
# Used for named expressions with an empty name
|
894
|
+
def self.empty_name_error(type, what)
|
895
|
+
validation_error(type, what, 'name is empty')
|
896
|
+
end
|
897
|
+
|
712
898
|
end # module Regexp::Scanner
|