regexp_parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +4 -0
- data/LICENSE +22 -0
- data/README.rdoc +307 -0
- data/Rakefile +91 -0
- data/lib/regexp_parser/ctype.rb +48 -0
- data/lib/regexp_parser/expression/property.rb +108 -0
- data/lib/regexp_parser/expression/set.rb +59 -0
- data/lib/regexp_parser/expression.rb +287 -0
- data/lib/regexp_parser/lexer.rb +105 -0
- data/lib/regexp_parser/parser.rb +417 -0
- data/lib/regexp_parser/scanner/property.rl +534 -0
- data/lib/regexp_parser/scanner/scanner.rl +712 -0
- data/lib/regexp_parser/scanner.rb +3325 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
- data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +332 -0
- data/lib/regexp_parser/syntax.rb +172 -0
- data/lib/regexp_parser.rb +45 -0
- data/test/helpers.rb +8 -0
- data/test/lexer/test_all.rb +26 -0
- data/test/lexer/test_literals.rb +120 -0
- data/test/lexer/test_nesting.rb +107 -0
- data/test/lexer/test_refcalls.rb +45 -0
- data/test/parser/test_all.rb +44 -0
- data/test/parser/test_alternation.rb +46 -0
- data/test/parser/test_anchors.rb +35 -0
- data/test/parser/test_errors.rb +59 -0
- data/test/parser/test_escapes.rb +48 -0
- data/test/parser/test_expression.rb +51 -0
- data/test/parser/test_groups.rb +69 -0
- data/test/parser/test_properties.rb +346 -0
- data/test/parser/test_quantifiers.rb +236 -0
- data/test/parser/test_refcalls.rb +101 -0
- data/test/parser/test_sets.rb +99 -0
- data/test/scanner/test_all.rb +30 -0
- data/test/scanner/test_anchors.rb +35 -0
- data/test/scanner/test_errors.rb +36 -0
- data/test/scanner/test_escapes.rb +49 -0
- data/test/scanner/test_groups.rb +41 -0
- data/test/scanner/test_literals.rb +85 -0
- data/test/scanner/test_meta.rb +36 -0
- data/test/scanner/test_properties.rb +315 -0
- data/test/scanner/test_quantifiers.rb +38 -0
- data/test/scanner/test_refcalls.rb +45 -0
- data/test/scanner/test_scripts.rb +314 -0
- data/test/scanner/test_sets.rb +80 -0
- data/test/scanner/test_types.rb +30 -0
- data/test/syntax/ruby/test_1.8.rb +57 -0
- data/test/syntax/ruby/test_1.9.1.rb +39 -0
- data/test/syntax/ruby/test_1.9.3.rb +38 -0
- data/test/syntax/ruby/test_all.rb +12 -0
- data/test/syntax/test_all.rb +19 -0
- data/test/test_all.rb +4 -0
- metadata +160 -0
@@ -0,0 +1,712 @@
|
|
1
|
+
%%{
|
2
|
+
machine re_scanner;
|
3
|
+
include re_property "property.rl";
|
4
|
+
|
5
|
+
dot = '.';
|
6
|
+
backslash = '\\';
|
7
|
+
alternation = '|';
|
8
|
+
beginning_of_line = '^';
|
9
|
+
end_of_line = '$';
|
10
|
+
|
11
|
+
range_open = '{';
|
12
|
+
range_close = '}';
|
13
|
+
curlies = range_open | range_close;
|
14
|
+
|
15
|
+
group_open = '(';
|
16
|
+
group_close = ')';
|
17
|
+
parantheses = group_open | group_close;
|
18
|
+
|
19
|
+
set_open = '[';
|
20
|
+
set_close = ']';
|
21
|
+
brackets = set_open | set_close;
|
22
|
+
|
23
|
+
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
24
|
+
'cntrl' | 'digit' | 'graph' |
|
25
|
+
'lower' | 'print' | 'punct' |
|
26
|
+
'space' | 'upper' | 'xdigit' |
|
27
|
+
'word' | 'ascii';
|
28
|
+
|
29
|
+
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
30
|
+
|
31
|
+
# these are not supported in ruby, and need verification
|
32
|
+
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
33
|
+
character_equivalent = '[=' . alpha . '=]';
|
34
|
+
|
35
|
+
char_type = [dDhHsSwW];
|
36
|
+
|
37
|
+
line_anchor = beginning_of_line | end_of_line;
|
38
|
+
anchor_char = [AbBzZG];
|
39
|
+
|
40
|
+
escaped_ascii = [abefnrstv];
|
41
|
+
octal_sequence = [0-7]{1,3};
|
42
|
+
|
43
|
+
hex_sequence = 'x' . xdigit{1,2};
|
44
|
+
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
45
|
+
|
46
|
+
codepoint_single = 'u' . xdigit{4};
|
47
|
+
codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
|
48
|
+
codepoint_sequence = codepoint_single | codepoint_list;
|
49
|
+
|
50
|
+
control_sequence = ('c' | 'C-') . alpha;
|
51
|
+
meta_sequence = 'M-' . ((backslash . control_sequence) | alpha);
|
52
|
+
|
53
|
+
zero_or_one = '?' | '??' | '?+';
|
54
|
+
zero_or_more = '*' | '*?' | '*+';
|
55
|
+
one_or_more = '+' | '+?' | '++';
|
56
|
+
|
57
|
+
quantifier_greedy = '?' | '*' | '+';
|
58
|
+
quantifier_reluctant = '??' | '*?' | '+?';
|
59
|
+
quantifier_possessive = '?+' | '*+' | '++';
|
60
|
+
quantifier_mode = '?' | '+';
|
61
|
+
|
62
|
+
quantifier_range = range_open . (digit+)? . ','? . (digit+)? .
|
63
|
+
range_close . quantifier_mode?;
|
64
|
+
|
65
|
+
quantifiers = quantifier_greedy | quantifier_reluctant |
|
66
|
+
quantifier_possessive | quantifier_range;
|
67
|
+
|
68
|
+
|
69
|
+
group_comment = '?#' . [^)]+ . group_close;
|
70
|
+
|
71
|
+
group_atomic = '?>';
|
72
|
+
group_passive = '?:';
|
73
|
+
|
74
|
+
assertion_lookahead = '?=';
|
75
|
+
assertion_nlookahead = '?!';
|
76
|
+
assertion_lookbehind = '?<=';
|
77
|
+
assertion_nlookbehind = '?<!';
|
78
|
+
|
79
|
+
group_options = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
|
80
|
+
|
81
|
+
group_ref = [gk];
|
82
|
+
group_name = alpha . (alnum+)?;
|
83
|
+
group_number = '-'? . [1-9] . ([0-9]+)?;
|
84
|
+
group_level = [+\-] . [0-9]+;
|
85
|
+
|
86
|
+
group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
|
87
|
+
|
88
|
+
group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
|
89
|
+
("'" . group_name . group_level? "'"));
|
90
|
+
|
91
|
+
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
92
|
+
("'" . group_number . group_level? "'"));
|
93
|
+
|
94
|
+
group_type = group_atomic | group_passive | group_named;
|
95
|
+
|
96
|
+
assertion_type = assertion_lookahead | assertion_nlookahead |
|
97
|
+
assertion_lookbehind | assertion_nlookbehind;
|
98
|
+
|
99
|
+
# characters that 'break' a literal
|
100
|
+
meta_char = dot | backslash | alternation |
|
101
|
+
curlies | parantheses | brackets |
|
102
|
+
line_anchor | quantifier_greedy;
|
103
|
+
|
104
|
+
ascii_print = ((0x20..0x7e) - meta_char)+;
|
105
|
+
ascii_nonprint = (0x01..0x1f | 0x7f)+;
|
106
|
+
|
107
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
|
108
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
|
109
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
|
110
|
+
utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
111
|
+
|
112
|
+
non_literal_escape = char_type | anchor_char | escaped_ascii |
|
113
|
+
group_ref | [xucCM];
|
114
|
+
|
115
|
+
# EOF error, used where it can be detected
|
116
|
+
action premature_end_error { raise PrematureEndError }
|
117
|
+
|
118
|
+
# group (nesting) and set open/close actions
|
119
|
+
action group_opened { group_depth += 1; in_group = true }
|
120
|
+
action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
|
121
|
+
|
122
|
+
# Character set scanner, continues consuming characters until it meets the
|
123
|
+
# closing bracket of the set.
|
124
|
+
# --------------------------------------------------------------------------
|
125
|
+
character_set := |*
|
126
|
+
']' {
|
127
|
+
set_type = set_depth > 1 ? :subset : :set
|
128
|
+
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
129
|
+
|
130
|
+
self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
|
131
|
+
|
132
|
+
if set_depth == 0
|
133
|
+
fgoto main;
|
134
|
+
else
|
135
|
+
fret;
|
136
|
+
end
|
137
|
+
};
|
138
|
+
|
139
|
+
'-]' { # special case, emits two tokens
|
140
|
+
set_type = set_depth > 1 ? :subset : :set
|
141
|
+
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
142
|
+
|
143
|
+
self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
|
144
|
+
self.emit(set_type, :close, data[ts+1..te-1].pack('c*'), ts, te)
|
145
|
+
|
146
|
+
if set_depth == 0
|
147
|
+
fgoto main;
|
148
|
+
else
|
149
|
+
fret;
|
150
|
+
end
|
151
|
+
};
|
152
|
+
|
153
|
+
'^' {
|
154
|
+
text = data[ts..te-1].pack('c*')
|
155
|
+
if @tokens.last[1] == :open
|
156
|
+
self.emit(set_type, :negate, text, ts, te)
|
157
|
+
else
|
158
|
+
self.emit(set_type, :member, text, ts, te)
|
159
|
+
end
|
160
|
+
};
|
161
|
+
|
162
|
+
alnum . '-' . alnum {
|
163
|
+
self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
|
164
|
+
};
|
165
|
+
|
166
|
+
'&&' {
|
167
|
+
self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
|
168
|
+
};
|
169
|
+
|
170
|
+
'\\' {
|
171
|
+
fcall set_escape_sequence;
|
172
|
+
};
|
173
|
+
|
174
|
+
'[' >(open_bracket, 1) {
|
175
|
+
set_depth += 1; in_set = true
|
176
|
+
set_type = set_depth > 1 ? :subset : :set
|
177
|
+
|
178
|
+
self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
|
179
|
+
fcall character_set;
|
180
|
+
};
|
181
|
+
|
182
|
+
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
183
|
+
text = data[ts..te-1].pack('c*')
|
184
|
+
|
185
|
+
class_name = text[2..-3]
|
186
|
+
if class_name[0].chr == '^'
|
187
|
+
class_name = "non#{class_name[1..-1]}"
|
188
|
+
end
|
189
|
+
|
190
|
+
token_sym = "class_#{class_name}".to_sym
|
191
|
+
self.emit(set_type, token_sym, text, ts, te)
|
192
|
+
};
|
193
|
+
|
194
|
+
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
195
|
+
self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
|
196
|
+
};
|
197
|
+
|
198
|
+
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
199
|
+
self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
|
200
|
+
};
|
201
|
+
|
202
|
+
# exclude the closing bracket as a cleaner workaround for dealing with the
|
203
|
+
# ambiguity caused upon exit from the unicode properties machine
|
204
|
+
meta_char -- ']' {
|
205
|
+
self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
|
206
|
+
};
|
207
|
+
|
208
|
+
any |
|
209
|
+
ascii_nonprint |
|
210
|
+
utf8_2_byte |
|
211
|
+
utf8_3_byte |
|
212
|
+
utf8_4_byte {
|
213
|
+
self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
|
214
|
+
};
|
215
|
+
*|;
|
216
|
+
|
217
|
+
# set escapes scanner
|
218
|
+
# --------------------------------------------------------------------------
|
219
|
+
set_escape_sequence := |*
|
220
|
+
'b' {
|
221
|
+
self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
|
222
|
+
fret;
|
223
|
+
};
|
224
|
+
|
225
|
+
char_type {
|
226
|
+
case text = data[ts-1..te-1].pack('c*')
|
227
|
+
when '\d'; self.emit(set_type, :type_digit, text, ts-1, te)
|
228
|
+
when '\D'; self.emit(set_type, :type_nondigit, text, ts-1, te)
|
229
|
+
when '\h'; self.emit(set_type, :type_hex, text, ts-1, te)
|
230
|
+
when '\H'; self.emit(set_type, :type_nonhex, text, ts-1, te)
|
231
|
+
when '\s'; self.emit(set_type, :type_space, text, ts-1, te)
|
232
|
+
when '\S'; self.emit(set_type, :type_nonspace, text, ts-1, te)
|
233
|
+
when '\w'; self.emit(set_type, :type_word, text, ts-1, te)
|
234
|
+
when '\W'; self.emit(set_type, :type_nonword, text, ts-1, te)
|
235
|
+
end
|
236
|
+
fret;
|
237
|
+
};
|
238
|
+
|
239
|
+
hex_sequence . '-\\' . hex_sequence {
|
240
|
+
self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
241
|
+
fret;
|
242
|
+
};
|
243
|
+
|
244
|
+
hex_sequence {
|
245
|
+
self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
246
|
+
fret;
|
247
|
+
};
|
248
|
+
|
249
|
+
meta_char | [\\\]\-\,] {
|
250
|
+
self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
|
251
|
+
fret;
|
252
|
+
};
|
253
|
+
|
254
|
+
property_char > (escaped_set_alpha, 2) {
|
255
|
+
fhold;
|
256
|
+
fnext character_set;
|
257
|
+
fcall unicode_property;
|
258
|
+
fret;
|
259
|
+
};
|
260
|
+
|
261
|
+
# special case exclusion of escaped dash, could be cleaner.
|
262
|
+
(ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
|
263
|
+
ascii_nonprint |
|
264
|
+
utf8_2_byte |
|
265
|
+
utf8_3_byte |
|
266
|
+
utf8_4_byte {
|
267
|
+
self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
|
268
|
+
fret;
|
269
|
+
};
|
270
|
+
*|;
|
271
|
+
|
272
|
+
|
273
|
+
# escape sequence scanner
|
274
|
+
# --------------------------------------------------------------------------
|
275
|
+
escape_sequence := |*
|
276
|
+
[1-9] {
|
277
|
+
text = data[ts-1..te-1].pack('c*')
|
278
|
+
self.emit(:backref, :number, text, ts-1, te)
|
279
|
+
fret;
|
280
|
+
};
|
281
|
+
|
282
|
+
octal_sequence {
|
283
|
+
self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
|
284
|
+
fret;
|
285
|
+
};
|
286
|
+
|
287
|
+
meta_char {
|
288
|
+
case text = data[ts-1..te-1].pack('c*')
|
289
|
+
when '\.'; self.emit(:escape, :dot, text, ts-1, te)
|
290
|
+
when '\|'; self.emit(:escape, :alternation, text, ts-1, te)
|
291
|
+
when '\^'; self.emit(:escape, :beginning_of_line, text, ts-1, te)
|
292
|
+
when '\$'; self.emit(:escape, :end_of_line, text, ts-1, te)
|
293
|
+
when '\?'; self.emit(:escape, :zero_or_one, text, ts-1, te)
|
294
|
+
when '\*'; self.emit(:escape, :zero_or_more, text, ts-1, te)
|
295
|
+
when '\+'; self.emit(:escape, :one_or_more, text, ts-1, te)
|
296
|
+
when '\('; self.emit(:escape, :group_open, text, ts-1, te)
|
297
|
+
when '\)'; self.emit(:escape, :group_close, text, ts-1, te)
|
298
|
+
when '\{'; self.emit(:escape, :interval_open, text, ts-1, te)
|
299
|
+
when '\}'; self.emit(:escape, :interval_close, text, ts-1, te)
|
300
|
+
when '\['; self.emit(:escape, :set_open, text, ts-1, te)
|
301
|
+
when '\]'; self.emit(:escape, :set_close, text, ts-1, te)
|
302
|
+
when "\\\\";
|
303
|
+
self.emit(:escape, :backslash, text, ts-1, te)
|
304
|
+
end
|
305
|
+
fret;
|
306
|
+
};
|
307
|
+
|
308
|
+
escaped_ascii > (escaped_alpha, 7) {
|
309
|
+
# \b is emitted as backspace only when inside a character set, otherwise
|
310
|
+
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
311
|
+
case text = data[ts-1..te-1].pack('c*')
|
312
|
+
when '\a'; self.emit(:escape, :bell, text, ts-1, te)
|
313
|
+
when '\e'; self.emit(:escape, :escape, text, ts-1, te)
|
314
|
+
when '\f'; self.emit(:escape, :form_feed, text, ts-1, te)
|
315
|
+
when '\n'; self.emit(:escape, :newline, text, ts-1, te)
|
316
|
+
when '\r'; self.emit(:escape, :carriage, text, ts-1, te)
|
317
|
+
when '\s'; self.emit(:escape, :space, text, ts-1, te)
|
318
|
+
when '\t'; self.emit(:escape, :tab, text, ts-1, te)
|
319
|
+
when '\v'; self.emit(:escape, :vertical_tab, text, ts-1, te)
|
320
|
+
end
|
321
|
+
fret;
|
322
|
+
};
|
323
|
+
|
324
|
+
codepoint_sequence > (escaped_alpha, 6) {
|
325
|
+
text = data[ts-1..te-1].pack('c*')
|
326
|
+
if text[2].chr == '{'
|
327
|
+
self.emit(:escape, :codepoint_list, text, ts-1, te)
|
328
|
+
else
|
329
|
+
self.emit(:escape, :codepoint, text, ts-1, te)
|
330
|
+
end
|
331
|
+
fret;
|
332
|
+
};
|
333
|
+
|
334
|
+
hex_sequence > (escaped_alpha, 5) {
|
335
|
+
self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
336
|
+
fret;
|
337
|
+
};
|
338
|
+
|
339
|
+
wide_hex_sequence > (escaped_alpha, 5) {
|
340
|
+
self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
|
341
|
+
fret;
|
342
|
+
};
|
343
|
+
|
344
|
+
control_sequence > (escaped_alpha, 4) {
|
345
|
+
self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
|
346
|
+
fret;
|
347
|
+
};
|
348
|
+
|
349
|
+
meta_sequence > (backslashed, 3) {
|
350
|
+
self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
|
351
|
+
};
|
352
|
+
|
353
|
+
property_char > (escaped_alpha, 2) {
|
354
|
+
fhold;
|
355
|
+
fnext main;
|
356
|
+
fcall unicode_property; fret;
|
357
|
+
};
|
358
|
+
|
359
|
+
(any -- non_literal_escape) > (escaped_alpha, 1) {
|
360
|
+
self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
|
361
|
+
fret;
|
362
|
+
};
|
363
|
+
*|;
|
364
|
+
|
365
|
+
|
366
|
+
# Main scanner
|
367
|
+
# --------------------------------------------------------------------------
|
368
|
+
main := |*
|
369
|
+
|
370
|
+
# Meta characters
|
371
|
+
# ------------------------------------------------------------------------
|
372
|
+
dot {
|
373
|
+
self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
|
374
|
+
};
|
375
|
+
|
376
|
+
alternation {
|
377
|
+
self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
|
378
|
+
};
|
379
|
+
|
380
|
+
# Anchors
|
381
|
+
# ------------------------------------------------------------------------
|
382
|
+
beginning_of_line {
|
383
|
+
self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
|
384
|
+
};
|
385
|
+
|
386
|
+
end_of_line {
|
387
|
+
self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
|
388
|
+
};
|
389
|
+
|
390
|
+
backslash . anchor_char > (backslashed, 3) {
|
391
|
+
case text = data[ts..te-1].pack('c*')
|
392
|
+
when '\\A'; self.emit(:anchor, :bos, text, ts, te)
|
393
|
+
when '\\z'; self.emit(:anchor, :eos, text, ts, te)
|
394
|
+
when '\\Z'; self.emit(:anchor, :eos_ob_eol, text, ts, te)
|
395
|
+
when '\\b'; self.emit(:anchor, :word_boundary, text, ts, te)
|
396
|
+
when '\\B'; self.emit(:anchor, :nonword_boundary, text, ts, te)
|
397
|
+
when '\\G'; self.emit(:anchor, :match_start, text, ts, te)
|
398
|
+
else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
|
399
|
+
end
|
400
|
+
};
|
401
|
+
|
402
|
+
# Character types
|
403
|
+
# \d, \D digit, non-digit
|
404
|
+
# \h, \H hex, non-hex
|
405
|
+
# \s, \S space, non-space
|
406
|
+
# \w, \W word, non-word
|
407
|
+
# ------------------------------------------------------------------------
|
408
|
+
backslash . char_type > (backslashed, 2) {
|
409
|
+
case text = data[ts..te-1].pack('c*')
|
410
|
+
when '\\d'; self.emit(:type, :digit, text, ts, te)
|
411
|
+
when '\\D'; self.emit(:type, :nondigit, text, ts, te)
|
412
|
+
when '\\h'; self.emit(:type, :hex, text, ts, te)
|
413
|
+
when '\\H'; self.emit(:type, :nonhex, text, ts, te)
|
414
|
+
when '\\s'; self.emit(:type, :space, text, ts, te)
|
415
|
+
when '\\S'; self.emit(:type, :nonspace, text, ts, te)
|
416
|
+
when '\\w'; self.emit(:type, :word, text, ts, te)
|
417
|
+
when '\\W'; self.emit(:type, :nonword, text, ts, te)
|
418
|
+
end
|
419
|
+
};
|
420
|
+
|
421
|
+
|
422
|
+
# Character sets
|
423
|
+
# ------------------------------------------------------------------------
|
424
|
+
set_open {
|
425
|
+
set_depth += 1; in_set = true
|
426
|
+
set_type = set_depth > 1 ? :subset : :set
|
427
|
+
|
428
|
+
self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
|
429
|
+
fcall character_set;
|
430
|
+
};
|
431
|
+
|
432
|
+
# (?#...) comments: parsed as a single expression, without introducing a
|
433
|
+
# new nesting level. Comments may not include parentheses, escaped or not.
|
434
|
+
# special case for close, action performed on all transitions to get the
|
435
|
+
# correct closing count.
|
436
|
+
# ------------------------------------------------------------------------
|
437
|
+
group_open . group_comment $group_closed {
|
438
|
+
self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
|
439
|
+
};
|
440
|
+
|
441
|
+
# Expression options:
|
442
|
+
# (?imx-imx) option on/off
|
443
|
+
# i: ignore case
|
444
|
+
# m: multi-line (dot(.) match newline)
|
445
|
+
# x: extended form
|
446
|
+
#
|
447
|
+
# (?imx-imx:subexp) option on/off for subexp
|
448
|
+
# ------------------------------------------------------------------------
|
449
|
+
group_open . group_options >group_opened {
|
450
|
+
# special handling to resolve ambiguity with passive groups
|
451
|
+
if data[te]
|
452
|
+
c = data[te].chr
|
453
|
+
if c == ':' # include the ':'
|
454
|
+
self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
|
455
|
+
p += 1
|
456
|
+
elsif c == ')' # just options by themselves
|
457
|
+
self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
|
458
|
+
else
|
459
|
+
raise ScannerError.new(
|
460
|
+
"Unexpected '#{c}' in options sequence, ':' or ')' expected")
|
461
|
+
end
|
462
|
+
else
|
463
|
+
raise PrematureEndError.new("options") unless data[te]
|
464
|
+
end
|
465
|
+
};
|
466
|
+
|
467
|
+
# Assertions
|
468
|
+
# (?=subexp) look-ahead
|
469
|
+
# (?!subexp) negative look-ahead
|
470
|
+
# (?<=subexp) look-behind
|
471
|
+
# (?<!subexp) negative look-behind
|
472
|
+
# ------------------------------------------------------------------------
|
473
|
+
group_open . assertion_type >group_opened {
|
474
|
+
case text = data[ts..te-1].pack('c*')
|
475
|
+
when '(?='; self.emit(:assertion, :lookahead, text, ts, te)
|
476
|
+
when '(?!'; self.emit(:assertion, :nlookahead, text, ts, te)
|
477
|
+
when '(?<='; self.emit(:assertion, :lookbehind, text, ts, te)
|
478
|
+
when '(?<!'; self.emit(:assertion, :nlookbehind, text, ts, te)
|
479
|
+
end
|
480
|
+
};
|
481
|
+
|
482
|
+
# Groups
|
483
|
+
# (?:subexp) passive (non-captured) group
|
484
|
+
# (?>subexp) atomic group, don't backtrack in subexp.
|
485
|
+
# (?<name>subexp) named group
|
486
|
+
# (?'name'subexp) named group (single quoted version)
|
487
|
+
# (subexp) captured group
|
488
|
+
# ------------------------------------------------------------------------
|
489
|
+
group_open . group_type >group_opened {
|
490
|
+
case text = data[ts..te-1].pack('c*')
|
491
|
+
when '(?:'; self.emit(:group, :passive, text, ts, te)
|
492
|
+
when '(?>'; self.emit(:group, :atomic, text, ts, te)
|
493
|
+
|
494
|
+
when /\(\?<\w+>/
|
495
|
+
self.emit(:group, :named_ab, text, ts, te)
|
496
|
+
when /\(\?'\w+'/
|
497
|
+
self.emit(:group, :named_sq, text, ts, te)
|
498
|
+
end
|
499
|
+
};
|
500
|
+
|
501
|
+
group_open @group_opened {
|
502
|
+
text = data[ts..te-1].pack('c*')
|
503
|
+
self.emit(:group, :capture, text, ts, te)
|
504
|
+
};
|
505
|
+
|
506
|
+
group_close @group_closed {
|
507
|
+
self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
|
508
|
+
};
|
509
|
+
|
510
|
+
|
511
|
+
# Group back-reference, named and numbered
|
512
|
+
# ------------------------------------------------------------------------
|
513
|
+
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
514
|
+
case text = data[ts..te-1].pack('c*')
|
515
|
+
when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
516
|
+
if $1 == 'k'
|
517
|
+
self.emit(:backref, :name_ref_ab, text, ts, te)
|
518
|
+
else
|
519
|
+
self.emit(:backref, :name_call_ab, text, ts, te)
|
520
|
+
end
|
521
|
+
|
522
|
+
when /\\([gk])'[^\d-](\w+)?'/ #single quotes
|
523
|
+
if $1 == 'k'
|
524
|
+
self.emit(:backref, :name_ref_sq, text, ts, te)
|
525
|
+
else
|
526
|
+
self.emit(:backref, :name_call_sq, text, ts, te)
|
527
|
+
end
|
528
|
+
|
529
|
+
when /\\([gk])<\d+>/ # angle-brackets
|
530
|
+
if $1 == 'k'
|
531
|
+
self.emit(:backref, :number_ref_ab, text, ts, te)
|
532
|
+
else
|
533
|
+
self.emit(:backref, :number_call_ab, text, ts, te)
|
534
|
+
end
|
535
|
+
|
536
|
+
when /\\([gk])'\d+'/ # single quotes
|
537
|
+
if $1 == 'k'
|
538
|
+
self.emit(:backref, :number_ref_sq, text, ts, te)
|
539
|
+
else
|
540
|
+
self.emit(:backref, :number_call_sq, text, ts, te)
|
541
|
+
end
|
542
|
+
|
543
|
+
when /\\([gk])<-\d+>/ # angle-brackets
|
544
|
+
if $1 == 'k'
|
545
|
+
self.emit(:backref, :number_rel_ref_ab, text, ts, te)
|
546
|
+
else
|
547
|
+
self.emit(:backref, :number_rel_call_ab, text, ts, te)
|
548
|
+
end
|
549
|
+
|
550
|
+
when /\\([gk])'-\d+'/ # single quotes
|
551
|
+
if $1 == 'k'
|
552
|
+
self.emit(:backref, :number_rel_ref_sq, text, ts, te)
|
553
|
+
else
|
554
|
+
self.emit(:backref, :number_rel_call_sq, text, ts, te)
|
555
|
+
end
|
556
|
+
|
557
|
+
when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
558
|
+
self.emit(:backref, :name_nest_ref_ab, text, ts, te)
|
559
|
+
|
560
|
+
when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
561
|
+
self.emit(:backref, :name_nest_ref_sq, text, ts, te)
|
562
|
+
|
563
|
+
when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
|
564
|
+
self.emit(:backref, :number_nest_ref_ab, text, ts, te)
|
565
|
+
|
566
|
+
when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
|
567
|
+
self.emit(:backref, :number_nest_ref_sq, text, ts, te)
|
568
|
+
|
569
|
+
end
|
570
|
+
};
|
571
|
+
|
572
|
+
|
573
|
+
# Quantifiers
|
574
|
+
# ------------------------------------------------------------------------
|
575
|
+
zero_or_one {
|
576
|
+
case text = data[ts..te-1].pack('c*')
|
577
|
+
when '?' ; self.emit(:quantifier, :zero_or_one, text, ts, te)
|
578
|
+
when '??'; self.emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
|
579
|
+
when '?+'; self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
|
580
|
+
end
|
581
|
+
};
|
582
|
+
|
583
|
+
zero_or_more {
|
584
|
+
case text = data[ts..te-1].pack('c*')
|
585
|
+
when '*' ; self.emit(:quantifier, :zero_or_more, text, ts, te)
|
586
|
+
when '*?'; self.emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
|
587
|
+
when '*+'; self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
|
588
|
+
end
|
589
|
+
};
|
590
|
+
|
591
|
+
one_or_more {
|
592
|
+
case text = data[ts..te-1].pack('c*')
|
593
|
+
when '+' ; self.emit(:quantifier, :one_or_more, text, ts, te)
|
594
|
+
when '+?'; self.emit(:quantifier, :one_or_more_reluctant, text, ts, te)
|
595
|
+
when '++'; self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
|
596
|
+
end
|
597
|
+
};
|
598
|
+
|
599
|
+
quantifier_range @err(premature_end_error) {
|
600
|
+
self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
|
601
|
+
};
|
602
|
+
|
603
|
+
# Escaped sequences
|
604
|
+
# ------------------------------------------------------------------------
|
605
|
+
backslash > (backslashed, 1) {
|
606
|
+
fcall escape_sequence;
|
607
|
+
};
|
608
|
+
|
609
|
+
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
610
|
+
# except meta characters.
|
611
|
+
# ------------------------------------------------------------------------
|
612
|
+
ascii_print+ |
|
613
|
+
ascii_nonprint+ |
|
614
|
+
utf8_2_byte+ |
|
615
|
+
utf8_3_byte+ |
|
616
|
+
utf8_4_byte+ {
|
617
|
+
self.append_literal(data, ts, te)
|
618
|
+
};
|
619
|
+
|
620
|
+
*|;
|
621
|
+
}%%
|
622
|
+
|
623
|
+
|
624
|
+
module Regexp::Scanner
|
625
|
+
%% write data;
|
626
|
+
|
627
|
+
class ScannerError < StandardError
|
628
|
+
def initialize(what)
|
629
|
+
super what
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
class PrematureEndError < ScannerError
|
634
|
+
def initialize(where = '')
|
635
|
+
super "Premature end of pattern: #{where}"
|
636
|
+
end
|
637
|
+
end
|
638
|
+
|
639
|
+
class UnknownUnicodePropertyError < ScannerError
|
640
|
+
def initialize(name)
|
641
|
+
super "Unknown unicode character property name #{name}"
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
|
646
|
+
# Scans the given regular expression text, or Regexp object and collects the
|
647
|
+
# emitted token into an array that gets returned at the end. If a block is
|
648
|
+
# given, it gets called for each emitted token.
|
649
|
+
#
|
650
|
+
# This method may raise errors if a syntax error is encountered.
|
651
|
+
# --------------------------------------------------------------------------
|
652
|
+
def self.scan(input, &block)
|
653
|
+
top, stack = 0, []
|
654
|
+
|
655
|
+
input = input.source if input.is_a?(Regexp)
|
656
|
+
data = input.unpack("c*") if input.is_a?(String)
|
657
|
+
eof = data.length
|
658
|
+
|
659
|
+
@tokens = []
|
660
|
+
@block = block_given? ? block : nil
|
661
|
+
|
662
|
+
in_group, group_depth = false, 0
|
663
|
+
in_set, set_depth, set_type = false, 0, :set
|
664
|
+
|
665
|
+
%% write init;
|
666
|
+
%% write exec;
|
667
|
+
|
668
|
+
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
669
|
+
"[#{in_group}:#{group_depth}]") if in_group
|
670
|
+
raise PrematureEndError.new("(missing set closing bracket) "+
|
671
|
+
"[#{in_set}:#{set_depth}]") if in_set
|
672
|
+
|
673
|
+
# when the entire expression is a literal run
|
674
|
+
self.emit_literal if @literal
|
675
|
+
|
676
|
+
@tokens
|
677
|
+
end
|
678
|
+
|
679
|
+
# appends one or more characters to the literal buffer, to be emitted later
|
680
|
+
# by a call to emit_literal. contents a mix of ASCII and UTF-8
|
681
|
+
def self.append_literal(data, ts, te)
|
682
|
+
@literal ||= []
|
683
|
+
@literal << [data[ts..te-1].pack('c*'), ts, te]
|
684
|
+
end
|
685
|
+
|
686
|
+
# emits the collected literal run collected by one or more calls to the
|
687
|
+
# append_literal method
|
688
|
+
def self.emit_literal
|
689
|
+
ts, te = @literal.first[1], @literal.last[2]
|
690
|
+
text = @literal.map {|t| t[0]}.join
|
691
|
+
|
692
|
+
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
693
|
+
|
694
|
+
self.emit(:literal, :literal, text, ts, te)
|
695
|
+
@literal = nil
|
696
|
+
end
|
697
|
+
|
698
|
+
def self.emit(type, token, text, ts, te)
|
699
|
+
#puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
|
700
|
+
|
701
|
+
if @literal and type != :literal
|
702
|
+
self.emit_literal
|
703
|
+
end
|
704
|
+
|
705
|
+
if @block
|
706
|
+
@block.call type, token, text, ts, te
|
707
|
+
end
|
708
|
+
|
709
|
+
@tokens << [type, token, text, ts, te]
|
710
|
+
end
|
711
|
+
|
712
|
+
end # module Regexp::Scanner
|