regexp_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +4 -0
- data/LICENSE +22 -0
- data/README.rdoc +307 -0
- data/Rakefile +91 -0
- data/lib/regexp_parser/ctype.rb +48 -0
- data/lib/regexp_parser/expression/property.rb +108 -0
- data/lib/regexp_parser/expression/set.rb +59 -0
- data/lib/regexp_parser/expression.rb +287 -0
- data/lib/regexp_parser/lexer.rb +105 -0
- data/lib/regexp_parser/parser.rb +417 -0
- data/lib/regexp_parser/scanner/property.rl +534 -0
- data/lib/regexp_parser/scanner/scanner.rl +712 -0
- data/lib/regexp_parser/scanner.rb +3325 -0
- data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
- data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
- data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
- data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +332 -0
- data/lib/regexp_parser/syntax.rb +172 -0
- data/lib/regexp_parser.rb +45 -0
- data/test/helpers.rb +8 -0
- data/test/lexer/test_all.rb +26 -0
- data/test/lexer/test_literals.rb +120 -0
- data/test/lexer/test_nesting.rb +107 -0
- data/test/lexer/test_refcalls.rb +45 -0
- data/test/parser/test_all.rb +44 -0
- data/test/parser/test_alternation.rb +46 -0
- data/test/parser/test_anchors.rb +35 -0
- data/test/parser/test_errors.rb +59 -0
- data/test/parser/test_escapes.rb +48 -0
- data/test/parser/test_expression.rb +51 -0
- data/test/parser/test_groups.rb +69 -0
- data/test/parser/test_properties.rb +346 -0
- data/test/parser/test_quantifiers.rb +236 -0
- data/test/parser/test_refcalls.rb +101 -0
- data/test/parser/test_sets.rb +99 -0
- data/test/scanner/test_all.rb +30 -0
- data/test/scanner/test_anchors.rb +35 -0
- data/test/scanner/test_errors.rb +36 -0
- data/test/scanner/test_escapes.rb +49 -0
- data/test/scanner/test_groups.rb +41 -0
- data/test/scanner/test_literals.rb +85 -0
- data/test/scanner/test_meta.rb +36 -0
- data/test/scanner/test_properties.rb +315 -0
- data/test/scanner/test_quantifiers.rb +38 -0
- data/test/scanner/test_refcalls.rb +45 -0
- data/test/scanner/test_scripts.rb +314 -0
- data/test/scanner/test_sets.rb +80 -0
- data/test/scanner/test_types.rb +30 -0
- data/test/syntax/ruby/test_1.8.rb +57 -0
- data/test/syntax/ruby/test_1.9.1.rb +39 -0
- data/test/syntax/ruby/test_1.9.3.rb +38 -0
- data/test/syntax/ruby/test_all.rb +12 -0
- data/test/syntax/test_all.rb +19 -0
- data/test/test_all.rb +4 -0
- metadata +160 -0
@@ -0,0 +1,712 @@
|
|
1
|
+
%%{
|
2
|
+
machine re_scanner;
|
3
|
+
include re_property "property.rl";
|
4
|
+
|
5
|
+
dot = '.';
|
6
|
+
backslash = '\\';
|
7
|
+
alternation = '|';
|
8
|
+
beginning_of_line = '^';
|
9
|
+
end_of_line = '$';
|
10
|
+
|
11
|
+
range_open = '{';
|
12
|
+
range_close = '}';
|
13
|
+
curlies = range_open | range_close;
|
14
|
+
|
15
|
+
group_open = '(';
|
16
|
+
group_close = ')';
|
17
|
+
parantheses = group_open | group_close;
|
18
|
+
|
19
|
+
set_open = '[';
|
20
|
+
set_close = ']';
|
21
|
+
brackets = set_open | set_close;
|
22
|
+
|
23
|
+
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
24
|
+
'cntrl' | 'digit' | 'graph' |
|
25
|
+
'lower' | 'print' | 'punct' |
|
26
|
+
'space' | 'upper' | 'xdigit' |
|
27
|
+
'word' | 'ascii';
|
28
|
+
|
29
|
+
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
30
|
+
|
31
|
+
# these are not supported in ruby, and need verification
|
32
|
+
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
33
|
+
character_equivalent = '[=' . alpha . '=]';
|
34
|
+
|
35
|
+
char_type = [dDhHsSwW];
|
36
|
+
|
37
|
+
line_anchor = beginning_of_line | end_of_line;
|
38
|
+
anchor_char = [AbBzZG];
|
39
|
+
|
40
|
+
escaped_ascii = [abefnrstv];
|
41
|
+
octal_sequence = [0-7]{1,3};
|
42
|
+
|
43
|
+
hex_sequence = 'x' . xdigit{1,2};
|
44
|
+
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
45
|
+
|
46
|
+
codepoint_single = 'u' . xdigit{4};
|
47
|
+
codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
|
48
|
+
codepoint_sequence = codepoint_single | codepoint_list;
|
49
|
+
|
50
|
+
control_sequence = ('c' | 'C-') . alpha;
|
51
|
+
meta_sequence = 'M-' . ((backslash . control_sequence) | alpha);
|
52
|
+
|
53
|
+
zero_or_one = '?' | '??' | '?+';
|
54
|
+
zero_or_more = '*' | '*?' | '*+';
|
55
|
+
one_or_more = '+' | '+?' | '++';
|
56
|
+
|
57
|
+
quantifier_greedy = '?' | '*' | '+';
|
58
|
+
quantifier_reluctant = '??' | '*?' | '+?';
|
59
|
+
quantifier_possessive = '?+' | '*+' | '++';
|
60
|
+
quantifier_mode = '?' | '+';
|
61
|
+
|
62
|
+
quantifier_range = range_open . (digit+)? . ','? . (digit+)? .
|
63
|
+
range_close . quantifier_mode?;
|
64
|
+
|
65
|
+
quantifiers = quantifier_greedy | quantifier_reluctant |
|
66
|
+
quantifier_possessive | quantifier_range;
|
67
|
+
|
68
|
+
|
69
|
+
group_comment = '?#' . [^)]+ . group_close;
|
70
|
+
|
71
|
+
group_atomic = '?>';
|
72
|
+
group_passive = '?:';
|
73
|
+
|
74
|
+
assertion_lookahead = '?=';
|
75
|
+
assertion_nlookahead = '?!';
|
76
|
+
assertion_lookbehind = '?<=';
|
77
|
+
assertion_nlookbehind = '?<!';
|
78
|
+
|
79
|
+
group_options = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
|
80
|
+
|
81
|
+
group_ref = [gk];
|
82
|
+
group_name = alpha . (alnum+)?;
|
83
|
+
group_number = '-'? . [1-9] . ([0-9]+)?;
|
84
|
+
group_level = [+\-] . [0-9]+;
|
85
|
+
|
86
|
+
group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
|
87
|
+
|
88
|
+
group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
|
89
|
+
("'" . group_name . group_level? "'"));
|
90
|
+
|
91
|
+
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
92
|
+
("'" . group_number . group_level? "'"));
|
93
|
+
|
94
|
+
group_type = group_atomic | group_passive | group_named;
|
95
|
+
|
96
|
+
assertion_type = assertion_lookahead | assertion_nlookahead |
|
97
|
+
assertion_lookbehind | assertion_nlookbehind;
|
98
|
+
|
99
|
+
# characters that 'break' a literal
|
100
|
+
meta_char = dot | backslash | alternation |
|
101
|
+
curlies | parantheses | brackets |
|
102
|
+
line_anchor | quantifier_greedy;
|
103
|
+
|
104
|
+
ascii_print = ((0x20..0x7e) - meta_char)+;
|
105
|
+
ascii_nonprint = (0x01..0x1f | 0x7f)+;
|
106
|
+
|
107
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
|
108
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
|
109
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
|
110
|
+
utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
111
|
+
|
112
|
+
non_literal_escape = char_type | anchor_char | escaped_ascii |
|
113
|
+
group_ref | [xucCM];
|
114
|
+
|
115
|
+
# EOF error, used where it can be detected
|
116
|
+
action premature_end_error { raise PrematureEndError }
|
117
|
+
|
118
|
+
# group (nesting) and set open/close actions
|
119
|
+
action group_opened { group_depth += 1; in_group = true }
|
120
|
+
action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
|
121
|
+
|
122
|
+
# Character set scanner, continues consuming characters until it meets the
|
123
|
+
# closing bracket of the set.
|
124
|
+
# --------------------------------------------------------------------------
|
125
|
+
character_set := |*
|
126
|
+
']' {
|
127
|
+
set_type = set_depth > 1 ? :subset : :set
|
128
|
+
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
129
|
+
|
130
|
+
self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
|
131
|
+
|
132
|
+
if set_depth == 0
|
133
|
+
fgoto main;
|
134
|
+
else
|
135
|
+
fret;
|
136
|
+
end
|
137
|
+
};
|
138
|
+
|
139
|
+
'-]' { # special case, emits two tokens
|
140
|
+
set_type = set_depth > 1 ? :subset : :set
|
141
|
+
set_depth -= 1; in_set = set_depth > 0 ? true : false
|
142
|
+
|
143
|
+
self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
|
144
|
+
self.emit(set_type, :close, data[ts+1..te-1].pack('c*'), ts, te)
|
145
|
+
|
146
|
+
if set_depth == 0
|
147
|
+
fgoto main;
|
148
|
+
else
|
149
|
+
fret;
|
150
|
+
end
|
151
|
+
};
|
152
|
+
|
153
|
+
'^' {
|
154
|
+
text = data[ts..te-1].pack('c*')
|
155
|
+
if @tokens.last[1] == :open
|
156
|
+
self.emit(set_type, :negate, text, ts, te)
|
157
|
+
else
|
158
|
+
self.emit(set_type, :member, text, ts, te)
|
159
|
+
end
|
160
|
+
};
|
161
|
+
|
162
|
+
alnum . '-' . alnum {
|
163
|
+
self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
|
164
|
+
};
|
165
|
+
|
166
|
+
'&&' {
|
167
|
+
self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
|
168
|
+
};
|
169
|
+
|
170
|
+
'\\' {
|
171
|
+
fcall set_escape_sequence;
|
172
|
+
};
|
173
|
+
|
174
|
+
'[' >(open_bracket, 1) {
|
175
|
+
set_depth += 1; in_set = true
|
176
|
+
set_type = set_depth > 1 ? :subset : :set
|
177
|
+
|
178
|
+
self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
|
179
|
+
fcall character_set;
|
180
|
+
};
|
181
|
+
|
182
|
+
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
183
|
+
text = data[ts..te-1].pack('c*')
|
184
|
+
|
185
|
+
class_name = text[2..-3]
|
186
|
+
if class_name[0].chr == '^'
|
187
|
+
class_name = "non#{class_name[1..-1]}"
|
188
|
+
end
|
189
|
+
|
190
|
+
token_sym = "class_#{class_name}".to_sym
|
191
|
+
self.emit(set_type, token_sym, text, ts, te)
|
192
|
+
};
|
193
|
+
|
194
|
+
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
195
|
+
self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
|
196
|
+
};
|
197
|
+
|
198
|
+
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
199
|
+
self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
|
200
|
+
};
|
201
|
+
|
202
|
+
# exclude the closing bracket as a cleaner workaround for dealing with the
|
203
|
+
# ambiguity caused upon exit from the unicode properties machine
|
204
|
+
meta_char -- ']' {
|
205
|
+
self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
|
206
|
+
};
|
207
|
+
|
208
|
+
any |
|
209
|
+
ascii_nonprint |
|
210
|
+
utf8_2_byte |
|
211
|
+
utf8_3_byte |
|
212
|
+
utf8_4_byte {
|
213
|
+
self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
|
214
|
+
};
|
215
|
+
*|;
|
216
|
+
|
217
|
+
# set escapes scanner
|
218
|
+
# --------------------------------------------------------------------------
|
219
|
+
set_escape_sequence := |*
|
220
|
+
'b' {
|
221
|
+
self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
|
222
|
+
fret;
|
223
|
+
};
|
224
|
+
|
225
|
+
char_type {
|
226
|
+
case text = data[ts-1..te-1].pack('c*')
|
227
|
+
when '\d'; self.emit(set_type, :type_digit, text, ts-1, te)
|
228
|
+
when '\D'; self.emit(set_type, :type_nondigit, text, ts-1, te)
|
229
|
+
when '\h'; self.emit(set_type, :type_hex, text, ts-1, te)
|
230
|
+
when '\H'; self.emit(set_type, :type_nonhex, text, ts-1, te)
|
231
|
+
when '\s'; self.emit(set_type, :type_space, text, ts-1, te)
|
232
|
+
when '\S'; self.emit(set_type, :type_nonspace, text, ts-1, te)
|
233
|
+
when '\w'; self.emit(set_type, :type_word, text, ts-1, te)
|
234
|
+
when '\W'; self.emit(set_type, :type_nonword, text, ts-1, te)
|
235
|
+
end
|
236
|
+
fret;
|
237
|
+
};
|
238
|
+
|
239
|
+
hex_sequence . '-\\' . hex_sequence {
|
240
|
+
self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
241
|
+
fret;
|
242
|
+
};
|
243
|
+
|
244
|
+
hex_sequence {
|
245
|
+
self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
246
|
+
fret;
|
247
|
+
};
|
248
|
+
|
249
|
+
meta_char | [\\\]\-\,] {
|
250
|
+
self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
|
251
|
+
fret;
|
252
|
+
};
|
253
|
+
|
254
|
+
property_char > (escaped_set_alpha, 2) {
|
255
|
+
fhold;
|
256
|
+
fnext character_set;
|
257
|
+
fcall unicode_property;
|
258
|
+
fret;
|
259
|
+
};
|
260
|
+
|
261
|
+
# special case exclusion of escaped dash, could be cleaner.
|
262
|
+
(ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
|
263
|
+
ascii_nonprint |
|
264
|
+
utf8_2_byte |
|
265
|
+
utf8_3_byte |
|
266
|
+
utf8_4_byte {
|
267
|
+
self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
|
268
|
+
fret;
|
269
|
+
};
|
270
|
+
*|;
|
271
|
+
|
272
|
+
|
273
|
+
# escape sequence scanner
|
274
|
+
# --------------------------------------------------------------------------
|
275
|
+
escape_sequence := |*
|
276
|
+
[1-9] {
|
277
|
+
text = data[ts-1..te-1].pack('c*')
|
278
|
+
self.emit(:backref, :number, text, ts-1, te)
|
279
|
+
fret;
|
280
|
+
};
|
281
|
+
|
282
|
+
octal_sequence {
|
283
|
+
self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
|
284
|
+
fret;
|
285
|
+
};
|
286
|
+
|
287
|
+
meta_char {
|
288
|
+
case text = data[ts-1..te-1].pack('c*')
|
289
|
+
when '\.'; self.emit(:escape, :dot, text, ts-1, te)
|
290
|
+
when '\|'; self.emit(:escape, :alternation, text, ts-1, te)
|
291
|
+
when '\^'; self.emit(:escape, :beginning_of_line, text, ts-1, te)
|
292
|
+
when '\$'; self.emit(:escape, :end_of_line, text, ts-1, te)
|
293
|
+
when '\?'; self.emit(:escape, :zero_or_one, text, ts-1, te)
|
294
|
+
when '\*'; self.emit(:escape, :zero_or_more, text, ts-1, te)
|
295
|
+
when '\+'; self.emit(:escape, :one_or_more, text, ts-1, te)
|
296
|
+
when '\('; self.emit(:escape, :group_open, text, ts-1, te)
|
297
|
+
when '\)'; self.emit(:escape, :group_close, text, ts-1, te)
|
298
|
+
when '\{'; self.emit(:escape, :interval_open, text, ts-1, te)
|
299
|
+
when '\}'; self.emit(:escape, :interval_close, text, ts-1, te)
|
300
|
+
when '\['; self.emit(:escape, :set_open, text, ts-1, te)
|
301
|
+
when '\]'; self.emit(:escape, :set_close, text, ts-1, te)
|
302
|
+
when "\\\\";
|
303
|
+
self.emit(:escape, :backslash, text, ts-1, te)
|
304
|
+
end
|
305
|
+
fret;
|
306
|
+
};
|
307
|
+
|
308
|
+
escaped_ascii > (escaped_alpha, 7) {
|
309
|
+
# \b is emitted as backspace only when inside a character set, otherwise
|
310
|
+
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
311
|
+
case text = data[ts-1..te-1].pack('c*')
|
312
|
+
when '\a'; self.emit(:escape, :bell, text, ts-1, te)
|
313
|
+
when '\e'; self.emit(:escape, :escape, text, ts-1, te)
|
314
|
+
when '\f'; self.emit(:escape, :form_feed, text, ts-1, te)
|
315
|
+
when '\n'; self.emit(:escape, :newline, text, ts-1, te)
|
316
|
+
when '\r'; self.emit(:escape, :carriage, text, ts-1, te)
|
317
|
+
when '\s'; self.emit(:escape, :space, text, ts-1, te)
|
318
|
+
when '\t'; self.emit(:escape, :tab, text, ts-1, te)
|
319
|
+
when '\v'; self.emit(:escape, :vertical_tab, text, ts-1, te)
|
320
|
+
end
|
321
|
+
fret;
|
322
|
+
};
|
323
|
+
|
324
|
+
codepoint_sequence > (escaped_alpha, 6) {
|
325
|
+
text = data[ts-1..te-1].pack('c*')
|
326
|
+
if text[2].chr == '{'
|
327
|
+
self.emit(:escape, :codepoint_list, text, ts-1, te)
|
328
|
+
else
|
329
|
+
self.emit(:escape, :codepoint, text, ts-1, te)
|
330
|
+
end
|
331
|
+
fret;
|
332
|
+
};
|
333
|
+
|
334
|
+
hex_sequence > (escaped_alpha, 5) {
|
335
|
+
self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
|
336
|
+
fret;
|
337
|
+
};
|
338
|
+
|
339
|
+
wide_hex_sequence > (escaped_alpha, 5) {
|
340
|
+
self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
|
341
|
+
fret;
|
342
|
+
};
|
343
|
+
|
344
|
+
control_sequence > (escaped_alpha, 4) {
|
345
|
+
self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
|
346
|
+
fret;
|
347
|
+
};
|
348
|
+
|
349
|
+
meta_sequence > (backslashed, 3) {
|
350
|
+
self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
|
351
|
+
};
|
352
|
+
|
353
|
+
property_char > (escaped_alpha, 2) {
|
354
|
+
fhold;
|
355
|
+
fnext main;
|
356
|
+
fcall unicode_property; fret;
|
357
|
+
};
|
358
|
+
|
359
|
+
(any -- non_literal_escape) > (escaped_alpha, 1) {
|
360
|
+
self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
|
361
|
+
fret;
|
362
|
+
};
|
363
|
+
*|;
|
364
|
+
|
365
|
+
|
366
|
+
# Main scanner
|
367
|
+
# --------------------------------------------------------------------------
|
368
|
+
main := |*
|
369
|
+
|
370
|
+
# Meta characters
|
371
|
+
# ------------------------------------------------------------------------
|
372
|
+
dot {
|
373
|
+
self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
|
374
|
+
};
|
375
|
+
|
376
|
+
alternation {
|
377
|
+
self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
|
378
|
+
};
|
379
|
+
|
380
|
+
# Anchors
|
381
|
+
# ------------------------------------------------------------------------
|
382
|
+
beginning_of_line {
|
383
|
+
self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
|
384
|
+
};
|
385
|
+
|
386
|
+
end_of_line {
|
387
|
+
self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
|
388
|
+
};
|
389
|
+
|
390
|
+
backslash . anchor_char > (backslashed, 3) {
|
391
|
+
case text = data[ts..te-1].pack('c*')
|
392
|
+
when '\\A'; self.emit(:anchor, :bos, text, ts, te)
|
393
|
+
when '\\z'; self.emit(:anchor, :eos, text, ts, te)
|
394
|
+
when '\\Z'; self.emit(:anchor, :eos_ob_eol, text, ts, te)
|
395
|
+
when '\\b'; self.emit(:anchor, :word_boundary, text, ts, te)
|
396
|
+
when '\\B'; self.emit(:anchor, :nonword_boundary, text, ts, te)
|
397
|
+
when '\\G'; self.emit(:anchor, :match_start, text, ts, te)
|
398
|
+
else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
|
399
|
+
end
|
400
|
+
};
|
401
|
+
|
402
|
+
# Character types
|
403
|
+
# \d, \D digit, non-digit
|
404
|
+
# \h, \H hex, non-hex
|
405
|
+
# \s, \S space, non-space
|
406
|
+
# \w, \W word, non-word
|
407
|
+
# ------------------------------------------------------------------------
|
408
|
+
backslash . char_type > (backslashed, 2) {
|
409
|
+
case text = data[ts..te-1].pack('c*')
|
410
|
+
when '\\d'; self.emit(:type, :digit, text, ts, te)
|
411
|
+
when '\\D'; self.emit(:type, :nondigit, text, ts, te)
|
412
|
+
when '\\h'; self.emit(:type, :hex, text, ts, te)
|
413
|
+
when '\\H'; self.emit(:type, :nonhex, text, ts, te)
|
414
|
+
when '\\s'; self.emit(:type, :space, text, ts, te)
|
415
|
+
when '\\S'; self.emit(:type, :nonspace, text, ts, te)
|
416
|
+
when '\\w'; self.emit(:type, :word, text, ts, te)
|
417
|
+
when '\\W'; self.emit(:type, :nonword, text, ts, te)
|
418
|
+
end
|
419
|
+
};
|
420
|
+
|
421
|
+
|
422
|
+
# Character sets
|
423
|
+
# ------------------------------------------------------------------------
|
424
|
+
set_open {
|
425
|
+
set_depth += 1; in_set = true
|
426
|
+
set_type = set_depth > 1 ? :subset : :set
|
427
|
+
|
428
|
+
self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
|
429
|
+
fcall character_set;
|
430
|
+
};
|
431
|
+
|
432
|
+
# (?#...) comments: parsed as a single expression, without introducing a
|
433
|
+
# new nesting level. Comments may not include parentheses, escaped or not.
|
434
|
+
# special case for close, action performed on all transitions to get the
|
435
|
+
# correct closing count.
|
436
|
+
# ------------------------------------------------------------------------
|
437
|
+
group_open . group_comment $group_closed {
|
438
|
+
self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
|
439
|
+
};
|
440
|
+
|
441
|
+
# Expression options:
|
442
|
+
# (?imx-imx) option on/off
|
443
|
+
# i: ignore case
|
444
|
+
# m: multi-line (dot(.) match newline)
|
445
|
+
# x: extended form
|
446
|
+
#
|
447
|
+
# (?imx-imx:subexp) option on/off for subexp
|
448
|
+
# ------------------------------------------------------------------------
|
449
|
+
group_open . group_options >group_opened {
|
450
|
+
# special handling to resolve ambiguity with passive groups
|
451
|
+
if data[te]
|
452
|
+
c = data[te].chr
|
453
|
+
if c == ':' # include the ':'
|
454
|
+
self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
|
455
|
+
p += 1
|
456
|
+
elsif c == ')' # just options by themselves
|
457
|
+
self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
|
458
|
+
else
|
459
|
+
raise ScannerError.new(
|
460
|
+
"Unexpected '#{c}' in options sequence, ':' or ')' expected")
|
461
|
+
end
|
462
|
+
else
|
463
|
+
raise PrematureEndError.new("options") unless data[te]
|
464
|
+
end
|
465
|
+
};
|
466
|
+
|
467
|
+
# Assertions
|
468
|
+
# (?=subexp) look-ahead
|
469
|
+
# (?!subexp) negative look-ahead
|
470
|
+
# (?<=subexp) look-behind
|
471
|
+
# (?<!subexp) negative look-behind
|
472
|
+
# ------------------------------------------------------------------------
|
473
|
+
group_open . assertion_type >group_opened {
|
474
|
+
case text = data[ts..te-1].pack('c*')
|
475
|
+
when '(?='; self.emit(:assertion, :lookahead, text, ts, te)
|
476
|
+
when '(?!'; self.emit(:assertion, :nlookahead, text, ts, te)
|
477
|
+
when '(?<='; self.emit(:assertion, :lookbehind, text, ts, te)
|
478
|
+
when '(?<!'; self.emit(:assertion, :nlookbehind, text, ts, te)
|
479
|
+
end
|
480
|
+
};
|
481
|
+
|
482
|
+
# Groups
|
483
|
+
# (?:subexp) passive (non-captured) group
|
484
|
+
# (?>subexp) atomic group, don't backtrack in subexp.
|
485
|
+
# (?<name>subexp) named group
|
486
|
+
# (?'name'subexp) named group (single quoted version)
|
487
|
+
# (subexp) captured group
|
488
|
+
# ------------------------------------------------------------------------
|
489
|
+
group_open . group_type >group_opened {
|
490
|
+
case text = data[ts..te-1].pack('c*')
|
491
|
+
when '(?:'; self.emit(:group, :passive, text, ts, te)
|
492
|
+
when '(?>'; self.emit(:group, :atomic, text, ts, te)
|
493
|
+
|
494
|
+
when /\(\?<\w+>/
|
495
|
+
self.emit(:group, :named_ab, text, ts, te)
|
496
|
+
when /\(\?'\w+'/
|
497
|
+
self.emit(:group, :named_sq, text, ts, te)
|
498
|
+
end
|
499
|
+
};
|
500
|
+
|
501
|
+
group_open @group_opened {
|
502
|
+
text = data[ts..te-1].pack('c*')
|
503
|
+
self.emit(:group, :capture, text, ts, te)
|
504
|
+
};
|
505
|
+
|
506
|
+
group_close @group_closed {
|
507
|
+
self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
|
508
|
+
};
|
509
|
+
|
510
|
+
|
511
|
+
# Group back-reference, named and numbered
|
512
|
+
# ------------------------------------------------------------------------
|
513
|
+
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
514
|
+
case text = data[ts..te-1].pack('c*')
|
515
|
+
when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
516
|
+
if $1 == 'k'
|
517
|
+
self.emit(:backref, :name_ref_ab, text, ts, te)
|
518
|
+
else
|
519
|
+
self.emit(:backref, :name_call_ab, text, ts, te)
|
520
|
+
end
|
521
|
+
|
522
|
+
when /\\([gk])'[^\d-](\w+)?'/ #single quotes
|
523
|
+
if $1 == 'k'
|
524
|
+
self.emit(:backref, :name_ref_sq, text, ts, te)
|
525
|
+
else
|
526
|
+
self.emit(:backref, :name_call_sq, text, ts, te)
|
527
|
+
end
|
528
|
+
|
529
|
+
when /\\([gk])<\d+>/ # angle-brackets
|
530
|
+
if $1 == 'k'
|
531
|
+
self.emit(:backref, :number_ref_ab, text, ts, te)
|
532
|
+
else
|
533
|
+
self.emit(:backref, :number_call_ab, text, ts, te)
|
534
|
+
end
|
535
|
+
|
536
|
+
when /\\([gk])'\d+'/ # single quotes
|
537
|
+
if $1 == 'k'
|
538
|
+
self.emit(:backref, :number_ref_sq, text, ts, te)
|
539
|
+
else
|
540
|
+
self.emit(:backref, :number_call_sq, text, ts, te)
|
541
|
+
end
|
542
|
+
|
543
|
+
when /\\([gk])<-\d+>/ # angle-brackets
|
544
|
+
if $1 == 'k'
|
545
|
+
self.emit(:backref, :number_rel_ref_ab, text, ts, te)
|
546
|
+
else
|
547
|
+
self.emit(:backref, :number_rel_call_ab, text, ts, te)
|
548
|
+
end
|
549
|
+
|
550
|
+
when /\\([gk])'-\d+'/ # single quotes
|
551
|
+
if $1 == 'k'
|
552
|
+
self.emit(:backref, :number_rel_ref_sq, text, ts, te)
|
553
|
+
else
|
554
|
+
self.emit(:backref, :number_rel_call_sq, text, ts, te)
|
555
|
+
end
|
556
|
+
|
557
|
+
when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
558
|
+
self.emit(:backref, :name_nest_ref_ab, text, ts, te)
|
559
|
+
|
560
|
+
when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
561
|
+
self.emit(:backref, :name_nest_ref_sq, text, ts, te)
|
562
|
+
|
563
|
+
when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
|
564
|
+
self.emit(:backref, :number_nest_ref_ab, text, ts, te)
|
565
|
+
|
566
|
+
when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
|
567
|
+
self.emit(:backref, :number_nest_ref_sq, text, ts, te)
|
568
|
+
|
569
|
+
end
|
570
|
+
};
|
571
|
+
|
572
|
+
|
573
|
+
# Quantifiers
|
574
|
+
# ------------------------------------------------------------------------
|
575
|
+
zero_or_one {
|
576
|
+
case text = data[ts..te-1].pack('c*')
|
577
|
+
when '?' ; self.emit(:quantifier, :zero_or_one, text, ts, te)
|
578
|
+
when '??'; self.emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
|
579
|
+
when '?+'; self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
|
580
|
+
end
|
581
|
+
};
|
582
|
+
|
583
|
+
zero_or_more {
|
584
|
+
case text = data[ts..te-1].pack('c*')
|
585
|
+
when '*' ; self.emit(:quantifier, :zero_or_more, text, ts, te)
|
586
|
+
when '*?'; self.emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
|
587
|
+
when '*+'; self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
|
588
|
+
end
|
589
|
+
};
|
590
|
+
|
591
|
+
one_or_more {
|
592
|
+
case text = data[ts..te-1].pack('c*')
|
593
|
+
when '+' ; self.emit(:quantifier, :one_or_more, text, ts, te)
|
594
|
+
when '+?'; self.emit(:quantifier, :one_or_more_reluctant, text, ts, te)
|
595
|
+
when '++'; self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
|
596
|
+
end
|
597
|
+
};
|
598
|
+
|
599
|
+
quantifier_range @err(premature_end_error) {
|
600
|
+
self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
|
601
|
+
};
|
602
|
+
|
603
|
+
# Escaped sequences
|
604
|
+
# ------------------------------------------------------------------------
|
605
|
+
backslash > (backslashed, 1) {
|
606
|
+
fcall escape_sequence;
|
607
|
+
};
|
608
|
+
|
609
|
+
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
610
|
+
# except meta characters.
|
611
|
+
# ------------------------------------------------------------------------
|
612
|
+
ascii_print+ |
|
613
|
+
ascii_nonprint+ |
|
614
|
+
utf8_2_byte+ |
|
615
|
+
utf8_3_byte+ |
|
616
|
+
utf8_4_byte+ {
|
617
|
+
self.append_literal(data, ts, te)
|
618
|
+
};
|
619
|
+
|
620
|
+
*|;
|
621
|
+
}%%
|
622
|
+
|
623
|
+
|
624
|
+
module Regexp::Scanner
|
625
|
+
%% write data;
|
626
|
+
|
627
|
+
class ScannerError < StandardError
|
628
|
+
def initialize(what)
|
629
|
+
super what
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
class PrematureEndError < ScannerError
|
634
|
+
def initialize(where = '')
|
635
|
+
super "Premature end of pattern: #{where}"
|
636
|
+
end
|
637
|
+
end
|
638
|
+
|
639
|
+
class UnknownUnicodePropertyError < ScannerError
|
640
|
+
def initialize(name)
|
641
|
+
super "Unknown unicode character property name #{name}"
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
|
646
|
+
# Scans the given regular expression text, or Regexp object and collects the
|
647
|
+
# emitted token into an array that gets returned at the end. If a block is
|
648
|
+
# given, it gets called for each emitted token.
|
649
|
+
#
|
650
|
+
# This method may raise errors if a syntax error is encountered.
|
651
|
+
# --------------------------------------------------------------------------
|
652
|
+
def self.scan(input, &block)
|
653
|
+
top, stack = 0, []
|
654
|
+
|
655
|
+
input = input.source if input.is_a?(Regexp)
|
656
|
+
data = input.unpack("c*") if input.is_a?(String)
|
657
|
+
eof = data.length
|
658
|
+
|
659
|
+
@tokens = []
|
660
|
+
@block = block_given? ? block : nil
|
661
|
+
|
662
|
+
in_group, group_depth = false, 0
|
663
|
+
in_set, set_depth, set_type = false, 0, :set
|
664
|
+
|
665
|
+
%% write init;
|
666
|
+
%% write exec;
|
667
|
+
|
668
|
+
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
669
|
+
"[#{in_group}:#{group_depth}]") if in_group
|
670
|
+
raise PrematureEndError.new("(missing set closing bracket) "+
|
671
|
+
"[#{in_set}:#{set_depth}]") if in_set
|
672
|
+
|
673
|
+
# when the entire expression is a literal run
|
674
|
+
self.emit_literal if @literal
|
675
|
+
|
676
|
+
@tokens
|
677
|
+
end
|
678
|
+
|
679
|
+
# appends one or more characters to the literal buffer, to be emitted later
|
680
|
+
# by a call to emit_literal. contents a mix of ASCII and UTF-8
|
681
|
+
def self.append_literal(data, ts, te)
|
682
|
+
@literal ||= []
|
683
|
+
@literal << [data[ts..te-1].pack('c*'), ts, te]
|
684
|
+
end
|
685
|
+
|
686
|
+
# emits the collected literal run collected by one or more calls to the
|
687
|
+
# append_literal method
|
688
|
+
def self.emit_literal
|
689
|
+
ts, te = @literal.first[1], @literal.last[2]
|
690
|
+
text = @literal.map {|t| t[0]}.join
|
691
|
+
|
692
|
+
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
693
|
+
|
694
|
+
self.emit(:literal, :literal, text, ts, te)
|
695
|
+
@literal = nil
|
696
|
+
end
|
697
|
+
|
698
|
+
def self.emit(type, token, text, ts, te)
|
699
|
+
#puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
|
700
|
+
|
701
|
+
if @literal and type != :literal
|
702
|
+
self.emit_literal
|
703
|
+
end
|
704
|
+
|
705
|
+
if @block
|
706
|
+
@block.call type, token, text, ts, te
|
707
|
+
end
|
708
|
+
|
709
|
+
@tokens << [type, token, text, ts, te]
|
710
|
+
end
|
711
|
+
|
712
|
+
end # module Regexp::Scanner
|