regexp_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,712 @@
1
+ %%{
2
+ machine re_scanner;
3
+ include re_property "property.rl";
4
+
5
+ dot = '.';
6
+ backslash = '\\';
7
+ alternation = '|';
8
+ beginning_of_line = '^';
9
+ end_of_line = '$';
10
+
11
+ range_open = '{';
12
+ range_close = '}';
13
+ curlies = range_open | range_close;
14
+
15
+ group_open = '(';
16
+ group_close = ')';
17
+ parantheses = group_open | group_close;
18
+
19
+ set_open = '[';
20
+ set_close = ']';
21
+ brackets = set_open | set_close;
22
+
23
+ class_name_posix = 'alnum' | 'alpha' | 'blank' |
24
+ 'cntrl' | 'digit' | 'graph' |
25
+ 'lower' | 'print' | 'punct' |
26
+ 'space' | 'upper' | 'xdigit' |
27
+ 'word' | 'ascii';
28
+
29
+ class_posix = ('[:' . '^'? . class_name_posix . ':]');
30
+
31
+ # these are not supported in ruby, and need verification
32
+ collating_sequence = '[.' . (alpha | [\-])+ . '.]';
33
+ character_equivalent = '[=' . alpha . '=]';
34
+
35
+ char_type = [dDhHsSwW];
36
+
37
+ line_anchor = beginning_of_line | end_of_line;
38
+ anchor_char = [AbBzZG];
39
+
40
+ escaped_ascii = [abefnrstv];
41
+ octal_sequence = [0-7]{1,3};
42
+
43
+ hex_sequence = 'x' . xdigit{1,2};
44
+ wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
45
+
46
+ codepoint_single = 'u' . xdigit{4};
47
+ codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
48
+ codepoint_sequence = codepoint_single | codepoint_list;
49
+
50
+ control_sequence = ('c' | 'C-') . alpha;
51
+ meta_sequence = 'M-' . ((backslash . control_sequence) | alpha);
52
+
53
+ zero_or_one = '?' | '??' | '?+';
54
+ zero_or_more = '*' | '*?' | '*+';
55
+ one_or_more = '+' | '+?' | '++';
56
+
57
+ quantifier_greedy = '?' | '*' | '+';
58
+ quantifier_reluctant = '??' | '*?' | '+?';
59
+ quantifier_possessive = '?+' | '*+' | '++';
60
+ quantifier_mode = '?' | '+';
61
+
62
+ quantifier_range = range_open . (digit+)? . ','? . (digit+)? .
63
+ range_close . quantifier_mode?;
64
+
65
+ quantifiers = quantifier_greedy | quantifier_reluctant |
66
+ quantifier_possessive | quantifier_range;
67
+
68
+
69
+ group_comment = '?#' . [^)]+ . group_close;
70
+
71
+ group_atomic = '?>';
72
+ group_passive = '?:';
73
+
74
+ assertion_lookahead = '?=';
75
+ assertion_nlookahead = '?!';
76
+ assertion_lookbehind = '?<=';
77
+ assertion_nlookbehind = '?<!';
78
+
79
+ group_options = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
80
+
81
+ group_ref = [gk];
82
+ group_name = alpha . (alnum+)?;
83
+ group_number = '-'? . [1-9] . ([0-9]+)?;
84
+ group_level = [+\-] . [0-9]+;
85
+
86
+ group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
87
+
88
+ group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
89
+ ("'" . group_name . group_level? "'"));
90
+
91
+ group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
92
+ ("'" . group_number . group_level? "'"));
93
+
94
+ group_type = group_atomic | group_passive | group_named;
95
+
96
+ assertion_type = assertion_lookahead | assertion_nlookahead |
97
+ assertion_lookbehind | assertion_nlookbehind;
98
+
99
+ # characters that 'break' a literal
100
+ meta_char = dot | backslash | alternation |
101
+ curlies | parantheses | brackets |
102
+ line_anchor | quantifier_greedy;
103
+
104
+ ascii_print = ((0x20..0x7e) - meta_char)+;
105
+ ascii_nonprint = (0x01..0x1f | 0x7f)+;
106
+
107
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
108
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
109
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
110
+ utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
111
+
112
+ non_literal_escape = char_type | anchor_char | escaped_ascii |
113
+ group_ref | [xucCM];
114
+
115
+ # EOF error, used where it can be detected
116
+ action premature_end_error { raise PrematureEndError }
117
+
118
+ # group (nesting) and set open/close actions
119
+ action group_opened { group_depth += 1; in_group = true }
120
+ action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
121
+
122
+ # Character set scanner, continues consuming characters until it meets the
123
+ # closing bracket of the set.
124
+ # --------------------------------------------------------------------------
125
+ character_set := |*
126
+ ']' {
127
+ set_type = set_depth > 1 ? :subset : :set
128
+ set_depth -= 1; in_set = set_depth > 0 ? true : false
129
+
130
+ self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
131
+
132
+ if set_depth == 0
133
+ fgoto main;
134
+ else
135
+ fret;
136
+ end
137
+ };
138
+
139
+ '-]' { # special case, emits two tokens
140
+ set_type = set_depth > 1 ? :subset : :set
141
+ set_depth -= 1; in_set = set_depth > 0 ? true : false
142
+
143
+ self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
144
+ self.emit(set_type, :close, data[ts+1..te-1].pack('c*'), ts, te)
145
+
146
+ if set_depth == 0
147
+ fgoto main;
148
+ else
149
+ fret;
150
+ end
151
+ };
152
+
153
+ '^' {
154
+ text = data[ts..te-1].pack('c*')
155
+ if @tokens.last[1] == :open
156
+ self.emit(set_type, :negate, text, ts, te)
157
+ else
158
+ self.emit(set_type, :member, text, ts, te)
159
+ end
160
+ };
161
+
162
+ alnum . '-' . alnum {
163
+ self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
164
+ };
165
+
166
+ '&&' {
167
+ self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
168
+ };
169
+
170
+ '\\' {
171
+ fcall set_escape_sequence;
172
+ };
173
+
174
+ '[' >(open_bracket, 1) {
175
+ set_depth += 1; in_set = true
176
+ set_type = set_depth > 1 ? :subset : :set
177
+
178
+ self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
179
+ fcall character_set;
180
+ };
181
+
182
+ class_posix >(open_bracket, 1) @eof(premature_end_error) {
183
+ text = data[ts..te-1].pack('c*')
184
+
185
+ class_name = text[2..-3]
186
+ if class_name[0].chr == '^'
187
+ class_name = "non#{class_name[1..-1]}"
188
+ end
189
+
190
+ token_sym = "class_#{class_name}".to_sym
191
+ self.emit(set_type, token_sym, text, ts, te)
192
+ };
193
+
194
+ collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
195
+ self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
196
+ };
197
+
198
+ character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
199
+ self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
200
+ };
201
+
202
+ # exclude the closing bracket as a cleaner workaround for dealing with the
203
+ # ambiguity caused upon exit from the unicode properties machine
204
+ meta_char -- ']' {
205
+ self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
206
+ };
207
+
208
+ any |
209
+ ascii_nonprint |
210
+ utf8_2_byte |
211
+ utf8_3_byte |
212
+ utf8_4_byte {
213
+ self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
214
+ };
215
+ *|;
216
+
217
+ # set escapes scanner
218
+ # --------------------------------------------------------------------------
219
+ set_escape_sequence := |*
220
+ 'b' {
221
+ self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
222
+ fret;
223
+ };
224
+
225
+ char_type {
226
+ case text = data[ts-1..te-1].pack('c*')
227
+ when '\d'; self.emit(set_type, :type_digit, text, ts-1, te)
228
+ when '\D'; self.emit(set_type, :type_nondigit, text, ts-1, te)
229
+ when '\h'; self.emit(set_type, :type_hex, text, ts-1, te)
230
+ when '\H'; self.emit(set_type, :type_nonhex, text, ts-1, te)
231
+ when '\s'; self.emit(set_type, :type_space, text, ts-1, te)
232
+ when '\S'; self.emit(set_type, :type_nonspace, text, ts-1, te)
233
+ when '\w'; self.emit(set_type, :type_word, text, ts-1, te)
234
+ when '\W'; self.emit(set_type, :type_nonword, text, ts-1, te)
235
+ end
236
+ fret;
237
+ };
238
+
239
+ hex_sequence . '-\\' . hex_sequence {
240
+ self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
241
+ fret;
242
+ };
243
+
244
+ hex_sequence {
245
+ self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
246
+ fret;
247
+ };
248
+
249
+ meta_char | [\\\]\-\,] {
250
+ self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
251
+ fret;
252
+ };
253
+
254
+ property_char > (escaped_set_alpha, 2) {
255
+ fhold;
256
+ fnext character_set;
257
+ fcall unicode_property;
258
+ fret;
259
+ };
260
+
261
+ # special case exclusion of escaped dash, could be cleaner.
262
+ (ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
263
+ ascii_nonprint |
264
+ utf8_2_byte |
265
+ utf8_3_byte |
266
+ utf8_4_byte {
267
+ self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
268
+ fret;
269
+ };
270
+ *|;
271
+
272
+
273
+ # escape sequence scanner
274
+ # --------------------------------------------------------------------------
275
+ escape_sequence := |*
276
+ [1-9] {
277
+ text = data[ts-1..te-1].pack('c*')
278
+ self.emit(:backref, :number, text, ts-1, te)
279
+ fret;
280
+ };
281
+
282
+ octal_sequence {
283
+ self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
284
+ fret;
285
+ };
286
+
287
+ meta_char {
288
+ case text = data[ts-1..te-1].pack('c*')
289
+ when '\.'; self.emit(:escape, :dot, text, ts-1, te)
290
+ when '\|'; self.emit(:escape, :alternation, text, ts-1, te)
291
+ when '\^'; self.emit(:escape, :beginning_of_line, text, ts-1, te)
292
+ when '\$'; self.emit(:escape, :end_of_line, text, ts-1, te)
293
+ when '\?'; self.emit(:escape, :zero_or_one, text, ts-1, te)
294
+ when '\*'; self.emit(:escape, :zero_or_more, text, ts-1, te)
295
+ when '\+'; self.emit(:escape, :one_or_more, text, ts-1, te)
296
+ when '\('; self.emit(:escape, :group_open, text, ts-1, te)
297
+ when '\)'; self.emit(:escape, :group_close, text, ts-1, te)
298
+ when '\{'; self.emit(:escape, :interval_open, text, ts-1, te)
299
+ when '\}'; self.emit(:escape, :interval_close, text, ts-1, te)
300
+ when '\['; self.emit(:escape, :set_open, text, ts-1, te)
301
+ when '\]'; self.emit(:escape, :set_close, text, ts-1, te)
302
+ when "\\\\";
303
+ self.emit(:escape, :backslash, text, ts-1, te)
304
+ end
305
+ fret;
306
+ };
307
+
308
+ escaped_ascii > (escaped_alpha, 7) {
309
+ # \b is emitted as backspace only when inside a character set, otherwise
310
+ # it is a word boundary anchor. A syntax might "normalize" it if needed.
311
+ case text = data[ts-1..te-1].pack('c*')
312
+ when '\a'; self.emit(:escape, :bell, text, ts-1, te)
313
+ when '\e'; self.emit(:escape, :escape, text, ts-1, te)
314
+ when '\f'; self.emit(:escape, :form_feed, text, ts-1, te)
315
+ when '\n'; self.emit(:escape, :newline, text, ts-1, te)
316
+ when '\r'; self.emit(:escape, :carriage, text, ts-1, te)
317
+ when '\s'; self.emit(:escape, :space, text, ts-1, te)
318
+ when '\t'; self.emit(:escape, :tab, text, ts-1, te)
319
+ when '\v'; self.emit(:escape, :vertical_tab, text, ts-1, te)
320
+ end
321
+ fret;
322
+ };
323
+
324
+ codepoint_sequence > (escaped_alpha, 6) {
325
+ text = data[ts-1..te-1].pack('c*')
326
+ if text[2].chr == '{'
327
+ self.emit(:escape, :codepoint_list, text, ts-1, te)
328
+ else
329
+ self.emit(:escape, :codepoint, text, ts-1, te)
330
+ end
331
+ fret;
332
+ };
333
+
334
+ hex_sequence > (escaped_alpha, 5) {
335
+ self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
336
+ fret;
337
+ };
338
+
339
+ wide_hex_sequence > (escaped_alpha, 5) {
340
+ self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
341
+ fret;
342
+ };
343
+
344
+ control_sequence > (escaped_alpha, 4) {
345
+ self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
346
+ fret;
347
+ };
348
+
349
+ meta_sequence > (backslashed, 3) {
350
+ self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
351
+ };
352
+
353
+ property_char > (escaped_alpha, 2) {
354
+ fhold;
355
+ fnext main;
356
+ fcall unicode_property; fret;
357
+ };
358
+
359
+ (any -- non_literal_escape) > (escaped_alpha, 1) {
360
+ self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
361
+ fret;
362
+ };
363
+ *|;
364
+
365
+
366
+ # Main scanner
367
+ # --------------------------------------------------------------------------
368
+ main := |*
369
+
370
+ # Meta characters
371
+ # ------------------------------------------------------------------------
372
+ dot {
373
+ self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
374
+ };
375
+
376
+ alternation {
377
+ self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
378
+ };
379
+
380
+ # Anchors
381
+ # ------------------------------------------------------------------------
382
+ beginning_of_line {
383
+ self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
384
+ };
385
+
386
+ end_of_line {
387
+ self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
388
+ };
389
+
390
+ backslash . anchor_char > (backslashed, 3) {
391
+ case text = data[ts..te-1].pack('c*')
392
+ when '\\A'; self.emit(:anchor, :bos, text, ts, te)
393
+ when '\\z'; self.emit(:anchor, :eos, text, ts, te)
394
+ when '\\Z'; self.emit(:anchor, :eos_ob_eol, text, ts, te)
395
+ when '\\b'; self.emit(:anchor, :word_boundary, text, ts, te)
396
+ when '\\B'; self.emit(:anchor, :nonword_boundary, text, ts, te)
397
+ when '\\G'; self.emit(:anchor, :match_start, text, ts, te)
398
+ else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
399
+ end
400
+ };
401
+
402
+ # Character types
403
+ # \d, \D digit, non-digit
404
+ # \h, \H hex, non-hex
405
+ # \s, \S space, non-space
406
+ # \w, \W word, non-word
407
+ # ------------------------------------------------------------------------
408
+ backslash . char_type > (backslashed, 2) {
409
+ case text = data[ts..te-1].pack('c*')
410
+ when '\\d'; self.emit(:type, :digit, text, ts, te)
411
+ when '\\D'; self.emit(:type, :nondigit, text, ts, te)
412
+ when '\\h'; self.emit(:type, :hex, text, ts, te)
413
+ when '\\H'; self.emit(:type, :nonhex, text, ts, te)
414
+ when '\\s'; self.emit(:type, :space, text, ts, te)
415
+ when '\\S'; self.emit(:type, :nonspace, text, ts, te)
416
+ when '\\w'; self.emit(:type, :word, text, ts, te)
417
+ when '\\W'; self.emit(:type, :nonword, text, ts, te)
418
+ end
419
+ };
420
+
421
+
422
+ # Character sets
423
+ # ------------------------------------------------------------------------
424
+ set_open {
425
+ set_depth += 1; in_set = true
426
+ set_type = set_depth > 1 ? :subset : :set
427
+
428
+ self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
429
+ fcall character_set;
430
+ };
431
+
432
+ # (?#...) comments: parsed as a single expression, without introducing a
433
+ # new nesting level. Comments may not include parentheses, escaped or not.
434
+ # special case for close, action performed on all transitions to get the
435
+ # correct closing count.
436
+ # ------------------------------------------------------------------------
437
+ group_open . group_comment $group_closed {
438
+ self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
439
+ };
440
+
441
+ # Expression options:
442
+ # (?imx-imx) option on/off
443
+ # i: ignore case
444
+ # m: multi-line (dot(.) match newline)
445
+ # x: extended form
446
+ #
447
+ # (?imx-imx:subexp) option on/off for subexp
448
+ # ------------------------------------------------------------------------
449
+ group_open . group_options >group_opened {
450
+ # special handling to resolve ambiguity with passive groups
451
+ if data[te]
452
+ c = data[te].chr
453
+ if c == ':' # include the ':'
454
+ self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
455
+ p += 1
456
+ elsif c == ')' # just options by themselves
457
+ self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
458
+ else
459
+ raise ScannerError.new(
460
+ "Unexpected '#{c}' in options sequence, ':' or ')' expected")
461
+ end
462
+ else
463
+ raise PrematureEndError.new("options") unless data[te]
464
+ end
465
+ };
466
+
467
+ # Assertions
468
+ # (?=subexp) look-ahead
469
+ # (?!subexp) negative look-ahead
470
+ # (?<=subexp) look-behind
471
+ # (?<!subexp) negative look-behind
472
+ # ------------------------------------------------------------------------
473
+ group_open . assertion_type >group_opened {
474
+ case text = data[ts..te-1].pack('c*')
475
+ when '(?='; self.emit(:assertion, :lookahead, text, ts, te)
476
+ when '(?!'; self.emit(:assertion, :nlookahead, text, ts, te)
477
+ when '(?<='; self.emit(:assertion, :lookbehind, text, ts, te)
478
+ when '(?<!'; self.emit(:assertion, :nlookbehind, text, ts, te)
479
+ end
480
+ };
481
+
482
+ # Groups
483
+ # (?:subexp) passive (non-captured) group
484
+ # (?>subexp) atomic group, don't backtrack in subexp.
485
+ # (?<name>subexp) named group
486
+ # (?'name'subexp) named group (single quoted version)
487
+ # (subexp) captured group
488
+ # ------------------------------------------------------------------------
489
+ group_open . group_type >group_opened {
490
+ case text = data[ts..te-1].pack('c*')
491
+ when '(?:'; self.emit(:group, :passive, text, ts, te)
492
+ when '(?>'; self.emit(:group, :atomic, text, ts, te)
493
+
494
+ when /\(\?<\w+>/
495
+ self.emit(:group, :named_ab, text, ts, te)
496
+ when /\(\?'\w+'/
497
+ self.emit(:group, :named_sq, text, ts, te)
498
+ end
499
+ };
500
+
501
+ group_open @group_opened {
502
+ text = data[ts..te-1].pack('c*')
503
+ self.emit(:group, :capture, text, ts, te)
504
+ };
505
+
506
+ group_close @group_closed {
507
+ self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
508
+ };
509
+
510
+
511
+ # Group back-reference, named and numbered
512
+ # ------------------------------------------------------------------------
513
+ backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
514
+ case text = data[ts..te-1].pack('c*')
515
+ when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
516
+ if $1 == 'k'
517
+ self.emit(:backref, :name_ref_ab, text, ts, te)
518
+ else
519
+ self.emit(:backref, :name_call_ab, text, ts, te)
520
+ end
521
+
522
+ when /\\([gk])'[^\d-](\w+)?'/ #single quotes
523
+ if $1 == 'k'
524
+ self.emit(:backref, :name_ref_sq, text, ts, te)
525
+ else
526
+ self.emit(:backref, :name_call_sq, text, ts, te)
527
+ end
528
+
529
+ when /\\([gk])<\d+>/ # angle-brackets
530
+ if $1 == 'k'
531
+ self.emit(:backref, :number_ref_ab, text, ts, te)
532
+ else
533
+ self.emit(:backref, :number_call_ab, text, ts, te)
534
+ end
535
+
536
+ when /\\([gk])'\d+'/ # single quotes
537
+ if $1 == 'k'
538
+ self.emit(:backref, :number_ref_sq, text, ts, te)
539
+ else
540
+ self.emit(:backref, :number_call_sq, text, ts, te)
541
+ end
542
+
543
+ when /\\([gk])<-\d+>/ # angle-brackets
544
+ if $1 == 'k'
545
+ self.emit(:backref, :number_rel_ref_ab, text, ts, te)
546
+ else
547
+ self.emit(:backref, :number_rel_call_ab, text, ts, te)
548
+ end
549
+
550
+ when /\\([gk])'-\d+'/ # single quotes
551
+ if $1 == 'k'
552
+ self.emit(:backref, :number_rel_ref_sq, text, ts, te)
553
+ else
554
+ self.emit(:backref, :number_rel_call_sq, text, ts, te)
555
+ end
556
+
557
+ when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
558
+ self.emit(:backref, :name_nest_ref_ab, text, ts, te)
559
+
560
+ when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
561
+ self.emit(:backref, :name_nest_ref_sq, text, ts, te)
562
+
563
+ when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
564
+ self.emit(:backref, :number_nest_ref_ab, text, ts, te)
565
+
566
+ when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
567
+ self.emit(:backref, :number_nest_ref_sq, text, ts, te)
568
+
569
+ end
570
+ };
571
+
572
+
573
+ # Quantifiers
574
+ # ------------------------------------------------------------------------
575
+ zero_or_one {
576
+ case text = data[ts..te-1].pack('c*')
577
+ when '?' ; self.emit(:quantifier, :zero_or_one, text, ts, te)
578
+ when '??'; self.emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
579
+ when '?+'; self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
580
+ end
581
+ };
582
+
583
+ zero_or_more {
584
+ case text = data[ts..te-1].pack('c*')
585
+ when '*' ; self.emit(:quantifier, :zero_or_more, text, ts, te)
586
+ when '*?'; self.emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
587
+ when '*+'; self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
588
+ end
589
+ };
590
+
591
+ one_or_more {
592
+ case text = data[ts..te-1].pack('c*')
593
+ when '+' ; self.emit(:quantifier, :one_or_more, text, ts, te)
594
+ when '+?'; self.emit(:quantifier, :one_or_more_reluctant, text, ts, te)
595
+ when '++'; self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
596
+ end
597
+ };
598
+
599
+ quantifier_range @err(premature_end_error) {
600
+ self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
601
+ };
602
+
603
+ # Escaped sequences
604
+ # ------------------------------------------------------------------------
605
+ backslash > (backslashed, 1) {
606
+ fcall escape_sequence;
607
+ };
608
+
609
+ # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
610
+ # except meta characters.
611
+ # ------------------------------------------------------------------------
612
+ ascii_print+ |
613
+ ascii_nonprint+ |
614
+ utf8_2_byte+ |
615
+ utf8_3_byte+ |
616
+ utf8_4_byte+ {
617
+ self.append_literal(data, ts, te)
618
+ };
619
+
620
+ *|;
621
+ }%%
622
+
623
+
624
+ module Regexp::Scanner
625
+ %% write data;
626
+
627
+ class ScannerError < StandardError
628
+ def initialize(what)
629
+ super what
630
+ end
631
+ end
632
+
633
+ class PrematureEndError < ScannerError
634
+ def initialize(where = '')
635
+ super "Premature end of pattern: #{where}"
636
+ end
637
+ end
638
+
639
+ class UnknownUnicodePropertyError < ScannerError
640
+ def initialize(name)
641
+ super "Unknown unicode character property name #{name}"
642
+ end
643
+ end
644
+
645
+
646
+ # Scans the given regular expression text, or Regexp object and collects the
647
+ # emitted token into an array that gets returned at the end. If a block is
648
+ # given, it gets called for each emitted token.
649
+ #
650
+ # This method may raise errors if a syntax error is encountered.
651
+ # --------------------------------------------------------------------------
652
+ def self.scan(input, &block)
653
+ top, stack = 0, []
654
+
655
+ input = input.source if input.is_a?(Regexp)
656
+ data = input.unpack("c*") if input.is_a?(String)
657
+ eof = data.length
658
+
659
+ @tokens = []
660
+ @block = block_given? ? block : nil
661
+
662
+ in_group, group_depth = false, 0
663
+ in_set, set_depth, set_type = false, 0, :set
664
+
665
+ %% write init;
666
+ %% write exec;
667
+
668
+ raise PrematureEndError.new("(missing group closing paranthesis) "+
669
+ "[#{in_group}:#{group_depth}]") if in_group
670
+ raise PrematureEndError.new("(missing set closing bracket) "+
671
+ "[#{in_set}:#{set_depth}]") if in_set
672
+
673
+ # when the entire expression is a literal run
674
+ self.emit_literal if @literal
675
+
676
+ @tokens
677
+ end
678
+
679
+ # appends one or more characters to the literal buffer, to be emitted later
680
+ # by a call to emit_literal. contents a mix of ASCII and UTF-8
681
+ def self.append_literal(data, ts, te)
682
+ @literal ||= []
683
+ @literal << [data[ts..te-1].pack('c*'), ts, te]
684
+ end
685
+
686
+ # emits the collected literal run collected by one or more calls to the
687
+ # append_literal method
688
+ def self.emit_literal
689
+ ts, te = @literal.first[1], @literal.last[2]
690
+ text = @literal.map {|t| t[0]}.join
691
+
692
+ text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
693
+
694
+ self.emit(:literal, :literal, text, ts, te)
695
+ @literal = nil
696
+ end
697
+
698
+ def self.emit(type, token, text, ts, te)
699
+ #puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
700
+
701
+ if @literal and type != :literal
702
+ self.emit_literal
703
+ end
704
+
705
+ if @block
706
+ @block.call type, token, text, ts, te
707
+ end
708
+
709
+ @tokens << [type, token, text, ts, te]
710
+ end
711
+
712
+ end # module Regexp::Scanner