regexp_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,712 @@
1
+ %%{
2
+ machine re_scanner;
3
+ include re_property "property.rl";
4
+
5
+ dot = '.';
6
+ backslash = '\\';
7
+ alternation = '|';
8
+ beginning_of_line = '^';
9
+ end_of_line = '$';
10
+
11
+ range_open = '{';
12
+ range_close = '}';
13
+ curlies = range_open | range_close;
14
+
15
+ group_open = '(';
16
+ group_close = ')';
17
+ parantheses = group_open | group_close;
18
+
19
+ set_open = '[';
20
+ set_close = ']';
21
+ brackets = set_open | set_close;
22
+
23
+ class_name_posix = 'alnum' | 'alpha' | 'blank' |
24
+ 'cntrl' | 'digit' | 'graph' |
25
+ 'lower' | 'print' | 'punct' |
26
+ 'space' | 'upper' | 'xdigit' |
27
+ 'word' | 'ascii';
28
+
29
+ class_posix = ('[:' . '^'? . class_name_posix . ':]');
30
+
31
+ # these are not supported in ruby, and need verification
32
+ collating_sequence = '[.' . (alpha | [\-])+ . '.]';
33
+ character_equivalent = '[=' . alpha . '=]';
34
+
35
+ char_type = [dDhHsSwW];
36
+
37
+ line_anchor = beginning_of_line | end_of_line;
38
+ anchor_char = [AbBzZG];
39
+
40
+ escaped_ascii = [abefnrstv];
41
+ octal_sequence = [0-7]{1,3};
42
+
43
+ hex_sequence = 'x' . xdigit{1,2};
44
+ wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
45
+
46
+ codepoint_single = 'u' . xdigit{4};
47
+ codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
48
+ codepoint_sequence = codepoint_single | codepoint_list;
49
+
50
+ control_sequence = ('c' | 'C-') . alpha;
51
+ meta_sequence = 'M-' . ((backslash . control_sequence) | alpha);
52
+
53
+ zero_or_one = '?' | '??' | '?+';
54
+ zero_or_more = '*' | '*?' | '*+';
55
+ one_or_more = '+' | '+?' | '++';
56
+
57
+ quantifier_greedy = '?' | '*' | '+';
58
+ quantifier_reluctant = '??' | '*?' | '+?';
59
+ quantifier_possessive = '?+' | '*+' | '++';
60
+ quantifier_mode = '?' | '+';
61
+
62
+ quantifier_range = range_open . (digit+)? . ','? . (digit+)? .
63
+ range_close . quantifier_mode?;
64
+
65
+ quantifiers = quantifier_greedy | quantifier_reluctant |
66
+ quantifier_possessive | quantifier_range;
67
+
68
+
69
+ group_comment = '?#' . [^)]+ . group_close;
70
+
71
+ group_atomic = '?>';
72
+ group_passive = '?:';
73
+
74
+ assertion_lookahead = '?=';
75
+ assertion_nlookahead = '?!';
76
+ assertion_lookbehind = '?<=';
77
+ assertion_nlookbehind = '?<!';
78
+
79
+ group_options = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
80
+
81
+ group_ref = [gk];
82
+ group_name = alpha . (alnum+)?;
83
+ group_number = '-'? . [1-9] . ([0-9]+)?;
84
+ group_level = [+\-] . [0-9]+;
85
+
86
+ group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
87
+
88
+ group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
89
+ ("'" . group_name . group_level? "'"));
90
+
91
+ group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
92
+ ("'" . group_number . group_level? "'"));
93
+
94
+ group_type = group_atomic | group_passive | group_named;
95
+
96
+ assertion_type = assertion_lookahead | assertion_nlookahead |
97
+ assertion_lookbehind | assertion_nlookbehind;
98
+
99
+ # characters that 'break' a literal
100
+ meta_char = dot | backslash | alternation |
101
+ curlies | parantheses | brackets |
102
+ line_anchor | quantifier_greedy;
103
+
104
+ ascii_print = ((0x20..0x7e) - meta_char)+;
105
+ ascii_nonprint = (0x01..0x1f | 0x7f)+;
106
+
107
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
108
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
109
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
110
+ utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
111
+
112
+ non_literal_escape = char_type | anchor_char | escaped_ascii |
113
+ group_ref | [xucCM];
114
+
115
+ # EOF error, used where it can be detected
116
+ action premature_end_error { raise PrematureEndError }
117
+
118
+ # group (nesting) and set open/close actions
119
+ action group_opened { group_depth += 1; in_group = true }
120
+ action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
121
+
122
+ # Character set scanner, continues consuming characters until it meets the
123
+ # closing bracket of the set.
124
+ # --------------------------------------------------------------------------
125
+ character_set := |*
126
+ ']' {
127
+ set_type = set_depth > 1 ? :subset : :set
128
+ set_depth -= 1; in_set = set_depth > 0 ? true : false
129
+
130
+ self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
131
+
132
+ if set_depth == 0
133
+ fgoto main;
134
+ else
135
+ fret;
136
+ end
137
+ };
138
+
139
+ '-]' { # special case, emits two tokens
140
+ set_type = set_depth > 1 ? :subset : :set
141
+ set_depth -= 1; in_set = set_depth > 0 ? true : false
142
+
143
+ self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
144
+ self.emit(set_type, :close, data[ts+1..te-1].pack('c*'), ts, te)
145
+
146
+ if set_depth == 0
147
+ fgoto main;
148
+ else
149
+ fret;
150
+ end
151
+ };
152
+
153
+ '^' {
154
+ text = data[ts..te-1].pack('c*')
155
+ if @tokens.last[1] == :open
156
+ self.emit(set_type, :negate, text, ts, te)
157
+ else
158
+ self.emit(set_type, :member, text, ts, te)
159
+ end
160
+ };
161
+
162
+ alnum . '-' . alnum {
163
+ self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
164
+ };
165
+
166
+ '&&' {
167
+ self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
168
+ };
169
+
170
+ '\\' {
171
+ fcall set_escape_sequence;
172
+ };
173
+
174
+ '[' >(open_bracket, 1) {
175
+ set_depth += 1; in_set = true
176
+ set_type = set_depth > 1 ? :subset : :set
177
+
178
+ self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
179
+ fcall character_set;
180
+ };
181
+
182
+ class_posix >(open_bracket, 1) @eof(premature_end_error) {
183
+ text = data[ts..te-1].pack('c*')
184
+
185
+ class_name = text[2..-3]
186
+ if class_name[0].chr == '^'
187
+ class_name = "non#{class_name[1..-1]}"
188
+ end
189
+
190
+ token_sym = "class_#{class_name}".to_sym
191
+ self.emit(set_type, token_sym, text, ts, te)
192
+ };
193
+
194
+ collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
195
+ self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
196
+ };
197
+
198
+ character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
199
+ self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
200
+ };
201
+
202
+ # exclude the closing bracket as a cleaner workaround for dealing with the
203
+ # ambiguity caused upon exit from the unicode properties machine
204
+ meta_char -- ']' {
205
+ self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
206
+ };
207
+
208
+ any |
209
+ ascii_nonprint |
210
+ utf8_2_byte |
211
+ utf8_3_byte |
212
+ utf8_4_byte {
213
+ self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
214
+ };
215
+ *|;
216
+
217
+ # set escapes scanner
218
+ # --------------------------------------------------------------------------
219
+ set_escape_sequence := |*
220
+ 'b' {
221
+ self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
222
+ fret;
223
+ };
224
+
225
+ char_type {
226
+ case text = data[ts-1..te-1].pack('c*')
227
+ when '\d'; self.emit(set_type, :type_digit, text, ts-1, te)
228
+ when '\D'; self.emit(set_type, :type_nondigit, text, ts-1, te)
229
+ when '\h'; self.emit(set_type, :type_hex, text, ts-1, te)
230
+ when '\H'; self.emit(set_type, :type_nonhex, text, ts-1, te)
231
+ when '\s'; self.emit(set_type, :type_space, text, ts-1, te)
232
+ when '\S'; self.emit(set_type, :type_nonspace, text, ts-1, te)
233
+ when '\w'; self.emit(set_type, :type_word, text, ts-1, te)
234
+ when '\W'; self.emit(set_type, :type_nonword, text, ts-1, te)
235
+ end
236
+ fret;
237
+ };
238
+
239
+ hex_sequence . '-\\' . hex_sequence {
240
+ self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
241
+ fret;
242
+ };
243
+
244
+ hex_sequence {
245
+ self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
246
+ fret;
247
+ };
248
+
249
+ meta_char | [\\\]\-\,] {
250
+ self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
251
+ fret;
252
+ };
253
+
254
+ property_char > (escaped_set_alpha, 2) {
255
+ fhold;
256
+ fnext character_set;
257
+ fcall unicode_property;
258
+ fret;
259
+ };
260
+
261
+ # special case exclusion of escaped dash, could be cleaner.
262
+ (ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
263
+ ascii_nonprint |
264
+ utf8_2_byte |
265
+ utf8_3_byte |
266
+ utf8_4_byte {
267
+ self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
268
+ fret;
269
+ };
270
+ *|;
271
+
272
+
273
+ # escape sequence scanner
274
+ # --------------------------------------------------------------------------
275
+ escape_sequence := |*
276
+ [1-9] {
277
+ text = data[ts-1..te-1].pack('c*')
278
+ self.emit(:backref, :number, text, ts-1, te)
279
+ fret;
280
+ };
281
+
282
+ octal_sequence {
283
+ self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
284
+ fret;
285
+ };
286
+
287
+ meta_char {
288
+ case text = data[ts-1..te-1].pack('c*')
289
+ when '\.'; self.emit(:escape, :dot, text, ts-1, te)
290
+ when '\|'; self.emit(:escape, :alternation, text, ts-1, te)
291
+ when '\^'; self.emit(:escape, :beginning_of_line, text, ts-1, te)
292
+ when '\$'; self.emit(:escape, :end_of_line, text, ts-1, te)
293
+ when '\?'; self.emit(:escape, :zero_or_one, text, ts-1, te)
294
+ when '\*'; self.emit(:escape, :zero_or_more, text, ts-1, te)
295
+ when '\+'; self.emit(:escape, :one_or_more, text, ts-1, te)
296
+ when '\('; self.emit(:escape, :group_open, text, ts-1, te)
297
+ when '\)'; self.emit(:escape, :group_close, text, ts-1, te)
298
+ when '\{'; self.emit(:escape, :interval_open, text, ts-1, te)
299
+ when '\}'; self.emit(:escape, :interval_close, text, ts-1, te)
300
+ when '\['; self.emit(:escape, :set_open, text, ts-1, te)
301
+ when '\]'; self.emit(:escape, :set_close, text, ts-1, te)
302
+ when "\\\\";
303
+ self.emit(:escape, :backslash, text, ts-1, te)
304
+ end
305
+ fret;
306
+ };
307
+
308
+ escaped_ascii > (escaped_alpha, 7) {
309
+ # \b is emitted as backspace only when inside a character set, otherwise
310
+ # it is a word boundary anchor. A syntax might "normalize" it if needed.
311
+ case text = data[ts-1..te-1].pack('c*')
312
+ when '\a'; self.emit(:escape, :bell, text, ts-1, te)
313
+ when '\e'; self.emit(:escape, :escape, text, ts-1, te)
314
+ when '\f'; self.emit(:escape, :form_feed, text, ts-1, te)
315
+ when '\n'; self.emit(:escape, :newline, text, ts-1, te)
316
+ when '\r'; self.emit(:escape, :carriage, text, ts-1, te)
317
+ when '\s'; self.emit(:escape, :space, text, ts-1, te)
318
+ when '\t'; self.emit(:escape, :tab, text, ts-1, te)
319
+ when '\v'; self.emit(:escape, :vertical_tab, text, ts-1, te)
320
+ end
321
+ fret;
322
+ };
323
+
324
+ codepoint_sequence > (escaped_alpha, 6) {
325
+ text = data[ts-1..te-1].pack('c*')
326
+ if text[2].chr == '{'
327
+ self.emit(:escape, :codepoint_list, text, ts-1, te)
328
+ else
329
+ self.emit(:escape, :codepoint, text, ts-1, te)
330
+ end
331
+ fret;
332
+ };
333
+
334
+ hex_sequence > (escaped_alpha, 5) {
335
+ self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
336
+ fret;
337
+ };
338
+
339
+ wide_hex_sequence > (escaped_alpha, 5) {
340
+ self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
341
+ fret;
342
+ };
343
+
344
+ control_sequence > (escaped_alpha, 4) {
345
+ self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
346
+ fret;
347
+ };
348
+
349
+ meta_sequence > (backslashed, 3) {
350
+ self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
351
+ };
352
+
353
+ property_char > (escaped_alpha, 2) {
354
+ fhold;
355
+ fnext main;
356
+ fcall unicode_property; fret;
357
+ };
358
+
359
+ (any -- non_literal_escape) > (escaped_alpha, 1) {
360
+ self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
361
+ fret;
362
+ };
363
+ *|;
364
+
365
+
366
+ # Main scanner
367
+ # --------------------------------------------------------------------------
368
+ main := |*
369
+
370
+ # Meta characters
371
+ # ------------------------------------------------------------------------
372
+ dot {
373
+ self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
374
+ };
375
+
376
+ alternation {
377
+ self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
378
+ };
379
+
380
+ # Anchors
381
+ # ------------------------------------------------------------------------
382
+ beginning_of_line {
383
+ self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
384
+ };
385
+
386
+ end_of_line {
387
+ self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
388
+ };
389
+
390
+ backslash . anchor_char > (backslashed, 3) {
391
+ case text = data[ts..te-1].pack('c*')
392
+ when '\\A'; self.emit(:anchor, :bos, text, ts, te)
393
+ when '\\z'; self.emit(:anchor, :eos, text, ts, te)
394
+ when '\\Z'; self.emit(:anchor, :eos_ob_eol, text, ts, te)
395
+ when '\\b'; self.emit(:anchor, :word_boundary, text, ts, te)
396
+ when '\\B'; self.emit(:anchor, :nonword_boundary, text, ts, te)
397
+ when '\\G'; self.emit(:anchor, :match_start, text, ts, te)
398
+ else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
399
+ end
400
+ };
401
+
402
+ # Character types
403
+ # \d, \D digit, non-digit
404
+ # \h, \H hex, non-hex
405
+ # \s, \S space, non-space
406
+ # \w, \W word, non-word
407
+ # ------------------------------------------------------------------------
408
+ backslash . char_type > (backslashed, 2) {
409
+ case text = data[ts..te-1].pack('c*')
410
+ when '\\d'; self.emit(:type, :digit, text, ts, te)
411
+ when '\\D'; self.emit(:type, :nondigit, text, ts, te)
412
+ when '\\h'; self.emit(:type, :hex, text, ts, te)
413
+ when '\\H'; self.emit(:type, :nonhex, text, ts, te)
414
+ when '\\s'; self.emit(:type, :space, text, ts, te)
415
+ when '\\S'; self.emit(:type, :nonspace, text, ts, te)
416
+ when '\\w'; self.emit(:type, :word, text, ts, te)
417
+ when '\\W'; self.emit(:type, :nonword, text, ts, te)
418
+ end
419
+ };
420
+
421
+
422
+ # Character sets
423
+ # ------------------------------------------------------------------------
424
+ set_open {
425
+ set_depth += 1; in_set = true
426
+ set_type = set_depth > 1 ? :subset : :set
427
+
428
+ self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
429
+ fcall character_set;
430
+ };
431
+
432
+ # (?#...) comments: parsed as a single expression, without introducing a
433
+ # new nesting level. Comments may not include parentheses, escaped or not.
434
+ # special case for close, action performed on all transitions to get the
435
+ # correct closing count.
436
+ # ------------------------------------------------------------------------
437
+ group_open . group_comment $group_closed {
438
+ self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
439
+ };
440
+
441
+ # Expression options:
442
+ # (?imx-imx) option on/off
443
+ # i: ignore case
444
+ # m: multi-line (dot(.) match newline)
445
+ # x: extended form
446
+ #
447
+ # (?imx-imx:subexp) option on/off for subexp
448
+ # ------------------------------------------------------------------------
449
+ group_open . group_options >group_opened {
450
+ # special handling to resolve ambiguity with passive groups
451
+ if data[te]
452
+ c = data[te].chr
453
+ if c == ':' # include the ':'
454
+ self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
455
+ p += 1
456
+ elsif c == ')' # just options by themselves
457
+ self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
458
+ else
459
+ raise ScannerError.new(
460
+ "Unexpected '#{c}' in options sequence, ':' or ')' expected")
461
+ end
462
+ else
463
+ raise PrematureEndError.new("options") unless data[te]
464
+ end
465
+ };
466
+
467
+ # Assertions
468
+ # (?=subexp) look-ahead
469
+ # (?!subexp) negative look-ahead
470
+ # (?<=subexp) look-behind
471
+ # (?<!subexp) negative look-behind
472
+ # ------------------------------------------------------------------------
473
+ group_open . assertion_type >group_opened {
474
+ case text = data[ts..te-1].pack('c*')
475
+ when '(?='; self.emit(:assertion, :lookahead, text, ts, te)
476
+ when '(?!'; self.emit(:assertion, :nlookahead, text, ts, te)
477
+ when '(?<='; self.emit(:assertion, :lookbehind, text, ts, te)
478
+ when '(?<!'; self.emit(:assertion, :nlookbehind, text, ts, te)
479
+ end
480
+ };
481
+
482
+ # Groups
483
+ # (?:subexp) passive (non-captured) group
484
+ # (?>subexp) atomic group, don't backtrack in subexp.
485
+ # (?<name>subexp) named group
486
+ # (?'name'subexp) named group (single quoted version)
487
+ # (subexp) captured group
488
+ # ------------------------------------------------------------------------
489
+ group_open . group_type >group_opened {
490
+ case text = data[ts..te-1].pack('c*')
491
+ when '(?:'; self.emit(:group, :passive, text, ts, te)
492
+ when '(?>'; self.emit(:group, :atomic, text, ts, te)
493
+
494
+ when /\(\?<\w+>/
495
+ self.emit(:group, :named_ab, text, ts, te)
496
+ when /\(\?'\w+'/
497
+ self.emit(:group, :named_sq, text, ts, te)
498
+ end
499
+ };
500
+
501
+ group_open @group_opened {
502
+ text = data[ts..te-1].pack('c*')
503
+ self.emit(:group, :capture, text, ts, te)
504
+ };
505
+
506
+ group_close @group_closed {
507
+ self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
508
+ };
509
+
510
+
511
+ # Group back-reference, named and numbered
512
+ # ------------------------------------------------------------------------
513
+ backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
514
+ case text = data[ts..te-1].pack('c*')
515
+ when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
516
+ if $1 == 'k'
517
+ self.emit(:backref, :name_ref_ab, text, ts, te)
518
+ else
519
+ self.emit(:backref, :name_call_ab, text, ts, te)
520
+ end
521
+
522
+ when /\\([gk])'[^\d-](\w+)?'/ #single quotes
523
+ if $1 == 'k'
524
+ self.emit(:backref, :name_ref_sq, text, ts, te)
525
+ else
526
+ self.emit(:backref, :name_call_sq, text, ts, te)
527
+ end
528
+
529
+ when /\\([gk])<\d+>/ # angle-brackets
530
+ if $1 == 'k'
531
+ self.emit(:backref, :number_ref_ab, text, ts, te)
532
+ else
533
+ self.emit(:backref, :number_call_ab, text, ts, te)
534
+ end
535
+
536
+ when /\\([gk])'\d+'/ # single quotes
537
+ if $1 == 'k'
538
+ self.emit(:backref, :number_ref_sq, text, ts, te)
539
+ else
540
+ self.emit(:backref, :number_call_sq, text, ts, te)
541
+ end
542
+
543
+ when /\\([gk])<-\d+>/ # angle-brackets
544
+ if $1 == 'k'
545
+ self.emit(:backref, :number_rel_ref_ab, text, ts, te)
546
+ else
547
+ self.emit(:backref, :number_rel_call_ab, text, ts, te)
548
+ end
549
+
550
+ when /\\([gk])'-\d+'/ # single quotes
551
+ if $1 == 'k'
552
+ self.emit(:backref, :number_rel_ref_sq, text, ts, te)
553
+ else
554
+ self.emit(:backref, :number_rel_call_sq, text, ts, te)
555
+ end
556
+
557
+ when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
558
+ self.emit(:backref, :name_nest_ref_ab, text, ts, te)
559
+
560
+ when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
561
+ self.emit(:backref, :name_nest_ref_sq, text, ts, te)
562
+
563
+ when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
564
+ self.emit(:backref, :number_nest_ref_ab, text, ts, te)
565
+
566
+ when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
567
+ self.emit(:backref, :number_nest_ref_sq, text, ts, te)
568
+
569
+ end
570
+ };
571
+
572
+
573
+ # Quantifiers
574
+ # ------------------------------------------------------------------------
575
+ zero_or_one {
576
+ case text = data[ts..te-1].pack('c*')
577
+ when '?' ; self.emit(:quantifier, :zero_or_one, text, ts, te)
578
+ when '??'; self.emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
579
+ when '?+'; self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
580
+ end
581
+ };
582
+
583
+ zero_or_more {
584
+ case text = data[ts..te-1].pack('c*')
585
+ when '*' ; self.emit(:quantifier, :zero_or_more, text, ts, te)
586
+ when '*?'; self.emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
587
+ when '*+'; self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
588
+ end
589
+ };
590
+
591
+ one_or_more {
592
+ case text = data[ts..te-1].pack('c*')
593
+ when '+' ; self.emit(:quantifier, :one_or_more, text, ts, te)
594
+ when '+?'; self.emit(:quantifier, :one_or_more_reluctant, text, ts, te)
595
+ when '++'; self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
596
+ end
597
+ };
598
+
599
+ quantifier_range @err(premature_end_error) {
600
+ self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
601
+ };
602
+
603
+ # Escaped sequences
604
+ # ------------------------------------------------------------------------
605
+ backslash > (backslashed, 1) {
606
+ fcall escape_sequence;
607
+ };
608
+
609
+ # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
610
+ # except meta characters.
611
+ # ------------------------------------------------------------------------
612
+ ascii_print+ |
613
+ ascii_nonprint+ |
614
+ utf8_2_byte+ |
615
+ utf8_3_byte+ |
616
+ utf8_4_byte+ {
617
+ self.append_literal(data, ts, te)
618
+ };
619
+
620
+ *|;
621
+ }%%
622
+
623
+
624
+ module Regexp::Scanner
625
+ %% write data;
626
+
627
+ class ScannerError < StandardError
628
+ def initialize(what)
629
+ super what
630
+ end
631
+ end
632
+
633
+ class PrematureEndError < ScannerError
634
+ def initialize(where = '')
635
+ super "Premature end of pattern: #{where}"
636
+ end
637
+ end
638
+
639
+ class UnknownUnicodePropertyError < ScannerError
640
+ def initialize(name)
641
+ super "Unknown unicode character property name #{name}"
642
+ end
643
+ end
644
+
645
+
646
+ # Scans the given regular expression text, or Regexp object and collects the
647
+ # emitted token into an array that gets returned at the end. If a block is
648
+ # given, it gets called for each emitted token.
649
+ #
650
+ # This method may raise errors if a syntax error is encountered.
651
+ # --------------------------------------------------------------------------
652
+ def self.scan(input, &block)
653
+ top, stack = 0, []
654
+
655
+ input = input.source if input.is_a?(Regexp)
656
+ data = input.unpack("c*") if input.is_a?(String)
657
+ eof = data.length
658
+
659
+ @tokens = []
660
+ @block = block_given? ? block : nil
661
+
662
+ in_group, group_depth = false, 0
663
+ in_set, set_depth, set_type = false, 0, :set
664
+
665
+ %% write init;
666
+ %% write exec;
667
+
668
+ raise PrematureEndError.new("(missing group closing paranthesis) "+
669
+ "[#{in_group}:#{group_depth}]") if in_group
670
+ raise PrematureEndError.new("(missing set closing bracket) "+
671
+ "[#{in_set}:#{set_depth}]") if in_set
672
+
673
+ # when the entire expression is a literal run
674
+ self.emit_literal if @literal
675
+
676
+ @tokens
677
+ end
678
+
679
+ # appends one or more characters to the literal buffer, to be emitted later
680
+ # by a call to emit_literal. contents a mix of ASCII and UTF-8
681
+ def self.append_literal(data, ts, te)
682
+ @literal ||= []
683
+ @literal << [data[ts..te-1].pack('c*'), ts, te]
684
+ end
685
+
686
+ # emits the collected literal run collected by one or more calls to the
687
+ # append_literal method
688
+ def self.emit_literal
689
+ ts, te = @literal.first[1], @literal.last[2]
690
+ text = @literal.map {|t| t[0]}.join
691
+
692
+ text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
693
+
694
+ self.emit(:literal, :literal, text, ts, te)
695
+ @literal = nil
696
+ end
697
+
698
+ def self.emit(type, token, text, ts, te)
699
+ #puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
700
+
701
+ if @literal and type != :literal
702
+ self.emit_literal
703
+ end
704
+
705
+ if @block
706
+ @block.call type, token, text, ts, te
707
+ end
708
+
709
+ @tokens << [type, token, text, ts, te]
710
+ end
711
+
712
+ end # module Regexp::Scanner