regexp_parser 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -1,6 +1,7 @@
1
1
  %%{
2
2
  machine re_scanner;
3
- include re_property "property.rl";
3
+ include re_char_type "char_type.rl";
4
+ include re_property "property.rl";
4
5
 
5
6
  dot = '.';
6
7
  backslash = '\\';
@@ -35,25 +36,17 @@
35
36
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
36
37
  character_equivalent = '[=' . alpha . '=]';
37
38
 
38
- char_type = [dDhHsSwWRX];
39
-
40
39
  line_anchor = beginning_of_line | end_of_line;
41
40
  anchor_char = [AbBzZG];
42
41
 
43
- escaped_ascii = [abefnrstv];
42
+ escaped_ascii = [abefnrtv];
44
43
  octal_sequence = [0-7]{1,3};
45
44
 
46
45
  hex_sequence = 'x' . xdigit{1,2};
47
46
  hex_sequence_err = 'x' . [^0-9a-fA-F{];
48
- wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
49
-
50
- hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
51
-
52
- wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
53
- wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
54
47
 
55
48
  codepoint_single = 'u' . xdigit{4};
56
- codepoint_list = 'u{' . xdigit{1,5} . (space . xdigit{1,5})* . '}';
49
+ codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
57
50
  codepoint_sequence = codepoint_single | codepoint_list;
58
51
 
59
52
  control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
@@ -110,6 +103,7 @@
110
103
 
111
104
  group_type = group_atomic | group_passive | group_absence | group_named;
112
105
 
106
+ keep_mark = 'K';
113
107
 
114
108
  assertion_type = assertion_lookahead | assertion_nlookahead |
115
109
  assertion_lookbehind | assertion_nlookbehind;
@@ -119,16 +113,18 @@
119
113
  curlies | parantheses | brackets |
120
114
  line_anchor | quantifier_greedy;
121
115
 
122
- ascii_print = ((0x20..0x7e) - meta_char)+;
123
- ascii_nonprint = (0x01..0x1f | 0x7f)+;
116
+ ascii_print = ((0x20..0x7e) - meta_char);
117
+ ascii_nonprint = (0x01..0x1f | 0x7f);
118
+
119
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
120
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
121
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
124
122
 
125
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
126
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
127
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
128
- utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
123
+ non_literal_escape = char_type_char | anchor_char | escaped_ascii |
124
+ group_ref | keep_mark | [xucCM];
129
125
 
130
- non_literal_escape = char_type | anchor_char | escaped_ascii |
131
- group_ref | [xucCM];
126
+ non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
127
+ multi_codepoint_char_type | [0-9cCM];
132
128
 
133
129
  # EOF error, used where it can be detected
134
130
  action premature_end_error {
@@ -150,11 +146,11 @@
150
146
  # closing bracket of the set.
151
147
  # --------------------------------------------------------------------------
152
148
  character_set := |*
153
- ']' {
154
- set_type = set_depth > 1 ? :subset : :set
155
- set_depth -= 1; in_set = set_depth > 0 ? true : false
149
+ set_close > (set_meta, 2) {
150
+ set_depth -= 1
151
+ in_set = set_depth > 0 ? true : false
156
152
 
157
- emit(set_type, :close, *text(data, ts, te))
153
+ emit(:set, :close, *text(data, ts, te))
158
154
 
159
155
  if set_depth == 0
160
156
  fgoto main;
@@ -164,11 +160,11 @@
164
160
  };
165
161
 
166
162
  '-]' { # special case, emits two tokens
167
- set_type = set_depth > 1 ? :subset : :set
168
- set_depth -= 1; in_set = set_depth > 0 ? true : false
163
+ set_depth -= 1
164
+ in_set = set_depth > 0 ? true : false
169
165
 
170
- emit(set_type, :member, copy(data, ts..te-2), ts, te)
171
- emit(set_type, :close, copy(data, ts+1..te-1), ts, te)
166
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
+ emit(:set, :close, copy(data, ts+1..te-1), ts, te)
172
168
 
173
169
  if set_depth == 0
174
170
  fgoto main;
@@ -177,59 +173,70 @@
177
173
  end
178
174
  };
179
175
 
176
+ '-&&' { # special case, emits two tokens
177
+ emit(:literal, :literal, '-', ts, te)
178
+ emit(:set, :intersection, '&&', ts, te)
179
+ };
180
+
180
181
  '^' {
181
182
  text = text(data, ts, te).first
182
183
  if tokens.last[1] == :open
183
- emit(set_type, :negate, text, ts, te)
184
+ emit(:set, :negate, text, ts, te)
184
185
  else
185
- emit(set_type, :member, text, ts, te)
186
+ emit(:literal, :literal, text, ts, te)
186
187
  end
187
188
  };
188
189
 
189
- alnum . '-' . alnum {
190
- emit(set_type, :range, *text(data, ts, te))
190
+ '-' {
191
+ text = text(data, ts, te).first
192
+ # ranges cant start with a subset or intersection/negation/range operator
193
+ if tokens.last[0] == :set
194
+ emit(:literal, :literal, text, ts, te)
195
+ else
196
+ emit(:set, :range, text, ts, te)
197
+ end
191
198
  };
192
199
 
200
+ # Unlike ranges, intersections can start or end at set boundaries, whereupon
201
+ # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
193
202
  '&&' {
194
- emit(set_type, :intersection, *text(data, ts, te))
203
+ emit(:set, :intersection, *text(data, ts, te))
195
204
  };
196
205
 
197
- '\\' {
206
+ backslash {
198
207
  fcall set_escape_sequence;
199
208
  };
200
209
 
201
- '[' >(open_bracket, 1) {
202
- set_depth += 1; in_set = true
203
- set_type = set_depth > 1 ? :subset : :set
210
+ set_open >(open_bracket, 1) {
211
+ set_depth += 1
204
212
 
205
- emit(set_type, :open, *text(data, ts, te))
213
+ emit(:set, :open, *text(data, ts, te))
206
214
  fcall character_set;
207
215
  };
208
216
 
209
217
  class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
218
  text = text(data, ts, te).first
211
219
 
220
+ type = :posixclass
212
221
  class_name = text[2..-3]
213
222
  if class_name[0].chr == '^'
214
- class_name = "non#{class_name[1..-1]}"
223
+ class_name = class_name[1..-1]
224
+ type = :nonposixclass
215
225
  end
216
226
 
217
- token_sym = "class_#{class_name}".to_sym
218
- emit(set_type, token_sym, text, ts, te)
227
+ emit(type, class_name.to_sym, text, ts, te)
219
228
  };
220
229
 
221
230
  collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
222
- emit(set_type, :collation, *text(data, ts, te))
231
+ emit(:set, :collation, *text(data, ts, te))
223
232
  };
224
233
 
225
234
  character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
226
- emit(set_type, :equivalent, *text(data, ts, te))
235
+ emit(:set, :equivalent, *text(data, ts, te))
227
236
  };
228
237
 
229
- # exclude the closing bracket as a cleaner workaround for dealing with the
230
- # ambiguity caused upon exit from the unicode properties machine
231
- meta_char -- ']' {
232
- emit(set_type, :member, *text(data, ts, te))
238
+ meta_char > (set_meta, 1) {
239
+ emit(:literal, :literal, *text(data, ts, te))
233
240
  };
234
241
 
235
242
  any |
@@ -237,63 +244,24 @@
237
244
  utf8_2_byte |
238
245
  utf8_3_byte |
239
246
  utf8_4_byte {
240
- emit(set_type, :member, *text(data, ts, te))
247
+ char, *rest = *text(data, ts, te)
248
+ char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
249
+ emit(:literal, :literal, char, *rest)
241
250
  };
242
251
  *|;
243
252
 
244
253
  # set escapes scanner
245
254
  # --------------------------------------------------------------------------
246
255
  set_escape_sequence := |*
247
- 'b' > (escaped_set_alpha, 2) {
248
- emit(set_type, :backspace, *text(data, ts, te, 1))
249
- fret;
250
- };
251
-
252
- char_type > (escaped_set_alpha, 4) {
253
- case text = text(data, ts, te, 1).first
254
- when '\d'; emit(set_type, :type_digit, text, ts-1, te)
255
- when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
256
- when '\h'; emit(set_type, :type_hex, text, ts-1, te)
257
- when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
258
- when '\s'; emit(set_type, :type_space, text, ts-1, te)
259
- when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
260
- when '\w'; emit(set_type, :type_word, text, ts-1, te)
261
- when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
262
- when '\R'; emit(set_type, :type_linebreak, text, ts-1, te)
263
- when '\X'; emit(set_type, :type_xgrapheme, text, ts-1, te)
264
- end
265
- fret;
266
- };
267
-
268
- hex_sequence . '-\\' . hex_sequence {
269
- emit(set_type, :range_hex, *text(data, ts, te, 1))
270
- fret;
271
- };
272
-
273
- hex_sequence {
274
- emit(set_type, :member_hex, *text(data, ts, te, 1))
275
- fret;
276
- };
277
-
278
- meta_char | [\\\]\-\,] {
279
- emit(set_type, :escape, *text(data, ts, te, 1))
256
+ non_set_escape > (escaped_set_alpha, 2) {
257
+ emit(:escape, :literal, *text(data, ts, te, 1))
280
258
  fret;
281
259
  };
282
260
 
283
- property_char > (escaped_set_alpha, 3) {
261
+ any > (escaped_set_alpha, 1) {
284
262
  fhold;
285
263
  fnext character_set;
286
- fcall unicode_property;
287
- };
288
-
289
- # special case exclusion of escaped dash, could be cleaner.
290
- (ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
291
- ascii_nonprint |
292
- utf8_2_byte |
293
- utf8_3_byte |
294
- utf8_4_byte {
295
- emit(set_type, :escape, *text(data, ts, te, 1))
296
- fret;
264
+ fcall escape_sequence;
297
265
  };
298
266
  *|;
299
267
 
@@ -338,11 +306,11 @@
338
306
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
339
307
  case text = text(data, ts, te, 1).first
340
308
  when '\a'; emit(:escape, :bell, text, ts-1, te)
309
+ when '\b'; emit(:escape, :backspace, text, ts-1, te)
341
310
  when '\e'; emit(:escape, :escape, text, ts-1, te)
342
311
  when '\f'; emit(:escape, :form_feed, text, ts-1, te)
343
312
  when '\n'; emit(:escape, :newline, text, ts-1, te)
344
313
  when '\r'; emit(:escape, :carriage, text, ts-1, te)
345
- when '\s'; emit(:escape, :space, text, ts-1, te)
346
314
  when '\t'; emit(:escape, :tab, text, ts-1, te)
347
315
  when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
348
316
  end
@@ -364,20 +332,10 @@
364
332
  fret;
365
333
  };
366
334
 
367
- wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
368
- emit(:escape, :hex_wide, *text(data, ts, te, 1))
369
- fret;
370
- };
371
-
372
335
  hex_sequence_err @invalid_sequence_error {
373
336
  fret;
374
337
  };
375
338
 
376
- (wide_hex_seq_invalid | wide_hex_seq_empty) {
377
- raise InvalidSequenceError.new("wide hex sequence")
378
- fret;
379
- };
380
-
381
339
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
382
340
  if data[te]
383
341
  c = data[te].chr
@@ -408,9 +366,15 @@
408
366
  fret;
409
367
  };
410
368
 
369
+ char_type_char > (escaped_alpha, 2) {
370
+ fhold;
371
+ fnext *(in_set ? fentry(character_set) : fentry(main));
372
+ fcall char_type;
373
+ };
374
+
411
375
  property_char > (escaped_alpha, 2) {
412
376
  fhold;
413
- fnext main;
377
+ fnext *(in_set ? fentry(character_set) : fentry(main));
414
378
  fcall unicode_property;
415
379
  };
416
380
 
@@ -466,7 +430,7 @@
466
430
  emit(:anchor, :eol, *text(data, ts, te))
467
431
  };
468
432
 
469
- backslash . 'K' > (backslashed, 4) {
433
+ backslash . keep_mark > (backslashed, 4) {
470
434
  emit(:keep, :mark, *text(data, ts, te))
471
435
  };
472
436
 
@@ -484,38 +448,13 @@
484
448
  end
485
449
  };
486
450
 
487
- # Character types
488
- # \d, \D digit, non-digit
489
- # \h, \H hex, non-hex
490
- # \s, \S space, non-space
491
- # \w, \W word, non-word
492
- # ------------------------------------------------------------------------
493
- backslash . char_type > (backslashed, 2) {
494
- case text = text(data, ts, te).first
495
- when '\\d'; emit(:type, :digit, text, ts, te)
496
- when '\\D'; emit(:type, :nondigit, text, ts, te)
497
- when '\\h'; emit(:type, :hex, text, ts, te)
498
- when '\\H'; emit(:type, :nonhex, text, ts, te)
499
- when '\\s'; emit(:type, :space, text, ts, te)
500
- when '\\S'; emit(:type, :nonspace, text, ts, te)
501
- when '\\w'; emit(:type, :word, text, ts, te)
502
- when '\\W'; emit(:type, :nonword, text, ts, te)
503
- when '\\R'; emit(:type, :linebreak, text, ts, te)
504
- when '\\X'; emit(:type, :xgrapheme, text, ts, te)
505
- else
506
- raise ScannerError.new(
507
- "Unexpected character in type at #{text} (char #{ts})")
508
- end
509
- };
510
-
511
-
512
451
  # Character sets
513
452
  # ------------------------------------------------------------------------
514
453
  set_open {
515
- set_depth += 1; in_set = true
516
- set_type = set_depth > 1 ? :subset : :set
454
+ set_depth += 1
455
+ in_set = true
517
456
 
518
- emit(set_type, :open, *text(data, ts, te))
457
+ emit(:set, :open, *text(data, ts, te))
519
458
  fcall character_set;
520
459
  };
521
460
 
@@ -645,57 +584,57 @@
645
584
 
646
585
  when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
647
586
  if $1 == 'k'
648
- emit(:backref, :name_ref_ab, text, ts, te)
587
+ emit(:backref, :name_ref_ab, text, ts, te)
649
588
  else
650
- emit(:backref, :name_call_ab, text, ts, te)
589
+ emit(:backref, :name_call_ab, text, ts, te)
651
590
  end
652
591
 
653
592
  when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
654
593
  if $1 == 'k'
655
- emit(:backref, :name_ref_sq, text, ts, te)
594
+ emit(:backref, :name_ref_sq, text, ts, te)
656
595
  else
657
- emit(:backref, :name_call_sq, text, ts, te)
596
+ emit(:backref, :name_call_sq, text, ts, te)
658
597
  end
659
598
 
660
599
  when /^\\([gk])<\d+>/ # angle-brackets
661
600
  if $1 == 'k'
662
- emit(:backref, :number_ref_ab, text, ts, te)
601
+ emit(:backref, :number_ref_ab, text, ts, te)
663
602
  else
664
- emit(:backref, :number_call_ab, text, ts, te)
603
+ emit(:backref, :number_call_ab, text, ts, te)
665
604
  end
666
605
 
667
606
  when /^\\([gk])'\d+'/ # single quotes
668
607
  if $1 == 'k'
669
- emit(:backref, :number_ref_sq, text, ts, te)
608
+ emit(:backref, :number_ref_sq, text, ts, te)
670
609
  else
671
- emit(:backref, :number_call_sq, text, ts, te)
610
+ emit(:backref, :number_call_sq, text, ts, te)
672
611
  end
673
612
 
674
613
  when /^\\([gk])<-\d+>/ # angle-brackets
675
614
  if $1 == 'k'
676
- emit(:backref, :number_rel_ref_ab, text, ts, te)
615
+ emit(:backref, :number_rel_ref_ab, text, ts, te)
677
616
  else
678
- emit(:backref, :number_rel_call_ab, text, ts, te)
617
+ emit(:backref, :number_rel_call_ab, text, ts, te)
679
618
  end
680
619
 
681
620
  when /^\\([gk])'-\d+'/ # single quotes
682
621
  if $1 == 'k'
683
- emit(:backref, :number_rel_ref_sq, text, ts, te)
622
+ emit(:backref, :number_rel_ref_sq, text, ts, te)
684
623
  else
685
- emit(:backref, :number_rel_call_sq, text, ts, te)
624
+ emit(:backref, :number_rel_call_sq, text, ts, te)
686
625
  end
687
626
 
688
627
  when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
689
- emit(:backref, :name_nest_ref_ab, text, ts, te)
628
+ emit(:backref, :name_recursion_ref_ab, text, ts, te)
690
629
 
691
630
  when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
692
- emit(:backref, :name_nest_ref_sq, text, ts, te)
631
+ emit(:backref, :name_recursion_ref_sq, text, ts, te)
693
632
 
694
- when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
695
- emit(:backref, :number_nest_ref_ab, text, ts, te)
633
+ when /^\\([gk])<-?\d+[+\-]\d+>/ # angle-brackets
634
+ emit(:backref, :number_recursion_ref_ab, text, ts, te)
696
635
 
697
- when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
698
- emit(:backref, :number_nest_ref_sq, text, ts, te)
636
+ when /^\\([gk])'-?\d+[+\-]\d+'/ # single-quotes
637
+ emit(:backref, :number_recursion_ref_sq, text, ts, te)
699
638
 
700
639
  else
701
640
  raise ScannerError.new(
@@ -859,8 +798,11 @@ class Regexp::Scanner
859
798
  self.group_depth = 0
860
799
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
861
800
 
862
- in_set, set_depth, set_type = false, 0, :set
863
- in_conditional, conditional_depth, conditional_stack = false, 0, []
801
+ in_set = false
802
+ set_depth = 0
803
+ in_conditional = false
804
+ conditional_depth = 0
805
+ conditional_stack = []
864
806
 
865
807
  %% write data;
866
808
  %% write init;
@@ -882,6 +824,18 @@ class Regexp::Scanner
882
824
  tokens
883
825
  end
884
826
 
827
+ # lazy-load property maps when first needed
828
+ require 'yaml'
829
+ PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
830
+
831
+ def self.short_prop_map
832
+ @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
833
+ end
834
+
835
+ def self.long_prop_map
836
+ @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
837
+ end
838
+
885
839
  # Emits an array with the details of the scanned pattern
886
840
  def emit(type, token, text, ts, te)
887
841
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -986,6 +940,8 @@ class Regexp::Scanner
986
940
  end
987
941
 
988
942
  def emit_options(text, ts, te)
943
+ token = nil
944
+
989
945
  if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
990
946
  positive, negative, group_local = $1, $2, $3
991
947
 
@@ -1001,13 +957,15 @@ class Regexp::Scanner
1001
957
 
1002
958
  if group_local
1003
959
  spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
960
+ token = :options
1004
961
  else
1005
962
  # switch for parent group level
1006
963
  spacing_stack.last[:free_spacing] = free_spacing
964
+ token = :options_switch
1007
965
  end
1008
966
  end
1009
967
 
1010
- emit(:group, :options, text, ts, te)
968
+ emit(:group, token, text, ts, te)
1011
969
  end
1012
970
 
1013
971
  # Centralizes and unifies the handling of validation related