regexp_parser 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -1,6 +1,7 @@
1
1
  %%{
2
2
  machine re_scanner;
3
- include re_property "property.rl";
3
+ include re_char_type "char_type.rl";
4
+ include re_property "property.rl";
4
5
 
5
6
  dot = '.';
6
7
  backslash = '\\';
@@ -35,25 +36,17 @@
35
36
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
36
37
  character_equivalent = '[=' . alpha . '=]';
37
38
 
38
- char_type = [dDhHsSwWRX];
39
-
40
39
  line_anchor = beginning_of_line | end_of_line;
41
40
  anchor_char = [AbBzZG];
42
41
 
43
- escaped_ascii = [abefnrstv];
42
+ escaped_ascii = [abefnrtv];
44
43
  octal_sequence = [0-7]{1,3};
45
44
 
46
45
  hex_sequence = 'x' . xdigit{1,2};
47
46
  hex_sequence_err = 'x' . [^0-9a-fA-F{];
48
- wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
49
-
50
- hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
51
-
52
- wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
53
- wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
54
47
 
55
48
  codepoint_single = 'u' . xdigit{4};
56
- codepoint_list = 'u{' . xdigit{1,5} . (space . xdigit{1,5})* . '}';
49
+ codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
57
50
  codepoint_sequence = codepoint_single | codepoint_list;
58
51
 
59
52
  control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
@@ -110,6 +103,7 @@
110
103
 
111
104
  group_type = group_atomic | group_passive | group_absence | group_named;
112
105
 
106
+ keep_mark = 'K';
113
107
 
114
108
  assertion_type = assertion_lookahead | assertion_nlookahead |
115
109
  assertion_lookbehind | assertion_nlookbehind;
@@ -119,16 +113,18 @@
119
113
  curlies | parantheses | brackets |
120
114
  line_anchor | quantifier_greedy;
121
115
 
122
- ascii_print = ((0x20..0x7e) - meta_char)+;
123
- ascii_nonprint = (0x01..0x1f | 0x7f)+;
116
+ ascii_print = ((0x20..0x7e) - meta_char);
117
+ ascii_nonprint = (0x01..0x1f | 0x7f);
118
+
119
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
120
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
121
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
124
122
 
125
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf)+;
126
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf)+;
127
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
128
- utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
123
+ non_literal_escape = char_type_char | anchor_char | escaped_ascii |
124
+ group_ref | keep_mark | [xucCM];
129
125
 
130
- non_literal_escape = char_type | anchor_char | escaped_ascii |
131
- group_ref | [xucCM];
126
+ non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
127
+ multi_codepoint_char_type | [0-9cCM];
132
128
 
133
129
  # EOF error, used where it can be detected
134
130
  action premature_end_error {
@@ -150,11 +146,11 @@
150
146
  # closing bracket of the set.
151
147
  # --------------------------------------------------------------------------
152
148
  character_set := |*
153
- ']' {
154
- set_type = set_depth > 1 ? :subset : :set
155
- set_depth -= 1; in_set = set_depth > 0 ? true : false
149
+ set_close > (set_meta, 2) {
150
+ set_depth -= 1
151
+ in_set = set_depth > 0 ? true : false
156
152
 
157
- emit(set_type, :close, *text(data, ts, te))
153
+ emit(:set, :close, *text(data, ts, te))
158
154
 
159
155
  if set_depth == 0
160
156
  fgoto main;
@@ -164,11 +160,11 @@
164
160
  };
165
161
 
166
162
  '-]' { # special case, emits two tokens
167
- set_type = set_depth > 1 ? :subset : :set
168
- set_depth -= 1; in_set = set_depth > 0 ? true : false
163
+ set_depth -= 1
164
+ in_set = set_depth > 0 ? true : false
169
165
 
170
- emit(set_type, :member, copy(data, ts..te-2), ts, te)
171
- emit(set_type, :close, copy(data, ts+1..te-1), ts, te)
166
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
+ emit(:set, :close, copy(data, ts+1..te-1), ts, te)
172
168
 
173
169
  if set_depth == 0
174
170
  fgoto main;
@@ -177,59 +173,70 @@
177
173
  end
178
174
  };
179
175
 
176
+ '-&&' { # special case, emits two tokens
177
+ emit(:literal, :literal, '-', ts, te)
178
+ emit(:set, :intersection, '&&', ts, te)
179
+ };
180
+
180
181
  '^' {
181
182
  text = text(data, ts, te).first
182
183
  if tokens.last[1] == :open
183
- emit(set_type, :negate, text, ts, te)
184
+ emit(:set, :negate, text, ts, te)
184
185
  else
185
- emit(set_type, :member, text, ts, te)
186
+ emit(:literal, :literal, text, ts, te)
186
187
  end
187
188
  };
188
189
 
189
- alnum . '-' . alnum {
190
- emit(set_type, :range, *text(data, ts, te))
190
+ '-' {
191
+ text = text(data, ts, te).first
192
+ # ranges cant start with a subset or intersection/negation/range operator
193
+ if tokens.last[0] == :set
194
+ emit(:literal, :literal, text, ts, te)
195
+ else
196
+ emit(:set, :range, text, ts, te)
197
+ end
191
198
  };
192
199
 
200
+ # Unlike ranges, intersections can start or end at set boundaries, whereupon
201
+ # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
193
202
  '&&' {
194
- emit(set_type, :intersection, *text(data, ts, te))
203
+ emit(:set, :intersection, *text(data, ts, te))
195
204
  };
196
205
 
197
- '\\' {
206
+ backslash {
198
207
  fcall set_escape_sequence;
199
208
  };
200
209
 
201
- '[' >(open_bracket, 1) {
202
- set_depth += 1; in_set = true
203
- set_type = set_depth > 1 ? :subset : :set
210
+ set_open >(open_bracket, 1) {
211
+ set_depth += 1
204
212
 
205
- emit(set_type, :open, *text(data, ts, te))
213
+ emit(:set, :open, *text(data, ts, te))
206
214
  fcall character_set;
207
215
  };
208
216
 
209
217
  class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
218
  text = text(data, ts, te).first
211
219
 
220
+ type = :posixclass
212
221
  class_name = text[2..-3]
213
222
  if class_name[0].chr == '^'
214
- class_name = "non#{class_name[1..-1]}"
223
+ class_name = class_name[1..-1]
224
+ type = :nonposixclass
215
225
  end
216
226
 
217
- token_sym = "class_#{class_name}".to_sym
218
- emit(set_type, token_sym, text, ts, te)
227
+ emit(type, class_name.to_sym, text, ts, te)
219
228
  };
220
229
 
221
230
  collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
222
- emit(set_type, :collation, *text(data, ts, te))
231
+ emit(:set, :collation, *text(data, ts, te))
223
232
  };
224
233
 
225
234
  character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
226
- emit(set_type, :equivalent, *text(data, ts, te))
235
+ emit(:set, :equivalent, *text(data, ts, te))
227
236
  };
228
237
 
229
- # exclude the closing bracket as a cleaner workaround for dealing with the
230
- # ambiguity caused upon exit from the unicode properties machine
231
- meta_char -- ']' {
232
- emit(set_type, :member, *text(data, ts, te))
238
+ meta_char > (set_meta, 1) {
239
+ emit(:literal, :literal, *text(data, ts, te))
233
240
  };
234
241
 
235
242
  any |
@@ -237,63 +244,24 @@
237
244
  utf8_2_byte |
238
245
  utf8_3_byte |
239
246
  utf8_4_byte {
240
- emit(set_type, :member, *text(data, ts, te))
247
+ char, *rest = *text(data, ts, te)
248
+ char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
249
+ emit(:literal, :literal, char, *rest)
241
250
  };
242
251
  *|;
243
252
 
244
253
  # set escapes scanner
245
254
  # --------------------------------------------------------------------------
246
255
  set_escape_sequence := |*
247
- 'b' > (escaped_set_alpha, 2) {
248
- emit(set_type, :backspace, *text(data, ts, te, 1))
249
- fret;
250
- };
251
-
252
- char_type > (escaped_set_alpha, 4) {
253
- case text = text(data, ts, te, 1).first
254
- when '\d'; emit(set_type, :type_digit, text, ts-1, te)
255
- when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
256
- when '\h'; emit(set_type, :type_hex, text, ts-1, te)
257
- when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
258
- when '\s'; emit(set_type, :type_space, text, ts-1, te)
259
- when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
260
- when '\w'; emit(set_type, :type_word, text, ts-1, te)
261
- when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
262
- when '\R'; emit(set_type, :type_linebreak, text, ts-1, te)
263
- when '\X'; emit(set_type, :type_xgrapheme, text, ts-1, te)
264
- end
265
- fret;
266
- };
267
-
268
- hex_sequence . '-\\' . hex_sequence {
269
- emit(set_type, :range_hex, *text(data, ts, te, 1))
270
- fret;
271
- };
272
-
273
- hex_sequence {
274
- emit(set_type, :member_hex, *text(data, ts, te, 1))
275
- fret;
276
- };
277
-
278
- meta_char | [\\\]\-\,] {
279
- emit(set_type, :escape, *text(data, ts, te, 1))
256
+ non_set_escape > (escaped_set_alpha, 2) {
257
+ emit(:escape, :literal, *text(data, ts, te, 1))
280
258
  fret;
281
259
  };
282
260
 
283
- property_char > (escaped_set_alpha, 3) {
261
+ any > (escaped_set_alpha, 1) {
284
262
  fhold;
285
263
  fnext character_set;
286
- fcall unicode_property;
287
- };
288
-
289
- # special case exclusion of escaped dash, could be cleaner.
290
- (ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
291
- ascii_nonprint |
292
- utf8_2_byte |
293
- utf8_3_byte |
294
- utf8_4_byte {
295
- emit(set_type, :escape, *text(data, ts, te, 1))
296
- fret;
264
+ fcall escape_sequence;
297
265
  };
298
266
  *|;
299
267
 
@@ -338,11 +306,11 @@
338
306
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
339
307
  case text = text(data, ts, te, 1).first
340
308
  when '\a'; emit(:escape, :bell, text, ts-1, te)
309
+ when '\b'; emit(:escape, :backspace, text, ts-1, te)
341
310
  when '\e'; emit(:escape, :escape, text, ts-1, te)
342
311
  when '\f'; emit(:escape, :form_feed, text, ts-1, te)
343
312
  when '\n'; emit(:escape, :newline, text, ts-1, te)
344
313
  when '\r'; emit(:escape, :carriage, text, ts-1, te)
345
- when '\s'; emit(:escape, :space, text, ts-1, te)
346
314
  when '\t'; emit(:escape, :tab, text, ts-1, te)
347
315
  when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
348
316
  end
@@ -364,20 +332,10 @@
364
332
  fret;
365
333
  };
366
334
 
367
- wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
368
- emit(:escape, :hex_wide, *text(data, ts, te, 1))
369
- fret;
370
- };
371
-
372
335
  hex_sequence_err @invalid_sequence_error {
373
336
  fret;
374
337
  };
375
338
 
376
- (wide_hex_seq_invalid | wide_hex_seq_empty) {
377
- raise InvalidSequenceError.new("wide hex sequence")
378
- fret;
379
- };
380
-
381
339
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
382
340
  if data[te]
383
341
  c = data[te].chr
@@ -408,9 +366,15 @@
408
366
  fret;
409
367
  };
410
368
 
369
+ char_type_char > (escaped_alpha, 2) {
370
+ fhold;
371
+ fnext *(in_set ? fentry(character_set) : fentry(main));
372
+ fcall char_type;
373
+ };
374
+
411
375
  property_char > (escaped_alpha, 2) {
412
376
  fhold;
413
- fnext main;
377
+ fnext *(in_set ? fentry(character_set) : fentry(main));
414
378
  fcall unicode_property;
415
379
  };
416
380
 
@@ -466,7 +430,7 @@
466
430
  emit(:anchor, :eol, *text(data, ts, te))
467
431
  };
468
432
 
469
- backslash . 'K' > (backslashed, 4) {
433
+ backslash . keep_mark > (backslashed, 4) {
470
434
  emit(:keep, :mark, *text(data, ts, te))
471
435
  };
472
436
 
@@ -484,38 +448,13 @@
484
448
  end
485
449
  };
486
450
 
487
- # Character types
488
- # \d, \D digit, non-digit
489
- # \h, \H hex, non-hex
490
- # \s, \S space, non-space
491
- # \w, \W word, non-word
492
- # ------------------------------------------------------------------------
493
- backslash . char_type > (backslashed, 2) {
494
- case text = text(data, ts, te).first
495
- when '\\d'; emit(:type, :digit, text, ts, te)
496
- when '\\D'; emit(:type, :nondigit, text, ts, te)
497
- when '\\h'; emit(:type, :hex, text, ts, te)
498
- when '\\H'; emit(:type, :nonhex, text, ts, te)
499
- when '\\s'; emit(:type, :space, text, ts, te)
500
- when '\\S'; emit(:type, :nonspace, text, ts, te)
501
- when '\\w'; emit(:type, :word, text, ts, te)
502
- when '\\W'; emit(:type, :nonword, text, ts, te)
503
- when '\\R'; emit(:type, :linebreak, text, ts, te)
504
- when '\\X'; emit(:type, :xgrapheme, text, ts, te)
505
- else
506
- raise ScannerError.new(
507
- "Unexpected character in type at #{text} (char #{ts})")
508
- end
509
- };
510
-
511
-
512
451
  # Character sets
513
452
  # ------------------------------------------------------------------------
514
453
  set_open {
515
- set_depth += 1; in_set = true
516
- set_type = set_depth > 1 ? :subset : :set
454
+ set_depth += 1
455
+ in_set = true
517
456
 
518
- emit(set_type, :open, *text(data, ts, te))
457
+ emit(:set, :open, *text(data, ts, te))
519
458
  fcall character_set;
520
459
  };
521
460
 
@@ -645,57 +584,57 @@
645
584
 
646
585
  when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
647
586
  if $1 == 'k'
648
- emit(:backref, :name_ref_ab, text, ts, te)
587
+ emit(:backref, :name_ref_ab, text, ts, te)
649
588
  else
650
- emit(:backref, :name_call_ab, text, ts, te)
589
+ emit(:backref, :name_call_ab, text, ts, te)
651
590
  end
652
591
 
653
592
  when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
654
593
  if $1 == 'k'
655
- emit(:backref, :name_ref_sq, text, ts, te)
594
+ emit(:backref, :name_ref_sq, text, ts, te)
656
595
  else
657
- emit(:backref, :name_call_sq, text, ts, te)
596
+ emit(:backref, :name_call_sq, text, ts, te)
658
597
  end
659
598
 
660
599
  when /^\\([gk])<\d+>/ # angle-brackets
661
600
  if $1 == 'k'
662
- emit(:backref, :number_ref_ab, text, ts, te)
601
+ emit(:backref, :number_ref_ab, text, ts, te)
663
602
  else
664
- emit(:backref, :number_call_ab, text, ts, te)
603
+ emit(:backref, :number_call_ab, text, ts, te)
665
604
  end
666
605
 
667
606
  when /^\\([gk])'\d+'/ # single quotes
668
607
  if $1 == 'k'
669
- emit(:backref, :number_ref_sq, text, ts, te)
608
+ emit(:backref, :number_ref_sq, text, ts, te)
670
609
  else
671
- emit(:backref, :number_call_sq, text, ts, te)
610
+ emit(:backref, :number_call_sq, text, ts, te)
672
611
  end
673
612
 
674
613
  when /^\\([gk])<-\d+>/ # angle-brackets
675
614
  if $1 == 'k'
676
- emit(:backref, :number_rel_ref_ab, text, ts, te)
615
+ emit(:backref, :number_rel_ref_ab, text, ts, te)
677
616
  else
678
- emit(:backref, :number_rel_call_ab, text, ts, te)
617
+ emit(:backref, :number_rel_call_ab, text, ts, te)
679
618
  end
680
619
 
681
620
  when /^\\([gk])'-\d+'/ # single quotes
682
621
  if $1 == 'k'
683
- emit(:backref, :number_rel_ref_sq, text, ts, te)
622
+ emit(:backref, :number_rel_ref_sq, text, ts, te)
684
623
  else
685
- emit(:backref, :number_rel_call_sq, text, ts, te)
624
+ emit(:backref, :number_rel_call_sq, text, ts, te)
686
625
  end
687
626
 
688
627
  when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
689
- emit(:backref, :name_nest_ref_ab, text, ts, te)
628
+ emit(:backref, :name_recursion_ref_ab, text, ts, te)
690
629
 
691
630
  when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
692
- emit(:backref, :name_nest_ref_sq, text, ts, te)
631
+ emit(:backref, :name_recursion_ref_sq, text, ts, te)
693
632
 
694
- when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
695
- emit(:backref, :number_nest_ref_ab, text, ts, te)
633
+ when /^\\([gk])<-?\d+[+\-]\d+>/ # angle-brackets
634
+ emit(:backref, :number_recursion_ref_ab, text, ts, te)
696
635
 
697
- when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
698
- emit(:backref, :number_nest_ref_sq, text, ts, te)
636
+ when /^\\([gk])'-?\d+[+\-]\d+'/ # single-quotes
637
+ emit(:backref, :number_recursion_ref_sq, text, ts, te)
699
638
 
700
639
  else
701
640
  raise ScannerError.new(
@@ -859,8 +798,11 @@ class Regexp::Scanner
859
798
  self.group_depth = 0
860
799
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
861
800
 
862
- in_set, set_depth, set_type = false, 0, :set
863
- in_conditional, conditional_depth, conditional_stack = false, 0, []
801
+ in_set = false
802
+ set_depth = 0
803
+ in_conditional = false
804
+ conditional_depth = 0
805
+ conditional_stack = []
864
806
 
865
807
  %% write data;
866
808
  %% write init;
@@ -882,6 +824,18 @@ class Regexp::Scanner
882
824
  tokens
883
825
  end
884
826
 
827
+ # lazy-load property maps when first needed
828
+ require 'yaml'
829
+ PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
830
+
831
+ def self.short_prop_map
832
+ @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
833
+ end
834
+
835
+ def self.long_prop_map
836
+ @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
837
+ end
838
+
885
839
  # Emits an array with the details of the scanned pattern
886
840
  def emit(type, token, text, ts, te)
887
841
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -986,6 +940,8 @@ class Regexp::Scanner
986
940
  end
987
941
 
988
942
  def emit_options(text, ts, te)
943
+ token = nil
944
+
989
945
  if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
990
946
  positive, negative, group_local = $1, $2, $3
991
947
 
@@ -1001,13 +957,15 @@ class Regexp::Scanner
1001
957
 
1002
958
  if group_local
1003
959
  spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
960
+ token = :options
1004
961
  else
1005
962
  # switch for parent group level
1006
963
  spacing_stack.last[:free_spacing] = free_spacing
964
+ token = :options_switch
1007
965
  end
1008
966
  end
1009
967
 
1010
- emit(:group, :options, text, ts, te)
968
+ emit(:group, token, text, ts, te)
1011
969
  end
1012
970
 
1013
971
  # Centralizes and unifies the handling of validation related