regexp_parser 1.5.0 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/lib/regexp_parser/expression.rb +6 -43
  4. data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
  5. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  6. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  7. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  8. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  9. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  10. data/lib/regexp_parser/expression/sequence.rb +3 -2
  11. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  12. data/lib/regexp_parser/lexer.rb +0 -21
  13. data/lib/regexp_parser/parser.rb +22 -21
  14. data/lib/regexp_parser/scanner.rb +1159 -1329
  15. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  16. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  17. data/lib/regexp_parser/version.rb +1 -1
  18. data/spec/expression/base_spec.rb +14 -0
  19. data/spec/expression/methods/match_length_spec.rb +13 -0
  20. data/spec/expression/methods/match_spec.rb +25 -0
  21. data/spec/expression/methods/tests_spec.rb +2 -0
  22. data/spec/expression/options_spec.rb +128 -0
  23. data/spec/expression/root_spec.rb +9 -0
  24. data/spec/expression/sequence_spec.rb +9 -0
  25. data/spec/lexer/conditionals_spec.rb +49 -119
  26. data/spec/lexer/escapes_spec.rb +8 -32
  27. data/spec/lexer/keep_spec.rb +5 -17
  28. data/spec/lexer/literals_spec.rb +73 -110
  29. data/spec/lexer/nesting_spec.rb +86 -117
  30. data/spec/lexer/refcalls_spec.rb +51 -50
  31. data/spec/parser/all_spec.rb +13 -1
  32. data/spec/parser/anchors_spec.rb +9 -23
  33. data/spec/parser/conditionals_spec.rb +9 -9
  34. data/spec/parser/errors_spec.rb +22 -43
  35. data/spec/parser/escapes_spec.rb +33 -44
  36. data/spec/parser/groups_spec.rb +98 -257
  37. data/spec/parser/keep_spec.rb +2 -15
  38. data/spec/parser/posix_classes_spec.rb +5 -24
  39. data/spec/parser/properties_spec.rb +42 -54
  40. data/spec/parser/quantifiers_spec.rb +41 -283
  41. data/spec/parser/refcalls_spec.rb +60 -185
  42. data/spec/parser/set/intersections_spec.rb +17 -17
  43. data/spec/parser/set/ranges_spec.rb +17 -17
  44. data/spec/parser/sets_spec.rb +5 -5
  45. data/spec/parser/types_spec.rb +11 -36
  46. data/spec/scanner/anchors_spec.rb +13 -28
  47. data/spec/scanner/conditionals_spec.rb +121 -173
  48. data/spec/scanner/errors_spec.rb +65 -87
  49. data/spec/scanner/escapes_spec.rb +49 -50
  50. data/spec/scanner/free_space_spec.rb +102 -165
  51. data/spec/scanner/groups_spec.rb +45 -64
  52. data/spec/scanner/keep_spec.rb +5 -28
  53. data/spec/scanner/literals_spec.rb +45 -81
  54. data/spec/scanner/meta_spec.rb +13 -33
  55. data/spec/scanner/properties_spec.rb +43 -286
  56. data/spec/scanner/quantifiers_spec.rb +13 -28
  57. data/spec/scanner/refcalls_spec.rb +32 -48
  58. data/spec/scanner/sets_spec.rb +88 -102
  59. data/spec/scanner/types_spec.rb +10 -25
  60. data/spec/spec_helper.rb +1 -0
  61. data/spec/support/shared_examples.rb +77 -0
  62. data/spec/syntax/syntax_spec.rb +4 -0
  63. data/spec/syntax/versions/1.8.6_spec.rb +12 -33
  64. data/spec/syntax/versions/1.9.1_spec.rb +5 -18
  65. data/spec/syntax/versions/1.9.3_spec.rb +4 -17
  66. data/spec/syntax/versions/2.0.0_spec.rb +8 -23
  67. data/spec/syntax/versions/2.2.0_spec.rb +4 -17
  68. data/spec/syntax/versions/aliases_spec.rb +25 -109
  69. metadata +14 -6
  70. data/spec/scanner/scripts_spec.rb +0 -49
  71. data/spec/scanner/unicode_blocks_spec.rb +0 -28
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -82,7 +82,8 @@
82
82
  assertion_lookbehind = '?<=';
83
83
  assertion_nlookbehind = '?<!';
84
84
 
85
- group_options = '?' . [\-mixdau];
85
+ # try to treat every other group head as options group, like Ruby
86
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
87
 
87
88
  group_ref = [gk];
88
89
  group_name_char = (alnum | '_');
@@ -135,41 +136,35 @@
135
136
  # Invalid sequence error, used from sequences, like escapes and sets
136
137
  action invalid_sequence_error {
137
138
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
139
+ validation_error(:sequence, 'sequence', text)
139
140
  }
140
141
 
141
142
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
143
+ action group_opened { self.group_depth = group_depth + 1 }
144
+ action group_closed { self.group_depth = group_depth - 1 }
145
+ action set_opened { self.set_depth = set_depth + 1 }
146
+ action set_closed { self.set_depth = set_depth - 1 }
144
147
 
145
148
  # Character set scanner, continues consuming characters until it meets the
146
149
  # closing bracket of the set.
147
150
  # --------------------------------------------------------------------------
148
151
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
152
+ set_close > (set_meta, 2) @set_closed {
153
153
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
154
+ if in_set?
158
155
  fret;
156
+ else
157
+ fgoto main;
159
158
  end
160
159
  };
161
160
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
161
+ '-]' @set_closed { # special case, emits two tokens
162
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
164
+ if in_set?
172
165
  fret;
166
+ else
167
+ fgoto main;
173
168
  end
174
169
  };
175
170
 
@@ -207,14 +202,12 @@
207
202
  fcall set_escape_sequence;
208
203
  };
209
204
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
205
+ set_open >(open_bracket, 1) >set_opened {
213
206
  emit(:set, :open, *text(data, ts, te))
214
207
  fcall character_set;
215
208
  };
216
209
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
211
  text = text(data, ts, te).first
219
212
 
220
213
  type = :posixclass
@@ -227,11 +220,11 @@
227
220
  emit(type, class_name.to_sym, text, ts, te)
228
221
  };
229
222
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
223
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
224
  emit(:set, :collation, *text(data, ts, te))
232
225
  };
233
226
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
227
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
228
  emit(:set, :equivalent, *text(data, ts, te))
236
229
  };
237
230
 
@@ -337,44 +330,24 @@
337
330
  };
338
331
 
339
332
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
333
+ emit_meta_control_sequence(data, ts, te, :control)
351
334
  fret;
352
335
  };
353
336
 
354
337
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
338
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
339
  fret;
367
340
  };
368
341
 
369
342
  char_type_char > (escaped_alpha, 2) {
370
343
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
344
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
345
  fcall char_type;
373
346
  };
374
347
 
375
348
  property_char > (escaped_alpha, 2) {
376
349
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
351
  fcall unicode_property;
379
352
  };
380
353
 
@@ -412,8 +385,7 @@
412
385
  };
413
386
 
414
387
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
388
+ if conditional_stack.last == group_depth
417
389
  emit(:conditional, :separator, *text(data, ts, te))
418
390
  else
419
391
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +414,12 @@
442
414
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
415
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
416
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
417
  end
449
418
  };
450
419
 
451
420
  # Character sets
452
421
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
422
+ set_open >set_opened {
457
423
  emit(:set, :open, *text(data, ts, te))
458
424
  fcall character_set;
459
425
  };
@@ -465,9 +431,7 @@
465
431
  conditional {
466
432
  text = text(data, ts, te).first
467
433
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
434
+ conditional_stack << group_depth
471
435
 
472
436
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
437
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +460,11 @@
496
460
  # (?imxdau-imx:subexp) option on/off for subexp
497
461
  # ------------------------------------------------------------------------
498
462
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
463
+ text = text(data, ts, te).first
464
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
+ end
467
+ emit_options(text, ts, te)
500
468
  };
501
469
 
502
470
  # Assertions
@@ -528,19 +496,15 @@
528
496
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
497
  when '(?~'; emit(:group, :absence, text, ts, te)
530
498
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
499
+ when /^\(\?(?:<>|'')/
500
+ validation_error(:group, 'named group', 'name is empty')
533
501
 
502
+ when /^\(\?<\w*>/
534
503
  emit(:group, :named_ab, text, ts, te)
535
504
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
505
+ when /^\(\?'\w*'/
539
506
  emit(:group, :named_sq, text, ts, te)
540
507
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
508
  end
545
509
  };
546
510
 
@@ -550,20 +514,13 @@
550
514
  };
551
515
 
552
516
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
517
+ if conditional_stack.last == group_depth + 1
557
518
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
519
+ emit(:conditional, :close, *text(data, ts, te))
562
520
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
521
+ if spacing_stack.length > 1 &&
522
+ spacing_stack.last[:depth] == group_depth + 1
565
523
  spacing_stack.pop
566
-
567
524
  self.free_spacing = spacing_stack.last[:free_spacing]
568
525
  end
569
526
 
@@ -576,11 +533,8 @@
576
533
  # ------------------------------------------------------------------------
577
534
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
535
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
536
+ when /^\\([gk])(<>|'')/ # angle brackets
537
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
538
 
585
539
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
540
  if $1 == 'k'
@@ -636,9 +590,6 @@
636
590
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
591
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
592
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
593
  end
643
594
  };
644
595
 
@@ -786,7 +737,7 @@ class Regexp::Scanner
786
737
  input = input_object
787
738
  self.free_spacing = false
788
739
  end
789
-
740
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
741
 
791
742
  data = input.unpack("c*") if input.is_a?(String)
792
743
  eof = data.length
@@ -794,15 +745,9 @@ class Regexp::Scanner
794
745
  self.tokens = []
795
746
  self.block = block_given? ? block : nil
796
747
 
797
- self.in_group = false
748
+ self.set_depth = 0
798
749
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
750
+ self.conditional_stack = []
806
751
 
807
752
  %% write data;
808
753
  %% write init;
@@ -817,9 +762,9 @@ class Regexp::Scanner
817
762
  end
818
763
 
819
764
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
765
+ "[#{group_depth}]") if in_group?
821
766
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
767
+ "[#{set_depth}]") if in_set?
823
768
 
824
769
  # when the entire expression is a literal run
825
770
  emit_literal if literal
@@ -854,62 +799,15 @@ class Regexp::Scanner
854
799
 
855
800
  private
856
801
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
866
-
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
802
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
+ :group_depth, :set_depth, :conditional_stack
878
804
 
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
805
+ def in_group?
806
+ group_depth > 0
807
+ end
911
808
 
912
- p # return the new value of the data pointer
809
+ def in_set?
810
+ set_depth > 0
913
811
  end
914
812
 
915
813
  # Copy from ts to te from data as text
@@ -945,32 +843,39 @@ class Regexp::Scanner
945
843
  def emit_options(text, ts, te)
946
844
  token = nil
947
845
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
846
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
847
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
848
+ positive, negative, group_local = $1, $2, $3
950
849
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
850
+ if positive.include?('x')
851
+ self.free_spacing = true
852
+ end
954
853
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
854
+ # If the x appears in both, treat it like ruby does, the second cancels
855
+ # the first.
856
+ if negative && negative.include?('x')
857
+ self.free_spacing = false
858
+ end
960
859
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
860
+ if group_local
861
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
862
+ token = :options
863
+ else
864
+ # switch for parent group level
865
+ spacing_stack.last[:free_spacing] = free_spacing
866
+ token = :options_switch
969
867
  end
970
868
 
971
869
  emit(:group, token, text, ts, te)
972
870
  end
973
871
 
872
+ def emit_meta_control_sequence(data, ts, te, token)
873
+ if data.last < 0x00 || data.last > 0x7F
874
+ validation_error(:sequence, 'escape', token.to_s)
875
+ end
876
+ emit(:escape, token, *text(data, ts, te, 1))
877
+ end
878
+
974
879
  # Centralizes and unifies the handling of validation related
975
880
  # errors.
976
881
  def validation_error(type, what, reason)
@@ -981,21 +886,8 @@ class Regexp::Scanner
981
886
  error = InvalidBackrefError.new(what, reason)
982
887
  when :sequence
983
888
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
889
  end
987
890
 
988
891
  raise error # unless @@config.validation_ignore
989
892
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
893
  end # module Regexp::Scanner