regexp_parser 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/lib/regexp_parser/expression.rb +6 -43
  4. data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
  5. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  6. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  7. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  8. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  9. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  10. data/lib/regexp_parser/expression/sequence.rb +3 -2
  11. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  12. data/lib/regexp_parser/lexer.rb +0 -21
  13. data/lib/regexp_parser/parser.rb +22 -21
  14. data/lib/regexp_parser/scanner.rb +1159 -1329
  15. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  16. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  17. data/lib/regexp_parser/version.rb +1 -1
  18. data/spec/expression/base_spec.rb +14 -0
  19. data/spec/expression/methods/match_length_spec.rb +13 -0
  20. data/spec/expression/methods/match_spec.rb +25 -0
  21. data/spec/expression/methods/tests_spec.rb +2 -0
  22. data/spec/expression/options_spec.rb +128 -0
  23. data/spec/expression/root_spec.rb +9 -0
  24. data/spec/expression/sequence_spec.rb +9 -0
  25. data/spec/lexer/conditionals_spec.rb +49 -119
  26. data/spec/lexer/escapes_spec.rb +8 -32
  27. data/spec/lexer/keep_spec.rb +5 -17
  28. data/spec/lexer/literals_spec.rb +73 -110
  29. data/spec/lexer/nesting_spec.rb +86 -117
  30. data/spec/lexer/refcalls_spec.rb +51 -50
  31. data/spec/parser/all_spec.rb +13 -1
  32. data/spec/parser/anchors_spec.rb +9 -23
  33. data/spec/parser/conditionals_spec.rb +9 -9
  34. data/spec/parser/errors_spec.rb +22 -43
  35. data/spec/parser/escapes_spec.rb +33 -44
  36. data/spec/parser/groups_spec.rb +98 -257
  37. data/spec/parser/keep_spec.rb +2 -15
  38. data/spec/parser/posix_classes_spec.rb +5 -24
  39. data/spec/parser/properties_spec.rb +42 -54
  40. data/spec/parser/quantifiers_spec.rb +41 -283
  41. data/spec/parser/refcalls_spec.rb +60 -185
  42. data/spec/parser/set/intersections_spec.rb +17 -17
  43. data/spec/parser/set/ranges_spec.rb +17 -17
  44. data/spec/parser/sets_spec.rb +5 -5
  45. data/spec/parser/types_spec.rb +11 -36
  46. data/spec/scanner/anchors_spec.rb +13 -28
  47. data/spec/scanner/conditionals_spec.rb +121 -173
  48. data/spec/scanner/errors_spec.rb +65 -87
  49. data/spec/scanner/escapes_spec.rb +49 -50
  50. data/spec/scanner/free_space_spec.rb +102 -165
  51. data/spec/scanner/groups_spec.rb +45 -64
  52. data/spec/scanner/keep_spec.rb +5 -28
  53. data/spec/scanner/literals_spec.rb +45 -81
  54. data/spec/scanner/meta_spec.rb +13 -33
  55. data/spec/scanner/properties_spec.rb +43 -286
  56. data/spec/scanner/quantifiers_spec.rb +13 -28
  57. data/spec/scanner/refcalls_spec.rb +32 -48
  58. data/spec/scanner/sets_spec.rb +88 -102
  59. data/spec/scanner/types_spec.rb +10 -25
  60. data/spec/spec_helper.rb +1 -0
  61. data/spec/support/shared_examples.rb +77 -0
  62. data/spec/syntax/syntax_spec.rb +4 -0
  63. data/spec/syntax/versions/1.8.6_spec.rb +12 -33
  64. data/spec/syntax/versions/1.9.1_spec.rb +5 -18
  65. data/spec/syntax/versions/1.9.3_spec.rb +4 -17
  66. data/spec/syntax/versions/2.0.0_spec.rb +8 -23
  67. data/spec/syntax/versions/2.2.0_spec.rb +4 -17
  68. data/spec/syntax/versions/aliases_spec.rb +25 -109
  69. metadata +14 -6
  70. data/spec/scanner/scripts_spec.rb +0 -49
  71. data/spec/scanner/unicode_blocks_spec.rb +0 -28
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -82,7 +82,8 @@
82
82
  assertion_lookbehind = '?<=';
83
83
  assertion_nlookbehind = '?<!';
84
84
 
85
- group_options = '?' . [\-mixdau];
85
+ # try to treat every other group head as options group, like Ruby
86
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
87
 
87
88
  group_ref = [gk];
88
89
  group_name_char = (alnum | '_');
@@ -135,41 +136,35 @@
135
136
  # Invalid sequence error, used from sequences, like escapes and sets
136
137
  action invalid_sequence_error {
137
138
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
139
+ validation_error(:sequence, 'sequence', text)
139
140
  }
140
141
 
141
142
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
143
+ action group_opened { self.group_depth = group_depth + 1 }
144
+ action group_closed { self.group_depth = group_depth - 1 }
145
+ action set_opened { self.set_depth = set_depth + 1 }
146
+ action set_closed { self.set_depth = set_depth - 1 }
144
147
 
145
148
  # Character set scanner, continues consuming characters until it meets the
146
149
  # closing bracket of the set.
147
150
  # --------------------------------------------------------------------------
148
151
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
152
+ set_close > (set_meta, 2) @set_closed {
153
153
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
154
+ if in_set?
158
155
  fret;
156
+ else
157
+ fgoto main;
159
158
  end
160
159
  };
161
160
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
161
+ '-]' @set_closed { # special case, emits two tokens
162
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
164
+ if in_set?
172
165
  fret;
166
+ else
167
+ fgoto main;
173
168
  end
174
169
  };
175
170
 
@@ -207,14 +202,12 @@
207
202
  fcall set_escape_sequence;
208
203
  };
209
204
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
205
+ set_open >(open_bracket, 1) >set_opened {
213
206
  emit(:set, :open, *text(data, ts, te))
214
207
  fcall character_set;
215
208
  };
216
209
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
211
  text = text(data, ts, te).first
219
212
 
220
213
  type = :posixclass
@@ -227,11 +220,11 @@
227
220
  emit(type, class_name.to_sym, text, ts, te)
228
221
  };
229
222
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
223
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
224
  emit(:set, :collation, *text(data, ts, te))
232
225
  };
233
226
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
227
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
228
  emit(:set, :equivalent, *text(data, ts, te))
236
229
  };
237
230
 
@@ -337,44 +330,24 @@
337
330
  };
338
331
 
339
332
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
333
+ emit_meta_control_sequence(data, ts, te, :control)
351
334
  fret;
352
335
  };
353
336
 
354
337
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
338
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
339
  fret;
367
340
  };
368
341
 
369
342
  char_type_char > (escaped_alpha, 2) {
370
343
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
344
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
345
  fcall char_type;
373
346
  };
374
347
 
375
348
  property_char > (escaped_alpha, 2) {
376
349
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
351
  fcall unicode_property;
379
352
  };
380
353
 
@@ -412,8 +385,7 @@
412
385
  };
413
386
 
414
387
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
388
+ if conditional_stack.last == group_depth
417
389
  emit(:conditional, :separator, *text(data, ts, te))
418
390
  else
419
391
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +414,12 @@
442
414
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
415
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
416
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
417
  end
449
418
  };
450
419
 
451
420
  # Character sets
452
421
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
422
+ set_open >set_opened {
457
423
  emit(:set, :open, *text(data, ts, te))
458
424
  fcall character_set;
459
425
  };
@@ -465,9 +431,7 @@
465
431
  conditional {
466
432
  text = text(data, ts, te).first
467
433
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
434
+ conditional_stack << group_depth
471
435
 
472
436
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
437
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +460,11 @@
496
460
  # (?imxdau-imx:subexp) option on/off for subexp
497
461
  # ------------------------------------------------------------------------
498
462
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
463
+ text = text(data, ts, te).first
464
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
+ end
467
+ emit_options(text, ts, te)
500
468
  };
501
469
 
502
470
  # Assertions
@@ -528,19 +496,15 @@
528
496
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
497
  when '(?~'; emit(:group, :absence, text, ts, te)
530
498
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
499
+ when /^\(\?(?:<>|'')/
500
+ validation_error(:group, 'named group', 'name is empty')
533
501
 
502
+ when /^\(\?<\w*>/
534
503
  emit(:group, :named_ab, text, ts, te)
535
504
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
505
+ when /^\(\?'\w*'/
539
506
  emit(:group, :named_sq, text, ts, te)
540
507
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
508
  end
545
509
  };
546
510
 
@@ -550,20 +514,13 @@
550
514
  };
551
515
 
552
516
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
517
+ if conditional_stack.last == group_depth + 1
557
518
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
519
+ emit(:conditional, :close, *text(data, ts, te))
562
520
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
521
+ if spacing_stack.length > 1 &&
522
+ spacing_stack.last[:depth] == group_depth + 1
565
523
  spacing_stack.pop
566
-
567
524
  self.free_spacing = spacing_stack.last[:free_spacing]
568
525
  end
569
526
 
@@ -576,11 +533,8 @@
576
533
  # ------------------------------------------------------------------------
577
534
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
535
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
536
+ when /^\\([gk])(<>|'')/ # angle brackets
537
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
538
 
585
539
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
540
  if $1 == 'k'
@@ -636,9 +590,6 @@
636
590
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
591
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
592
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
593
  end
643
594
  };
644
595
 
@@ -786,7 +737,7 @@ class Regexp::Scanner
786
737
  input = input_object
787
738
  self.free_spacing = false
788
739
  end
789
-
740
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
741
 
791
742
  data = input.unpack("c*") if input.is_a?(String)
792
743
  eof = data.length
@@ -794,15 +745,9 @@ class Regexp::Scanner
794
745
  self.tokens = []
795
746
  self.block = block_given? ? block : nil
796
747
 
797
- self.in_group = false
748
+ self.set_depth = 0
798
749
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
750
+ self.conditional_stack = []
806
751
 
807
752
  %% write data;
808
753
  %% write init;
@@ -817,9 +762,9 @@ class Regexp::Scanner
817
762
  end
818
763
 
819
764
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
765
+ "[#{group_depth}]") if in_group?
821
766
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
767
+ "[#{set_depth}]") if in_set?
823
768
 
824
769
  # when the entire expression is a literal run
825
770
  emit_literal if literal
@@ -854,62 +799,15 @@ class Regexp::Scanner
854
799
 
855
800
  private
856
801
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
866
-
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
802
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
+ :group_depth, :set_depth, :conditional_stack
878
804
 
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
805
+ def in_group?
806
+ group_depth > 0
807
+ end
911
808
 
912
- p # return the new value of the data pointer
809
+ def in_set?
810
+ set_depth > 0
913
811
  end
914
812
 
915
813
  # Copy from ts to te from data as text
@@ -945,32 +843,39 @@ class Regexp::Scanner
945
843
  def emit_options(text, ts, te)
946
844
  token = nil
947
845
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
846
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
847
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
848
+ positive, negative, group_local = $1, $2, $3
950
849
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
850
+ if positive.include?('x')
851
+ self.free_spacing = true
852
+ end
954
853
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
854
+ # If the x appears in both, treat it like ruby does, the second cancels
855
+ # the first.
856
+ if negative && negative.include?('x')
857
+ self.free_spacing = false
858
+ end
960
859
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
860
+ if group_local
861
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
862
+ token = :options
863
+ else
864
+ # switch for parent group level
865
+ spacing_stack.last[:free_spacing] = free_spacing
866
+ token = :options_switch
969
867
  end
970
868
 
971
869
  emit(:group, token, text, ts, te)
972
870
  end
973
871
 
872
+ def emit_meta_control_sequence(data, ts, te, token)
873
+ if data.last < 0x00 || data.last > 0x7F
874
+ validation_error(:sequence, 'escape', token.to_s)
875
+ end
876
+ emit(:escape, token, *text(data, ts, te, 1))
877
+ end
878
+
974
879
  # Centralizes and unifies the handling of validation related
975
880
  # errors.
976
881
  def validation_error(type, what, reason)
@@ -981,21 +886,8 @@ class Regexp::Scanner
981
886
  error = InvalidBackrefError.new(what, reason)
982
887
  when :sequence
983
888
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
889
  end
987
890
 
988
891
  raise error # unless @@config.validation_ignore
989
892
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
893
  end # module Regexp::Scanner