regexp_parser 2.7.0 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +55 -3
  3. data/Gemfile +2 -2
  4. data/README.md +32 -29
  5. data/lib/regexp_parser/expression/base.rb +0 -7
  6. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  7. data/lib/regexp_parser/expression/classes/backreference.rb +4 -6
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  9. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  10. data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
  11. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  13. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  18. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  19. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  20. data/lib/regexp_parser/expression/methods/traverse.rb +33 -20
  21. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  22. data/lib/regexp_parser/expression/sequence.rb +5 -9
  23. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  24. data/lib/regexp_parser/expression/shared.rb +37 -24
  25. data/lib/regexp_parser/expression/subexpression.rb +20 -18
  26. data/lib/regexp_parser/expression.rb +2 -0
  27. data/lib/regexp_parser/lexer.rb +15 -7
  28. data/lib/regexp_parser/parser.rb +85 -86
  29. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  30. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  31. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  32. data/lib/regexp_parser/scanner/mapping.rb +89 -0
  33. data/lib/regexp_parser/scanner/property.rl +1 -1
  34. data/lib/regexp_parser/scanner/scanner.rl +35 -129
  35. data/lib/regexp_parser/scanner.rb +1084 -1303
  36. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  37. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  38. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  39. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  40. data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  42. data/lib/regexp_parser/version.rb +1 -1
  43. metadata +9 -2
@@ -0,0 +1,63 @@
1
+ class Regexp::Scanner
2
+ # Base for all scanner validation errors
3
+ class ValidationError < ScannerError
4
+ # Centralizes and unifies the handling of validation related errors.
5
+ def self.for(type, problem, reason = nil)
6
+ types.fetch(type).new(problem, reason)
7
+ end
8
+
9
+ def self.types
10
+ @types ||= {
11
+ backref: InvalidBackrefError,
12
+ group: InvalidGroupError,
13
+ group_option: InvalidGroupOption,
14
+ posix_class: UnknownPosixClassError,
15
+ property: UnknownUnicodePropertyError,
16
+ sequence: InvalidSequenceError,
17
+ }
18
+ end
19
+ end
20
+
21
+ # Invalid sequence format. Used for escape sequences, mainly.
22
+ class InvalidSequenceError < ValidationError
23
+ def initialize(what = 'sequence', where = '')
24
+ super "Invalid #{what} at #{where}"
25
+ end
26
+ end
27
+
28
+ # Invalid group. Used for named groups.
29
+ class InvalidGroupError < ValidationError
30
+ def initialize(what, reason)
31
+ super "Invalid #{what}, #{reason}."
32
+ end
33
+ end
34
+
35
+ # Invalid groupOption. Used for inline options.
36
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
37
+ class InvalidGroupOption < ValidationError
38
+ def initialize(option, text)
39
+ super "Invalid group option #{option} in #{text}"
40
+ end
41
+ end
42
+
43
+ # Invalid back reference. Used for name a number refs/calls.
44
+ class InvalidBackrefError < ValidationError
45
+ def initialize(what, reason)
46
+ super "Invalid back reference #{what}, #{reason}"
47
+ end
48
+ end
49
+
50
+ # The property name was not recognized by the scanner.
51
+ class UnknownUnicodePropertyError < ValidationError
52
+ def initialize(name, _)
53
+ super "Unknown unicode character property name #{name}"
54
+ end
55
+ end
56
+
57
+ # The POSIX class name was not recognized by the scanner.
58
+ class UnknownPosixClassError < ValidationError
59
+ def initialize(text, _)
60
+ super "Unknown POSIX class #{text}"
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,89 @@
1
+ # mapping for simple cases with a 1:1 relation between text and token
2
+ class Regexp::Scanner
3
+ MAPPING = {
4
+ anchor: {
5
+ '\A' => :bos,
6
+ '\B' => :nonword_boundary,
7
+ '\G' => :match_start,
8
+ '\Z' => :eos_ob_eol,
9
+ '\b' => :word_boundary,
10
+ '\z' => :eos,
11
+ },
12
+ assertion: {
13
+ '(?=' => :lookahead,
14
+ '(?!' => :nlookahead,
15
+ '(?<=' => :lookbehind,
16
+ '(?<!' => :nlookbehind,
17
+ },
18
+ conditional: {
19
+ '(?' => :open,
20
+ },
21
+ escape: {
22
+ '\.' => :dot,
23
+ '\|' => :alternation,
24
+ '\^' => :bol,
25
+ '\$' => :eol,
26
+ '\?' => :zero_or_one,
27
+ '\*' => :zero_or_more,
28
+ '\+' => :one_or_more,
29
+ '\(' => :group_open,
30
+ '\)' => :group_close,
31
+ '\{' => :interval_open,
32
+ '\}' => :interval_close,
33
+ '\[' => :set_open,
34
+ '\]' => :set_close,
35
+ '\\\\' => :backslash,
36
+ '\a' => :bell,
37
+ '\b' => :backspace,
38
+ '\e' => :escape,
39
+ '\f' => :form_feed,
40
+ '\n' => :newline,
41
+ '\r' => :carriage,
42
+ '\t' => :tab,
43
+ '\v' => :vertical_tab,
44
+ },
45
+ group: {
46
+ '(?:' => :passive,
47
+ '(?>' => :atomic,
48
+ '(?~' => :absence,
49
+ },
50
+ meta: {
51
+ '|' => :alternation,
52
+ '.' => :dot,
53
+ },
54
+ quantifier: {
55
+ '?' => :zero_or_one,
56
+ '??' => :zero_or_one_reluctant,
57
+ '?+' => :zero_or_one_possessive,
58
+ '*' => :zero_or_more,
59
+ '*?' => :zero_or_more_reluctant,
60
+ '*+' => :zero_or_more_possessive,
61
+ '+' => :one_or_more,
62
+ '+?' => :one_or_more_reluctant,
63
+ '++' => :one_or_more_possessive,
64
+ },
65
+ set: {
66
+ '[' => :character,
67
+ '-' => :range,
68
+ '&&' => :intersection,
69
+ },
70
+ type: {
71
+ '\d' => :digit,
72
+ '\D' => :nondigit,
73
+ '\h' => :hex,
74
+ '\H' => :nonhex,
75
+ '\s' => :space,
76
+ '\S' => :nonspace,
77
+ '\w' => :word,
78
+ '\W' => :nonword,
79
+ '\R' => :linebreak,
80
+ '\X' => :xgrapheme,
81
+ }
82
+ }
83
+ ANCHOR_MAPPING = MAPPING[:anchor]
84
+ ASSERTION_MAPPING = MAPPING[:assertion]
85
+ ESCAPE_MAPPING = MAPPING[:escape]
86
+ GROUP_MAPPING = MAPPING[:group]
87
+ QUANTIFIER_MAPPING = MAPPING[:quantifier]
88
+ TYPE_MAPPING = MAPPING[:type]
89
+ end
@@ -20,7 +20,7 @@
20
20
  name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- validation_error(:property, name) unless token
23
+ raise ValidationError.for(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -30,11 +30,6 @@
30
30
 
31
31
  class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
32
32
 
33
-
34
- # these are not supported in ruby at the moment
35
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
36
- character_equivalent = '[=' . alpha . '=]';
37
-
38
33
  line_anchor = beginning_of_line | end_of_line;
39
34
  anchor_char = [AbBzZG];
40
35
 
@@ -83,10 +78,9 @@
83
78
  # try to treat every other group head as options group, like Ruby
84
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
85
80
 
86
- group_ref = [gk];
87
81
  group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
88
82
  group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
89
- group_number = '-'? . [1-9] . [0-9]*;
83
+ group_number = '-'? . [0-9]+;
90
84
  group_level = [+\-] . [0-9]+;
91
85
 
92
86
  group_name = ('<' . group_name_id_ab? . '>') |
@@ -95,15 +89,11 @@
95
89
 
96
90
  group_named = ('?' . group_name );
97
91
 
98
- group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
99
- ("'" . group_name_id_sq? . group_level? "'"));
100
- group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
101
- ("'" . group_name_id_sq? . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
102
94
 
103
- group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
104
- ("'" . group_number . group_level? "'"));
105
- group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
106
- ("'" . ((group_number . group_level?) | '0') "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
107
97
 
108
98
  group_type = group_atomic | group_passive | group_absence | group_named;
109
99
 
@@ -134,13 +124,13 @@
134
124
  # EOF error, used where it can be detected
135
125
  action premature_end_error {
136
126
  text = copy(data, ts ? ts-1 : 0, -1)
137
- raise PrematureEndError.new( text )
127
+ raise PrematureEndError.new(text)
138
128
  }
139
129
 
140
130
  # Invalid sequence error, used from sequences, like escapes and sets
141
131
  action invalid_sequence_error {
142
132
  text = copy(data, ts ? ts-1 : 0, -1)
143
- validation_error(:sequence, 'sequence', text)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
144
134
  }
145
135
 
146
136
  # group (nesting) and set open/close actions
@@ -221,20 +211,12 @@
221
211
  end
222
212
 
223
213
  unless self.class.posix_classes.include?(class_name)
224
- validation_error(:posix_class, text)
214
+ raise ValidationError.for(:posix_class, text)
225
215
  end
226
216
 
227
217
  emit(type, class_name.to_sym, text)
228
218
  };
229
219
 
230
- # These are not supported in ruby at the moment. Enable them if they are.
231
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
232
- # emit(:set, :collation, copy(data, ts, te))
233
- # };
234
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
- # emit(:set, :equivalent, copy(data, ts, te))
236
- # };
237
-
238
220
  meta_char > (set_meta, 1) {
239
221
  emit(:literal, :literal, copy(data, ts, te))
240
222
  };
@@ -457,10 +439,9 @@
457
439
 
458
440
  # (?#...) comments: parsed as a single expression, without introducing a
459
441
  # new nesting level. Comments may not include parentheses, escaped or not.
460
- # special case for close, action performed on all transitions to get the
461
- # correct closing count.
442
+ # special case for close to get the correct closing count.
462
443
  # ------------------------------------------------------------------------
463
- group_open . group_comment $group_closed {
444
+ (group_open . group_comment) @group_closed {
464
445
  emit(:group, :comment, copy(data, ts, te))
465
446
  };
466
447
 
@@ -475,10 +456,10 @@
475
456
  #
476
457
  # (?imxdau-imx:subexp) option on/off for subexp
477
458
  # ------------------------------------------------------------------------
478
- group_open . group_options >group_opened {
459
+ (group_open . group_options) >group_opened {
479
460
  text = copy(data, ts, te)
480
461
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
481
- validation_error(:group_option, $1 || "-#{$2}", text)
462
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
482
463
  end
483
464
  emit_options(text)
484
465
  };
@@ -489,7 +470,7 @@
489
470
  # (?<=subexp) look-behind
490
471
  # (?<!subexp) negative look-behind
491
472
  # ------------------------------------------------------------------------
492
- group_open . assertion_type >group_opened {
473
+ (group_open . assertion_type) >group_opened {
493
474
  case text = copy(data, ts, te)
494
475
  when '(?='; emit(:assertion, :lookahead, text)
495
476
  when '(?!'; emit(:assertion, :nlookahead, text)
@@ -506,14 +487,14 @@
506
487
  # (?'name'subexp) named group (single quoted version)
507
488
  # (subexp) captured group
508
489
  # ------------------------------------------------------------------------
509
- group_open . group_type >group_opened {
490
+ (group_open . group_type) >group_opened {
510
491
  case text = copy(data, ts, te)
511
492
  when '(?:'; emit(:group, :passive, text)
512
493
  when '(?>'; emit(:group, :atomic, text)
513
494
  when '(?~'; emit(:group, :absence, text)
514
495
 
515
496
  when /^\(\?(?:<>|'')/
516
- validation_error(:group, 'named group', 'name is empty')
497
+ raise ValidationError.for(:group, 'named group', 'name is empty')
517
498
 
518
499
  when /^\(\?<[^>]+>/
519
500
  emit(:group, :named_ab, text)
@@ -533,7 +514,7 @@
533
514
  if conditional_stack.last == group_depth + 1
534
515
  conditional_stack.pop
535
516
  emit(:conditional, :close, ')')
536
- else
517
+ elsif group_depth >= 0
537
518
  if spacing_stack.length > 1 &&
538
519
  spacing_stack.last[:depth] == group_depth + 1
539
520
  spacing_stack.pop
@@ -541,41 +522,43 @@
541
522
  end
542
523
 
543
524
  emit(:group, :close, ')')
525
+ else
526
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
544
527
  end
545
528
  };
546
529
 
547
530
 
548
531
  # Group backreference, named and numbered
549
532
  # ------------------------------------------------------------------------
550
- backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
533
+ backslash . (group_ref) > (backslashed, 4) {
551
534
  case text = copy(data, ts, te)
552
- when /^\\k(<>|'')/
553
- validation_error(:backref, 'backreference', 'ref ID is empty')
554
- when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
535
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
555
536
  emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
556
- when /^\\k(.)\d+\D$/
537
+ when /^\\k(.)[1-9]\d*['>]$/
557
538
  emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
558
- when /^\\k(.)-\d+\D$/
539
+ when /^\\k(.)-[1-9]\d*['>]$/
559
540
  emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
560
- when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
541
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
561
542
  emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
562
- when /^\\k(.)-?\d+[+\-]\d+\D$/
543
+ when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
563
544
  emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
545
+ else
546
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
564
547
  end
565
548
  };
566
549
 
567
550
  # Group call, named and numbered
568
551
  # ------------------------------------------------------------------------
569
- backslash . (group_name_call | group_number_call) > (backslashed, 4) {
552
+ backslash . (group_call) > (backslashed, 4) {
570
553
  case text = copy(data, ts, te)
571
- when /^\\g(<>|'')/
572
- validation_error(:backref, 'subexpression call', 'ref ID is empty')
573
- when /^\\g(.)[^\p{digit}+\->][^+\-]*/
554
+ when /^\\g(.)[^0-9+\-].*['>]$/
574
555
  emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
575
- when /^\\g(.)\d+\D$/
556
+ when /^\\g(.)\d+['>]$/
576
557
  emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
577
558
  when /^\\g(.)[+-]\d+/
578
559
  emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
560
+ else
561
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
579
562
  end
580
563
  };
581
564
 
@@ -649,72 +632,11 @@
649
632
  *|;
650
633
  }%%
651
634
 
652
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
653
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
654
-
655
- require 'regexp_parser/error'
635
+ require 'regexp_parser/scanner/errors/scanner_error'
636
+ require 'regexp_parser/scanner/errors/premature_end_error'
637
+ require 'regexp_parser/scanner/errors/validation_error'
656
638
 
657
639
  class Regexp::Scanner
658
- # General scanner error (catch all)
659
- class ScannerError < Regexp::Parser::Error; end
660
-
661
- # Base for all scanner validation errors
662
- class ValidationError < Regexp::Parser::Error
663
- def initialize(reason)
664
- super reason
665
- end
666
- end
667
-
668
- # Unexpected end of pattern
669
- class PrematureEndError < ScannerError
670
- def initialize(where = '')
671
- super "Premature end of pattern at #{where}"
672
- end
673
- end
674
-
675
- # Invalid sequence format. Used for escape sequences, mainly.
676
- class InvalidSequenceError < ValidationError
677
- def initialize(what = 'sequence', where = '')
678
- super "Invalid #{what} at #{where}"
679
- end
680
- end
681
-
682
- # Invalid group. Used for named groups.
683
- class InvalidGroupError < ValidationError
684
- def initialize(what, reason)
685
- super "Invalid #{what}, #{reason}."
686
- end
687
- end
688
-
689
- # Invalid groupOption. Used for inline options.
690
- # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
691
- class InvalidGroupOption < ValidationError
692
- def initialize(option, text)
693
- super "Invalid group option #{option} in #{text}"
694
- end
695
- end
696
-
697
- # Invalid back reference. Used for name a number refs/calls.
698
- class InvalidBackrefError < ValidationError
699
- def initialize(what, reason)
700
- super "Invalid back reference #{what}, #{reason}"
701
- end
702
- end
703
-
704
- # The property name was not recognized by the scanner.
705
- class UnknownUnicodePropertyError < ValidationError
706
- def initialize(name)
707
- super "Unknown unicode character property name #{name}"
708
- end
709
- end
710
-
711
- # The POSIX class name was not recognized by the scanner.
712
- class UnknownPosixClassError < ValidationError
713
- def initialize(text)
714
- super "Unknown POSIX class #{text}"
715
- end
716
- end
717
-
718
640
  # Scans the given regular expression text, or Regexp object and collects the
719
641
  # emitted token into an array that gets returned at the end. If a block is
720
642
  # given, it gets called for each emitted token.
@@ -891,24 +813,8 @@ class Regexp::Scanner
891
813
 
892
814
  def emit_meta_control_sequence(data, ts, te, token)
893
815
  if data.last < 0x00 || data.last > 0x7F
894
- validation_error(:sequence, 'escape', token.to_s)
816
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
895
817
  end
896
818
  emit(:escape, token, copy(data, ts-1, te))
897
819
  end
898
-
899
- # Centralizes and unifies the handling of validation related
900
- # errors.
901
- def validation_error(type, what, reason = nil)
902
- error =
903
- case type
904
- when :backref then InvalidBackrefError.new(what, reason)
905
- when :group then InvalidGroupError.new(what, reason)
906
- when :group_option then InvalidGroupOption.new(what, reason)
907
- when :posix_class then UnknownPosixClassError.new(what)
908
- when :property then UnknownUnicodePropertyError.new(what)
909
- when :sequence then InvalidSequenceError.new(what, reason)
910
- end
911
-
912
- raise error # unless @@config.validation_ignore
913
- end
914
820
  end # module Regexp::Scanner