regexp_parser 2.0.0 → 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +66 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +3 -2
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +12 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  14. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  15. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  16. data/lib/regexp_parser/expression/quantifier.rb +1 -1
  17. data/lib/regexp_parser/expression/sequence.rb +3 -9
  18. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  19. data/lib/regexp_parser/parser.rb +282 -334
  20. data/lib/regexp_parser/scanner.rb +1084 -1230
  21. data/lib/regexp_parser/scanner/scanner.rl +80 -110
  22. data/lib/regexp_parser/syntax.rb +8 -6
  23. data/lib/regexp_parser/syntax/any.rb +3 -3
  24. data/lib/regexp_parser/syntax/base.rb +1 -1
  25. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  26. data/lib/regexp_parser/version.rb +1 -1
  27. data/spec/expression/clone_spec.rb +36 -4
  28. data/spec/expression/free_space_spec.rb +2 -2
  29. data/spec/expression/methods/match_length_spec.rb +2 -2
  30. data/spec/expression/subexpression_spec.rb +1 -1
  31. data/spec/expression/to_s_spec.rb +28 -36
  32. data/spec/lexer/refcalls_spec.rb +5 -0
  33. data/spec/parser/all_spec.rb +2 -2
  34. data/spec/parser/errors_spec.rb +1 -1
  35. data/spec/parser/quantifiers_spec.rb +1 -0
  36. data/spec/parser/refcalls_spec.rb +5 -0
  37. data/spec/scanner/escapes_spec.rb +2 -1
  38. data/spec/scanner/groups_spec.rb +10 -1
  39. data/spec/scanner/refcalls_spec.rb +19 -0
  40. data/spec/scanner/sets_spec.rb +57 -14
  41. data/spec/spec_helper.rb +1 -0
  42. metadata +4 -3
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,7 +20,7 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,7 +127,7 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
@@ -123,15 +135,13 @@
123
135
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
@@ -226,23 +236,19 @@
226
236
  emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, copy(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, copy(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
248
  emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
251
+ any | ascii_nonprint | utf8_multibyte {
246
252
  text = copy(data, ts, te)
247
253
  emit(:literal, :literal, text)
248
254
  };
@@ -251,16 +257,16 @@
251
257
  # set escapes scanner
252
258
  # --------------------------------------------------------------------------
253
259
  set_escape_sequence := |*
254
- non_set_escape > (escaped_set_alpha, 2) {
255
- emit(:escape, :literal, copy(data, ts-1, te))
256
- fret;
257
- };
258
-
259
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
260
261
  fhold;
261
262
  fnext character_set;
262
263
  fcall escape_sequence;
263
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
264
270
  *|;
265
271
 
266
272
 
@@ -325,7 +331,7 @@
325
331
  fret;
326
332
  };
327
333
 
328
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
329
335
  emit(:escape, :hex, copy(data, ts-1, te))
330
336
  fret;
331
337
  };
@@ -356,10 +362,7 @@
356
362
  fcall unicode_property;
357
363
  };
358
364
 
359
- (any -- non_literal_escape) |
360
- utf8_2_byte |
361
- utf8_3_byte |
362
- utf8_4_byte > (escaped_alpha, 1) {
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
363
366
  emit(:escape, :literal, copy(data, ts-1, te))
364
367
  fret;
365
368
  };
@@ -511,10 +514,10 @@
511
514
  when /^\(\?(?:<>|'')/
512
515
  validation_error(:group, 'named group', 'name is empty')
513
516
 
514
- when /^\(\?<\w*>/
517
+ when /^\(\?<[^>]+>/
515
518
  emit(:group, :named_ab, text)
516
519
 
517
- when /^\(\?'\w*'/
520
+ when /^\(\?'[^']+'/
518
521
  emit(:group, :named_sq, text)
519
522
 
520
523
  end
@@ -543,65 +546,35 @@
543
546
 
544
547
  # Group backreference, named and numbered
545
548
  # ------------------------------------------------------------------------
546
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
547
550
  case text = copy(data, ts, te)
548
- when /^\\([gk])(<>|'')/ # angle brackets
549
- validation_error(:backref, 'ref/call', 'ref ID is empty')
550
-
551
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
552
- if $1 == 'k'
553
- emit(:backref, :name_ref_ab, text)
554
- else
555
- emit(:backref, :name_call_ab, text)
556
- end
557
-
558
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
559
- if $1 == 'k'
560
- emit(:backref, :name_ref_sq, text)
561
- else
562
- emit(:backref, :name_call_sq, text)
563
- end
564
-
565
- when /^\\([gk])<\d+>/ # angle-brackets
566
- if $1 == 'k'
567
- emit(:backref, :number_ref_ab, text)
568
- else
569
- emit(:backref, :number_call_ab, text)
570
- end
571
-
572
- when /^\\([gk])'\d+'/ # single quotes
573
- if $1 == 'k'
574
- emit(:backref, :number_ref_sq, text)
575
- else
576
- emit(:backref, :number_call_sq, text)
577
- end
578
-
579
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
580
- if $1 == 'k'
581
- emit(:backref, :number_rel_ref_ab, text)
582
- else
583
- emit(:backref, :number_rel_call_ab, text)
584
- end
585
-
586
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
587
- if $1 == 'k'
588
- emit(:backref, :number_rel_ref_sq, text)
589
- else
590
- emit(:backref, :number_rel_call_sq, text)
591
- end
592
-
593
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
594
- emit(:backref, :name_recursion_ref_ab, text)
595
-
596
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
597
- emit(:backref, :name_recursion_ref_sq, text)
598
-
599
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
600
- emit(:backref, :number_recursion_ref_ab, text)
601
-
602
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
603
- emit(:backref, :number_recursion_ref_sq, text)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
604
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
605
578
  end
606
579
  };
607
580
 
@@ -668,11 +641,7 @@
668
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
669
642
  # except meta characters.
670
643
  # ------------------------------------------------------------------------
671
- (ascii_print -- space)+ |
672
- ascii_nonprint+ |
673
- utf8_2_byte+ |
674
- utf8_3_byte+ |
675
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
676
645
  append_literal(data, ts, te)
677
646
  };
678
647
 
@@ -682,12 +651,14 @@
682
651
  # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
683
652
  # This file was generated from lib/regexp_parser/scanner/scanner.rl
684
653
 
654
+ require 'regexp_parser/error'
655
+
685
656
  class Regexp::Scanner
686
657
  # General scanner error (catch all)
687
- class ScannerError < StandardError; end
658
+ class ScannerError < Regexp::Parser::Error; end
688
659
 
689
660
  # Base for all scanner validation errors
690
- class ValidationError < StandardError
661
+ class ValidationError < Regexp::Parser::Error
691
662
  def initialize(reason)
692
663
  super reason
693
664
  end
@@ -789,14 +760,13 @@ class Regexp::Scanner
789
760
 
790
761
  # lazy-load property maps when first needed
791
762
  require 'yaml'
792
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
793
763
 
794
764
  def self.short_prop_map
795
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
765
+ @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
796
766
  end
797
767
 
798
768
  def self.long_prop_map
799
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
769
+ @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
800
770
  end
801
771
 
802
772
  # Emits an array with the details of the scanned pattern
@@ -1,9 +1,11 @@
1
- require File.expand_path('../syntax/tokens', __FILE__)
2
- require File.expand_path('../syntax/base', __FILE__)
3
- require File.expand_path('../syntax/any', __FILE__)
4
- require File.expand_path('../syntax/version_lookup', __FILE__)
5
- require File.expand_path('../syntax/versions', __FILE__)
1
+ require 'regexp_parser/error'
6
2
 
7
3
  module Regexp::Syntax
8
- class SyntaxError < StandardError; end
4
+ class SyntaxError < Regexp::Parser::Error; end
9
5
  end
6
+
7
+ require_relative 'syntax/tokens'
8
+ require_relative 'syntax/base'
9
+ require_relative 'syntax/any'
10
+ require_relative 'syntax/version_lookup'
11
+ require_relative 'syntax/versions'
@@ -4,12 +4,12 @@ module Regexp::Syntax
4
4
  # is useful during development, testing, and should be useful for some types
5
5
  # of transformations as well.
6
6
  class Any < Base
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @implements = { :* => [:*] }
9
9
  end
10
10
 
11
- def implements?(type, token) true end
12
- def implements!(type, token) true end
11
+ def implements?(_type, _token) true end
12
+ def implements!(_type, _token) true end
13
13
  end
14
14
 
15
15
  end
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Regexp::Syntax
4
- class NotImplementedError < SyntaxError
4
+ class NotImplementedError < Regexp::Syntax::SyntaxError
5
5
  def initialize(syntax, type, token)
6
6
  super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
7
7
  end
@@ -3,13 +3,13 @@ module Regexp::Syntax
3
3
  VERSION_REGEXP = /#{VERSION_FORMAT}/
4
4
  VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
5
5
 
6
- class InvalidVersionNameError < SyntaxError
6
+ class InvalidVersionNameError < Regexp::Syntax::SyntaxError
7
7
  def initialize(name)
8
8
  super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
9
9
  end
10
10
  end
11
11
 
12
- class UnknownSyntaxNameError < SyntaxError
12
+ class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
13
13
  def initialize(name)
14
14
  super "Unknown syntax name '#{name}'."
15
15
  end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '2.0.0'
3
+ VERSION = '2.1.1'
4
4
  end
5
5
  end
@@ -27,8 +27,8 @@ RSpec.describe('Expression#clone') do
27
27
  expect(root_2.quantifier.object_id).not_to eq copy_2.quantifier.object_id
28
28
 
29
29
  # regression test
30
- expect { root_2.clone }.not_to change { root_2.quantifier.object_id }
31
- expect { root_2.clone }.not_to change { root_2.quantifier.text.object_id }
30
+ expect { root_2.clone }.not_to(change { root_2.quantifier.object_id })
31
+ expect { root_2.clone }.not_to(change { root_2.quantifier.text.object_id })
32
32
  end
33
33
 
34
34
  specify('Subexpression#clone') do
@@ -48,7 +48,7 @@ RSpec.describe('Expression#clone') do
48
48
  end
49
49
 
50
50
  # regression test
51
- expect { root.clone }.not_to change { root.expressions.object_id }
51
+ expect { root.clone }.not_to(change { root.expressions.object_id })
52
52
  end
53
53
 
54
54
  specify('Group::Named#clone') do
@@ -69,7 +69,39 @@ RSpec.describe('Expression#clone') do
69
69
  end
70
70
 
71
71
  # regression test
72
- expect { root_1.clone }.not_to change { root_1.name.object_id }
72
+ expect { root_1.clone }.not_to(change { root_1.name.object_id })
73
+ end
74
+
75
+ specify('Group::Options#clone') do
76
+ root = RP.parse('foo(?i)bar')
77
+ copy = root.clone
78
+
79
+ expect(copy.to_s).to eq root.to_s
80
+
81
+ root_1 = root[1]
82
+ copy_1 = copy[1]
83
+
84
+ expect(root_1.option_changes).to eq copy_1.option_changes
85
+ expect(root_1.option_changes.object_id).not_to eq copy_1.option_changes.object_id
86
+
87
+ # regression test
88
+ expect { root_1.clone }.not_to(change { root_1.option_changes.object_id })
89
+ end
90
+
91
+ specify('Backreference::Base#clone') do
92
+ root = RP.parse('(foo)\1')
93
+ copy = root.clone
94
+
95
+ expect(copy.to_s).to eq root.to_s
96
+
97
+ root_1 = root[1]
98
+ copy_1 = copy[1]
99
+
100
+ expect(root_1.referenced_expression.to_s).to eq copy_1.referenced_expression.to_s
101
+ expect(root_1.referenced_expression.object_id).not_to eq copy_1.referenced_expression.object_id
102
+
103
+ # regression test
104
+ expect { root_1.clone }.not_to(change { root_1.referenced_expression.object_id })
73
105
  end
74
106
 
75
107
  specify('Sequence#clone') do