regexp_parser 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +66 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +3 -2
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +12 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  14. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  15. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  16. data/lib/regexp_parser/expression/quantifier.rb +1 -1
  17. data/lib/regexp_parser/expression/sequence.rb +3 -9
  18. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  19. data/lib/regexp_parser/parser.rb +282 -334
  20. data/lib/regexp_parser/scanner.rb +1084 -1230
  21. data/lib/regexp_parser/scanner/scanner.rl +80 -110
  22. data/lib/regexp_parser/syntax.rb +8 -6
  23. data/lib/regexp_parser/syntax/any.rb +3 -3
  24. data/lib/regexp_parser/syntax/base.rb +1 -1
  25. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  26. data/lib/regexp_parser/version.rb +1 -1
  27. data/spec/expression/clone_spec.rb +36 -4
  28. data/spec/expression/free_space_spec.rb +2 -2
  29. data/spec/expression/methods/match_length_spec.rb +2 -2
  30. data/spec/expression/subexpression_spec.rb +1 -1
  31. data/spec/expression/to_s_spec.rb +28 -36
  32. data/spec/lexer/refcalls_spec.rb +5 -0
  33. data/spec/parser/all_spec.rb +2 -2
  34. data/spec/parser/errors_spec.rb +1 -1
  35. data/spec/parser/quantifiers_spec.rb +1 -0
  36. data/spec/parser/refcalls_spec.rb +5 -0
  37. data/spec/scanner/escapes_spec.rb +2 -1
  38. data/spec/scanner/groups_spec.rb +10 -1
  39. data/spec/scanner/refcalls_spec.rb +19 -0
  40. data/spec/scanner/sets_spec.rb +57 -14
  41. data/spec/spec_helper.rb +1 -0
  42. metadata +4 -3
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,7 +20,7 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,7 +127,7 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
@@ -123,15 +135,13 @@
123
135
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
@@ -226,23 +236,19 @@
226
236
  emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, copy(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, copy(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
248
  emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
251
+ any | ascii_nonprint | utf8_multibyte {
246
252
  text = copy(data, ts, te)
247
253
  emit(:literal, :literal, text)
248
254
  };
@@ -251,16 +257,16 @@
251
257
  # set escapes scanner
252
258
  # --------------------------------------------------------------------------
253
259
  set_escape_sequence := |*
254
- non_set_escape > (escaped_set_alpha, 2) {
255
- emit(:escape, :literal, copy(data, ts-1, te))
256
- fret;
257
- };
258
-
259
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
260
261
  fhold;
261
262
  fnext character_set;
262
263
  fcall escape_sequence;
263
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
264
270
  *|;
265
271
 
266
272
 
@@ -325,7 +331,7 @@
325
331
  fret;
326
332
  };
327
333
 
328
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
329
335
  emit(:escape, :hex, copy(data, ts-1, te))
330
336
  fret;
331
337
  };
@@ -356,10 +362,7 @@
356
362
  fcall unicode_property;
357
363
  };
358
364
 
359
- (any -- non_literal_escape) |
360
- utf8_2_byte |
361
- utf8_3_byte |
362
- utf8_4_byte > (escaped_alpha, 1) {
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
363
366
  emit(:escape, :literal, copy(data, ts-1, te))
364
367
  fret;
365
368
  };
@@ -511,10 +514,10 @@
511
514
  when /^\(\?(?:<>|'')/
512
515
  validation_error(:group, 'named group', 'name is empty')
513
516
 
514
- when /^\(\?<\w*>/
517
+ when /^\(\?<[^>]+>/
515
518
  emit(:group, :named_ab, text)
516
519
 
517
- when /^\(\?'\w*'/
520
+ when /^\(\?'[^']+'/
518
521
  emit(:group, :named_sq, text)
519
522
 
520
523
  end
@@ -543,65 +546,35 @@
543
546
 
544
547
  # Group backreference, named and numbered
545
548
  # ------------------------------------------------------------------------
546
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
547
550
  case text = copy(data, ts, te)
548
- when /^\\([gk])(<>|'')/ # angle brackets
549
- validation_error(:backref, 'ref/call', 'ref ID is empty')
550
-
551
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
552
- if $1 == 'k'
553
- emit(:backref, :name_ref_ab, text)
554
- else
555
- emit(:backref, :name_call_ab, text)
556
- end
557
-
558
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
559
- if $1 == 'k'
560
- emit(:backref, :name_ref_sq, text)
561
- else
562
- emit(:backref, :name_call_sq, text)
563
- end
564
-
565
- when /^\\([gk])<\d+>/ # angle-brackets
566
- if $1 == 'k'
567
- emit(:backref, :number_ref_ab, text)
568
- else
569
- emit(:backref, :number_call_ab, text)
570
- end
571
-
572
- when /^\\([gk])'\d+'/ # single quotes
573
- if $1 == 'k'
574
- emit(:backref, :number_ref_sq, text)
575
- else
576
- emit(:backref, :number_call_sq, text)
577
- end
578
-
579
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
580
- if $1 == 'k'
581
- emit(:backref, :number_rel_ref_ab, text)
582
- else
583
- emit(:backref, :number_rel_call_ab, text)
584
- end
585
-
586
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
587
- if $1 == 'k'
588
- emit(:backref, :number_rel_ref_sq, text)
589
- else
590
- emit(:backref, :number_rel_call_sq, text)
591
- end
592
-
593
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
594
- emit(:backref, :name_recursion_ref_ab, text)
595
-
596
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
597
- emit(:backref, :name_recursion_ref_sq, text)
598
-
599
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
600
- emit(:backref, :number_recursion_ref_ab, text)
601
-
602
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
603
- emit(:backref, :number_recursion_ref_sq, text)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
604
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
605
578
  end
606
579
  };
607
580
 
@@ -668,11 +641,7 @@
668
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
669
642
  # except meta characters.
670
643
  # ------------------------------------------------------------------------
671
- (ascii_print -- space)+ |
672
- ascii_nonprint+ |
673
- utf8_2_byte+ |
674
- utf8_3_byte+ |
675
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
676
645
  append_literal(data, ts, te)
677
646
  };
678
647
 
@@ -682,12 +651,14 @@
682
651
  # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
683
652
  # This file was generated from lib/regexp_parser/scanner/scanner.rl
684
653
 
654
+ require 'regexp_parser/error'
655
+
685
656
  class Regexp::Scanner
686
657
  # General scanner error (catch all)
687
- class ScannerError < StandardError; end
658
+ class ScannerError < Regexp::Parser::Error; end
688
659
 
689
660
  # Base for all scanner validation errors
690
- class ValidationError < StandardError
661
+ class ValidationError < Regexp::Parser::Error
691
662
  def initialize(reason)
692
663
  super reason
693
664
  end
@@ -789,14 +760,13 @@ class Regexp::Scanner
789
760
 
790
761
  # lazy-load property maps when first needed
791
762
  require 'yaml'
792
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
793
763
 
794
764
  def self.short_prop_map
795
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
765
+ @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
796
766
  end
797
767
 
798
768
  def self.long_prop_map
799
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
769
+ @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
800
770
  end
801
771
 
802
772
  # Emits an array with the details of the scanned pattern
@@ -1,9 +1,11 @@
1
- require File.expand_path('../syntax/tokens', __FILE__)
2
- require File.expand_path('../syntax/base', __FILE__)
3
- require File.expand_path('../syntax/any', __FILE__)
4
- require File.expand_path('../syntax/version_lookup', __FILE__)
5
- require File.expand_path('../syntax/versions', __FILE__)
1
+ require 'regexp_parser/error'
6
2
 
7
3
  module Regexp::Syntax
8
- class SyntaxError < StandardError; end
4
+ class SyntaxError < Regexp::Parser::Error; end
9
5
  end
6
+
7
+ require_relative 'syntax/tokens'
8
+ require_relative 'syntax/base'
9
+ require_relative 'syntax/any'
10
+ require_relative 'syntax/version_lookup'
11
+ require_relative 'syntax/versions'
@@ -4,12 +4,12 @@ module Regexp::Syntax
4
4
  # is useful during development, testing, and should be useful for some types
5
5
  # of transformations as well.
6
6
  class Any < Base
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @implements = { :* => [:*] }
9
9
  end
10
10
 
11
- def implements?(type, token) true end
12
- def implements!(type, token) true end
11
+ def implements?(_type, _token) true end
12
+ def implements!(_type, _token) true end
13
13
  end
14
14
 
15
15
  end
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Regexp::Syntax
4
- class NotImplementedError < SyntaxError
4
+ class NotImplementedError < Regexp::Syntax::SyntaxError
5
5
  def initialize(syntax, type, token)
6
6
  super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
7
7
  end
@@ -3,13 +3,13 @@ module Regexp::Syntax
3
3
  VERSION_REGEXP = /#{VERSION_FORMAT}/
4
4
  VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
5
5
 
6
- class InvalidVersionNameError < SyntaxError
6
+ class InvalidVersionNameError < Regexp::Syntax::SyntaxError
7
7
  def initialize(name)
8
8
  super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
9
9
  end
10
10
  end
11
11
 
12
- class UnknownSyntaxNameError < SyntaxError
12
+ class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
13
13
  def initialize(name)
14
14
  super "Unknown syntax name '#{name}'."
15
15
  end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '2.0.0'
3
+ VERSION = '2.1.1'
4
4
  end
5
5
  end
@@ -27,8 +27,8 @@ RSpec.describe('Expression#clone') do
27
27
  expect(root_2.quantifier.object_id).not_to eq copy_2.quantifier.object_id
28
28
 
29
29
  # regression test
30
- expect { root_2.clone }.not_to change { root_2.quantifier.object_id }
31
- expect { root_2.clone }.not_to change { root_2.quantifier.text.object_id }
30
+ expect { root_2.clone }.not_to(change { root_2.quantifier.object_id })
31
+ expect { root_2.clone }.not_to(change { root_2.quantifier.text.object_id })
32
32
  end
33
33
 
34
34
  specify('Subexpression#clone') do
@@ -48,7 +48,7 @@ RSpec.describe('Expression#clone') do
48
48
  end
49
49
 
50
50
  # regression test
51
- expect { root.clone }.not_to change { root.expressions.object_id }
51
+ expect { root.clone }.not_to(change { root.expressions.object_id })
52
52
  end
53
53
 
54
54
  specify('Group::Named#clone') do
@@ -69,7 +69,39 @@ RSpec.describe('Expression#clone') do
69
69
  end
70
70
 
71
71
  # regression test
72
- expect { root_1.clone }.not_to change { root_1.name.object_id }
72
+ expect { root_1.clone }.not_to(change { root_1.name.object_id })
73
+ end
74
+
75
+ specify('Group::Options#clone') do
76
+ root = RP.parse('foo(?i)bar')
77
+ copy = root.clone
78
+
79
+ expect(copy.to_s).to eq root.to_s
80
+
81
+ root_1 = root[1]
82
+ copy_1 = copy[1]
83
+
84
+ expect(root_1.option_changes).to eq copy_1.option_changes
85
+ expect(root_1.option_changes.object_id).not_to eq copy_1.option_changes.object_id
86
+
87
+ # regression test
88
+ expect { root_1.clone }.not_to(change { root_1.option_changes.object_id })
89
+ end
90
+
91
+ specify('Backreference::Base#clone') do
92
+ root = RP.parse('(foo)\1')
93
+ copy = root.clone
94
+
95
+ expect(copy.to_s).to eq root.to_s
96
+
97
+ root_1 = root[1]
98
+ copy_1 = copy[1]
99
+
100
+ expect(root_1.referenced_expression.to_s).to eq copy_1.referenced_expression.to_s
101
+ expect(root_1.referenced_expression.object_id).not_to eq copy_1.referenced_expression.object_id
102
+
103
+ # regression test
104
+ expect { root_1.clone }.not_to(change { root_1.referenced_expression.object_id })
73
105
  end
74
106
 
75
107
  specify('Sequence#clone') do