regexp_parser 2.4.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +98 -42
  3. data/README.md +46 -30
  4. data/lib/regexp_parser/expression/base.rb +17 -9
  5. data/lib/regexp_parser/expression/classes/backreference.rb +19 -2
  6. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
  7. data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
  8. data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
  9. data/lib/regexp_parser/expression/classes/group.rb +10 -0
  10. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  11. data/lib/regexp_parser/expression/classes/root.rb +3 -5
  12. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
  13. data/lib/regexp_parser/expression/methods/construct.rb +43 -0
  14. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  15. data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
  16. data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
  17. data/lib/regexp_parser/expression/quantifier.rb +6 -5
  18. data/lib/regexp_parser/expression/sequence.rb +6 -21
  19. data/lib/regexp_parser/expression/shared.rb +20 -3
  20. data/lib/regexp_parser/expression/subexpression.rb +4 -1
  21. data/lib/regexp_parser/expression.rb +4 -2
  22. data/lib/regexp_parser/lexer.rb +61 -29
  23. data/lib/regexp_parser/parser.rb +36 -26
  24. data/lib/regexp_parser/scanner/property.rl +1 -1
  25. data/lib/regexp_parser/scanner/scanner.rl +57 -42
  26. data/lib/regexp_parser/scanner.rb +873 -823
  27. data/lib/regexp_parser/syntax/token/escape.rb +1 -1
  28. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  29. data/lib/regexp_parser/syntax/versions.rb +2 -0
  30. data/lib/regexp_parser/version.rb +1 -1
  31. metadata +7 -5
@@ -59,9 +59,6 @@
59
59
  one_or_more = '+' | '+?' | '++';
60
60
 
61
61
  quantifier_greedy = '?' | '*' | '+';
62
- quantifier_reluctant = '??' | '*?' | '+?';
63
- quantifier_possessive = '?+' | '*+' | '++';
64
- quantifier_mode = '?' | '+';
65
62
 
66
63
  quantity_exact = (digit+);
67
64
  quantity_minimum = (digit+) . ',';
@@ -70,9 +67,6 @@
70
67
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
71
68
  quantity_maximum | quantity_range ) . range_close;
72
69
 
73
- quantifiers = quantifier_greedy | quantifier_reluctant |
74
- quantifier_possessive | quantifier_interval;
75
-
76
70
  conditional = '(?(';
77
71
 
78
72
  group_comment = '?#' . [^)]* . group_close;
@@ -90,8 +84,8 @@
90
84
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
85
 
92
86
  group_ref = [gk];
93
- group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
94
- group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
87
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
88
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
95
89
  group_number = '-'? . [1-9] . [0-9]*;
96
90
  group_level = [+\-] . [0-9]+;
97
91
 
@@ -132,7 +126,8 @@
132
126
  keep_mark | sequence_char;
133
127
 
134
128
  # escapes that also work within a character set
135
- set_escape = backslash | brackets | escaped_ascii | property_char |
129
+ set_escape = backslash | brackets | escaped_ascii |
130
+ octal_sequence | property_char |
136
131
  sequence_char | single_codepoint_char_type;
137
132
 
138
133
 
@@ -168,8 +163,8 @@
168
163
  };
169
164
 
170
165
  '-]' @set_closed { # special case, emits two tokens
171
- emit(:literal, :literal, copy(data, ts, te-1))
172
- emit(:set, :close, copy(data, ts+1, te))
166
+ emit(:literal, :literal, '-')
167
+ emit(:set, :close, ']')
173
168
  if in_set?
174
169
  fret;
175
170
  else
@@ -183,28 +178,27 @@
183
178
  };
184
179
 
185
180
  '^' {
186
- text = copy(data, ts, te)
187
- if tokens.last[1] == :open
188
- emit(:set, :negate, text)
181
+ if prev_token[1] == :open
182
+ emit(:set, :negate, '^')
189
183
  else
190
- emit(:literal, :literal, text)
184
+ emit(:literal, :literal, '^')
191
185
  end
192
186
  };
193
187
 
194
188
  '-' {
195
- text = copy(data, ts, te)
196
- # ranges cant start with a subset or intersection/negation/range operator
197
- if tokens.last[0] == :set
198
- emit(:literal, :literal, text)
189
+ # ranges cant start with the opening bracket, a subset, or
190
+ # intersection/negation/range operators
191
+ if prev_token[0] == :set
192
+ emit(:literal, :literal, '-')
199
193
  else
200
- emit(:set, :range, text)
194
+ emit(:set, :range, '-')
201
195
  end
202
196
  };
203
197
 
204
198
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
205
199
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
206
200
  '&&' {
207
- emit(:set, :intersection, copy(data, ts, te))
201
+ emit(:set, :intersection, '&&')
208
202
  };
209
203
 
210
204
  backslash {
@@ -212,7 +206,7 @@
212
206
  };
213
207
 
214
208
  set_open >(open_bracket, 1) >set_opened {
215
- emit(:set, :open, copy(data, ts, te))
209
+ emit(:set, :open, '[')
216
210
  fcall character_set;
217
211
  };
218
212
 
@@ -254,12 +248,22 @@
254
248
  # set escapes scanner
255
249
  # --------------------------------------------------------------------------
256
250
  set_escape_sequence := |*
251
+ # Special case: in sets, octal sequences have higher priority than backrefs
252
+ octal_sequence {
253
+ emit(:escape, :octal, copy(data, ts-1, te))
254
+ fret;
255
+ };
256
+
257
+ # Scan all other escapes that work in sets with the generic escape scanner
257
258
  set_escape > (escaped_set_alpha, 2) {
258
259
  fhold;
259
260
  fnext character_set;
260
261
  fcall escape_sequence;
261
262
  };
262
263
 
264
+ # Treat all remaining escapes - those not supported in sets - as literal.
265
+ # (This currently includes \^, \-, \&, \:, although these could potentially
266
+ # be meta chars when not escaped, depending on their position in the set.)
263
267
  any > (escaped_set_alpha, 1) {
264
268
  emit(:escape, :literal, copy(data, ts-1, te))
265
269
  fret;
@@ -528,7 +532,7 @@
528
532
  group_close @group_closed {
529
533
  if conditional_stack.last == group_depth + 1
530
534
  conditional_stack.pop
531
- emit(:conditional, :close, copy(data, ts, te))
535
+ emit(:conditional, :close, ')')
532
536
  else
533
537
  if spacing_stack.length > 1 &&
534
538
  spacing_stack.last[:depth] == group_depth + 1
@@ -536,7 +540,7 @@
536
540
  self.free_spacing = spacing_stack.last[:free_spacing]
537
541
  end
538
542
 
539
- emit(:group, :close, copy(data, ts, te))
543
+ emit(:group, :close, ')')
540
544
  end
541
545
  };
542
546
 
@@ -717,23 +721,24 @@ class Regexp::Scanner
717
721
  #
718
722
  # This method may raise errors if a syntax error is encountered.
719
723
  # --------------------------------------------------------------------------
720
- def self.scan(input_object, options: nil, &block)
721
- new.scan(input_object, options: options, &block)
724
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
725
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
722
726
  end
723
727
 
724
- def scan(input_object, options: nil, &block)
725
- self.literal = nil
728
+ def scan(input_object, options: nil, collect_tokens: true, &block)
729
+ self.collect_tokens = collect_tokens
730
+ self.literal_run = nil
726
731
  stack = []
727
732
 
728
733
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
729
734
  self.free_spacing = free_spacing?(input_object, options)
730
735
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
731
736
 
732
- data = input.unpack("c*") if input.is_a?(String)
737
+ data = input.unpack("c*")
733
738
  eof = data.length
734
739
 
735
740
  self.tokens = []
736
- self.block = block_given? ? block : nil
741
+ self.block = block
737
742
 
738
743
  self.set_depth = 0
739
744
  self.group_depth = 0
@@ -758,7 +763,7 @@ class Regexp::Scanner
758
763
  "[#{set_depth}]") if in_set?
759
764
 
760
765
  # when the entire expression is a literal run
761
- emit_literal if literal
766
+ emit_literal if literal_run
762
767
 
763
768
  tokens
764
769
  end
@@ -785,26 +790,37 @@ class Regexp::Scanner
785
790
  def emit(type, token, text)
786
791
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
787
792
 
788
- emit_literal if literal
793
+ emit_literal if literal_run
789
794
 
790
795
  # Ragel runs with byte-based indices (ts, te). These are of little value to
791
796
  # end-users, so we keep track of char-based indices and emit those instead.
792
797
  ts_char_pos = char_pos
793
798
  te_char_pos = char_pos + text.length
794
799
 
795
- if block
796
- block.call type, token, text, ts_char_pos, te_char_pos
797
- end
800
+ tok = [type, token, text, ts_char_pos, te_char_pos]
798
801
 
799
- tokens << [type, token, text, ts_char_pos, te_char_pos]
802
+ self.prev_token = tok
800
803
 
801
804
  self.char_pos = te_char_pos
805
+
806
+ if block
807
+ block.call type, token, text, ts_char_pos, te_char_pos
808
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
809
+ tokens << tok if collect_tokens
810
+ elsif collect_tokens
811
+ tokens << tok
812
+ end
802
813
  end
803
814
 
815
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
816
+
804
817
  private
805
818
 
806
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
807
- :group_depth, :set_depth, :conditional_stack, :char_pos
819
+ attr_accessor :block,
820
+ :collect_tokens, :tokens, :prev_token,
821
+ :free_spacing, :spacing_stack,
822
+ :group_depth, :set_depth, :conditional_stack,
823
+ :char_pos
808
824
 
809
825
  def free_spacing?(input_object, options)
810
826
  if options && !input_object.is_a?(String)
@@ -834,14 +850,13 @@ class Regexp::Scanner
834
850
  # Appends one or more characters to the literal buffer, to be emitted later
835
851
  # by a call to emit_literal.
836
852
  def append_literal(data, ts, te)
837
- self.literal = literal || []
838
- literal << copy(data, ts, te)
853
+ (self.literal_run ||= []) << copy(data, ts, te)
839
854
  end
840
855
 
841
856
  # Emits the literal run collected by calls to the append_literal method.
842
857
  def emit_literal
843
- text = literal.join
844
- self.literal = nil
858
+ text = literal_run.join
859
+ self.literal_run = nil
845
860
  emit(:literal, :literal, text)
846
861
  end
847
862