regexp_parser 2.10.0 → 2.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/LICENSE +1 -1
  4. data/Rakefile +5 -3
  5. data/lib/regexp_parser/error.rb +2 -0
  6. data/lib/regexp_parser/expression/base.rb +2 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +2 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +2 -0
  9. data/lib/regexp_parser/expression/classes/backreference.rb +2 -0
  10. data/lib/regexp_parser/expression/classes/character_set/intersection.rb +2 -0
  11. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -0
  12. data/lib/regexp_parser/expression/classes/character_set.rb +2 -0
  13. data/lib/regexp_parser/expression/classes/character_type.rb +2 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +2 -0
  15. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -0
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -0
  17. data/lib/regexp_parser/expression/classes/group.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  19. data/lib/regexp_parser/expression/classes/literal.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/posix_class.rb +2 -0
  21. data/lib/regexp_parser/expression/classes/root.rb +2 -0
  22. data/lib/regexp_parser/expression/classes/unicode_property.rb +2 -0
  23. data/lib/regexp_parser/expression/methods/construct.rb +2 -0
  24. data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +2 -0
  25. data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +8 -0
  26. data/lib/regexp_parser/expression/methods/human_name.rb +2 -0
  27. data/lib/regexp_parser/expression/methods/match.rb +2 -0
  28. data/lib/regexp_parser/expression/methods/match_length.rb +2 -0
  29. data/lib/regexp_parser/expression/methods/negative.rb +2 -0
  30. data/lib/regexp_parser/expression/methods/options.rb +2 -0
  31. data/lib/regexp_parser/expression/methods/parts.rb +2 -0
  32. data/lib/regexp_parser/expression/methods/printing.rb +2 -0
  33. data/lib/regexp_parser/expression/methods/referenced_expressions.rb +2 -0
  34. data/lib/regexp_parser/expression/methods/strfregexp.rb +2 -0
  35. data/lib/regexp_parser/expression/methods/tests.rb +2 -0
  36. data/lib/regexp_parser/expression/methods/traverse.rb +2 -0
  37. data/lib/regexp_parser/expression/quantifier.rb +3 -1
  38. data/lib/regexp_parser/expression/sequence.rb +2 -0
  39. data/lib/regexp_parser/expression/sequence_operation.rb +2 -0
  40. data/lib/regexp_parser/expression/shared.rb +6 -3
  41. data/lib/regexp_parser/expression/subexpression.rb +2 -0
  42. data/lib/regexp_parser/expression.rb +2 -0
  43. data/lib/regexp_parser/lexer.rb +2 -0
  44. data/lib/regexp_parser/parser.rb +3 -0
  45. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +2 -0
  46. data/lib/regexp_parser/scanner/errors/scanner_error.rb +2 -0
  47. data/lib/regexp_parser/scanner/errors/validation_error.rb +2 -0
  48. data/lib/regexp_parser/scanner/properties/long.csv +19 -0
  49. data/lib/regexp_parser/scanner/properties/short.csv +8 -0
  50. data/lib/regexp_parser/scanner/scanner.rl +43 -7
  51. data/lib/regexp_parser/scanner.rb +509 -471
  52. data/lib/regexp_parser/syntax/any.rb +2 -0
  53. data/lib/regexp_parser/syntax/base.rb +2 -0
  54. data/lib/regexp_parser/syntax/token/anchor.rb +5 -3
  55. data/lib/regexp_parser/syntax/token/assertion.rb +4 -2
  56. data/lib/regexp_parser/syntax/token/backreference.rb +8 -6
  57. data/lib/regexp_parser/syntax/token/character_set.rb +3 -1
  58. data/lib/regexp_parser/syntax/token/character_type.rb +6 -4
  59. data/lib/regexp_parser/syntax/token/conditional.rb +5 -3
  60. data/lib/regexp_parser/syntax/token/escape.rb +9 -7
  61. data/lib/regexp_parser/syntax/token/group.rb +8 -6
  62. data/lib/regexp_parser/syntax/token/keep.rb +3 -1
  63. data/lib/regexp_parser/syntax/token/meta.rb +4 -2
  64. data/lib/regexp_parser/syntax/token/posix_class.rb +4 -2
  65. data/lib/regexp_parser/syntax/token/quantifier.rb +8 -6
  66. data/lib/regexp_parser/syntax/token/unicode_property.rb +62 -47
  67. data/lib/regexp_parser/syntax/token/virtual.rb +5 -3
  68. data/lib/regexp_parser/syntax/token.rb +5 -3
  69. data/lib/regexp_parser/syntax/version_lookup.rb +4 -2
  70. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -0
  71. data/lib/regexp_parser/syntax/versions/1.9.1.rb +2 -0
  72. data/lib/regexp_parser/syntax/versions/1.9.3.rb +2 -0
  73. data/lib/regexp_parser/syntax/versions/2.0.0.rb +2 -0
  74. data/lib/regexp_parser/syntax/versions/2.2.0.rb +2 -0
  75. data/lib/regexp_parser/syntax/versions/2.3.0.rb +2 -0
  76. data/lib/regexp_parser/syntax/versions/2.4.0.rb +2 -0
  77. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -0
  78. data/lib/regexp_parser/syntax/versions/2.5.0.rb +2 -0
  79. data/lib/regexp_parser/syntax/versions/2.6.0.rb +2 -0
  80. data/lib/regexp_parser/syntax/versions/2.6.2.rb +2 -0
  81. data/lib/regexp_parser/syntax/versions/2.6.3.rb +2 -0
  82. data/lib/regexp_parser/syntax/versions/3.1.0.rb +2 -0
  83. data/lib/regexp_parser/syntax/versions/3.2.0.rb +2 -0
  84. data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
  85. data/lib/regexp_parser/syntax/versions.rb +2 -0
  86. data/lib/regexp_parser/syntax.rb +2 -0
  87. data/lib/regexp_parser/token.rb +2 -0
  88. data/lib/regexp_parser/version.rb +3 -1
  89. data/lib/regexp_parser.rb +2 -0
  90. data/regexp_parser.gemspec +2 -0
  91. metadata +4 -6
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -321,6 +334,16 @@
321
334
  fret;
322
335
  };
323
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
324
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
348
  emit(:escape, :hex, copy(data, ts-1, te))
326
349
  fret;
@@ -514,6 +537,7 @@
514
537
  };
515
538
 
516
539
  group_open @group_opened {
540
+ self.capturing_group_count = capturing_group_count + 1
517
541
  text = copy(data, ts, te)
518
542
  emit(:group, :capture, text)
519
543
  };
@@ -662,6 +686,7 @@ class Regexp::Scanner
662
686
 
663
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = extract_encoding(input_object, options)
665
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666
691
 
667
692
  data = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
672
697
 
673
698
  self.set_depth = 0
674
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
675
701
  self.conditional_stack = []
676
702
  self.char_pos = 0
677
703
 
@@ -711,10 +737,11 @@ class Regexp::Scanner
711
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
738
  end
713
739
 
714
- def self.posix_classes
740
+ # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
741
+ POSIX_CLASSES =
715
742
  %w[alnum alpha ascii blank cntrl digit graph
716
743
  lower print punct space upper word xdigit]
717
- end
744
+ .inject({}) { |o, e| o.merge(e => true) }.freeze
718
745
 
719
746
  # Emits an array with the details of the scanned pattern
720
747
  def emit(type, token, text)
@@ -742,16 +769,25 @@ class Regexp::Scanner
742
769
  end
743
770
  end
744
771
 
745
- attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
772
+ attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
746
773
 
747
774
  private
748
775
 
749
776
  attr_accessor :block,
750
777
  :collect_tokens, :tokens, :prev_token,
751
778
  :free_spacing, :spacing_stack,
779
+ :regexp_encoding,
752
780
  :group_depth, :set_depth, :conditional_stack,
753
781
  :char_pos
754
782
 
783
+ def extract_encoding(input_object, options)
784
+ if input_object.is_a?(::Regexp)
785
+ input_object.encoding
786
+ elsif options && (options & Regexp::NOENCODING)
787
+ Encoding::BINARY
788
+ end
789
+ end
790
+
755
791
  def free_spacing?(input_object, options)
756
792
  if options && !input_object.is_a?(String)
757
793
  raise ArgumentError, 'options cannot be supplied unless scanning a String'