regexp_parser 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6ed5457d89738fa1076cf3875cd2d009973f02857ea68e055ef3ef74a78dc91
4
- data.tar.gz: d67eb5f0cb37ad106574b2ae327eefcfc13c9d585cddec6661898f4d8166ebcc
3
+ metadata.gz: 4cb66cfbf1c78a46f36cb24a7cbc9e04b0bc96aa1285fe81de79cec4bfd1c2c1
4
+ data.tar.gz: f650a1b30acac1298186dce0818eede9944e3b5117e794801abd0576d7b37b9e
5
5
  SHA512:
6
- metadata.gz: 6b8adbc3c4707fc4c823456ae1d7547f17568802de03008a17fef18a5f95af08b0e42d48ccdfab25a740603a58ab89c036d70cec94405701201e5a5af51ce392
7
- data.tar.gz: 9bea98a42ab64a9b45ddc5564cd077d7eb6d2ddc293844759bb8001aa9fefd8aa26b0e03fff7a286ccde9f7aeacacda9fbb187fe04082749d3c2605e0cece7b9
6
+ metadata.gz: 1b87b74cafd00c2a8a3fe5a44942a005a4756974363c916c650c18e74df719920537bfaecdf21080aed339d24f1988444940d96fd66dc6af847498c04efbc033
7
+ data.tar.gz: 40e7f8357bd2ff7485c7d7105d852b6c615eaf8902b787b611953df637e246a53990acdc6e1b7f7ff2dd350edf749bd012352981b908de2bbcbee0bc59714513
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2024, Ammar Ali
1
+ Copyright (c) 2010, 2012-2025, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
@@ -18,6 +18,7 @@ module Regexp::Expression
18
18
  Codepoint = Class.new(Base) # e.g. \u000A
19
19
 
20
20
  CodepointList = Class.new(Base) # e.g. \u{A B}
21
+ UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
21
22
 
22
23
  AbstractMetaControlSequence = Class.new(Base)
23
24
  Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
15
15
  Hex.class_eval { def codepoint; text[/\h+/].hex end }
16
16
  Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
17
17
 
18
+ UTF8Hex.class_eval do
19
+ def codepoint
20
+ text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
21
+ end
22
+ end
23
+
18
24
  CodepointList.class_eval do
19
25
  # Maybe this should be a unique top-level expression class?
20
26
  def char
@@ -319,6 +319,7 @@ class Regexp::Parser
319
319
  when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
320
  when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
321
  when :octal; node << EscapeSequence::Octal.new(token, active_opts)
322
+ when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
322
323
 
323
324
  when :control
324
325
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= self.capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -321,6 +334,16 @@
321
334
  fret;
322
335
  };
323
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
324
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
348
  emit(:escape, :hex, copy(data, ts-1, te))
326
349
  fret;
@@ -514,6 +537,7 @@
514
537
  };
515
538
 
516
539
  group_open @group_opened {
540
+ self.capturing_group_count += 1
517
541
  text = copy(data, ts, te)
518
542
  emit(:group, :capture, text)
519
543
  };
@@ -662,6 +686,7 @@ class Regexp::Scanner
662
686
 
663
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
665
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666
691
 
667
692
  data = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
672
697
 
673
698
  self.set_depth = 0
674
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
675
701
  self.conditional_stack = []
676
702
  self.char_pos = 0
677
703
 
@@ -711,10 +737,9 @@ class Regexp::Scanner
711
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
738
  end
713
739
 
714
- def self.posix_classes
740
+ POSIX_CLASSES =
715
741
  %w[alnum alpha ascii blank cntrl digit graph
716
- lower print punct space upper word xdigit]
717
- end
742
+ lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
718
743
 
719
744
  # Emits an array with the details of the scanned pattern
720
745
  def emit(type, token, text)
@@ -749,7 +774,9 @@ class Regexp::Scanner
749
774
  attr_accessor :block,
750
775
  :collect_tokens, :tokens, :prev_token,
751
776
  :free_spacing, :spacing_stack,
777
+ :regexp_encoding,
752
778
  :group_depth, :set_depth, :conditional_stack,
779
+ :capturing_group_count,
753
780
  :char_pos
754
781
 
755
782
  def free_spacing?(input_object, options)