regexp_parser 2.9.3 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e429c2cc03b2c9c31e3bf5c3dc71ffc15c5032a35f52c3abae9134d02c45496f
4
- data.tar.gz: 6a89f8618748c8ab479c4d81ff44c9fabfb461337993fcc641da23d6c349a1ec
3
+ metadata.gz: 4cb66cfbf1c78a46f36cb24a7cbc9e04b0bc96aa1285fe81de79cec4bfd1c2c1
4
+ data.tar.gz: f650a1b30acac1298186dce0818eede9944e3b5117e794801abd0576d7b37b9e
5
5
  SHA512:
6
- metadata.gz: 8f9cb8133b24db6f8bb2199356101c234960839ae1251a77da6fe4faeaafc2ab1d6f679f5a6e081860d4a9137a91aeb7793dbc617f04c9747b9110d64134d45f
7
- data.tar.gz: bdfe1c9a13fef4f891c28787588c92be37c1c8a61e0d473d05482a67207675be090ec5a74d12b46cf93c8d2565388c6680acd89ad65bc63454514ee720181c82
6
+ metadata.gz: 1b87b74cafd00c2a8a3fe5a44942a005a4756974363c916c650c18e74df719920537bfaecdf21080aed339d24f1988444940d96fd66dc6af847498c04efbc033
7
+ data.tar.gz: 40e7f8357bd2ff7485c7d7105d852b6c615eaf8902b787b611953df637e246a53990acdc6e1b7f7ff2dd350edf749bd012352981b908de2bbcbee0bc59714513
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2024, Ammar Ali
1
+ Copyright (c) 2010, 2012-2025, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
@@ -1,25 +1,6 @@
1
1
  module Regexp::Expression
2
2
  module Backreference
3
- class Base < Regexp::Expression::Base
4
- attr_accessor :referenced_expression
5
-
6
- def initialize_copy(orig)
7
- exp_id = [self.class, self.starts_at]
8
-
9
- # prevent infinite recursion for recursive subexp calls
10
- copied = @@copied ||= {}
11
- self.referenced_expression =
12
- if copied[exp_id]
13
- orig.referenced_expression
14
- else
15
- copied[exp_id] = true
16
- orig.referenced_expression.dup
17
- end
18
- copied.clear
19
-
20
- super
21
- end
22
- end
3
+ class Base < Regexp::Expression::Base; end
23
4
 
24
5
  class Number < Backreference::Base
25
6
  attr_reader :number
@@ -7,26 +7,17 @@ module Regexp::Expression
7
7
  end
8
8
 
9
9
  class Condition < Regexp::Expression::Base
10
- attr_accessor :referenced_expression
11
-
12
10
  # Name or number of the referenced capturing group that determines state.
13
11
  # Returns a String if reference is by name, Integer if by number.
14
12
  def reference
15
13
  ref = text.tr("'<>()", "")
16
14
  ref =~ /\D/ ? ref : Integer(ref)
17
15
  end
18
-
19
- def initialize_copy(orig)
20
- self.referenced_expression = orig.referenced_expression.dup
21
- super
22
- end
23
16
  end
24
17
 
25
18
  class Branch < Regexp::Expression::Sequence; end
26
19
 
27
20
  class Expression < Regexp::Expression::Subexpression
28
- attr_accessor :referenced_expression
29
-
30
21
  def <<(exp)
31
22
  expressions.last << exp
32
23
  end
@@ -54,11 +45,6 @@ module Regexp::Expression
54
45
  def reference
55
46
  condition.reference
56
47
  end
57
-
58
- def initialize_copy(orig)
59
- self.referenced_expression = orig.referenced_expression.dup
60
- super
61
- end
62
48
  end
63
49
  end
64
50
  end
@@ -1,100 +1,29 @@
1
1
  module Regexp::Expression
2
2
  module EscapeSequence
3
- class Base < Regexp::Expression::Base
4
- def codepoint
5
- char.ord
6
- end
7
-
8
- if ''.respond_to?(:undump)
9
- def char
10
- %("#{text}").undump
11
- end
12
- else
13
- # poor man's unescape without using eval
14
- require 'yaml'
15
- def char
16
- YAML.load(%Q(---\n"#{text}"\n))
17
- end
18
- end
19
- end
20
-
21
- class Literal < EscapeSequence::Base
22
- def char
23
- text[1..-1]
24
- end
25
- end
26
-
27
- class AsciiEscape < EscapeSequence::Base; end
28
- class Backspace < EscapeSequence::Base; end
29
- class Bell < EscapeSequence::Base; end
30
- class FormFeed < EscapeSequence::Base; end
31
- class Newline < EscapeSequence::Base; end
32
- class Return < EscapeSequence::Base; end
33
- class Tab < EscapeSequence::Base; end
34
- class VerticalTab < EscapeSequence::Base; end
35
-
36
- class Hex < EscapeSequence::Base; end
37
- class Codepoint < EscapeSequence::Base; end
38
-
39
- class CodepointList < EscapeSequence::Base
40
- def char
41
- raise NoMethodError, 'CodepointList responds only to #chars'
42
- end
43
-
44
- def codepoint
45
- raise NoMethodError, 'CodepointList responds only to #codepoints'
46
- end
47
-
48
- def chars
49
- codepoints.map { |cp| cp.chr('utf-8') }
50
- end
51
-
52
- def codepoints
53
- text.scan(/\h+/).map(&:hex)
54
- end
55
- end
56
-
57
- class Octal < EscapeSequence::Base
58
- def char
59
- text[1..-1].to_i(8).chr('utf-8')
60
- end
61
- end
62
-
63
- class AbstractMetaControlSequence < EscapeSequence::Base
64
- def char
65
- codepoint.chr('utf-8')
66
- end
67
-
68
- private
69
-
70
- def control_sequence_to_s(control_sequence)
71
- five_lsb = control_sequence.unpack('B*').first[-5..-1]
72
- ["000#{five_lsb}"].pack('B*')
73
- end
74
-
75
- def meta_char_to_codepoint(meta_char)
76
- byte_value = meta_char.ord
77
- byte_value < 128 ? byte_value + 128 : byte_value
78
- end
79
- end
80
-
81
- class Control < AbstractMetaControlSequence
82
- def codepoint
83
- control_sequence_to_s(text).ord
84
- end
85
- end
86
-
87
- class Meta < AbstractMetaControlSequence
88
- def codepoint
89
- meta_char_to_codepoint(text[-1])
90
- end
91
- end
92
-
93
- class MetaControl < AbstractMetaControlSequence
94
- def codepoint
95
- meta_char_to_codepoint(control_sequence_to_s(text))
96
- end
97
- end
3
+ Base = Class.new(Regexp::Expression::Base)
4
+
5
+ AsciiEscape = Class.new(Base) # \e
6
+ Backspace = Class.new(Base) # \b
7
+ Bell = Class.new(Base) # \a
8
+ FormFeed = Class.new(Base) # \f
9
+ Newline = Class.new(Base) # \n
10
+ Return = Class.new(Base) # \r
11
+ Tab = Class.new(Base) # \t
12
+ VerticalTab = Class.new(Base) # \v
13
+
14
+ Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes)
15
+
16
+ Octal = Class.new(Base) # e.g. \012
17
+ Hex = Class.new(Base) # e.g. \x0A
18
+ Codepoint = Class.new(Base) # e.g. \u000A
19
+
20
+ CodepointList = Class.new(Base) # e.g. \u{A B}
21
+ UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
22
+
23
+ AbstractMetaControlSequence = Class.new(Base)
24
+ Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
25
+ Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z
26
+ MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX
98
27
  end
99
28
 
100
29
  # alias for symmetry between Token::* and Expression::*
@@ -0,0 +1,5 @@
1
+ Regexp::Expression::EscapeSequence::Base.class_eval do
2
+ def char
3
+ codepoint.chr('utf-8')
4
+ end
5
+ end
@@ -0,0 +1,74 @@
1
+ module Regexp::Expression::EscapeSequence
2
+ AsciiEscape.class_eval { def codepoint; 0x1B end }
3
+ Backspace.class_eval { def codepoint; 0x8 end }
4
+ Bell.class_eval { def codepoint; 0x7 end }
5
+ FormFeed.class_eval { def codepoint; 0xC end }
6
+ Newline.class_eval { def codepoint; 0xA end }
7
+ Return.class_eval { def codepoint; 0xD end }
8
+ Tab.class_eval { def codepoint; 0x9 end }
9
+ VerticalTab.class_eval { def codepoint; 0xB end }
10
+
11
+ Literal.class_eval { def codepoint; text[1].ord end }
12
+
13
+ Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end }
14
+
15
+ Hex.class_eval { def codepoint; text[/\h+/].hex end }
16
+ Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
17
+
18
+ UTF8Hex.class_eval do
19
+ def codepoint
20
+ text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
21
+ end
22
+ end
23
+
24
+ CodepointList.class_eval do
25
+ # Maybe this should be a unique top-level expression class?
26
+ def char
27
+ raise NoMethodError, 'CodepointList responds only to #chars'
28
+ end
29
+
30
+ def codepoint
31
+ raise NoMethodError, 'CodepointList responds only to #codepoints'
32
+ end
33
+
34
+ def chars
35
+ codepoints.map { |cp| cp.chr('utf-8') }
36
+ end
37
+
38
+ def codepoints
39
+ text.scan(/\h+/).map(&:hex)
40
+ end
41
+ end
42
+
43
+ AbstractMetaControlSequence.class_eval do
44
+ private
45
+
46
+ def control_sequence_to_s(control_sequence)
47
+ five_lsb = control_sequence.unpack('B*').first[-5..-1]
48
+ ["000#{five_lsb}"].pack('B*')
49
+ end
50
+
51
+ def meta_char_to_codepoint(meta_char)
52
+ byte_value = meta_char.ord
53
+ byte_value < 128 ? byte_value + 128 : byte_value
54
+ end
55
+ end
56
+
57
+ Control.class_eval do
58
+ def codepoint
59
+ control_sequence_to_s(text).ord
60
+ end
61
+ end
62
+
63
+ Meta.class_eval do
64
+ def codepoint
65
+ meta_char_to_codepoint(text[-1])
66
+ end
67
+ end
68
+
69
+ MetaControl.class_eval do
70
+ def codepoint
71
+ meta_char_to_codepoint(control_sequence_to_s(text))
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,28 @@
1
+ module Regexp::Expression
2
+ module ReferencedExpressions
3
+ attr_accessor :referenced_expressions
4
+
5
+ def referenced_expression
6
+ referenced_expressions && referenced_expressions.first
7
+ end
8
+
9
+ def initialize_copy(orig)
10
+ exp_id = [self.class, self.starts_at]
11
+
12
+ # prevent infinite recursion for recursive subexp calls
13
+ copied = self.class.instance_eval { @copied_ref_exps ||= {} }
14
+ self.referenced_expressions =
15
+ if copied[exp_id]
16
+ orig.referenced_expressions
17
+ else
18
+ copied[exp_id] = true
19
+ orig.referenced_expressions && orig.referenced_expressions.map(&:dup)
20
+ end
21
+ copied.clear
22
+
23
+ super
24
+ end
25
+ end
26
+
27
+ Base.include ReferencedExpressions
28
+ end
@@ -25,6 +25,8 @@ require_relative 'expression/classes/root'
25
25
  require_relative 'expression/classes/unicode_property'
26
26
 
27
27
  require_relative 'expression/methods/construct'
28
+ require_relative 'expression/methods/escape_sequence_char'
29
+ require_relative 'expression/methods/escape_sequence_codepoint'
28
30
  require_relative 'expression/methods/human_name'
29
31
  require_relative 'expression/methods/match'
30
32
  require_relative 'expression/methods/match_length'
@@ -32,6 +34,7 @@ require_relative 'expression/methods/negative'
32
34
  require_relative 'expression/methods/options'
33
35
  require_relative 'expression/methods/parts'
34
36
  require_relative 'expression/methods/printing'
37
+ require_relative 'expression/methods/referenced_expressions'
35
38
  require_relative 'expression/methods/strfregexp'
36
39
  require_relative 'expression/methods/tests'
37
40
  require_relative 'expression/methods/traverse'
@@ -319,6 +319,7 @@ class Regexp::Parser
319
319
  when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
320
  when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
321
  when :octal; node << EscapeSequence::Octal.new(token, active_opts)
322
+ when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
322
323
 
323
324
  when :control
324
325
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
@@ -580,16 +581,19 @@ class Regexp::Parser
580
581
  # the instance of Group::Capture that it refers to via its number.
581
582
  def assign_referenced_expressions
582
583
  # find all referenceable and referring expressions
583
- targets = { 0 => root }
584
+ targets = { 0 => [root] }
584
585
  referrers = []
585
586
  root.each_expression do |exp|
586
- exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
587
- referrers << exp if exp.referential?
587
+ if exp.referential?
588
+ referrers << exp
589
+ elsif exp.is_a?(Group::Capture)
590
+ (targets[exp.identifier] ||= []) << exp
591
+ end
588
592
  end
589
- # assign reference expression to referring expressions
593
+ # assign referenced expressions to referring expressions
590
594
  # (in a second iteration because there might be forward references)
591
595
  referrers.each do |exp|
592
- exp.referenced_expression = targets[exp.reference] ||
596
+ exp.referenced_expressions = targets[exp.reference] ||
593
597
  raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
594
598
  end
595
599
  end
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= self.capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -321,6 +334,16 @@
321
334
  fret;
322
335
  };
323
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
324
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
348
  emit(:escape, :hex, copy(data, ts-1, te))
326
349
  fret;
@@ -514,6 +537,7 @@
514
537
  };
515
538
 
516
539
  group_open @group_opened {
540
+ self.capturing_group_count += 1
517
541
  text = copy(data, ts, te)
518
542
  emit(:group, :capture, text)
519
543
  };
@@ -662,6 +686,7 @@ class Regexp::Scanner
662
686
 
663
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
665
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666
691
 
667
692
  data = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
672
697
 
673
698
  self.set_depth = 0
674
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
675
701
  self.conditional_stack = []
676
702
  self.char_pos = 0
677
703
 
@@ -711,10 +737,9 @@ class Regexp::Scanner
711
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
738
  end
713
739
 
714
- def self.posix_classes
740
+ POSIX_CLASSES =
715
741
  %w[alnum alpha ascii blank cntrl digit graph
716
- lower print punct space upper word xdigit]
717
- end
742
+ lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
718
743
 
719
744
  # Emits an array with the details of the scanned pattern
720
745
  def emit(type, token, text)
@@ -749,7 +774,9 @@ class Regexp::Scanner
749
774
  attr_accessor :block,
750
775
  :collect_tokens, :tokens, :prev_token,
751
776
  :free_spacing, :spacing_stack,
777
+ :regexp_encoding,
752
778
  :group_depth, :set_depth, :conditional_stack,
779
+ :capturing_group_count,
753
780
  :char_pos
754
781
 
755
782
  def free_spacing?(input_object, options)