regexp_parser 1.3.0 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (169) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +53 -1
  3. data/Gemfile +3 -3
  4. data/README.md +10 -14
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  18. data/lib/regexp_parser/expression/sequence.rb +3 -6
  19. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  20. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  21. data/lib/regexp_parser/lexer.rb +30 -44
  22. data/lib/regexp_parser/parser.rb +47 -24
  23. data/lib/regexp_parser/scanner.rb +1159 -1329
  24. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  25. data/lib/regexp_parser/scanner/properties/long.yml +34 -1
  26. data/lib/regexp_parser/scanner/properties/short.yml +12 -0
  27. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  28. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  29. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
  30. data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
  31. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  33. data/lib/regexp_parser/version.rb +1 -1
  34. data/regexp_parser.gemspec +3 -3
  35. data/spec/expression/base_spec.rb +94 -0
  36. data/spec/expression/clone_spec.rb +120 -0
  37. data/spec/expression/conditional_spec.rb +89 -0
  38. data/spec/expression/free_space_spec.rb +27 -0
  39. data/spec/expression/methods/match_length_spec.rb +154 -0
  40. data/spec/expression/methods/match_spec.rb +25 -0
  41. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  42. data/spec/expression/methods/tests_spec.rb +99 -0
  43. data/spec/expression/methods/traverse_spec.rb +140 -0
  44. data/spec/expression/options_spec.rb +128 -0
  45. data/spec/expression/root_spec.rb +9 -0
  46. data/spec/expression/sequence_spec.rb +9 -0
  47. data/spec/expression/subexpression_spec.rb +50 -0
  48. data/spec/expression/to_h_spec.rb +26 -0
  49. data/spec/expression/to_s_spec.rb +100 -0
  50. data/spec/lexer/all_spec.rb +22 -0
  51. data/spec/lexer/conditionals_spec.rb +53 -0
  52. data/spec/lexer/escapes_spec.rb +14 -0
  53. data/spec/lexer/keep_spec.rb +10 -0
  54. data/spec/lexer/literals_spec.rb +89 -0
  55. data/spec/lexer/nesting_spec.rb +99 -0
  56. data/spec/lexer/refcalls_spec.rb +55 -0
  57. data/spec/parser/all_spec.rb +43 -0
  58. data/spec/parser/alternation_spec.rb +88 -0
  59. data/spec/parser/anchors_spec.rb +17 -0
  60. data/spec/parser/conditionals_spec.rb +179 -0
  61. data/spec/parser/errors_spec.rb +30 -0
  62. data/spec/parser/escapes_spec.rb +121 -0
  63. data/spec/parser/free_space_spec.rb +130 -0
  64. data/spec/parser/groups_spec.rb +108 -0
  65. data/spec/parser/keep_spec.rb +6 -0
  66. data/spec/parser/posix_classes_spec.rb +8 -0
  67. data/spec/parser/properties_spec.rb +115 -0
  68. data/spec/parser/quantifiers_spec.rb +51 -0
  69. data/spec/parser/refcalls_spec.rb +112 -0
  70. data/spec/parser/set/intersections_spec.rb +127 -0
  71. data/spec/parser/set/ranges_spec.rb +111 -0
  72. data/spec/parser/sets_spec.rb +178 -0
  73. data/spec/parser/types_spec.rb +18 -0
  74. data/spec/scanner/all_spec.rb +18 -0
  75. data/spec/scanner/anchors_spec.rb +21 -0
  76. data/spec/scanner/conditionals_spec.rb +128 -0
  77. data/spec/scanner/errors_spec.rb +68 -0
  78. data/spec/scanner/escapes_spec.rb +53 -0
  79. data/spec/scanner/free_space_spec.rb +133 -0
  80. data/spec/scanner/groups_spec.rb +52 -0
  81. data/spec/scanner/keep_spec.rb +10 -0
  82. data/spec/scanner/literals_spec.rb +49 -0
  83. data/spec/scanner/meta_spec.rb +18 -0
  84. data/spec/scanner/properties_spec.rb +64 -0
  85. data/spec/scanner/quantifiers_spec.rb +20 -0
  86. data/spec/scanner/refcalls_spec.rb +36 -0
  87. data/spec/scanner/sets_spec.rb +102 -0
  88. data/spec/scanner/types_spec.rb +14 -0
  89. data/spec/spec_helper.rb +15 -0
  90. data/{test → spec}/support/runner.rb +9 -8
  91. data/spec/support/shared_examples.rb +77 -0
  92. data/{test → spec}/support/warning_extractor.rb +5 -7
  93. data/spec/syntax/syntax_spec.rb +48 -0
  94. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  95. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  96. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  97. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  98. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  99. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  100. data/spec/syntax/versions/aliases_spec.rb +37 -0
  101. data/spec/token/token_spec.rb +85 -0
  102. metadata +144 -143
  103. data/test/expression/test_all.rb +0 -12
  104. data/test/expression/test_base.rb +0 -90
  105. data/test/expression/test_clone.rb +0 -89
  106. data/test/expression/test_conditionals.rb +0 -113
  107. data/test/expression/test_free_space.rb +0 -35
  108. data/test/expression/test_set.rb +0 -84
  109. data/test/expression/test_strfregexp.rb +0 -230
  110. data/test/expression/test_subexpression.rb +0 -58
  111. data/test/expression/test_tests.rb +0 -99
  112. data/test/expression/test_to_h.rb +0 -59
  113. data/test/expression/test_to_s.rb +0 -104
  114. data/test/expression/test_traverse.rb +0 -161
  115. data/test/helpers.rb +0 -10
  116. data/test/lexer/test_all.rb +0 -41
  117. data/test/lexer/test_conditionals.rb +0 -127
  118. data/test/lexer/test_keep.rb +0 -24
  119. data/test/lexer/test_literals.rb +0 -130
  120. data/test/lexer/test_nesting.rb +0 -132
  121. data/test/lexer/test_refcalls.rb +0 -56
  122. data/test/parser/set/test_intersections.rb +0 -127
  123. data/test/parser/set/test_ranges.rb +0 -111
  124. data/test/parser/test_all.rb +0 -64
  125. data/test/parser/test_alternation.rb +0 -92
  126. data/test/parser/test_anchors.rb +0 -34
  127. data/test/parser/test_conditionals.rb +0 -187
  128. data/test/parser/test_errors.rb +0 -63
  129. data/test/parser/test_escapes.rb +0 -134
  130. data/test/parser/test_free_space.rb +0 -139
  131. data/test/parser/test_groups.rb +0 -289
  132. data/test/parser/test_keep.rb +0 -21
  133. data/test/parser/test_posix_classes.rb +0 -27
  134. data/test/parser/test_properties.rb +0 -133
  135. data/test/parser/test_quantifiers.rb +0 -301
  136. data/test/parser/test_refcalls.rb +0 -186
  137. data/test/parser/test_sets.rb +0 -179
  138. data/test/parser/test_types.rb +0 -50
  139. data/test/scanner/test_all.rb +0 -38
  140. data/test/scanner/test_anchors.rb +0 -38
  141. data/test/scanner/test_conditionals.rb +0 -184
  142. data/test/scanner/test_errors.rb +0 -91
  143. data/test/scanner/test_escapes.rb +0 -56
  144. data/test/scanner/test_free_space.rb +0 -200
  145. data/test/scanner/test_groups.rb +0 -79
  146. data/test/scanner/test_keep.rb +0 -35
  147. data/test/scanner/test_literals.rb +0 -89
  148. data/test/scanner/test_meta.rb +0 -40
  149. data/test/scanner/test_properties.rb +0 -312
  150. data/test/scanner/test_quantifiers.rb +0 -37
  151. data/test/scanner/test_refcalls.rb +0 -52
  152. data/test/scanner/test_scripts.rb +0 -53
  153. data/test/scanner/test_sets.rb +0 -119
  154. data/test/scanner/test_types.rb +0 -35
  155. data/test/scanner/test_unicode_blocks.rb +0 -30
  156. data/test/support/disable_autotest.rb +0 -8
  157. data/test/syntax/test_all.rb +0 -6
  158. data/test/syntax/test_syntax.rb +0 -61
  159. data/test/syntax/test_syntax_token_map.rb +0 -25
  160. data/test/syntax/versions/test_1.8.rb +0 -55
  161. data/test/syntax/versions/test_1.9.1.rb +0 -36
  162. data/test/syntax/versions/test_1.9.3.rb +0 -32
  163. data/test/syntax/versions/test_2.0.0.rb +0 -37
  164. data/test/syntax/versions/test_2.2.0.rb +0 -32
  165. data/test/syntax/versions/test_aliases.rb +0 -129
  166. data/test/syntax/versions/test_all.rb +0 -5
  167. data/test/test_all.rb +0 -5
  168. data/test/token/test_all.rb +0 -2
  169. data/test/token/test_token.rb +0 -107
@@ -0,0 +1,172 @@
1
+ class Regexp::MatchLength
2
+ include Enumerable
3
+
4
+ def self.of(obj)
5
+ exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj)
6
+ exp.match_length
7
+ end
8
+
9
+ def initialize(exp, opts = {})
10
+ self.exp_class = exp.class
11
+ self.min_rep = exp.repetitions.min
12
+ self.max_rep = exp.repetitions.max
13
+ if base = opts[:base]
14
+ self.base_min = base
15
+ self.base_max = base
16
+ self.reify = ->{ '.' * base }
17
+ else
18
+ self.base_min = opts.fetch(:base_min)
19
+ self.base_max = opts.fetch(:base_max)
20
+ self.reify = opts.fetch(:reify)
21
+ end
22
+ end
23
+
24
+ def each(opts = {})
25
+ return enum_for(__method__) unless block_given?
26
+ limit = opts[:limit] || 1000
27
+ yielded = 0
28
+ (min..max).each do |num|
29
+ next unless include?(num)
30
+ yield(num)
31
+ break if (yielded += 1) >= limit
32
+ end
33
+ end
34
+
35
+ def endless_each(&block)
36
+ return enum_for(__method__) unless block_given?
37
+ (min..max).each { |num| yield(num) if include?(num) }
38
+ end
39
+
40
+ def include?(length)
41
+ test_regexp.match?('X' * length)
42
+ end
43
+
44
+ def fixed?
45
+ min == max
46
+ end
47
+
48
+ def min
49
+ min_rep * base_min
50
+ end
51
+
52
+ def max
53
+ max_rep * base_max
54
+ end
55
+
56
+ def minmax
57
+ [min, max]
58
+ end
59
+
60
+ def inspect
61
+ type = exp_class.name.sub('Regexp::Expression::', '')
62
+ "#<#{self.class}<#{type}> min=#{min} max=#{max}>"
63
+ end
64
+
65
+ def to_re
66
+ "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
67
+ end
68
+
69
+ private
70
+
71
+ attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72
+
73
+ def test_regexp
74
+ @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
75
+ regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
76
+ end
77
+ end
78
+ end
79
+
80
+ module Regexp::Expression
81
+ MatchLength = Regexp::MatchLength
82
+
83
+ [
84
+ CharacterSet,
85
+ CharacterSet::Intersection,
86
+ CharacterSet::IntersectedSequence,
87
+ CharacterSet::Range,
88
+ CharacterType::Base,
89
+ EscapeSequence::Base,
90
+ PosixClass,
91
+ UnicodeProperty::Base,
92
+ ].each do |klass|
93
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
94
+ def match_length
95
+ MatchLength.new(self, base: 1)
96
+ end
97
+ RUBY
98
+ end
99
+
100
+ class Literal
101
+ def match_length
102
+ MatchLength.new(self, base: text.length)
103
+ end
104
+ end
105
+
106
+ class Subexpression
107
+ def match_length
108
+ MatchLength.new(self,
109
+ base_min: map { |exp| exp.match_length.min }.inject(0, :+),
110
+ base_max: map { |exp| exp.match_length.max }.inject(0, :+),
111
+ reify: ->{ map { |exp| exp.match_length.to_re }.join })
112
+ end
113
+
114
+ def inner_match_length
115
+ dummy = Regexp::Expression::Root.build
116
+ dummy.expressions = expressions.map(&:clone)
117
+ dummy.quantifier = quantifier && quantifier.clone
118
+ dummy.match_length
119
+ end
120
+ end
121
+
122
+ [
123
+ Alternation,
124
+ Conditional::Expression,
125
+ ].each do |klass|
126
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
127
+ def match_length
128
+ MatchLength.new(self,
129
+ base_min: map { |exp| exp.match_length.min }.min,
130
+ base_max: map { |exp| exp.match_length.max }.max,
131
+ reify: ->{ map { |exp| exp.match_length.to_re }.join('|') })
132
+ end
133
+ RUBY
134
+ end
135
+
136
+ [
137
+ Anchor::Base,
138
+ Assertion::Base,
139
+ Conditional::Condition,
140
+ FreeSpace,
141
+ Keep::Mark,
142
+ ].each do |klass|
143
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
144
+ def match_length
145
+ MatchLength.new(self, base: 0)
146
+ end
147
+ RUBY
148
+ end
149
+
150
+ class Backreference::Base
151
+ def match_length
152
+ if referenced_expression.nil?
153
+ raise ArgumentError, 'Missing referenced_expression - not parsed?'
154
+ end
155
+ referenced_expression.unquantified_clone.match_length
156
+ end
157
+ end
158
+
159
+ class EscapeSequence::CodepointList
160
+ def match_length
161
+ MatchLength.new(self, base: codepoints.count)
162
+ end
163
+ end
164
+
165
+ # Special case. Absence group can match 0.. chars, irrespective of content.
166
+ # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})`
167
+ class Group::Absence
168
+ def match_length
169
+ MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' })
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,35 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ def multiline?
4
+ options[:m] == true
5
+ end
6
+ alias :m? :multiline?
7
+
8
+ def case_insensitive?
9
+ options[:i] == true
10
+ end
11
+ alias :i? :case_insensitive?
12
+ alias :ignore_case? :case_insensitive?
13
+
14
+ def free_spacing?
15
+ options[:x] == true
16
+ end
17
+ alias :x? :free_spacing?
18
+ alias :extended? :free_spacing?
19
+
20
+ def default_classes?
21
+ options[:d] == true
22
+ end
23
+ alias :d? :default_classes?
24
+
25
+ def ascii_classes?
26
+ options[:a] == true
27
+ end
28
+ alias :a? :ascii_classes?
29
+
30
+ def unicode_classes?
31
+ options[:u] == true
32
+ end
33
+ alias :u? :unicode_classes?
34
+ end
35
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  class Base
4
3
 
5
4
  # %l Level (depth) of the expression. Returns 'root' for the root
@@ -75,32 +75,23 @@ module Regexp::Expression
75
75
  def one_of?(scope, top = true)
76
76
  case scope
77
77
  when Array
78
- if scope.include?(:*)
79
- return (scope.include?(token) or scope.include?(:*))
80
- else
81
- return scope.include?(token)
82
- end
78
+ scope.include?(:*) || scope.include?(token)
83
79
 
84
80
  when Hash
85
81
  if scope.has_key?(:*)
86
82
  test_type = scope.has_key?(type) ? type : :*
87
- return one_of?(scope[test_type], false)
83
+ one_of?(scope[test_type], false)
88
84
  else
89
- return (scope.has_key?(type) and one_of?(scope[type], false))
85
+ scope.has_key?(type) && one_of?(scope[type], false)
90
86
  end
91
87
 
92
88
  when Symbol
93
- return true if scope == :*
94
-
95
- return is?(scope) unless top
96
- return type?(scope) if top
89
+ scope.equal?(:*) || (top ? type?(scope) : is?(scope))
97
90
 
98
91
  else
99
- raise "Array, Hash, or Symbol expected, #{scope.class.name} given"
92
+ raise ArgumentError,
93
+ "Array, Hash, or Symbol expected, #{scope.class.name} given"
100
94
  end
101
-
102
- false
103
95
  end
104
-
105
96
  end
106
97
  end
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(other)
16
- other.instance_variable_set(:@text, text.dup)
15
+ def initialize_clone(orig)
16
+ @text = orig.text.dup
17
17
  super
18
18
  end
19
19
 
@@ -18,13 +18,14 @@ module Regexp::Expression
18
18
  end
19
19
 
20
20
  class << self
21
- def add_to(subexpression, options = {})
21
+ def add_to(subexpression, params = {}, active_opts = {})
22
22
  sequence = at_levels(
23
23
  subexpression.level,
24
24
  subexpression.set_level,
25
- options[:conditional_level] || subexpression.conditional_level
25
+ params[:conditional_level] || subexpression.conditional_level
26
26
  )
27
27
  sequence.nesting_level = subexpression.nesting_level + 1
28
+ sequence.options = active_opts
28
29
  subexpression.expressions << sequence
29
30
  sequence
30
31
  end
@@ -44,10 +45,6 @@ module Regexp::Expression
44
45
  end
45
46
  end
46
47
 
47
- def text
48
- to_s
49
- end
50
-
51
48
  def starts_at
52
49
  expressions.first.starts_at
53
50
  end
@@ -14,12 +14,8 @@ module Regexp::Expression
14
14
  expressions.last << exp
15
15
  end
16
16
 
17
- def add_sequence
18
- self.class::OPERAND.add_to(self)
19
- end
20
-
21
- def quantify(token, text, min = nil, max = nil, mode = :greedy)
22
- sequences.last.last.quantify(token, text, min, max, mode)
17
+ def add_sequence(active_opts = {})
18
+ self.class::OPERAND.add_to(self, {}, active_opts)
23
19
  end
24
20
 
25
21
  def to_s(format = :full)
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(other)
16
- other.expressions = expressions.map(&:clone)
15
+ def initialize_clone(orig)
16
+ self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
19
19
 
@@ -46,9 +46,7 @@ module Regexp::Expression
46
46
 
47
47
  def to_s(format = :full)
48
48
  # Note: the format does not get passed down to subexpressions.
49
- # Note: cant use #text accessor, b/c it is overriden as def text; to_s end
50
- # in Expression::Sequence, causing infinite recursion. Clean-up needed.
51
- "#{@text}#{expressions.join}#{quantifier_affix(format)}"
49
+ "#{expressions.join}#{quantifier_affix(format)}"
52
50
  end
53
51
 
54
52
  def to_h
@@ -22,6 +22,7 @@ class Regexp::Lexer
22
22
  self.nesting = 0
23
23
  self.set_nesting = 0
24
24
  self.conditional_nesting = 0
25
+ self.shift = 0
25
26
 
26
27
  last = nil
27
28
  Regexp::Scanner.scan(input) do |type, token, text, ts, te|
@@ -30,15 +31,13 @@ class Regexp::Lexer
30
31
 
31
32
  ascend(type, token)
32
33
 
33
- break_literal(last) if type == :quantifier and
34
- last and last.type == :literal
35
-
36
- current = Regexp::Token.new(type, token, text, ts, te,
37
- nesting, set_nesting, conditional_nesting)
34
+ if type == :quantifier and last
35
+ break_literal(last) if last.type == :literal
36
+ break_codepoint_list(last) if last.token == :codepoint_list
37
+ end
38
38
 
39
- current = merge_literal(current) if type == :literal and
40
- set_nesting == 0 and
41
- last and last.type == :literal
39
+ current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
40
+ nesting, set_nesting, conditional_nesting)
42
41
 
43
42
  current = merge_condition(current) if type == :conditional and
44
43
  [:condition, :condition_close].include?(token)
@@ -65,7 +64,7 @@ class Regexp::Lexer
65
64
 
66
65
  private
67
66
 
68
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
67
+ attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
69
68
 
70
69
  def ascend(type, token)
71
70
  case type
@@ -92,44 +91,31 @@ class Regexp::Lexer
92
91
  # called by scan to break a literal run that is longer than one character
93
92
  # into two separate tokens when it is followed by a quantifier
94
93
  def break_literal(token)
95
- text = token.text
96
- if text.scan(/./mu).length > 1
97
- lead = text.sub(/.\z/mu, "")
98
- last = text[/.\z/mu] || ''
99
-
100
- if RUBY_VERSION >= '1.9'
101
- lead_length = lead.bytesize
102
- last_length = last.bytesize
103
- else
104
- lead_length = lead.length
105
- last_length = last.length
106
- end
107
-
108
- tokens.pop
109
- tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
110
- (token.te - last_length), nesting, set_nesting, conditional_nesting)
111
-
112
- tokens << Regexp::Token.new(:literal, :literal, last,
113
- (token.ts + lead_length),
114
- token.te, nesting, set_nesting, conditional_nesting)
115
- end
94
+ lead, last, _ = token.text.partition(/.\z/mu)
95
+ return if lead.empty?
96
+
97
+ tokens.pop
98
+ tokens << Regexp::Token.new(:literal, :literal, lead,
99
+ token.ts, (token.te - last.bytesize),
100
+ nesting, set_nesting, conditional_nesting)
101
+ tokens << Regexp::Token.new(:literal, :literal, last,
102
+ (token.ts + lead.bytesize), token.te,
103
+ nesting, set_nesting, conditional_nesting)
116
104
  end
117
105
 
118
- # called by scan to merge two consecutive literals. this happens when tokens
119
- # get normalized (as in the case of posix/bre) and end up becoming literals.
120
- def merge_literal(current)
121
- last = tokens.pop
106
+ def break_codepoint_list(token)
107
+ lead, _, tail = token.text.rpartition(' ')
108
+ return if lead.empty?
109
+
110
+ tokens.pop
111
+ tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
112
+ token.ts, (token.te - tail.length),
113
+ nesting, set_nesting, conditional_nesting)
114
+ tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
115
+ (token.ts + lead.length + 1), (token.te + 3),
116
+ nesting, set_nesting, conditional_nesting)
122
117
 
123
- Regexp::Token.new(
124
- :literal,
125
- :literal,
126
- last.text + current.text,
127
- last.ts,
128
- current.te,
129
- nesting,
130
- set_nesting,
131
- conditional_nesting,
132
- )
118
+ self.shift = shift + 3 # one space less, but extra \, u, {, and }
133
119
  end
134
120
 
135
121
  def merge_condition(current)
@@ -39,6 +39,8 @@ class Regexp::Parser
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ assign_referenced_expressions
43
+
42
44
  if block_given?
43
45
  block.call(root)
44
46
  else
@@ -163,14 +165,18 @@ class Regexp::Parser
163
165
  node << Backreference::NameCall.new(token, active_opts)
164
166
  when :number, :number_ref
165
167
  node << Backreference::Number.new(token, active_opts)
166
- when :number_rel_ref
167
- node << Backreference::NumberRelative.new(token, active_opts)
168
168
  when :number_recursion_ref
169
169
  node << Backreference::NumberRecursionLevel.new(token, active_opts)
170
170
  when :number_call
171
171
  node << Backreference::NumberCall.new(token, active_opts)
172
+ when :number_rel_ref
173
+ node << Backreference::NumberRelative.new(token, active_opts).tap do |exp|
174
+ assign_effective_number(exp)
175
+ end
172
176
  when :number_rel_call
173
- node << Backreference::NumberCallRelative.new(token, active_opts)
177
+ node << Backreference::NumberCallRelative.new(token, active_opts).tap do |exp|
178
+ assign_effective_number(exp)
179
+ end
174
180
  else
175
181
  raise UnknownTokenError.new('Backreference', token)
176
182
  end
@@ -209,9 +215,9 @@ class Regexp::Parser
209
215
  nest_conditional(Conditional::Expression.new(token, active_opts))
210
216
  when :condition
211
217
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
212
- conditional_nesting.last.branch
218
+ conditional_nesting.last.add_sequence(active_opts)
213
219
  when :separator
214
- conditional_nesting.last.branch
220
+ conditional_nesting.last.add_sequence(active_opts)
215
221
  self.node = conditional_nesting.last.branches.last
216
222
  when :close
217
223
  conditional_nesting.pop
@@ -229,7 +235,7 @@ class Regexp::Parser
229
235
  end
230
236
 
231
237
  def posixclass(token)
232
- node << PosixClass.new(token)
238
+ node << PosixClass.new(token, active_opts)
233
239
  end
234
240
 
235
241
  include Regexp::Expression::UnicodeProperty
@@ -491,6 +497,9 @@ class Regexp::Parser
491
497
  end
492
498
  end
493
499
 
500
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
501
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
502
+
494
503
  def options_group(token)
495
504
  positive, negative = token.text.split('-', 2)
496
505
  negative ||= ''
@@ -499,23 +508,23 @@ class Regexp::Parser
499
508
  opt_changes = {}
500
509
  new_active_opts = active_opts.dup
501
510
 
502
- # Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive.
503
- %w[i m x].each do |flag|
504
- if positive.include?(flag)
505
- opt_changes[flag.to_sym] = new_active_opts[flag.to_sym] = true
511
+ MOD_FLAGS.each do |flag|
512
+ if positive.include?(flag.to_s)
513
+ opt_changes[flag] = new_active_opts[flag] = true
506
514
  end
507
- if negative.include?(flag)
508
- opt_changes[flag.to_sym] = false
509
- new_active_opts.delete(flag.to_sym)
515
+ if negative.include?(flag.to_s)
516
+ opt_changes[flag] = false
517
+ new_active_opts.delete(flag)
510
518
  end
511
519
  end
512
520
 
513
- # Any encoding flag overrides all previous encoding flags. If there are
514
- # multiple encoding flags in an options string, the last one wins.
515
- # E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
516
- if (flag = positive.reverse[/[adu]/])
517
- %w[a d u].each { |key| new_active_opts.delete(key.to_sym) }
518
- opt_changes[flag.to_sym] = new_active_opts[flag.to_sym] = true
521
+ if (enc_flag = positive.reverse[/[adu]/])
522
+ enc_flag = enc_flag.to_sym
523
+ (ENC_FLAGS - [enc_flag]).each do |other|
524
+ opt_changes[other] = false if new_active_opts[other]
525
+ new_active_opts.delete(other)
526
+ end
527
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
519
528
  end
520
529
 
521
530
  options_stack << new_active_opts
@@ -600,16 +609,14 @@ class Regexp::Parser
600
609
  end
601
610
 
602
611
  def sequence_operation(klass, token)
603
- if node.last.is_a?(klass)
604
- self.node = node.last
605
- elsif !node.is_a?(klass)
612
+ unless node.is_a?(klass)
606
613
  operator = klass.new(token, active_opts)
607
- sequence = operator.add_sequence
614
+ sequence = operator.add_sequence(active_opts)
608
615
  sequence.expressions = node.expressions
609
616
  node.expressions = []
610
617
  nest(operator)
611
618
  end
612
- node.add_sequence
619
+ node.add_sequence(active_opts)
613
620
  end
614
621
 
615
622
  def active_opts
@@ -627,4 +634,20 @@ class Regexp::Parser
627
634
  def count_captured_group
628
635
  captured_group_counts[node.level] += 1
629
636
  end
637
+
638
+ def assign_effective_number(exp)
639
+ exp.effective_number =
640
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
641
+ end
642
+
643
+ def assign_referenced_expressions
644
+ targets = {}
645
+ root.each_expression do |exp|
646
+ exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
647
+ end
648
+ root.each_expression do |exp|
649
+ exp.respond_to?(:reference) &&
650
+ exp.referenced_expression = targets[exp.reference]
651
+ end
652
+ end
630
653
  end # module Regexp::Parser