regexp_parser 1.3.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +53 -1
  3. data/Gemfile +3 -3
  4. data/README.md +10 -14
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  18. data/lib/regexp_parser/expression/sequence.rb +3 -6
  19. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  20. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  21. data/lib/regexp_parser/lexer.rb +30 -44
  22. data/lib/regexp_parser/parser.rb +47 -24
  23. data/lib/regexp_parser/scanner.rb +1159 -1329
  24. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  25. data/lib/regexp_parser/scanner/properties/long.yml +34 -1
  26. data/lib/regexp_parser/scanner/properties/short.yml +12 -0
  27. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  28. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  29. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
  30. data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
  31. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  33. data/lib/regexp_parser/version.rb +1 -1
  34. data/regexp_parser.gemspec +3 -3
  35. data/spec/expression/base_spec.rb +94 -0
  36. data/spec/expression/clone_spec.rb +120 -0
  37. data/spec/expression/conditional_spec.rb +89 -0
  38. data/spec/expression/free_space_spec.rb +27 -0
  39. data/spec/expression/methods/match_length_spec.rb +154 -0
  40. data/spec/expression/methods/match_spec.rb +25 -0
  41. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  42. data/spec/expression/methods/tests_spec.rb +99 -0
  43. data/spec/expression/methods/traverse_spec.rb +140 -0
  44. data/spec/expression/options_spec.rb +128 -0
  45. data/spec/expression/root_spec.rb +9 -0
  46. data/spec/expression/sequence_spec.rb +9 -0
  47. data/spec/expression/subexpression_spec.rb +50 -0
  48. data/spec/expression/to_h_spec.rb +26 -0
  49. data/spec/expression/to_s_spec.rb +100 -0
  50. data/spec/lexer/all_spec.rb +22 -0
  51. data/spec/lexer/conditionals_spec.rb +53 -0
  52. data/spec/lexer/escapes_spec.rb +14 -0
  53. data/spec/lexer/keep_spec.rb +10 -0
  54. data/spec/lexer/literals_spec.rb +89 -0
  55. data/spec/lexer/nesting_spec.rb +99 -0
  56. data/spec/lexer/refcalls_spec.rb +55 -0
  57. data/spec/parser/all_spec.rb +43 -0
  58. data/spec/parser/alternation_spec.rb +88 -0
  59. data/spec/parser/anchors_spec.rb +17 -0
  60. data/spec/parser/conditionals_spec.rb +179 -0
  61. data/spec/parser/errors_spec.rb +30 -0
  62. data/spec/parser/escapes_spec.rb +121 -0
  63. data/spec/parser/free_space_spec.rb +130 -0
  64. data/spec/parser/groups_spec.rb +108 -0
  65. data/spec/parser/keep_spec.rb +6 -0
  66. data/spec/parser/posix_classes_spec.rb +8 -0
  67. data/spec/parser/properties_spec.rb +115 -0
  68. data/spec/parser/quantifiers_spec.rb +51 -0
  69. data/spec/parser/refcalls_spec.rb +112 -0
  70. data/spec/parser/set/intersections_spec.rb +127 -0
  71. data/spec/parser/set/ranges_spec.rb +111 -0
  72. data/spec/parser/sets_spec.rb +178 -0
  73. data/spec/parser/types_spec.rb +18 -0
  74. data/spec/scanner/all_spec.rb +18 -0
  75. data/spec/scanner/anchors_spec.rb +21 -0
  76. data/spec/scanner/conditionals_spec.rb +128 -0
  77. data/spec/scanner/errors_spec.rb +68 -0
  78. data/spec/scanner/escapes_spec.rb +53 -0
  79. data/spec/scanner/free_space_spec.rb +133 -0
  80. data/spec/scanner/groups_spec.rb +52 -0
  81. data/spec/scanner/keep_spec.rb +10 -0
  82. data/spec/scanner/literals_spec.rb +49 -0
  83. data/spec/scanner/meta_spec.rb +18 -0
  84. data/spec/scanner/properties_spec.rb +64 -0
  85. data/spec/scanner/quantifiers_spec.rb +20 -0
  86. data/spec/scanner/refcalls_spec.rb +36 -0
  87. data/spec/scanner/sets_spec.rb +102 -0
  88. data/spec/scanner/types_spec.rb +14 -0
  89. data/spec/spec_helper.rb +15 -0
  90. data/{test → spec}/support/runner.rb +9 -8
  91. data/spec/support/shared_examples.rb +77 -0
  92. data/{test → spec}/support/warning_extractor.rb +5 -7
  93. data/spec/syntax/syntax_spec.rb +48 -0
  94. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  95. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  96. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  97. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  98. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  99. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  100. data/spec/syntax/versions/aliases_spec.rb +37 -0
  101. data/spec/token/token_spec.rb +85 -0
  102. metadata +144 -143
  103. data/test/expression/test_all.rb +0 -12
  104. data/test/expression/test_base.rb +0 -90
  105. data/test/expression/test_clone.rb +0 -89
  106. data/test/expression/test_conditionals.rb +0 -113
  107. data/test/expression/test_free_space.rb +0 -35
  108. data/test/expression/test_set.rb +0 -84
  109. data/test/expression/test_strfregexp.rb +0 -230
  110. data/test/expression/test_subexpression.rb +0 -58
  111. data/test/expression/test_tests.rb +0 -99
  112. data/test/expression/test_to_h.rb +0 -59
  113. data/test/expression/test_to_s.rb +0 -104
  114. data/test/expression/test_traverse.rb +0 -161
  115. data/test/helpers.rb +0 -10
  116. data/test/lexer/test_all.rb +0 -41
  117. data/test/lexer/test_conditionals.rb +0 -127
  118. data/test/lexer/test_keep.rb +0 -24
  119. data/test/lexer/test_literals.rb +0 -130
  120. data/test/lexer/test_nesting.rb +0 -132
  121. data/test/lexer/test_refcalls.rb +0 -56
  122. data/test/parser/set/test_intersections.rb +0 -127
  123. data/test/parser/set/test_ranges.rb +0 -111
  124. data/test/parser/test_all.rb +0 -64
  125. data/test/parser/test_alternation.rb +0 -92
  126. data/test/parser/test_anchors.rb +0 -34
  127. data/test/parser/test_conditionals.rb +0 -187
  128. data/test/parser/test_errors.rb +0 -63
  129. data/test/parser/test_escapes.rb +0 -134
  130. data/test/parser/test_free_space.rb +0 -139
  131. data/test/parser/test_groups.rb +0 -289
  132. data/test/parser/test_keep.rb +0 -21
  133. data/test/parser/test_posix_classes.rb +0 -27
  134. data/test/parser/test_properties.rb +0 -133
  135. data/test/parser/test_quantifiers.rb +0 -301
  136. data/test/parser/test_refcalls.rb +0 -186
  137. data/test/parser/test_sets.rb +0 -179
  138. data/test/parser/test_types.rb +0 -50
  139. data/test/scanner/test_all.rb +0 -38
  140. data/test/scanner/test_anchors.rb +0 -38
  141. data/test/scanner/test_conditionals.rb +0 -184
  142. data/test/scanner/test_errors.rb +0 -91
  143. data/test/scanner/test_escapes.rb +0 -56
  144. data/test/scanner/test_free_space.rb +0 -200
  145. data/test/scanner/test_groups.rb +0 -79
  146. data/test/scanner/test_keep.rb +0 -35
  147. data/test/scanner/test_literals.rb +0 -89
  148. data/test/scanner/test_meta.rb +0 -40
  149. data/test/scanner/test_properties.rb +0 -312
  150. data/test/scanner/test_quantifiers.rb +0 -37
  151. data/test/scanner/test_refcalls.rb +0 -52
  152. data/test/scanner/test_scripts.rb +0 -53
  153. data/test/scanner/test_sets.rb +0 -119
  154. data/test/scanner/test_types.rb +0 -35
  155. data/test/scanner/test_unicode_blocks.rb +0 -30
  156. data/test/support/disable_autotest.rb +0 -8
  157. data/test/syntax/test_all.rb +0 -6
  158. data/test/syntax/test_syntax.rb +0 -61
  159. data/test/syntax/test_syntax_token_map.rb +0 -25
  160. data/test/syntax/versions/test_1.8.rb +0 -55
  161. data/test/syntax/versions/test_1.9.1.rb +0 -36
  162. data/test/syntax/versions/test_1.9.3.rb +0 -32
  163. data/test/syntax/versions/test_2.0.0.rb +0 -37
  164. data/test/syntax/versions/test_2.2.0.rb +0 -32
  165. data/test/syntax/versions/test_aliases.rb +0 -129
  166. data/test/syntax/versions/test_all.rb +0 -5
  167. data/test/test_all.rb +0 -5
  168. data/test/token/test_all.rb +0 -2
  169. data/test/token/test_token.rb +0 -107
@@ -0,0 +1,172 @@
1
+ class Regexp::MatchLength
2
+ include Enumerable
3
+
4
+ def self.of(obj)
5
+ exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj)
6
+ exp.match_length
7
+ end
8
+
9
+ def initialize(exp, opts = {})
10
+ self.exp_class = exp.class
11
+ self.min_rep = exp.repetitions.min
12
+ self.max_rep = exp.repetitions.max
13
+ if base = opts[:base]
14
+ self.base_min = base
15
+ self.base_max = base
16
+ self.reify = ->{ '.' * base }
17
+ else
18
+ self.base_min = opts.fetch(:base_min)
19
+ self.base_max = opts.fetch(:base_max)
20
+ self.reify = opts.fetch(:reify)
21
+ end
22
+ end
23
+
24
+ def each(opts = {})
25
+ return enum_for(__method__) unless block_given?
26
+ limit = opts[:limit] || 1000
27
+ yielded = 0
28
+ (min..max).each do |num|
29
+ next unless include?(num)
30
+ yield(num)
31
+ break if (yielded += 1) >= limit
32
+ end
33
+ end
34
+
35
+ def endless_each(&block)
36
+ return enum_for(__method__) unless block_given?
37
+ (min..max).each { |num| yield(num) if include?(num) }
38
+ end
39
+
40
+ def include?(length)
41
+ test_regexp.match?('X' * length)
42
+ end
43
+
44
+ def fixed?
45
+ min == max
46
+ end
47
+
48
+ def min
49
+ min_rep * base_min
50
+ end
51
+
52
+ def max
53
+ max_rep * base_max
54
+ end
55
+
56
+ def minmax
57
+ [min, max]
58
+ end
59
+
60
+ def inspect
61
+ type = exp_class.name.sub('Regexp::Expression::', '')
62
+ "#<#{self.class}<#{type}> min=#{min} max=#{max}>"
63
+ end
64
+
65
+ def to_re
66
+ "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
67
+ end
68
+
69
+ private
70
+
71
+ attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72
+
73
+ def test_regexp
74
+ @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
75
+ regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
76
+ end
77
+ end
78
+ end
79
+
80
+ module Regexp::Expression
81
+ MatchLength = Regexp::MatchLength
82
+
83
+ [
84
+ CharacterSet,
85
+ CharacterSet::Intersection,
86
+ CharacterSet::IntersectedSequence,
87
+ CharacterSet::Range,
88
+ CharacterType::Base,
89
+ EscapeSequence::Base,
90
+ PosixClass,
91
+ UnicodeProperty::Base,
92
+ ].each do |klass|
93
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
94
+ def match_length
95
+ MatchLength.new(self, base: 1)
96
+ end
97
+ RUBY
98
+ end
99
+
100
+ class Literal
101
+ def match_length
102
+ MatchLength.new(self, base: text.length)
103
+ end
104
+ end
105
+
106
+ class Subexpression
107
+ def match_length
108
+ MatchLength.new(self,
109
+ base_min: map { |exp| exp.match_length.min }.inject(0, :+),
110
+ base_max: map { |exp| exp.match_length.max }.inject(0, :+),
111
+ reify: ->{ map { |exp| exp.match_length.to_re }.join })
112
+ end
113
+
114
+ def inner_match_length
115
+ dummy = Regexp::Expression::Root.build
116
+ dummy.expressions = expressions.map(&:clone)
117
+ dummy.quantifier = quantifier && quantifier.clone
118
+ dummy.match_length
119
+ end
120
+ end
121
+
122
+ [
123
+ Alternation,
124
+ Conditional::Expression,
125
+ ].each do |klass|
126
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
127
+ def match_length
128
+ MatchLength.new(self,
129
+ base_min: map { |exp| exp.match_length.min }.min,
130
+ base_max: map { |exp| exp.match_length.max }.max,
131
+ reify: ->{ map { |exp| exp.match_length.to_re }.join('|') })
132
+ end
133
+ RUBY
134
+ end
135
+
136
+ [
137
+ Anchor::Base,
138
+ Assertion::Base,
139
+ Conditional::Condition,
140
+ FreeSpace,
141
+ Keep::Mark,
142
+ ].each do |klass|
143
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
144
+ def match_length
145
+ MatchLength.new(self, base: 0)
146
+ end
147
+ RUBY
148
+ end
149
+
150
+ class Backreference::Base
151
+ def match_length
152
+ if referenced_expression.nil?
153
+ raise ArgumentError, 'Missing referenced_expression - not parsed?'
154
+ end
155
+ referenced_expression.unquantified_clone.match_length
156
+ end
157
+ end
158
+
159
+ class EscapeSequence::CodepointList
160
+ def match_length
161
+ MatchLength.new(self, base: codepoints.count)
162
+ end
163
+ end
164
+
165
+ # Special case. Absence group can match 0.. chars, irrespective of content.
166
+ # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})`
167
+ class Group::Absence
168
+ def match_length
169
+ MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' })
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,35 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ def multiline?
4
+ options[:m] == true
5
+ end
6
+ alias :m? :multiline?
7
+
8
+ def case_insensitive?
9
+ options[:i] == true
10
+ end
11
+ alias :i? :case_insensitive?
12
+ alias :ignore_case? :case_insensitive?
13
+
14
+ def free_spacing?
15
+ options[:x] == true
16
+ end
17
+ alias :x? :free_spacing?
18
+ alias :extended? :free_spacing?
19
+
20
+ def default_classes?
21
+ options[:d] == true
22
+ end
23
+ alias :d? :default_classes?
24
+
25
+ def ascii_classes?
26
+ options[:a] == true
27
+ end
28
+ alias :a? :ascii_classes?
29
+
30
+ def unicode_classes?
31
+ options[:u] == true
32
+ end
33
+ alias :u? :unicode_classes?
34
+ end
35
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  class Base
4
3
 
5
4
  # %l Level (depth) of the expression. Returns 'root' for the root
@@ -75,32 +75,23 @@ module Regexp::Expression
75
75
  def one_of?(scope, top = true)
76
76
  case scope
77
77
  when Array
78
- if scope.include?(:*)
79
- return (scope.include?(token) or scope.include?(:*))
80
- else
81
- return scope.include?(token)
82
- end
78
+ scope.include?(:*) || scope.include?(token)
83
79
 
84
80
  when Hash
85
81
  if scope.has_key?(:*)
86
82
  test_type = scope.has_key?(type) ? type : :*
87
- return one_of?(scope[test_type], false)
83
+ one_of?(scope[test_type], false)
88
84
  else
89
- return (scope.has_key?(type) and one_of?(scope[type], false))
85
+ scope.has_key?(type) && one_of?(scope[type], false)
90
86
  end
91
87
 
92
88
  when Symbol
93
- return true if scope == :*
94
-
95
- return is?(scope) unless top
96
- return type?(scope) if top
89
+ scope.equal?(:*) || (top ? type?(scope) : is?(scope))
97
90
 
98
91
  else
99
- raise "Array, Hash, or Symbol expected, #{scope.class.name} given"
92
+ raise ArgumentError,
93
+ "Array, Hash, or Symbol expected, #{scope.class.name} given"
100
94
  end
101
-
102
- false
103
95
  end
104
-
105
96
  end
106
97
  end
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(other)
16
- other.instance_variable_set(:@text, text.dup)
15
+ def initialize_clone(orig)
16
+ @text = orig.text.dup
17
17
  super
18
18
  end
19
19
 
@@ -18,13 +18,14 @@ module Regexp::Expression
18
18
  end
19
19
 
20
20
  class << self
21
- def add_to(subexpression, options = {})
21
+ def add_to(subexpression, params = {}, active_opts = {})
22
22
  sequence = at_levels(
23
23
  subexpression.level,
24
24
  subexpression.set_level,
25
- options[:conditional_level] || subexpression.conditional_level
25
+ params[:conditional_level] || subexpression.conditional_level
26
26
  )
27
27
  sequence.nesting_level = subexpression.nesting_level + 1
28
+ sequence.options = active_opts
28
29
  subexpression.expressions << sequence
29
30
  sequence
30
31
  end
@@ -44,10 +45,6 @@ module Regexp::Expression
44
45
  end
45
46
  end
46
47
 
47
- def text
48
- to_s
49
- end
50
-
51
48
  def starts_at
52
49
  expressions.first.starts_at
53
50
  end
@@ -14,12 +14,8 @@ module Regexp::Expression
14
14
  expressions.last << exp
15
15
  end
16
16
 
17
- def add_sequence
18
- self.class::OPERAND.add_to(self)
19
- end
20
-
21
- def quantify(token, text, min = nil, max = nil, mode = :greedy)
22
- sequences.last.last.quantify(token, text, min, max, mode)
17
+ def add_sequence(active_opts = {})
18
+ self.class::OPERAND.add_to(self, {}, active_opts)
23
19
  end
24
20
 
25
21
  def to_s(format = :full)
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(other)
16
- other.expressions = expressions.map(&:clone)
15
+ def initialize_clone(orig)
16
+ self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
19
19
 
@@ -46,9 +46,7 @@ module Regexp::Expression
46
46
 
47
47
  def to_s(format = :full)
48
48
  # Note: the format does not get passed down to subexpressions.
49
- # Note: cant use #text accessor, b/c it is overriden as def text; to_s end
50
- # in Expression::Sequence, causing infinite recursion. Clean-up needed.
51
- "#{@text}#{expressions.join}#{quantifier_affix(format)}"
49
+ "#{expressions.join}#{quantifier_affix(format)}"
52
50
  end
53
51
 
54
52
  def to_h
@@ -22,6 +22,7 @@ class Regexp::Lexer
22
22
  self.nesting = 0
23
23
  self.set_nesting = 0
24
24
  self.conditional_nesting = 0
25
+ self.shift = 0
25
26
 
26
27
  last = nil
27
28
  Regexp::Scanner.scan(input) do |type, token, text, ts, te|
@@ -30,15 +31,13 @@ class Regexp::Lexer
30
31
 
31
32
  ascend(type, token)
32
33
 
33
- break_literal(last) if type == :quantifier and
34
- last and last.type == :literal
35
-
36
- current = Regexp::Token.new(type, token, text, ts, te,
37
- nesting, set_nesting, conditional_nesting)
34
+ if type == :quantifier and last
35
+ break_literal(last) if last.type == :literal
36
+ break_codepoint_list(last) if last.token == :codepoint_list
37
+ end
38
38
 
39
- current = merge_literal(current) if type == :literal and
40
- set_nesting == 0 and
41
- last and last.type == :literal
39
+ current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
40
+ nesting, set_nesting, conditional_nesting)
42
41
 
43
42
  current = merge_condition(current) if type == :conditional and
44
43
  [:condition, :condition_close].include?(token)
@@ -65,7 +64,7 @@ class Regexp::Lexer
65
64
 
66
65
  private
67
66
 
68
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
67
+ attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
69
68
 
70
69
  def ascend(type, token)
71
70
  case type
@@ -92,44 +91,31 @@ class Regexp::Lexer
92
91
  # called by scan to break a literal run that is longer than one character
93
92
  # into two separate tokens when it is followed by a quantifier
94
93
  def break_literal(token)
95
- text = token.text
96
- if text.scan(/./mu).length > 1
97
- lead = text.sub(/.\z/mu, "")
98
- last = text[/.\z/mu] || ''
99
-
100
- if RUBY_VERSION >= '1.9'
101
- lead_length = lead.bytesize
102
- last_length = last.bytesize
103
- else
104
- lead_length = lead.length
105
- last_length = last.length
106
- end
107
-
108
- tokens.pop
109
- tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
110
- (token.te - last_length), nesting, set_nesting, conditional_nesting)
111
-
112
- tokens << Regexp::Token.new(:literal, :literal, last,
113
- (token.ts + lead_length),
114
- token.te, nesting, set_nesting, conditional_nesting)
115
- end
94
+ lead, last, _ = token.text.partition(/.\z/mu)
95
+ return if lead.empty?
96
+
97
+ tokens.pop
98
+ tokens << Regexp::Token.new(:literal, :literal, lead,
99
+ token.ts, (token.te - last.bytesize),
100
+ nesting, set_nesting, conditional_nesting)
101
+ tokens << Regexp::Token.new(:literal, :literal, last,
102
+ (token.ts + lead.bytesize), token.te,
103
+ nesting, set_nesting, conditional_nesting)
116
104
  end
117
105
 
118
- # called by scan to merge two consecutive literals. this happens when tokens
119
- # get normalized (as in the case of posix/bre) and end up becoming literals.
120
- def merge_literal(current)
121
- last = tokens.pop
106
+ def break_codepoint_list(token)
107
+ lead, _, tail = token.text.rpartition(' ')
108
+ return if lead.empty?
109
+
110
+ tokens.pop
111
+ tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
112
+ token.ts, (token.te - tail.length),
113
+ nesting, set_nesting, conditional_nesting)
114
+ tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
115
+ (token.ts + lead.length + 1), (token.te + 3),
116
+ nesting, set_nesting, conditional_nesting)
122
117
 
123
- Regexp::Token.new(
124
- :literal,
125
- :literal,
126
- last.text + current.text,
127
- last.ts,
128
- current.te,
129
- nesting,
130
- set_nesting,
131
- conditional_nesting,
132
- )
118
+ self.shift = shift + 3 # one space less, but extra \, u, {, and }
133
119
  end
134
120
 
135
121
  def merge_condition(current)
@@ -39,6 +39,8 @@ class Regexp::Parser
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ assign_referenced_expressions
43
+
42
44
  if block_given?
43
45
  block.call(root)
44
46
  else
@@ -163,14 +165,18 @@ class Regexp::Parser
163
165
  node << Backreference::NameCall.new(token, active_opts)
164
166
  when :number, :number_ref
165
167
  node << Backreference::Number.new(token, active_opts)
166
- when :number_rel_ref
167
- node << Backreference::NumberRelative.new(token, active_opts)
168
168
  when :number_recursion_ref
169
169
  node << Backreference::NumberRecursionLevel.new(token, active_opts)
170
170
  when :number_call
171
171
  node << Backreference::NumberCall.new(token, active_opts)
172
+ when :number_rel_ref
173
+ node << Backreference::NumberRelative.new(token, active_opts).tap do |exp|
174
+ assign_effective_number(exp)
175
+ end
172
176
  when :number_rel_call
173
- node << Backreference::NumberCallRelative.new(token, active_opts)
177
+ node << Backreference::NumberCallRelative.new(token, active_opts).tap do |exp|
178
+ assign_effective_number(exp)
179
+ end
174
180
  else
175
181
  raise UnknownTokenError.new('Backreference', token)
176
182
  end
@@ -209,9 +215,9 @@ class Regexp::Parser
209
215
  nest_conditional(Conditional::Expression.new(token, active_opts))
210
216
  when :condition
211
217
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
212
- conditional_nesting.last.branch
218
+ conditional_nesting.last.add_sequence(active_opts)
213
219
  when :separator
214
- conditional_nesting.last.branch
220
+ conditional_nesting.last.add_sequence(active_opts)
215
221
  self.node = conditional_nesting.last.branches.last
216
222
  when :close
217
223
  conditional_nesting.pop
@@ -229,7 +235,7 @@ class Regexp::Parser
229
235
  end
230
236
 
231
237
  def posixclass(token)
232
- node << PosixClass.new(token)
238
+ node << PosixClass.new(token, active_opts)
233
239
  end
234
240
 
235
241
  include Regexp::Expression::UnicodeProperty
@@ -491,6 +497,9 @@ class Regexp::Parser
491
497
  end
492
498
  end
493
499
 
500
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
501
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
502
+
494
503
  def options_group(token)
495
504
  positive, negative = token.text.split('-', 2)
496
505
  negative ||= ''
@@ -499,23 +508,23 @@ class Regexp::Parser
499
508
  opt_changes = {}
500
509
  new_active_opts = active_opts.dup
501
510
 
502
- # Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive.
503
- %w[i m x].each do |flag|
504
- if positive.include?(flag)
505
- opt_changes[flag.to_sym] = new_active_opts[flag.to_sym] = true
511
+ MOD_FLAGS.each do |flag|
512
+ if positive.include?(flag.to_s)
513
+ opt_changes[flag] = new_active_opts[flag] = true
506
514
  end
507
- if negative.include?(flag)
508
- opt_changes[flag.to_sym] = false
509
- new_active_opts.delete(flag.to_sym)
515
+ if negative.include?(flag.to_s)
516
+ opt_changes[flag] = false
517
+ new_active_opts.delete(flag)
510
518
  end
511
519
  end
512
520
 
513
- # Any encoding flag overrides all previous encoding flags. If there are
514
- # multiple encoding flags in an options string, the last one wins.
515
- # E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
516
- if (flag = positive.reverse[/[adu]/])
517
- %w[a d u].each { |key| new_active_opts.delete(key.to_sym) }
518
- opt_changes[flag.to_sym] = new_active_opts[flag.to_sym] = true
521
+ if (enc_flag = positive.reverse[/[adu]/])
522
+ enc_flag = enc_flag.to_sym
523
+ (ENC_FLAGS - [enc_flag]).each do |other|
524
+ opt_changes[other] = false if new_active_opts[other]
525
+ new_active_opts.delete(other)
526
+ end
527
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
519
528
  end
520
529
 
521
530
  options_stack << new_active_opts
@@ -600,16 +609,14 @@ class Regexp::Parser
600
609
  end
601
610
 
602
611
  def sequence_operation(klass, token)
603
- if node.last.is_a?(klass)
604
- self.node = node.last
605
- elsif !node.is_a?(klass)
612
+ unless node.is_a?(klass)
606
613
  operator = klass.new(token, active_opts)
607
- sequence = operator.add_sequence
614
+ sequence = operator.add_sequence(active_opts)
608
615
  sequence.expressions = node.expressions
609
616
  node.expressions = []
610
617
  nest(operator)
611
618
  end
612
- node.add_sequence
619
+ node.add_sequence(active_opts)
613
620
  end
614
621
 
615
622
  def active_opts
@@ -627,4 +634,20 @@ class Regexp::Parser
627
634
  def count_captured_group
628
635
  captured_group_counts[node.level] += 1
629
636
  end
637
+
638
+ def assign_effective_number(exp)
639
+ exp.effective_number =
640
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
641
+ end
642
+
643
+ def assign_referenced_expressions
644
+ targets = {}
645
+ root.each_expression do |exp|
646
+ exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
647
+ end
648
+ root.each_expression do |exp|
649
+ exp.respond_to?(:reference) &&
650
+ exp.referenced_expression = targets[exp.reference]
651
+ end
652
+ end
630
653
  end # module Regexp::Parser