regexp_parser 1.4.0 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +66 -1
  3. data/Gemfile +3 -3
  4. data/README.md +11 -18
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
  18. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  19. data/lib/regexp_parser/expression/sequence.rb +3 -6
  20. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  21. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  22. data/lib/regexp_parser/lexer.rb +30 -44
  23. data/lib/regexp_parser/parser.rb +47 -24
  24. data/lib/regexp_parser/scanner.rb +1228 -1367
  25. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  26. data/lib/regexp_parser/scanner/properties/long.yml +15 -1
  27. data/lib/regexp_parser/scanner/properties/short.yml +5 -0
  28. data/lib/regexp_parser/scanner/scanner.rl +101 -194
  29. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  30. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
  31. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  33. data/lib/regexp_parser/version.rb +1 -1
  34. data/regexp_parser.gemspec +2 -2
  35. data/spec/expression/base_spec.rb +94 -0
  36. data/spec/expression/clone_spec.rb +120 -0
  37. data/spec/expression/conditional_spec.rb +89 -0
  38. data/spec/expression/free_space_spec.rb +27 -0
  39. data/spec/expression/methods/match_length_spec.rb +161 -0
  40. data/spec/expression/methods/match_spec.rb +25 -0
  41. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  42. data/spec/expression/methods/tests_spec.rb +99 -0
  43. data/spec/expression/methods/traverse_spec.rb +161 -0
  44. data/spec/expression/options_spec.rb +128 -0
  45. data/spec/expression/root_spec.rb +9 -0
  46. data/spec/expression/sequence_spec.rb +9 -0
  47. data/spec/expression/subexpression_spec.rb +50 -0
  48. data/spec/expression/to_h_spec.rb +26 -0
  49. data/spec/expression/to_s_spec.rb +100 -0
  50. data/spec/lexer/all_spec.rb +22 -0
  51. data/spec/lexer/conditionals_spec.rb +53 -0
  52. data/spec/lexer/delimiters_spec.rb +68 -0
  53. data/spec/lexer/escapes_spec.rb +14 -0
  54. data/spec/lexer/keep_spec.rb +10 -0
  55. data/spec/lexer/literals_spec.rb +89 -0
  56. data/spec/lexer/nesting_spec.rb +99 -0
  57. data/spec/lexer/refcalls_spec.rb +55 -0
  58. data/spec/parser/all_spec.rb +43 -0
  59. data/spec/parser/alternation_spec.rb +88 -0
  60. data/spec/parser/anchors_spec.rb +17 -0
  61. data/spec/parser/conditionals_spec.rb +179 -0
  62. data/spec/parser/errors_spec.rb +30 -0
  63. data/spec/parser/escapes_spec.rb +121 -0
  64. data/spec/parser/free_space_spec.rb +130 -0
  65. data/spec/parser/groups_spec.rb +108 -0
  66. data/spec/parser/keep_spec.rb +6 -0
  67. data/spec/parser/posix_classes_spec.rb +8 -0
  68. data/spec/parser/properties_spec.rb +115 -0
  69. data/spec/parser/quantifiers_spec.rb +52 -0
  70. data/spec/parser/refcalls_spec.rb +112 -0
  71. data/spec/parser/set/intersections_spec.rb +127 -0
  72. data/spec/parser/set/ranges_spec.rb +111 -0
  73. data/spec/parser/sets_spec.rb +178 -0
  74. data/spec/parser/types_spec.rb +18 -0
  75. data/spec/scanner/all_spec.rb +18 -0
  76. data/spec/scanner/anchors_spec.rb +21 -0
  77. data/spec/scanner/conditionals_spec.rb +128 -0
  78. data/spec/scanner/delimiters_spec.rb +52 -0
  79. data/spec/scanner/errors_spec.rb +67 -0
  80. data/spec/scanner/escapes_spec.rb +53 -0
  81. data/spec/scanner/free_space_spec.rb +133 -0
  82. data/spec/scanner/groups_spec.rb +52 -0
  83. data/spec/scanner/keep_spec.rb +10 -0
  84. data/spec/scanner/literals_spec.rb +49 -0
  85. data/spec/scanner/meta_spec.rb +18 -0
  86. data/spec/scanner/properties_spec.rb +64 -0
  87. data/spec/scanner/quantifiers_spec.rb +20 -0
  88. data/spec/scanner/refcalls_spec.rb +36 -0
  89. data/spec/scanner/sets_spec.rb +102 -0
  90. data/spec/scanner/types_spec.rb +14 -0
  91. data/spec/spec_helper.rb +15 -0
  92. data/{test → spec}/support/runner.rb +9 -8
  93. data/spec/support/shared_examples.rb +77 -0
  94. data/{test → spec}/support/warning_extractor.rb +5 -7
  95. data/spec/syntax/syntax_spec.rb +48 -0
  96. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  97. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  98. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  99. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  100. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  101. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  102. data/spec/syntax/versions/aliases_spec.rb +37 -0
  103. data/spec/token/token_spec.rb +85 -0
  104. metadata +149 -144
  105. data/test/expression/test_all.rb +0 -12
  106. data/test/expression/test_base.rb +0 -90
  107. data/test/expression/test_clone.rb +0 -89
  108. data/test/expression/test_conditionals.rb +0 -113
  109. data/test/expression/test_free_space.rb +0 -35
  110. data/test/expression/test_set.rb +0 -84
  111. data/test/expression/test_strfregexp.rb +0 -230
  112. data/test/expression/test_subexpression.rb +0 -58
  113. data/test/expression/test_tests.rb +0 -99
  114. data/test/expression/test_to_h.rb +0 -59
  115. data/test/expression/test_to_s.rb +0 -104
  116. data/test/expression/test_traverse.rb +0 -161
  117. data/test/helpers.rb +0 -10
  118. data/test/lexer/test_all.rb +0 -41
  119. data/test/lexer/test_conditionals.rb +0 -127
  120. data/test/lexer/test_keep.rb +0 -24
  121. data/test/lexer/test_literals.rb +0 -130
  122. data/test/lexer/test_nesting.rb +0 -132
  123. data/test/lexer/test_refcalls.rb +0 -56
  124. data/test/parser/set/test_intersections.rb +0 -127
  125. data/test/parser/set/test_ranges.rb +0 -111
  126. data/test/parser/test_all.rb +0 -64
  127. data/test/parser/test_alternation.rb +0 -92
  128. data/test/parser/test_anchors.rb +0 -34
  129. data/test/parser/test_conditionals.rb +0 -187
  130. data/test/parser/test_errors.rb +0 -63
  131. data/test/parser/test_escapes.rb +0 -134
  132. data/test/parser/test_free_space.rb +0 -139
  133. data/test/parser/test_groups.rb +0 -289
  134. data/test/parser/test_keep.rb +0 -21
  135. data/test/parser/test_posix_classes.rb +0 -27
  136. data/test/parser/test_properties.rb +0 -134
  137. data/test/parser/test_quantifiers.rb +0 -301
  138. data/test/parser/test_refcalls.rb +0 -186
  139. data/test/parser/test_sets.rb +0 -179
  140. data/test/parser/test_types.rb +0 -50
  141. data/test/scanner/test_all.rb +0 -38
  142. data/test/scanner/test_anchors.rb +0 -38
  143. data/test/scanner/test_conditionals.rb +0 -184
  144. data/test/scanner/test_errors.rb +0 -91
  145. data/test/scanner/test_escapes.rb +0 -56
  146. data/test/scanner/test_free_space.rb +0 -200
  147. data/test/scanner/test_groups.rb +0 -79
  148. data/test/scanner/test_keep.rb +0 -35
  149. data/test/scanner/test_literals.rb +0 -89
  150. data/test/scanner/test_meta.rb +0 -40
  151. data/test/scanner/test_properties.rb +0 -312
  152. data/test/scanner/test_quantifiers.rb +0 -37
  153. data/test/scanner/test_refcalls.rb +0 -52
  154. data/test/scanner/test_scripts.rb +0 -53
  155. data/test/scanner/test_sets.rb +0 -119
  156. data/test/scanner/test_types.rb +0 -35
  157. data/test/scanner/test_unicode_blocks.rb +0 -30
  158. data/test/support/disable_autotest.rb +0 -8
  159. data/test/syntax/test_all.rb +0 -6
  160. data/test/syntax/test_syntax.rb +0 -61
  161. data/test/syntax/test_syntax_token_map.rb +0 -25
  162. data/test/syntax/versions/test_1.8.rb +0 -55
  163. data/test/syntax/versions/test_1.9.1.rb +0 -36
  164. data/test/syntax/versions/test_1.9.3.rb +0 -32
  165. data/test/syntax/versions/test_2.0.0.rb +0 -37
  166. data/test/syntax/versions/test_2.2.0.rb +0 -32
  167. data/test/syntax/versions/test_aliases.rb +0 -129
  168. data/test/syntax/versions/test_all.rb +0 -5
  169. data/test/test_all.rb +0 -5
  170. data/test/token/test_all.rb +0 -2
  171. data/test/token/test_token.rb +0 -107
@@ -0,0 +1,13 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ def match?(string)
4
+ !!match(string)
5
+ end
6
+ alias :matches? :match?
7
+
8
+ def match(string, offset = 0)
9
+ Regexp.new(to_s).match(string, offset)
10
+ end
11
+ alias :=~ :match
12
+ end
13
+ end
@@ -0,0 +1,172 @@
1
+ class Regexp::MatchLength
2
+ include Enumerable
3
+
4
+ def self.of(obj)
5
+ exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj)
6
+ exp.match_length
7
+ end
8
+
9
+ def initialize(exp, opts = {})
10
+ self.exp_class = exp.class
11
+ self.min_rep = exp.repetitions.min
12
+ self.max_rep = exp.repetitions.max
13
+ if base = opts[:base]
14
+ self.base_min = base
15
+ self.base_max = base
16
+ self.reify = ->{ '.' * base }
17
+ else
18
+ self.base_min = opts.fetch(:base_min)
19
+ self.base_max = opts.fetch(:base_max)
20
+ self.reify = opts.fetch(:reify)
21
+ end
22
+ end
23
+
24
+ def each(opts = {})
25
+ return enum_for(__method__, opts) unless block_given?
26
+ limit = opts[:limit] || 1000
27
+ yielded = 0
28
+ (min..max).each do |num|
29
+ next unless include?(num)
30
+ yield(num)
31
+ break if (yielded += 1) >= limit
32
+ end
33
+ end
34
+
35
+ def endless_each(&block)
36
+ return enum_for(__method__) unless block_given?
37
+ (min..max).each { |num| yield(num) if include?(num) }
38
+ end
39
+
40
+ def include?(length)
41
+ test_regexp.match?('X' * length)
42
+ end
43
+
44
+ def fixed?
45
+ min == max
46
+ end
47
+
48
+ def min
49
+ min_rep * base_min
50
+ end
51
+
52
+ def max
53
+ max_rep * base_max
54
+ end
55
+
56
+ def minmax
57
+ [min, max]
58
+ end
59
+
60
+ def inspect
61
+ type = exp_class.name.sub('Regexp::Expression::', '')
62
+ "#<#{self.class}<#{type}> min=#{min} max=#{max}>"
63
+ end
64
+
65
+ def to_re
66
+ "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
67
+ end
68
+
69
+ private
70
+
71
+ attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72
+
73
+ def test_regexp
74
+ @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
75
+ regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
76
+ end
77
+ end
78
+ end
79
+
80
+ module Regexp::Expression
81
+ MatchLength = Regexp::MatchLength
82
+
83
+ [
84
+ CharacterSet,
85
+ CharacterSet::Intersection,
86
+ CharacterSet::IntersectedSequence,
87
+ CharacterSet::Range,
88
+ CharacterType::Base,
89
+ EscapeSequence::Base,
90
+ PosixClass,
91
+ UnicodeProperty::Base,
92
+ ].each do |klass|
93
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
94
+ def match_length
95
+ MatchLength.new(self, base: 1)
96
+ end
97
+ RUBY
98
+ end
99
+
100
+ class Literal
101
+ def match_length
102
+ MatchLength.new(self, base: text.length)
103
+ end
104
+ end
105
+
106
+ class Subexpression
107
+ def match_length
108
+ MatchLength.new(self,
109
+ base_min: map { |exp| exp.match_length.min }.inject(0, :+),
110
+ base_max: map { |exp| exp.match_length.max }.inject(0, :+),
111
+ reify: ->{ map { |exp| exp.match_length.to_re }.join })
112
+ end
113
+
114
+ def inner_match_length
115
+ dummy = Regexp::Expression::Root.build
116
+ dummy.expressions = expressions.map(&:clone)
117
+ dummy.quantifier = quantifier && quantifier.clone
118
+ dummy.match_length
119
+ end
120
+ end
121
+
122
+ [
123
+ Alternation,
124
+ Conditional::Expression,
125
+ ].each do |klass|
126
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
127
+ def match_length
128
+ MatchLength.new(self,
129
+ base_min: map { |exp| exp.match_length.min }.min,
130
+ base_max: map { |exp| exp.match_length.max }.max,
131
+ reify: ->{ map { |exp| exp.match_length.to_re }.join('|') })
132
+ end
133
+ RUBY
134
+ end
135
+
136
+ [
137
+ Anchor::Base,
138
+ Assertion::Base,
139
+ Conditional::Condition,
140
+ FreeSpace,
141
+ Keep::Mark,
142
+ ].each do |klass|
143
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
144
+ def match_length
145
+ MatchLength.new(self, base: 0)
146
+ end
147
+ RUBY
148
+ end
149
+
150
+ class Backreference::Base
151
+ def match_length
152
+ if referenced_expression.nil?
153
+ raise ArgumentError, 'Missing referenced_expression - not parsed?'
154
+ end
155
+ referenced_expression.unquantified_clone.match_length
156
+ end
157
+ end
158
+
159
+ class EscapeSequence::CodepointList
160
+ def match_length
161
+ MatchLength.new(self, base: codepoints.count)
162
+ end
163
+ end
164
+
165
+ # Special case. Absence group can match 0.. chars, irrespective of content.
166
+ # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})`
167
+ class Group::Absence
168
+ def match_length
169
+ MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' })
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,35 @@
1
+ module Regexp::Expression
2
+ class Base
3
+ def multiline?
4
+ options[:m] == true
5
+ end
6
+ alias :m? :multiline?
7
+
8
+ def case_insensitive?
9
+ options[:i] == true
10
+ end
11
+ alias :i? :case_insensitive?
12
+ alias :ignore_case? :case_insensitive?
13
+
14
+ def free_spacing?
15
+ options[:x] == true
16
+ end
17
+ alias :x? :free_spacing?
18
+ alias :extended? :free_spacing?
19
+
20
+ def default_classes?
21
+ options[:d] == true
22
+ end
23
+ alias :d? :default_classes?
24
+
25
+ def ascii_classes?
26
+ options[:a] == true
27
+ end
28
+ alias :a? :ascii_classes?
29
+
30
+ def unicode_classes?
31
+ options[:u] == true
32
+ end
33
+ alias :u? :unicode_classes?
34
+ end
35
+ end
@@ -1,5 +1,4 @@
1
1
  module Regexp::Expression
2
-
3
2
  class Base
4
3
 
5
4
  # %l Level (depth) of the expression. Returns 'root' for the root
@@ -75,32 +75,23 @@ module Regexp::Expression
75
75
  def one_of?(scope, top = true)
76
76
  case scope
77
77
  when Array
78
- if scope.include?(:*)
79
- return (scope.include?(token) or scope.include?(:*))
80
- else
81
- return scope.include?(token)
82
- end
78
+ scope.include?(:*) || scope.include?(token)
83
79
 
84
80
  when Hash
85
81
  if scope.has_key?(:*)
86
82
  test_type = scope.has_key?(type) ? type : :*
87
- return one_of?(scope[test_type], false)
83
+ one_of?(scope[test_type], false)
88
84
  else
89
- return (scope.has_key?(type) and one_of?(scope[type], false))
85
+ scope.has_key?(type) && one_of?(scope[type], false)
90
86
  end
91
87
 
92
88
  when Symbol
93
- return true if scope == :*
94
-
95
- return is?(scope) unless top
96
- return type?(scope) if top
89
+ scope.equal?(:*) || (top ? type?(scope) : is?(scope))
97
90
 
98
91
  else
99
- raise "Array, Hash, or Symbol expected, #{scope.class.name} given"
92
+ raise ArgumentError,
93
+ "Array, Hash, or Symbol expected, #{scope.class.name} given"
100
94
  end
101
-
102
- false
103
95
  end
104
-
105
96
  end
106
97
  end
@@ -14,7 +14,7 @@ module Regexp::Expression
14
14
  #
15
15
  # Returns self.
16
16
  def traverse(include_self = false, &block)
17
- raise 'traverse requires a block' unless block_given?
17
+ return enum_for(__method__, include_self) unless block_given?
18
18
 
19
19
  block.call(:enter, self, 0) if include_self
20
20
 
@@ -37,6 +37,8 @@ module Regexp::Expression
37
37
  # Iterates over the expressions of this expression as an array, passing
38
38
  # the expression and its index within its parent to the given block.
39
39
  def each_expression(include_self = false, &block)
40
+ return enum_for(__method__, include_self) unless block_given?
41
+
40
42
  traverse(include_self) do |event, exp, index|
41
43
  yield(exp, index) unless event == :exit
42
44
  end
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(other)
16
- other.instance_variable_set(:@text, text.dup)
15
+ def initialize_clone(orig)
16
+ @text = orig.text.dup
17
17
  super
18
18
  end
19
19
 
@@ -18,13 +18,14 @@ module Regexp::Expression
18
18
  end
19
19
 
20
20
  class << self
21
- def add_to(subexpression, options = {})
21
+ def add_to(subexpression, params = {}, active_opts = {})
22
22
  sequence = at_levels(
23
23
  subexpression.level,
24
24
  subexpression.set_level,
25
- options[:conditional_level] || subexpression.conditional_level
25
+ params[:conditional_level] || subexpression.conditional_level
26
26
  )
27
27
  sequence.nesting_level = subexpression.nesting_level + 1
28
+ sequence.options = active_opts
28
29
  subexpression.expressions << sequence
29
30
  sequence
30
31
  end
@@ -44,10 +45,6 @@ module Regexp::Expression
44
45
  end
45
46
  end
46
47
 
47
- def text
48
- to_s
49
- end
50
-
51
48
  def starts_at
52
49
  expressions.first.starts_at
53
50
  end
@@ -14,12 +14,8 @@ module Regexp::Expression
14
14
  expressions.last << exp
15
15
  end
16
16
 
17
- def add_sequence
18
- self.class::OPERAND.add_to(self)
19
- end
20
-
21
- def quantify(token, text, min = nil, max = nil, mode = :greedy)
22
- sequences.last.last.quantify(token, text, min, max, mode)
17
+ def add_sequence(active_opts = {})
18
+ self.class::OPERAND.add_to(self, {}, active_opts)
23
19
  end
24
20
 
25
21
  def to_s(format = :full)
@@ -12,8 +12,8 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(other)
16
- other.expressions = expressions.map(&:clone)
15
+ def initialize_clone(orig)
16
+ self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
19
19
 
@@ -46,9 +46,7 @@ module Regexp::Expression
46
46
 
47
47
  def to_s(format = :full)
48
48
  # Note: the format does not get passed down to subexpressions.
49
- # Note: cant use #text accessor, b/c it is overriden as def text; to_s end
50
- # in Expression::Sequence, causing infinite recursion. Clean-up needed.
51
- "#{@text}#{expressions.join}#{quantifier_affix(format)}"
49
+ "#{expressions.join}#{quantifier_affix(format)}"
52
50
  end
53
51
 
54
52
  def to_h
@@ -22,6 +22,7 @@ class Regexp::Lexer
22
22
  self.nesting = 0
23
23
  self.set_nesting = 0
24
24
  self.conditional_nesting = 0
25
+ self.shift = 0
25
26
 
26
27
  last = nil
27
28
  Regexp::Scanner.scan(input) do |type, token, text, ts, te|
@@ -30,15 +31,13 @@ class Regexp::Lexer
30
31
 
31
32
  ascend(type, token)
32
33
 
33
- break_literal(last) if type == :quantifier and
34
- last and last.type == :literal
35
-
36
- current = Regexp::Token.new(type, token, text, ts, te,
37
- nesting, set_nesting, conditional_nesting)
34
+ if type == :quantifier and last
35
+ break_literal(last) if last.type == :literal
36
+ break_codepoint_list(last) if last.token == :codepoint_list
37
+ end
38
38
 
39
- current = merge_literal(current) if type == :literal and
40
- set_nesting == 0 and
41
- last and last.type == :literal
39
+ current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
40
+ nesting, set_nesting, conditional_nesting)
42
41
 
43
42
  current = merge_condition(current) if type == :conditional and
44
43
  [:condition, :condition_close].include?(token)
@@ -65,7 +64,7 @@ class Regexp::Lexer
65
64
 
66
65
  private
67
66
 
68
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
67
+ attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
69
68
 
70
69
  def ascend(type, token)
71
70
  case type
@@ -92,44 +91,31 @@ class Regexp::Lexer
92
91
  # called by scan to break a literal run that is longer than one character
93
92
  # into two separate tokens when it is followed by a quantifier
94
93
  def break_literal(token)
95
- text = token.text
96
- if text.scan(/./mu).length > 1
97
- lead = text.sub(/.\z/mu, "")
98
- last = text[/.\z/mu] || ''
99
-
100
- if RUBY_VERSION >= '1.9'
101
- lead_length = lead.bytesize
102
- last_length = last.bytesize
103
- else
104
- lead_length = lead.length
105
- last_length = last.length
106
- end
107
-
108
- tokens.pop
109
- tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
110
- (token.te - last_length), nesting, set_nesting, conditional_nesting)
111
-
112
- tokens << Regexp::Token.new(:literal, :literal, last,
113
- (token.ts + lead_length),
114
- token.te, nesting, set_nesting, conditional_nesting)
115
- end
94
+ lead, last, _ = token.text.partition(/.\z/mu)
95
+ return if lead.empty?
96
+
97
+ tokens.pop
98
+ tokens << Regexp::Token.new(:literal, :literal, lead,
99
+ token.ts, (token.te - last.bytesize),
100
+ nesting, set_nesting, conditional_nesting)
101
+ tokens << Regexp::Token.new(:literal, :literal, last,
102
+ (token.ts + lead.bytesize), token.te,
103
+ nesting, set_nesting, conditional_nesting)
116
104
  end
117
105
 
118
- # called by scan to merge two consecutive literals. this happens when tokens
119
- # get normalized (as in the case of posix/bre) and end up becoming literals.
120
- def merge_literal(current)
121
- last = tokens.pop
106
+ def break_codepoint_list(token)
107
+ lead, _, tail = token.text.rpartition(' ')
108
+ return if lead.empty?
109
+
110
+ tokens.pop
111
+ tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
112
+ token.ts, (token.te - tail.length),
113
+ nesting, set_nesting, conditional_nesting)
114
+ tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
115
+ (token.ts + lead.length + 1), (token.te + 3),
116
+ nesting, set_nesting, conditional_nesting)
122
117
 
123
- Regexp::Token.new(
124
- :literal,
125
- :literal,
126
- last.text + current.text,
127
- last.ts,
128
- current.te,
129
- nesting,
130
- set_nesting,
131
- conditional_nesting,
132
- )
118
+ self.shift = shift + 3 # one space less, but extra \, u, {, and }
133
119
  end
134
120
 
135
121
  def merge_condition(current)