regexp_parser 1.7.0 → 2.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +9 -3
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +11 -12
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  26. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  27. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  28. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  29. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  30. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  31. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  32. data/lib/regexp_parser/expression/sequence.rb +11 -47
  33. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  34. data/lib/regexp_parser/expression/shared.rb +111 -0
  35. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  36. data/lib/regexp_parser/expression.rb +15 -141
  37. data/lib/regexp_parser/lexer.rb +83 -41
  38. data/lib/regexp_parser/parser.rb +372 -429
  39. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  40. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  41. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  42. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  43. data/lib/regexp_parser/scanner/properties/long.csv +651 -0
  44. data/lib/regexp_parser/scanner/properties/short.csv +249 -0
  45. data/lib/regexp_parser/scanner/property.rl +4 -4
  46. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  47. data/lib/regexp_parser/scanner.rb +1423 -1674
  48. data/lib/regexp_parser/syntax/any.rb +2 -7
  49. data/lib/regexp_parser/syntax/base.rb +92 -67
  50. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  51. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  52. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  53. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  54. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  55. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  56. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  57. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  58. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  59. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  60. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  61. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  62. data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
  63. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  64. data/lib/regexp_parser/syntax/token.rb +45 -0
  65. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  66. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  67. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  68. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  69. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  70. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  74. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions.rb +3 -1
  81. data/lib/regexp_parser/syntax.rb +8 -6
  82. data/lib/regexp_parser/token.rb +9 -20
  83. data/lib/regexp_parser/version.rb +1 -1
  84. data/lib/regexp_parser.rb +0 -2
  85. data/regexp_parser.gemspec +19 -23
  86. metadata +53 -171
  87. data/CHANGELOG.md +0 -349
  88. data/README.md +0 -470
  89. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  90. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  91. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  92. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  93. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  94. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  95. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  96. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  97. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  98. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  99. data/spec/expression/base_spec.rb +0 -94
  100. data/spec/expression/clone_spec.rb +0 -120
  101. data/spec/expression/conditional_spec.rb +0 -89
  102. data/spec/expression/free_space_spec.rb +0 -27
  103. data/spec/expression/methods/match_length_spec.rb +0 -161
  104. data/spec/expression/methods/match_spec.rb +0 -25
  105. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  106. data/spec/expression/methods/tests_spec.rb +0 -99
  107. data/spec/expression/methods/traverse_spec.rb +0 -161
  108. data/spec/expression/options_spec.rb +0 -128
  109. data/spec/expression/root_spec.rb +0 -9
  110. data/spec/expression/sequence_spec.rb +0 -9
  111. data/spec/expression/subexpression_spec.rb +0 -50
  112. data/spec/expression/to_h_spec.rb +0 -26
  113. data/spec/expression/to_s_spec.rb +0 -100
  114. data/spec/lexer/all_spec.rb +0 -22
  115. data/spec/lexer/conditionals_spec.rb +0 -53
  116. data/spec/lexer/escapes_spec.rb +0 -14
  117. data/spec/lexer/keep_spec.rb +0 -10
  118. data/spec/lexer/literals_spec.rb +0 -89
  119. data/spec/lexer/nesting_spec.rb +0 -99
  120. data/spec/lexer/refcalls_spec.rb +0 -55
  121. data/spec/parser/all_spec.rb +0 -43
  122. data/spec/parser/alternation_spec.rb +0 -88
  123. data/spec/parser/anchors_spec.rb +0 -17
  124. data/spec/parser/conditionals_spec.rb +0 -179
  125. data/spec/parser/errors_spec.rb +0 -30
  126. data/spec/parser/escapes_spec.rb +0 -121
  127. data/spec/parser/free_space_spec.rb +0 -130
  128. data/spec/parser/groups_spec.rb +0 -108
  129. data/spec/parser/keep_spec.rb +0 -6
  130. data/spec/parser/posix_classes_spec.rb +0 -8
  131. data/spec/parser/properties_spec.rb +0 -115
  132. data/spec/parser/quantifiers_spec.rb +0 -51
  133. data/spec/parser/refcalls_spec.rb +0 -112
  134. data/spec/parser/set/intersections_spec.rb +0 -127
  135. data/spec/parser/set/ranges_spec.rb +0 -111
  136. data/spec/parser/sets_spec.rb +0 -178
  137. data/spec/parser/types_spec.rb +0 -18
  138. data/spec/scanner/all_spec.rb +0 -18
  139. data/spec/scanner/anchors_spec.rb +0 -21
  140. data/spec/scanner/conditionals_spec.rb +0 -128
  141. data/spec/scanner/errors_spec.rb +0 -68
  142. data/spec/scanner/escapes_spec.rb +0 -53
  143. data/spec/scanner/free_space_spec.rb +0 -133
  144. data/spec/scanner/groups_spec.rb +0 -52
  145. data/spec/scanner/keep_spec.rb +0 -10
  146. data/spec/scanner/literals_spec.rb +0 -49
  147. data/spec/scanner/meta_spec.rb +0 -18
  148. data/spec/scanner/properties_spec.rb +0 -64
  149. data/spec/scanner/quantifiers_spec.rb +0 -20
  150. data/spec/scanner/refcalls_spec.rb +0 -36
  151. data/spec/scanner/sets_spec.rb +0 -102
  152. data/spec/scanner/types_spec.rb +0 -14
  153. data/spec/spec_helper.rb +0 -15
  154. data/spec/support/runner.rb +0 -42
  155. data/spec/support/shared_examples.rb +0 -77
  156. data/spec/support/warning_extractor.rb +0 -60
  157. data/spec/syntax/syntax_spec.rb +0 -48
  158. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  159. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  160. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  161. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  162. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  163. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  164. data/spec/syntax/versions/aliases_spec.rb +0 -37
  165. data/spec/token/token_spec.rb +0 -85
  166. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -1,29 +1,25 @@
1
1
  module Regexp::Expression
2
-
3
2
  class Subexpression < Regexp::Expression::Base
4
3
  include Enumerable
5
4
 
6
5
  attr_accessor :expressions
7
6
 
8
7
  def initialize(token, options = {})
9
- super
10
-
11
8
  self.expressions = []
9
+ super
12
10
  end
13
11
 
14
12
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
16
- self.expressions = orig.expressions.map(&:clone)
13
+ def initialize_copy(orig)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
17
17
  super
18
18
  end
19
19
 
20
20
  def <<(exp)
21
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
22
- last.merge(exp)
23
- else
24
- exp.nesting_level = nesting_level + 1
25
- expressions << exp
26
- end
21
+ exp.parent = self
22
+ expressions << exp
27
23
  end
28
24
 
29
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -41,19 +37,31 @@ module Regexp::Expression
41
37
  end
42
38
 
43
39
  def te
44
- ts + to_s.length
45
- end
46
-
47
- def to_s(format = :full)
48
- # Note: the format does not get passed down to subexpressions.
49
- "#{expressions.join}#{quantifier_affix(format)}"
40
+ ts + base_length
50
41
  end
51
42
 
52
43
  def to_h
53
- attributes.merge({
44
+ attributes.merge(
54
45
  text: to_s(:base),
55
46
  expressions: expressions.map(&:to_h)
56
- })
47
+ )
48
+ end
49
+
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
62
+
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
57
65
  end
58
66
  end
59
67
  end
@@ -1,138 +1,7 @@
1
- module Regexp::Expression
2
-
3
- class Base
4
- attr_accessor :type, :token
5
- attr_accessor :text, :ts
6
- attr_accessor :level, :set_level, :conditional_level, :nesting_level
7
-
8
- attr_accessor :quantifier
9
- attr_accessor :options
10
-
11
- def initialize(token, options = {})
12
- self.type = token.type
13
- self.token = token.token
14
- self.text = token.text
15
- self.ts = token.ts
16
- self.level = token.level
17
- self.set_level = token.set_level
18
- self.conditional_level = token.conditional_level
19
- self.nesting_level = 0
20
- self.quantifier = nil
21
- self.options = options
22
- end
23
-
24
- def initialize_clone(orig)
25
- self.text = (orig.text ? orig.text.dup : nil)
26
- self.options = (orig.options ? orig.options.dup : nil)
27
- self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
28
- super
29
- end
30
-
31
- def to_re(format = :full)
32
- ::Regexp.new(to_s(format))
33
- end
34
-
35
- alias :starts_at :ts
36
-
37
- def full_length
38
- to_s.length
39
- end
40
-
41
- def offset
42
- [starts_at, full_length]
43
- end
44
-
45
- def coded_offset
46
- '@%d+%d' % offset
47
- end
48
-
49
- def to_s(format = :full)
50
- "#{text}#{quantifier_affix(format)}"
51
- end
52
-
53
- def quantifier_affix(expression_format)
54
- quantifier.to_s if quantified? && expression_format != :base
55
- end
56
-
57
- def terminal?
58
- !respond_to?(:expressions)
59
- end
60
-
61
- def quantify(token, text, min = nil, max = nil, mode = :greedy)
62
- self.quantifier = Quantifier.new(token, text, min, max, mode)
63
- end
64
-
65
- def unquantified_clone
66
- clone.tap { |exp| exp.quantifier = nil }
67
- end
68
-
69
- def quantified?
70
- !quantifier.nil?
71
- end
72
-
73
- # Deprecated. Prefer `#repetitions` which has a more uniform interface.
74
- def quantity
75
- return [nil,nil] unless quantified?
76
- [quantifier.min, quantifier.max]
77
- end
78
-
79
- def repetitions
80
- return 1..1 unless quantified?
81
- min = quantifier.min
82
- max = quantifier.max < 0 ? Float::INFINITY : quantifier.max
83
- # fix Range#minmax - https://bugs.ruby-lang.org/issues/15807
84
- (min..max).tap { |r| r.define_singleton_method(:minmax) { [min, max] } }
85
- end
86
-
87
- def greedy?
88
- quantified? and quantifier.greedy?
89
- end
90
-
91
- def reluctant?
92
- quantified? and quantifier.reluctant?
93
- end
94
- alias :lazy? :reluctant?
95
-
96
- def possessive?
97
- quantified? and quantifier.possessive?
98
- end
99
-
100
- def attributes
101
- {
102
- type: type,
103
- token: token,
104
- text: to_s(:base),
105
- starts_at: ts,
106
- length: full_length,
107
- level: level,
108
- set_level: set_level,
109
- conditional_level: conditional_level,
110
- options: options,
111
- quantifier: quantified? ? quantifier.to_h : nil,
112
- }
113
- end
114
- alias :to_h :attributes
115
- end
116
-
117
- def self.parsed(exp)
118
- warn('WARNING: Regexp::Expression::Base.parsed is buggy and '\
119
- 'will be removed in 2.0.0. Use Regexp::Parser.parse instead.')
120
- case exp
121
- when String
122
- Regexp::Parser.parse(exp)
123
- when Regexp
124
- Regexp::Parser.parse(exp.source) # <- causes loss of root options
125
- when Regexp::Expression # <- never triggers
126
- exp
127
- else
128
- raise ArgumentError, 'Expression.parsed accepts a String, Regexp, or '\
129
- 'a Regexp::Expression as a value for exp, but it '\
130
- "was given #{exp.class.name}."
131
- end
132
- end
133
-
134
- end # module Regexp::Expression
1
+ require 'regexp_parser/error'
135
2
 
3
+ require 'regexp_parser/expression/shared'
4
+ require 'regexp_parser/expression/base'
136
5
  require 'regexp_parser/expression/quantifier'
137
6
  require 'regexp_parser/expression/subexpression'
138
7
  require 'regexp_parser/expression/sequence'
@@ -140,24 +9,29 @@ require 'regexp_parser/expression/sequence_operation'
140
9
 
141
10
  require 'regexp_parser/expression/classes/alternation'
142
11
  require 'regexp_parser/expression/classes/anchor'
143
- require 'regexp_parser/expression/classes/backref'
12
+ require 'regexp_parser/expression/classes/backreference'
13
+ require 'regexp_parser/expression/classes/character_set'
14
+ require 'regexp_parser/expression/classes/character_set/intersection'
15
+ require 'regexp_parser/expression/classes/character_set/range'
16
+ require 'regexp_parser/expression/classes/character_type'
144
17
  require 'regexp_parser/expression/classes/conditional'
145
- require 'regexp_parser/expression/classes/escape'
18
+ require 'regexp_parser/expression/classes/escape_sequence'
146
19
  require 'regexp_parser/expression/classes/free_space'
147
20
  require 'regexp_parser/expression/classes/group'
148
21
  require 'regexp_parser/expression/classes/keep'
149
22
  require 'regexp_parser/expression/classes/literal'
150
23
  require 'regexp_parser/expression/classes/posix_class'
151
- require 'regexp_parser/expression/classes/property'
152
24
  require 'regexp_parser/expression/classes/root'
153
- require 'regexp_parser/expression/classes/set'
154
- require 'regexp_parser/expression/classes/set/intersection'
155
- require 'regexp_parser/expression/classes/set/range'
156
- require 'regexp_parser/expression/classes/type'
25
+ require 'regexp_parser/expression/classes/unicode_property'
157
26
 
27
+ require 'regexp_parser/expression/methods/construct'
28
+ require 'regexp_parser/expression/methods/human_name'
158
29
  require 'regexp_parser/expression/methods/match'
159
30
  require 'regexp_parser/expression/methods/match_length'
31
+ require 'regexp_parser/expression/methods/negative'
160
32
  require 'regexp_parser/expression/methods/options'
33
+ require 'regexp_parser/expression/methods/parts'
34
+ require 'regexp_parser/expression/methods/printing'
161
35
  require 'regexp_parser/expression/methods/strfregexp'
162
36
  require 'regexp_parser/expression/methods/tests'
163
37
  require 'regexp_parser/expression/methods/traverse'
@@ -4,57 +4,77 @@
4
4
  # given syntax flavor.
5
5
  class Regexp::Lexer
6
6
 
7
- OPENING_TOKENS = [
8
- :capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
9
- :atomic, :options, :options_switch, :named, :absence
7
+ OPENING_TOKENS = %i[
8
+ capture passive lookahead nlookahead lookbehind nlookbehind
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
- CLOSING_TOKENS = [:close].freeze
12
+ CLOSING_TOKENS = %i[close].freeze
13
13
 
14
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
15
- new.lex(input, syntax, &block)
14
+ CONDITION_TOKENS = %i[condition condition_close].freeze
15
+
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
16
18
  end
17
19
 
18
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
19
- syntax = Regexp::Syntax.new(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
20
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
21
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
22
28
  self.nesting = 0
23
29
  self.set_nesting = 0
24
30
  self.conditional_nesting = 0
25
31
  self.shift = 0
26
32
 
27
- last = nil
28
- Regexp::Scanner.scan(input) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
29
34
  type, token = *syntax.normalize(type, token)
30
35
  syntax.check! type, token
31
36
 
32
37
  ascend(type, token)
33
38
 
34
- if type == :quantifier and last
35
- break_literal(last) if last.type == :literal
36
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
37
47
  end
38
48
 
39
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
40
50
  nesting, set_nesting, conditional_nesting)
41
51
 
42
- current = merge_condition(current) if type == :conditional and
43
- [:condition, :condition_close].include?(token)
44
-
45
- last.next = current if last
46
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
47
59
 
48
- tokens << current
49
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
50
62
 
51
63
  descend(type, token)
52
64
  end
53
65
 
54
- if block_given?
55
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
56
76
  else
57
- tokens
77
+ tokens << token
58
78
  end
59
79
  end
60
80
 
@@ -64,27 +84,37 @@ class Regexp::Lexer
64
84
 
65
85
  private
66
86
 
67
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
68
90
 
69
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
70
94
  case type
71
95
  when :group, :assertion
72
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
73
97
  when :set
74
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
75
99
  when :conditional
76
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
77
103
  end
78
104
  end
79
105
 
80
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
81
109
  case type
82
110
  when :group, :assertion
83
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
84
112
  when :set
85
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
86
114
  when :conditional
87
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
88
118
  end
89
119
  end
90
120
 
@@ -94,34 +124,46 @@ class Regexp::Lexer
94
124
  lead, last, _ = token.text.partition(/.\z/mu)
95
125
  return if lead.empty?
96
126
 
97
- tokens.pop
98
- tokens << Regexp::Token.new(:literal, :literal, lead,
99
- token.ts, (token.te - last.bytesize),
127
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
128
+ token.ts, (token.te - last.length),
100
129
  nesting, set_nesting, conditional_nesting)
101
- tokens << Regexp::Token.new(:literal, :literal, last,
102
- (token.ts + lead.bytesize), token.te,
130
+ token_2 = Regexp::Token.new(:literal, :literal, last,
131
+ (token.ts + lead.length), token.te,
103
132
  nesting, set_nesting, conditional_nesting)
133
+
134
+ token_1.previous = preprev_token
135
+ token_1.next = token_2
136
+ token_2.previous = token_1 # .next will be set by #lex
137
+ [token_1, token_2]
104
138
  end
105
139
 
140
+ # if a codepoint list is followed by a quantifier, that quantifier applies
141
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
142
+ # c.f. #break_literal.
106
143
  def break_codepoint_list(token)
107
144
  lead, _, tail = token.text.rpartition(' ')
108
145
  return if lead.empty?
109
146
 
110
- tokens.pop
111
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
147
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
112
148
  token.ts, (token.te - tail.length),
113
149
  nesting, set_nesting, conditional_nesting)
114
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
150
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
115
151
  (token.ts + lead.length + 1), (token.te + 3),
116
152
  nesting, set_nesting, conditional_nesting)
117
153
 
118
154
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
155
+
156
+ token_1.previous = preprev_token
157
+ token_1.next = token_2
158
+ token_2.previous = token_1 # .next will be set by #lex
159
+ [token_1, token_2]
119
160
  end
120
161
 
121
- def merge_condition(current)
122
- last = tokens.pop
123
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
162
+ def merge_condition(current, last)
163
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
124
164
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
165
+ token.previous = preprev_token # .next will be set by #lex
166
+ token
125
167
  end
126
168
 
127
169
  end # module Regexp::Lexer