regexp_parser 2.6.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a468f97c0fecc8b90781d4d6775f82423fd5e7f15561a419be849b1d24fe05d9
4
- data.tar.gz: c5c78beabe6ebe360b4f7cdede3c62149f4eba3c1556fd55cf02e3300cdb38b7
3
+ metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
4
+ data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
5
5
  SHA512:
6
- metadata.gz: a3b86a8f66154804b49d227ad4653cb969f1c337d4dc90de09e116e39cd87f608a12d29cc0422e4b1b4201234bc2b5b6467b065d94c274674fb1c555a04518d8
7
- data.tar.gz: fb26d224504f71645645013ee3dd5a07066b0323f9c97f8c0a716e75ea0d4fdffbf41c0526eafdf19c6d7fe1772d6616aec71541dc46d51123640cfc76b703f6
6
+ metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
7
+ data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07
data/CHANGELOG.md CHANGED
@@ -5,7 +5,29 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [Unreleased]
8
+ ## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
9
+
10
+ ### Added
11
+
12
+ - `Regexp::Lexer.lex` now streams tokens when called with a block
13
+ - it can now take arbitrarily large input, just like `Regexp::Scanner`
14
+ - this also slightly improves `Regexp::Parser.parse` performance
15
+ - note: `Regexp::Parser.parse` still does not and will not support streaming
16
+ - improved performance of `Subexpression#each_expression`
17
+ - minor improvements to `Regexp::Scanner` performance
18
+ - overall improvement of parse performance: about 10% for large Regexps
19
+
20
+ ### Fixed
21
+
22
+ - parsing of octal escape sequences in sets, e.g. `[\141]`
23
+ * thanks to [Randy Stauner](https://github.com/rwstauner) for the report
24
+
25
+ ## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
26
+
27
+ ### Fixed
28
+
29
+ - fixed `SystemStackError` when cloning recursive subexpression calls
30
+ * e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
9
31
 
10
32
  ## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
11
33
 
@@ -5,9 +5,25 @@ module Regexp::Expression
5
5
  attr_accessor :referenced_expression
6
6
 
7
7
  def initialize_copy(orig)
8
- self.referenced_expression = orig.referenced_expression.dup
8
+ exp_id = [self.class, self.starts_at]
9
+
10
+ # prevent infinite recursion for recursive subexp calls
11
+ copied = @@copied ||= {}
12
+ self.referenced_expression =
13
+ if copied[exp_id]
14
+ orig.referenced_expression
15
+ else
16
+ copied[exp_id] = true
17
+ orig.referenced_expression.dup
18
+ end
19
+ copied.clear
20
+
9
21
  super
10
22
  end
23
+
24
+ def referential?
25
+ true
26
+ end
11
27
  end
12
28
 
13
29
  class Number < Backreference::Base
@@ -20,6 +20,10 @@ module Regexp::Expression
20
20
  self.referenced_expression = orig.referenced_expression.dup
21
21
  super
22
22
  end
23
+
24
+ def referential?
25
+ true
26
+ end
23
27
  end
24
28
 
25
29
  class Branch < Regexp::Expression::Sequence; end
@@ -55,6 +59,10 @@ module Regexp::Expression
55
59
  condition.reference
56
60
  end
57
61
 
62
+ def referential?
63
+ true
64
+ end
65
+
58
66
  def parts
59
67
  [text.dup, condition, *intersperse(branches, '|'), ')']
60
68
  end
@@ -63,16 +63,20 @@ class Regexp::MatchLength
63
63
  end
64
64
 
65
65
  def to_re
66
- "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
66
+ /(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/
67
67
  end
68
68
 
69
69
  private
70
70
 
71
71
  attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
72
72
 
73
- def test_regexp
74
- @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
75
- regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
73
+ if Regexp.method_defined?(:match?) # ruby >= 2.4
74
+ def test_regexp
75
+ @test_regexp ||= /^#{to_re}$/
76
+ end
77
+ else
78
+ def test_regexp
79
+ @test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end }
76
80
  end
77
81
  end
78
82
  end
@@ -36,11 +36,14 @@ module Regexp::Expression
36
36
 
37
37
  # Iterates over the expressions of this expression as an array, passing
38
38
  # the expression and its index within its parent to the given block.
39
- def each_expression(include_self = false)
39
+ def each_expression(include_self = false, &block)
40
40
  return enum_for(__method__, include_self) unless block_given?
41
41
 
42
- traverse(include_self) do |event, exp, index|
43
- yield(exp, index) unless event == :exit
42
+ block.call(self, 0) if include_self
43
+
44
+ each_with_index do |exp, index|
45
+ block.call(exp, index)
46
+ exp.each_expression(&block) unless exp.terminal?
44
47
  end
45
48
  end
46
49
 
@@ -13,7 +13,6 @@ module Regexp::Expression
13
13
  set_level: exp.set_level,
14
14
  conditional_level: params[:conditional_level] || exp.conditional_level,
15
15
  )
16
- sequence.nesting_level = exp.nesting_level + 1
17
16
  sequence.options = active_opts
18
17
  exp.expressions << sequence
19
18
  sequence
@@ -77,7 +77,11 @@ module Regexp::Expression
77
77
  end
78
78
 
79
79
  def terminal?
80
- !respond_to?(:expressions)
80
+ true # overridden to be false in Expression::Subexpression
81
+ end
82
+
83
+ def referential?
84
+ false # overridden to be true e.g. in Expression::Backreference::Base
81
85
  end
82
86
 
83
87
  def nesting_level=(lvl)
@@ -19,7 +19,6 @@ module Regexp::Expression
19
19
  if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
20
  last.merge(exp)
21
21
  else
22
- exp.nesting_level = nesting_level + 1
23
22
  expressions << exp
24
23
  end
25
24
  end
@@ -53,6 +52,10 @@ module Regexp::Expression
53
52
  )
54
53
  end
55
54
 
55
+ def terminal?
56
+ false
57
+ end
58
+
56
59
  private
57
60
 
58
61
  def intersperse(expressions, separator)
@@ -13,50 +13,68 @@ class Regexp::Lexer
13
13
 
14
14
  CONDITION_TOKENS = %i[condition condition_close].freeze
15
15
 
16
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
17
- new.lex(input, syntax, options: options, &block)
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
18
18
  end
19
19
 
20
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.for(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
22
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
23
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
24
28
  self.nesting = 0
25
29
  self.set_nesting = 0
26
30
  self.conditional_nesting = 0
27
31
  self.shift = 0
28
32
 
29
- last = nil
30
- Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
31
34
  type, token = *syntax.normalize(type, token)
32
35
  syntax.check! type, token
33
36
 
34
37
  ascend(type, token)
35
38
 
36
- if type == :quantifier and last
37
- break_literal(last) if last.type == :literal
38
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
39
47
  end
40
48
 
41
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
42
50
  nesting, set_nesting, conditional_nesting)
43
51
 
44
- current = merge_condition(current) if type == :conditional and
45
- CONDITION_TOKENS.include?(token)
46
-
47
- last.next = current if last
48
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
49
59
 
50
- tokens << current
51
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
52
62
 
53
63
  descend(type, token)
54
64
  end
55
65
 
56
- if block_given?
57
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
58
76
  else
59
- tokens
77
+ tokens << token
60
78
  end
61
79
  end
62
80
 
@@ -66,7 +84,9 @@ class Regexp::Lexer
66
84
 
67
85
  private
68
86
 
69
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
70
90
 
71
91
  def ascend(type, token)
72
92
  case type
@@ -96,34 +116,46 @@ class Regexp::Lexer
96
116
  lead, last, _ = token.text.partition(/.\z/mu)
97
117
  return if lead.empty?
98
118
 
99
- tokens.pop
100
- tokens << Regexp::Token.new(:literal, :literal, lead,
119
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
101
120
  token.ts, (token.te - last.length),
102
121
  nesting, set_nesting, conditional_nesting)
103
- tokens << Regexp::Token.new(:literal, :literal, last,
122
+ token_2 = Regexp::Token.new(:literal, :literal, last,
104
123
  (token.ts + lead.length), token.te,
105
124
  nesting, set_nesting, conditional_nesting)
125
+
126
+ token_1.previous = preprev_token
127
+ token_1.next = token_2
128
+ token_2.previous = token_1 # .next will be set by #lex
129
+ [token_1, token_2]
106
130
  end
107
131
 
132
+ # if a codepoint list is followed by a quantifier, that quantifier applies
133
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
134
+ # c.f. #break_literal.
108
135
  def break_codepoint_list(token)
109
136
  lead, _, tail = token.text.rpartition(' ')
110
137
  return if lead.empty?
111
138
 
112
- tokens.pop
113
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
139
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
114
140
  token.ts, (token.te - tail.length),
115
141
  nesting, set_nesting, conditional_nesting)
116
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
142
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
117
143
  (token.ts + lead.length + 1), (token.te + 3),
118
144
  nesting, set_nesting, conditional_nesting)
119
145
 
120
146
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
147
+
148
+ token_1.previous = preprev_token
149
+ token_1.next = token_2
150
+ token_2.previous = token_1 # .next will be set by #lex
151
+ [token_1, token_2]
121
152
  end
122
153
 
123
- def merge_condition(current)
124
- last = tokens.pop
125
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
154
+ def merge_condition(current, last)
155
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
126
156
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
157
+ token.previous = preprev_token # .next will be set by #lex
158
+ token
127
159
  end
128
160
 
129
161
  end # module Regexp::Lexer
@@ -18,11 +18,11 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
+ def self.parse(input, syntax = nil, options: nil, &block)
22
22
  new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
25
+ def parse(input, syntax = nil, options: nil, &block)
26
26
  root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
@@ -35,7 +35,7 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax, options: options) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
@@ -379,7 +379,7 @@ class Regexp::Parser
379
379
  end
380
380
 
381
381
  def sequence_operation(klass, token)
382
- unless node.is_a?(klass)
382
+ unless node.instance_of?(klass)
383
383
  operator = klass.new(token, active_opts)
384
384
  sequence = operator.add_sequence(active_opts)
385
385
  sequence.expressions = node.expressions
@@ -541,7 +541,7 @@ class Regexp::Parser
541
541
 
542
542
  def range(token)
543
543
  exp = CharacterSet::Range.new(token, active_opts)
544
- scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
544
+ scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
545
545
  exp << scope.expressions.pop
546
546
  nest(exp)
547
547
  end
@@ -568,7 +568,7 @@ class Regexp::Parser
568
568
  end
569
569
 
570
570
  def close_completed_character_set_range
571
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
571
+ decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
572
572
  end
573
573
 
574
574
  def active_opts
@@ -579,15 +579,16 @@ class Regexp::Parser
579
579
  # an instance of Backreference::Number, its #referenced_expression is set to
580
580
  # the instance of Group::Capture that it refers to via its number.
581
581
  def assign_referenced_expressions
582
- # find all referencable expressions
582
+ # find all referencable and refering expressions
583
583
  targets = { 0 => root }
584
+ referrers = []
584
585
  root.each_expression do |exp|
585
586
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
587
+ referrers << exp if exp.referential?
586
588
  end
587
- # assign them to any refering expressions
588
- root.each_expression do |exp|
589
- next unless exp.respond_to?(:reference)
590
-
589
+ # assign reference expression to refering expressions
590
+ # (in a second iteration because there might be forward references)
591
+ referrers.each do |exp|
591
592
  exp.referenced_expression = targets[exp.reference] ||
592
593
  raise(ParserError, "Invalid reference: #{exp.reference}")
593
594
  end
@@ -17,7 +17,7 @@
17
17
  text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
- name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
20
+ name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  validation_error(:property, name) unless token
@@ -59,9 +59,6 @@
59
59
  one_or_more = '+' | '+?' | '++';
60
60
 
61
61
  quantifier_greedy = '?' | '*' | '+';
62
- quantifier_reluctant = '??' | '*?' | '+?';
63
- quantifier_possessive = '?+' | '*+' | '++';
64
- quantifier_mode = '?' | '+';
65
62
 
66
63
  quantity_exact = (digit+);
67
64
  quantity_minimum = (digit+) . ',';
@@ -70,9 +67,6 @@
70
67
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
71
68
  quantity_maximum | quantity_range ) . range_close;
72
69
 
73
- quantifiers = quantifier_greedy | quantifier_reluctant |
74
- quantifier_possessive | quantifier_interval;
75
-
76
70
  conditional = '(?(';
77
71
 
78
72
  group_comment = '?#' . [^)]* . group_close;
@@ -132,7 +126,8 @@
132
126
  keep_mark | sequence_char;
133
127
 
134
128
  # escapes that also work within a character set
135
- set_escape = backslash | brackets | escaped_ascii | property_char |
129
+ set_escape = backslash | brackets | escaped_ascii |
130
+ octal_sequence | property_char |
136
131
  sequence_char | single_codepoint_char_type;
137
132
 
138
133
 
@@ -168,8 +163,8 @@
168
163
  };
169
164
 
170
165
  '-]' @set_closed { # special case, emits two tokens
171
- emit(:literal, :literal, copy(data, ts, te-1))
172
- emit(:set, :close, copy(data, ts+1, te))
166
+ emit(:literal, :literal, '-')
167
+ emit(:set, :close, ']')
173
168
  if in_set?
174
169
  fret;
175
170
  else
@@ -183,28 +178,27 @@
183
178
  };
184
179
 
185
180
  '^' {
186
- text = copy(data, ts, te)
187
- if tokens.last[1] == :open
188
- emit(:set, :negate, text)
181
+ if prev_token[1] == :open
182
+ emit(:set, :negate, '^')
189
183
  else
190
- emit(:literal, :literal, text)
184
+ emit(:literal, :literal, '^')
191
185
  end
192
186
  };
193
187
 
194
188
  '-' {
195
- text = copy(data, ts, te)
196
- # ranges cant start with a subset or intersection/negation/range operator
197
- if tokens.last[0] == :set
198
- emit(:literal, :literal, text)
189
+ # ranges cant start with the opening bracket, a subset, or
190
+ # intersection/negation/range operators
191
+ if prev_token[0] == :set
192
+ emit(:literal, :literal, '-')
199
193
  else
200
- emit(:set, :range, text)
194
+ emit(:set, :range, '-')
201
195
  end
202
196
  };
203
197
 
204
198
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
205
199
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
206
200
  '&&' {
207
- emit(:set, :intersection, copy(data, ts, te))
201
+ emit(:set, :intersection, '&&')
208
202
  };
209
203
 
210
204
  backslash {
@@ -212,7 +206,7 @@
212
206
  };
213
207
 
214
208
  set_open >(open_bracket, 1) >set_opened {
215
- emit(:set, :open, copy(data, ts, te))
209
+ emit(:set, :open, '[')
216
210
  fcall character_set;
217
211
  };
218
212
 
@@ -254,12 +248,22 @@
254
248
  # set escapes scanner
255
249
  # --------------------------------------------------------------------------
256
250
  set_escape_sequence := |*
251
+ # Special case: in sets, octal sequences have higher priority than backrefs
252
+ octal_sequence {
253
+ emit(:escape, :octal, copy(data, ts-1, te))
254
+ fret;
255
+ };
256
+
257
+ # Scan all other escapes that work in sets with the generic escape scanner
257
258
  set_escape > (escaped_set_alpha, 2) {
258
259
  fhold;
259
260
  fnext character_set;
260
261
  fcall escape_sequence;
261
262
  };
262
263
 
264
+ # Treat all remaining escapes - those not supported in sets - as literal.
265
+ # (This currently includes \^, \-, \&, \:, although these could potentially
266
+ # be meta chars when not escaped, depending on their position in the set.)
263
267
  any > (escaped_set_alpha, 1) {
264
268
  emit(:escape, :literal, copy(data, ts-1, te))
265
269
  fret;
@@ -528,7 +532,7 @@
528
532
  group_close @group_closed {
529
533
  if conditional_stack.last == group_depth + 1
530
534
  conditional_stack.pop
531
- emit(:conditional, :close, copy(data, ts, te))
535
+ emit(:conditional, :close, ')')
532
536
  else
533
537
  if spacing_stack.length > 1 &&
534
538
  spacing_stack.last[:depth] == group_depth + 1
@@ -536,7 +540,7 @@
536
540
  self.free_spacing = spacing_stack.last[:free_spacing]
537
541
  end
538
542
 
539
- emit(:group, :close, copy(data, ts, te))
543
+ emit(:group, :close, ')')
540
544
  end
541
545
  };
542
546
 
@@ -717,23 +721,24 @@ class Regexp::Scanner
717
721
  #
718
722
  # This method may raise errors if a syntax error is encountered.
719
723
  # --------------------------------------------------------------------------
720
- def self.scan(input_object, options: nil, &block)
721
- new.scan(input_object, options: options, &block)
724
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
725
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
722
726
  end
723
727
 
724
- def scan(input_object, options: nil, &block)
725
- self.literal = nil
728
+ def scan(input_object, options: nil, collect_tokens: true, &block)
729
+ self.collect_tokens = collect_tokens
730
+ self.literal_run = nil
726
731
  stack = []
727
732
 
728
733
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
729
734
  self.free_spacing = free_spacing?(input_object, options)
730
735
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
731
736
 
732
- data = input.unpack("c*") if input.is_a?(String)
737
+ data = input.unpack("c*")
733
738
  eof = data.length
734
739
 
735
740
  self.tokens = []
736
- self.block = block_given? ? block : nil
741
+ self.block = block
737
742
 
738
743
  self.set_depth = 0
739
744
  self.group_depth = 0
@@ -758,7 +763,7 @@ class Regexp::Scanner
758
763
  "[#{set_depth}]") if in_set?
759
764
 
760
765
  # when the entire expression is a literal run
761
- emit_literal if literal
766
+ emit_literal if literal_run
762
767
 
763
768
  tokens
764
769
  end
@@ -785,26 +790,37 @@ class Regexp::Scanner
785
790
  def emit(type, token, text)
786
791
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
787
792
 
788
- emit_literal if literal
793
+ emit_literal if literal_run
789
794
 
790
795
  # Ragel runs with byte-based indices (ts, te). These are of little value to
791
796
  # end-users, so we keep track of char-based indices and emit those instead.
792
797
  ts_char_pos = char_pos
793
798
  te_char_pos = char_pos + text.length
794
799
 
795
- if block
796
- block.call type, token, text, ts_char_pos, te_char_pos
797
- end
800
+ tok = [type, token, text, ts_char_pos, te_char_pos]
798
801
 
799
- tokens << [type, token, text, ts_char_pos, te_char_pos]
802
+ self.prev_token = tok
800
803
 
801
804
  self.char_pos = te_char_pos
805
+
806
+ if block
807
+ block.call type, token, text, ts_char_pos, te_char_pos
808
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
809
+ tokens << tok if collect_tokens
810
+ elsif collect_tokens
811
+ tokens << tok
812
+ end
802
813
  end
803
814
 
815
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
816
+
804
817
  private
805
818
 
806
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
807
- :group_depth, :set_depth, :conditional_stack, :char_pos
819
+ attr_accessor :block,
820
+ :collect_tokens, :tokens, :prev_token,
821
+ :free_spacing, :spacing_stack,
822
+ :group_depth, :set_depth, :conditional_stack,
823
+ :char_pos
808
824
 
809
825
  def free_spacing?(input_object, options)
810
826
  if options && !input_object.is_a?(String)
@@ -834,14 +850,13 @@ class Regexp::Scanner
834
850
  # Appends one or more characters to the literal buffer, to be emitted later
835
851
  # by a call to emit_literal.
836
852
  def append_literal(data, ts, te)
837
- self.literal = literal || []
838
- literal << copy(data, ts, te)
853
+ (self.literal_run ||= []) << copy(data, ts, te)
839
854
  end
840
855
 
841
856
  # Emits the literal run collected by calls to the append_literal method.
842
857
  def emit_literal
843
- text = literal.join
844
- self.literal = nil
858
+ text = literal_run.join
859
+ self.literal_run = nil
845
860
  emit(:literal, :literal, text)
846
861
  end
847
862