regexp_parser 2.6.2 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +4 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
- data/lib/regexp_parser/expression/sequence.rb +0 -1
- data/lib/regexp_parser/expression/shared.rb +5 -1
- data/lib/regexp_parser/expression/subexpression.rb +4 -1
- data/lib/regexp_parser/lexer.rb +61 -29
- data/lib/regexp_parser/parser.rb +12 -11
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +55 -40
- data/lib/regexp_parser/scanner.rb +344 -298
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
|
4
|
+
data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
|
7
|
+
data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07
|
data/CHANGELOG.md
CHANGED
@@ -5,7 +5,22 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
-
## [
|
8
|
+
## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
9
|
+
|
10
|
+
### Added
|
11
|
+
|
12
|
+
- `Regexp::Lexer.lex` now streams tokens when called with a block
|
13
|
+
- it can now take arbitrarily large input, just like `Regexp::Scanner`
|
14
|
+
- this also slightly improves `Regexp::Parser.parse` performance
|
15
|
+
- note: `Regexp::Parser.parse` still does not and will not support streaming
|
16
|
+
- improved performance of `Subexpression#each_expression`
|
17
|
+
- minor improvements to `Regexp::Scanner` performance
|
18
|
+
- overall improvement of parse performance: about 10% for large Regexps
|
19
|
+
|
20
|
+
### Fixed
|
21
|
+
|
22
|
+
- parsing of octal escape sequences in sets, e.g. `[\141]`
|
23
|
+
* thanks to [Randy Stauner](https://github.com/rwstauner) for the report
|
9
24
|
|
10
25
|
## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
|
11
26
|
|
@@ -20,6 +20,10 @@ module Regexp::Expression
|
|
20
20
|
self.referenced_expression = orig.referenced_expression.dup
|
21
21
|
super
|
22
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
29
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -55,6 +59,10 @@ module Regexp::Expression
|
|
55
59
|
condition.reference
|
56
60
|
end
|
57
61
|
|
62
|
+
def referential?
|
63
|
+
true
|
64
|
+
end
|
65
|
+
|
58
66
|
def parts
|
59
67
|
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
68
|
end
|
@@ -36,11 +36,14 @@ module Regexp::Expression
|
|
36
36
|
|
37
37
|
# Iterates over the expressions of this expression as an array, passing
|
38
38
|
# the expression and its index within its parent to the given block.
|
39
|
-
def each_expression(include_self = false)
|
39
|
+
def each_expression(include_self = false, &block)
|
40
40
|
return enum_for(__method__, include_self) unless block_given?
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
block.call(self, 0) if include_self
|
43
|
+
|
44
|
+
each_with_index do |exp, index|
|
45
|
+
block.call(exp, index)
|
46
|
+
exp.each_expression(&block) unless exp.terminal?
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
@@ -77,7 +77,11 @@ module Regexp::Expression
|
|
77
77
|
end
|
78
78
|
|
79
79
|
def terminal?
|
80
|
-
|
80
|
+
true # overridden to be false in Expression::Subexpression
|
81
|
+
end
|
82
|
+
|
83
|
+
def referential?
|
84
|
+
false # overridden to be true e.g. in Expression::Backreference::Base
|
81
85
|
end
|
82
86
|
|
83
87
|
def nesting_level=(lvl)
|
@@ -19,7 +19,6 @@ module Regexp::Expression
|
|
19
19
|
if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
|
20
20
|
last.merge(exp)
|
21
21
|
else
|
22
|
-
exp.nesting_level = nesting_level + 1
|
23
22
|
expressions << exp
|
24
23
|
end
|
25
24
|
end
|
@@ -53,6 +52,10 @@ module Regexp::Expression
|
|
53
52
|
)
|
54
53
|
end
|
55
54
|
|
55
|
+
def terminal?
|
56
|
+
false
|
57
|
+
end
|
58
|
+
|
56
59
|
private
|
57
60
|
|
58
61
|
def intersperse(expressions, separator)
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -13,50 +13,68 @@ class Regexp::Lexer
|
|
13
13
|
|
14
14
|
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
15
|
|
16
|
-
def self.lex(input, syntax =
|
17
|
-
new.lex(input, syntax, options: options, &block)
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def lex(input, syntax =
|
21
|
-
syntax = Regexp::Syntax.for(syntax)
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
22
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
23
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
24
28
|
self.nesting = 0
|
25
29
|
self.set_nesting = 0
|
26
30
|
self.conditional_nesting = 0
|
27
31
|
self.shift = 0
|
28
32
|
|
29
|
-
|
30
|
-
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
31
34
|
type, token = *syntax.normalize(type, token)
|
32
35
|
syntax.check! type, token
|
33
36
|
|
34
37
|
ascend(type, token)
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
39
47
|
end
|
40
48
|
|
41
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
42
50
|
nesting, set_nesting, conditional_nesting)
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
52
62
|
|
53
63
|
descend(type, token)
|
54
64
|
end
|
55
65
|
|
56
|
-
if
|
57
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
58
76
|
else
|
59
|
-
tokens
|
77
|
+
tokens << token
|
60
78
|
end
|
61
79
|
end
|
62
80
|
|
@@ -66,7 +84,9 @@ class Regexp::Lexer
|
|
66
84
|
|
67
85
|
private
|
68
86
|
|
69
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
70
90
|
|
71
91
|
def ascend(type, token)
|
72
92
|
case type
|
@@ -96,34 +116,46 @@ class Regexp::Lexer
|
|
96
116
|
lead, last, _ = token.text.partition(/.\z/mu)
|
97
117
|
return if lead.empty?
|
98
118
|
|
99
|
-
|
100
|
-
tokens << Regexp::Token.new(:literal, :literal, lead,
|
119
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
101
120
|
token.ts, (token.te - last.length),
|
102
121
|
nesting, set_nesting, conditional_nesting)
|
103
|
-
|
122
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
104
123
|
(token.ts + lead.length), token.te,
|
105
124
|
nesting, set_nesting, conditional_nesting)
|
125
|
+
|
126
|
+
token_1.previous = preprev_token
|
127
|
+
token_1.next = token_2
|
128
|
+
token_2.previous = token_1 # .next will be set by #lex
|
129
|
+
[token_1, token_2]
|
106
130
|
end
|
107
131
|
|
132
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
133
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
134
|
+
# c.f. #break_literal.
|
108
135
|
def break_codepoint_list(token)
|
109
136
|
lead, _, tail = token.text.rpartition(' ')
|
110
137
|
return if lead.empty?
|
111
138
|
|
112
|
-
|
113
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
139
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
114
140
|
token.ts, (token.te - tail.length),
|
115
141
|
nesting, set_nesting, conditional_nesting)
|
116
|
-
|
142
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
117
143
|
(token.ts + lead.length + 1), (token.te + 3),
|
118
144
|
nesting, set_nesting, conditional_nesting)
|
119
145
|
|
120
146
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
147
|
+
|
148
|
+
token_1.previous = preprev_token
|
149
|
+
token_1.next = token_2
|
150
|
+
token_2.previous = token_1 # .next will be set by #lex
|
151
|
+
[token_1, token_2]
|
121
152
|
end
|
122
153
|
|
123
|
-
def merge_condition(current)
|
124
|
-
|
125
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
154
|
+
def merge_condition(current, last)
|
155
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
126
156
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
157
|
+
token.previous = preprev_token # .next will be set by #lex
|
158
|
+
token
|
127
159
|
end
|
128
160
|
|
129
161
|
end # module Regexp::Lexer
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -18,11 +18,11 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
26
|
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -379,7 +379,7 @@ class Regexp::Parser
|
|
379
379
|
end
|
380
380
|
|
381
381
|
def sequence_operation(klass, token)
|
382
|
-
unless node.
|
382
|
+
unless node.instance_of?(klass)
|
383
383
|
operator = klass.new(token, active_opts)
|
384
384
|
sequence = operator.add_sequence(active_opts)
|
385
385
|
sequence.expressions = node.expressions
|
@@ -541,7 +541,7 @@ class Regexp::Parser
|
|
541
541
|
|
542
542
|
def range(token)
|
543
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
544
|
-
scope = node.last.
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
545
545
|
exp << scope.expressions.pop
|
546
546
|
nest(exp)
|
547
547
|
end
|
@@ -568,7 +568,7 @@ class Regexp::Parser
|
|
568
568
|
end
|
569
569
|
|
570
570
|
def close_completed_character_set_range
|
571
|
-
decrease_nesting if node.
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
572
572
|
end
|
573
573
|
|
574
574
|
def active_opts
|
@@ -579,15 +579,16 @@ class Regexp::Parser
|
|
579
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
|
-
# find all referencable expressions
|
582
|
+
# find all referencable and refering expressions
|
583
583
|
targets = { 0 => root }
|
584
|
+
referrers = []
|
584
585
|
root.each_expression do |exp|
|
585
586
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
587
|
+
referrers << exp if exp.referential?
|
586
588
|
end
|
587
|
-
# assign
|
588
|
-
|
589
|
-
|
590
|
-
|
589
|
+
# assign reference expression to refering expressions
|
590
|
+
# (in a second iteration because there might be forward references)
|
591
|
+
referrers.each do |exp|
|
591
592
|
exp.referenced_expression = targets[exp.reference] ||
|
592
593
|
raise(ParserError, "Invalid reference: #{exp.reference}")
|
593
594
|
end
|
@@ -17,7 +17,7 @@
|
|
17
17
|
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
|
-
name =
|
20
|
+
name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
validation_error(:property, name) unless token
|
@@ -59,9 +59,6 @@
|
|
59
59
|
one_or_more = '+' | '+?' | '++';
|
60
60
|
|
61
61
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
62
|
|
66
63
|
quantity_exact = (digit+);
|
67
64
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +67,6 @@
|
|
70
67
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
68
|
quantity_maximum | quantity_range ) . range_close;
|
72
69
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
70
|
conditional = '(?(';
|
77
71
|
|
78
72
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -132,7 +126,8 @@
|
|
132
126
|
keep_mark | sequence_char;
|
133
127
|
|
134
128
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
129
|
+
set_escape = backslash | brackets | escaped_ascii |
|
130
|
+
octal_sequence | property_char |
|
136
131
|
sequence_char | single_codepoint_char_type;
|
137
132
|
|
138
133
|
|
@@ -168,8 +163,8 @@
|
|
168
163
|
};
|
169
164
|
|
170
165
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :close, ']')
|
173
168
|
if in_set?
|
174
169
|
fret;
|
175
170
|
else
|
@@ -183,28 +178,27 @@
|
|
183
178
|
};
|
184
179
|
|
185
180
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
181
|
+
if prev_token[1] == :open
|
182
|
+
emit(:set, :negate, '^')
|
189
183
|
else
|
190
|
-
emit(:literal, :literal,
|
184
|
+
emit(:literal, :literal, '^')
|
191
185
|
end
|
192
186
|
};
|
193
187
|
|
194
188
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
189
|
+
# ranges cant start with the opening bracket, a subset, or
|
190
|
+
# intersection/negation/range operators
|
191
|
+
if prev_token[0] == :set
|
192
|
+
emit(:literal, :literal, '-')
|
199
193
|
else
|
200
|
-
emit(:set, :range,
|
194
|
+
emit(:set, :range, '-')
|
201
195
|
end
|
202
196
|
};
|
203
197
|
|
204
198
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
199
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
200
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
201
|
+
emit(:set, :intersection, '&&')
|
208
202
|
};
|
209
203
|
|
210
204
|
backslash {
|
@@ -212,7 +206,7 @@
|
|
212
206
|
};
|
213
207
|
|
214
208
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
209
|
+
emit(:set, :open, '[')
|
216
210
|
fcall character_set;
|
217
211
|
};
|
218
212
|
|
@@ -254,12 +248,22 @@
|
|
254
248
|
# set escapes scanner
|
255
249
|
# --------------------------------------------------------------------------
|
256
250
|
set_escape_sequence := |*
|
251
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
252
|
+
octal_sequence {
|
253
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
254
|
+
fret;
|
255
|
+
};
|
256
|
+
|
257
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
258
|
set_escape > (escaped_set_alpha, 2) {
|
258
259
|
fhold;
|
259
260
|
fnext character_set;
|
260
261
|
fcall escape_sequence;
|
261
262
|
};
|
262
263
|
|
264
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
265
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
266
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
267
|
any > (escaped_set_alpha, 1) {
|
264
268
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
269
|
fret;
|
@@ -528,7 +532,7 @@
|
|
528
532
|
group_close @group_closed {
|
529
533
|
if conditional_stack.last == group_depth + 1
|
530
534
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
535
|
+
emit(:conditional, :close, ')')
|
532
536
|
else
|
533
537
|
if spacing_stack.length > 1 &&
|
534
538
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -536,7 +540,7 @@
|
|
536
540
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
541
|
end
|
538
542
|
|
539
|
-
emit(:group, :close,
|
543
|
+
emit(:group, :close, ')')
|
540
544
|
end
|
541
545
|
};
|
542
546
|
|
@@ -717,23 +721,24 @@ class Regexp::Scanner
|
|
717
721
|
#
|
718
722
|
# This method may raise errors if a syntax error is encountered.
|
719
723
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
724
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
725
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
726
|
end
|
723
727
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
728
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
729
|
+
self.collect_tokens = collect_tokens
|
730
|
+
self.literal_run = nil
|
726
731
|
stack = []
|
727
732
|
|
728
733
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
734
|
self.free_spacing = free_spacing?(input_object, options)
|
730
735
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
736
|
|
732
|
-
data = input.unpack("c*")
|
737
|
+
data = input.unpack("c*")
|
733
738
|
eof = data.length
|
734
739
|
|
735
740
|
self.tokens = []
|
736
|
-
self.block =
|
741
|
+
self.block = block
|
737
742
|
|
738
743
|
self.set_depth = 0
|
739
744
|
self.group_depth = 0
|
@@ -758,7 +763,7 @@ class Regexp::Scanner
|
|
758
763
|
"[#{set_depth}]") if in_set?
|
759
764
|
|
760
765
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
766
|
+
emit_literal if literal_run
|
762
767
|
|
763
768
|
tokens
|
764
769
|
end
|
@@ -785,26 +790,37 @@ class Regexp::Scanner
|
|
785
790
|
def emit(type, token, text)
|
786
791
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
792
|
|
788
|
-
emit_literal if
|
793
|
+
emit_literal if literal_run
|
789
794
|
|
790
795
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
796
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
797
|
ts_char_pos = char_pos
|
793
798
|
te_char_pos = char_pos + text.length
|
794
799
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
800
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
801
|
|
799
|
-
|
802
|
+
self.prev_token = tok
|
800
803
|
|
801
804
|
self.char_pos = te_char_pos
|
805
|
+
|
806
|
+
if block
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
808
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
809
|
+
tokens << tok if collect_tokens
|
810
|
+
elsif collect_tokens
|
811
|
+
tokens << tok
|
812
|
+
end
|
802
813
|
end
|
803
814
|
|
815
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
816
|
+
|
804
817
|
private
|
805
818
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
819
|
+
attr_accessor :block,
|
820
|
+
:collect_tokens, :tokens, :prev_token,
|
821
|
+
:free_spacing, :spacing_stack,
|
822
|
+
:group_depth, :set_depth, :conditional_stack,
|
823
|
+
:char_pos
|
808
824
|
|
809
825
|
def free_spacing?(input_object, options)
|
810
826
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +850,13 @@ class Regexp::Scanner
|
|
834
850
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
851
|
# by a call to emit_literal.
|
836
852
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
853
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
854
|
end
|
840
855
|
|
841
856
|
# Emits the literal run collected by calls to the append_literal method.
|
842
857
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
858
|
+
text = literal_run.join
|
859
|
+
self.literal_run = nil
|
845
860
|
emit(:literal, :literal, text)
|
846
861
|
end
|
847
862
|
|