regexp_parser 2.6.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +17 -1
- data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
- data/lib/regexp_parser/expression/sequence.rb +0 -1
- data/lib/regexp_parser/expression/shared.rb +5 -1
- data/lib/regexp_parser/expression/subexpression.rb +4 -1
- data/lib/regexp_parser/lexer.rb +61 -29
- data/lib/regexp_parser/parser.rb +12 -11
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +55 -40
- data/lib/regexp_parser/scanner.rb +344 -298
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04af46818e9d560362fea9b3fd24802b557ac145ed95f6e02580dd7cf5e8ddfc
|
4
|
+
data.tar.gz: 75b7d30241f48ddf90c8cd68228fa928904ab6055ea755f4bdcf28361e645a4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 407025a9b14af76463260fca2a48f9fef4ab863e3dddf3f7f54101c1348611afa49d9973e850d9e1c84d6e5faf8f1a9d3d2da5dceaefe8dc4fefe7069ecd9280
|
7
|
+
data.tar.gz: 9f3d2eb4264318511a82e9034c4c4a8a8e73e67e427945f0c9f745fd37b2f2f0ae8e30ba942f0920da3109b59436a5518dfc5e2f7669317de0214a0deb6f0e07
|
data/CHANGELOG.md
CHANGED
@@ -5,7 +5,29 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
-
## [
|
8
|
+
## [2.7.0] - 2023-02-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
9
|
+
|
10
|
+
### Added
|
11
|
+
|
12
|
+
- `Regexp::Lexer.lex` now streams tokens when called with a block
|
13
|
+
- it can now take arbitrarily large input, just like `Regexp::Scanner`
|
14
|
+
- this also slightly improves `Regexp::Parser.parse` performance
|
15
|
+
- note: `Regexp::Parser.parse` still does not and will not support streaming
|
16
|
+
- improved performance of `Subexpression#each_expression`
|
17
|
+
- minor improvements to `Regexp::Scanner` performance
|
18
|
+
- overall improvement of parse performance: about 10% for large Regexps
|
19
|
+
|
20
|
+
### Fixed
|
21
|
+
|
22
|
+
- parsing of octal escape sequences in sets, e.g. `[\141]`
|
23
|
+
* thanks to [Randy Stauner](https://github.com/rwstauner) for the report
|
24
|
+
|
25
|
+
## [2.6.2] - 2023-01-19 - [Janosch Müller](mailto:janosch84@gmail.com)
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
|
29
|
+
- fixed `SystemStackError` when cloning recursive subexpression calls
|
30
|
+
* e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup`
|
9
31
|
|
10
32
|
## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com)
|
11
33
|
|
@@ -5,9 +5,25 @@ module Regexp::Expression
|
|
5
5
|
attr_accessor :referenced_expression
|
6
6
|
|
7
7
|
def initialize_copy(orig)
|
8
|
-
|
8
|
+
exp_id = [self.class, self.starts_at]
|
9
|
+
|
10
|
+
# prevent infinite recursion for recursive subexp calls
|
11
|
+
copied = @@copied ||= {}
|
12
|
+
self.referenced_expression =
|
13
|
+
if copied[exp_id]
|
14
|
+
orig.referenced_expression
|
15
|
+
else
|
16
|
+
copied[exp_id] = true
|
17
|
+
orig.referenced_expression.dup
|
18
|
+
end
|
19
|
+
copied.clear
|
20
|
+
|
9
21
|
super
|
10
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
11
27
|
end
|
12
28
|
|
13
29
|
class Number < Backreference::Base
|
@@ -20,6 +20,10 @@ module Regexp::Expression
|
|
20
20
|
self.referenced_expression = orig.referenced_expression.dup
|
21
21
|
super
|
22
22
|
end
|
23
|
+
|
24
|
+
def referential?
|
25
|
+
true
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
29
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -55,6 +59,10 @@ module Regexp::Expression
|
|
55
59
|
condition.reference
|
56
60
|
end
|
57
61
|
|
62
|
+
def referential?
|
63
|
+
true
|
64
|
+
end
|
65
|
+
|
58
66
|
def parts
|
59
67
|
[text.dup, condition, *intersperse(branches, '|'), ')']
|
60
68
|
end
|
@@ -63,16 +63,20 @@ class Regexp::MatchLength
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def to_re
|
66
|
-
|
66
|
+
/(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/
|
67
67
|
end
|
68
68
|
|
69
69
|
private
|
70
70
|
|
71
71
|
attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
|
73
|
+
if Regexp.method_defined?(:match?) # ruby >= 2.4
|
74
|
+
def test_regexp
|
75
|
+
@test_regexp ||= /^#{to_re}$/
|
76
|
+
end
|
77
|
+
else
|
78
|
+
def test_regexp
|
79
|
+
@test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end }
|
76
80
|
end
|
77
81
|
end
|
78
82
|
end
|
@@ -36,11 +36,14 @@ module Regexp::Expression
|
|
36
36
|
|
37
37
|
# Iterates over the expressions of this expression as an array, passing
|
38
38
|
# the expression and its index within its parent to the given block.
|
39
|
-
def each_expression(include_self = false)
|
39
|
+
def each_expression(include_self = false, &block)
|
40
40
|
return enum_for(__method__, include_self) unless block_given?
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
block.call(self, 0) if include_self
|
43
|
+
|
44
|
+
each_with_index do |exp, index|
|
45
|
+
block.call(exp, index)
|
46
|
+
exp.each_expression(&block) unless exp.terminal?
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
@@ -77,7 +77,11 @@ module Regexp::Expression
|
|
77
77
|
end
|
78
78
|
|
79
79
|
def terminal?
|
80
|
-
|
80
|
+
true # overridden to be false in Expression::Subexpression
|
81
|
+
end
|
82
|
+
|
83
|
+
def referential?
|
84
|
+
false # overridden to be true e.g. in Expression::Backreference::Base
|
81
85
|
end
|
82
86
|
|
83
87
|
def nesting_level=(lvl)
|
@@ -19,7 +19,6 @@ module Regexp::Expression
|
|
19
19
|
if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
|
20
20
|
last.merge(exp)
|
21
21
|
else
|
22
|
-
exp.nesting_level = nesting_level + 1
|
23
22
|
expressions << exp
|
24
23
|
end
|
25
24
|
end
|
@@ -53,6 +52,10 @@ module Regexp::Expression
|
|
53
52
|
)
|
54
53
|
end
|
55
54
|
|
55
|
+
def terminal?
|
56
|
+
false
|
57
|
+
end
|
58
|
+
|
56
59
|
private
|
57
60
|
|
58
61
|
def intersperse(expressions, separator)
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -13,50 +13,68 @@ class Regexp::Lexer
|
|
13
13
|
|
14
14
|
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
15
|
|
16
|
-
def self.lex(input, syntax =
|
17
|
-
new.lex(input, syntax, options: options, &block)
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def lex(input, syntax =
|
21
|
-
syntax = Regexp::Syntax.for(syntax)
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
22
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
23
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
24
28
|
self.nesting = 0
|
25
29
|
self.set_nesting = 0
|
26
30
|
self.conditional_nesting = 0
|
27
31
|
self.shift = 0
|
28
32
|
|
29
|
-
|
30
|
-
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
31
34
|
type, token = *syntax.normalize(type, token)
|
32
35
|
syntax.check! type, token
|
33
36
|
|
34
37
|
ascend(type, token)
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
39
47
|
end
|
40
48
|
|
41
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
42
50
|
nesting, set_nesting, conditional_nesting)
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
52
62
|
|
53
63
|
descend(type, token)
|
54
64
|
end
|
55
65
|
|
56
|
-
if
|
57
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
58
76
|
else
|
59
|
-
tokens
|
77
|
+
tokens << token
|
60
78
|
end
|
61
79
|
end
|
62
80
|
|
@@ -66,7 +84,9 @@ class Regexp::Lexer
|
|
66
84
|
|
67
85
|
private
|
68
86
|
|
69
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
70
90
|
|
71
91
|
def ascend(type, token)
|
72
92
|
case type
|
@@ -96,34 +116,46 @@ class Regexp::Lexer
|
|
96
116
|
lead, last, _ = token.text.partition(/.\z/mu)
|
97
117
|
return if lead.empty?
|
98
118
|
|
99
|
-
|
100
|
-
tokens << Regexp::Token.new(:literal, :literal, lead,
|
119
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
101
120
|
token.ts, (token.te - last.length),
|
102
121
|
nesting, set_nesting, conditional_nesting)
|
103
|
-
|
122
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
104
123
|
(token.ts + lead.length), token.te,
|
105
124
|
nesting, set_nesting, conditional_nesting)
|
125
|
+
|
126
|
+
token_1.previous = preprev_token
|
127
|
+
token_1.next = token_2
|
128
|
+
token_2.previous = token_1 # .next will be set by #lex
|
129
|
+
[token_1, token_2]
|
106
130
|
end
|
107
131
|
|
132
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
133
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
134
|
+
# c.f. #break_literal.
|
108
135
|
def break_codepoint_list(token)
|
109
136
|
lead, _, tail = token.text.rpartition(' ')
|
110
137
|
return if lead.empty?
|
111
138
|
|
112
|
-
|
113
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
139
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
114
140
|
token.ts, (token.te - tail.length),
|
115
141
|
nesting, set_nesting, conditional_nesting)
|
116
|
-
|
142
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
117
143
|
(token.ts + lead.length + 1), (token.te + 3),
|
118
144
|
nesting, set_nesting, conditional_nesting)
|
119
145
|
|
120
146
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
147
|
+
|
148
|
+
token_1.previous = preprev_token
|
149
|
+
token_1.next = token_2
|
150
|
+
token_2.previous = token_1 # .next will be set by #lex
|
151
|
+
[token_1, token_2]
|
121
152
|
end
|
122
153
|
|
123
|
-
def merge_condition(current)
|
124
|
-
|
125
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
154
|
+
def merge_condition(current, last)
|
155
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
126
156
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
157
|
+
token.previous = preprev_token # .next will be set by #lex
|
158
|
+
token
|
127
159
|
end
|
128
160
|
|
129
161
|
end # module Regexp::Lexer
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -18,11 +18,11 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
26
|
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -379,7 +379,7 @@ class Regexp::Parser
|
|
379
379
|
end
|
380
380
|
|
381
381
|
def sequence_operation(klass, token)
|
382
|
-
unless node.
|
382
|
+
unless node.instance_of?(klass)
|
383
383
|
operator = klass.new(token, active_opts)
|
384
384
|
sequence = operator.add_sequence(active_opts)
|
385
385
|
sequence.expressions = node.expressions
|
@@ -541,7 +541,7 @@ class Regexp::Parser
|
|
541
541
|
|
542
542
|
def range(token)
|
543
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
544
|
-
scope = node.last.
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
545
545
|
exp << scope.expressions.pop
|
546
546
|
nest(exp)
|
547
547
|
end
|
@@ -568,7 +568,7 @@ class Regexp::Parser
|
|
568
568
|
end
|
569
569
|
|
570
570
|
def close_completed_character_set_range
|
571
|
-
decrease_nesting if node.
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
572
572
|
end
|
573
573
|
|
574
574
|
def active_opts
|
@@ -579,15 +579,16 @@ class Regexp::Parser
|
|
579
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
|
-
# find all referencable expressions
|
582
|
+
# find all referencable and refering expressions
|
583
583
|
targets = { 0 => root }
|
584
|
+
referrers = []
|
584
585
|
root.each_expression do |exp|
|
585
586
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
587
|
+
referrers << exp if exp.referential?
|
586
588
|
end
|
587
|
-
# assign
|
588
|
-
|
589
|
-
|
590
|
-
|
589
|
+
# assign reference expression to refering expressions
|
590
|
+
# (in a second iteration because there might be forward references)
|
591
|
+
referrers.each do |exp|
|
591
592
|
exp.referenced_expression = targets[exp.reference] ||
|
592
593
|
raise(ParserError, "Invalid reference: #{exp.reference}")
|
593
594
|
end
|
@@ -17,7 +17,7 @@
|
|
17
17
|
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
|
-
name =
|
20
|
+
name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
validation_error(:property, name) unless token
|
@@ -59,9 +59,6 @@
|
|
59
59
|
one_or_more = '+' | '+?' | '++';
|
60
60
|
|
61
61
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
62
|
|
66
63
|
quantity_exact = (digit+);
|
67
64
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +67,6 @@
|
|
70
67
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
68
|
quantity_maximum | quantity_range ) . range_close;
|
72
69
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
70
|
conditional = '(?(';
|
77
71
|
|
78
72
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -132,7 +126,8 @@
|
|
132
126
|
keep_mark | sequence_char;
|
133
127
|
|
134
128
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
129
|
+
set_escape = backslash | brackets | escaped_ascii |
|
130
|
+
octal_sequence | property_char |
|
136
131
|
sequence_char | single_codepoint_char_type;
|
137
132
|
|
138
133
|
|
@@ -168,8 +163,8 @@
|
|
168
163
|
};
|
169
164
|
|
170
165
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :close, ']')
|
173
168
|
if in_set?
|
174
169
|
fret;
|
175
170
|
else
|
@@ -183,28 +178,27 @@
|
|
183
178
|
};
|
184
179
|
|
185
180
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
181
|
+
if prev_token[1] == :open
|
182
|
+
emit(:set, :negate, '^')
|
189
183
|
else
|
190
|
-
emit(:literal, :literal,
|
184
|
+
emit(:literal, :literal, '^')
|
191
185
|
end
|
192
186
|
};
|
193
187
|
|
194
188
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
189
|
+
# ranges cant start with the opening bracket, a subset, or
|
190
|
+
# intersection/negation/range operators
|
191
|
+
if prev_token[0] == :set
|
192
|
+
emit(:literal, :literal, '-')
|
199
193
|
else
|
200
|
-
emit(:set, :range,
|
194
|
+
emit(:set, :range, '-')
|
201
195
|
end
|
202
196
|
};
|
203
197
|
|
204
198
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
199
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
200
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
201
|
+
emit(:set, :intersection, '&&')
|
208
202
|
};
|
209
203
|
|
210
204
|
backslash {
|
@@ -212,7 +206,7 @@
|
|
212
206
|
};
|
213
207
|
|
214
208
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
209
|
+
emit(:set, :open, '[')
|
216
210
|
fcall character_set;
|
217
211
|
};
|
218
212
|
|
@@ -254,12 +248,22 @@
|
|
254
248
|
# set escapes scanner
|
255
249
|
# --------------------------------------------------------------------------
|
256
250
|
set_escape_sequence := |*
|
251
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
252
|
+
octal_sequence {
|
253
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
254
|
+
fret;
|
255
|
+
};
|
256
|
+
|
257
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
258
|
set_escape > (escaped_set_alpha, 2) {
|
258
259
|
fhold;
|
259
260
|
fnext character_set;
|
260
261
|
fcall escape_sequence;
|
261
262
|
};
|
262
263
|
|
264
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
265
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
266
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
267
|
any > (escaped_set_alpha, 1) {
|
264
268
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
269
|
fret;
|
@@ -528,7 +532,7 @@
|
|
528
532
|
group_close @group_closed {
|
529
533
|
if conditional_stack.last == group_depth + 1
|
530
534
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
535
|
+
emit(:conditional, :close, ')')
|
532
536
|
else
|
533
537
|
if spacing_stack.length > 1 &&
|
534
538
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -536,7 +540,7 @@
|
|
536
540
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
541
|
end
|
538
542
|
|
539
|
-
emit(:group, :close,
|
543
|
+
emit(:group, :close, ')')
|
540
544
|
end
|
541
545
|
};
|
542
546
|
|
@@ -717,23 +721,24 @@ class Regexp::Scanner
|
|
717
721
|
#
|
718
722
|
# This method may raise errors if a syntax error is encountered.
|
719
723
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
724
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
725
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
726
|
end
|
723
727
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
728
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
729
|
+
self.collect_tokens = collect_tokens
|
730
|
+
self.literal_run = nil
|
726
731
|
stack = []
|
727
732
|
|
728
733
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
734
|
self.free_spacing = free_spacing?(input_object, options)
|
730
735
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
736
|
|
732
|
-
data = input.unpack("c*")
|
737
|
+
data = input.unpack("c*")
|
733
738
|
eof = data.length
|
734
739
|
|
735
740
|
self.tokens = []
|
736
|
-
self.block =
|
741
|
+
self.block = block
|
737
742
|
|
738
743
|
self.set_depth = 0
|
739
744
|
self.group_depth = 0
|
@@ -758,7 +763,7 @@ class Regexp::Scanner
|
|
758
763
|
"[#{set_depth}]") if in_set?
|
759
764
|
|
760
765
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
766
|
+
emit_literal if literal_run
|
762
767
|
|
763
768
|
tokens
|
764
769
|
end
|
@@ -785,26 +790,37 @@ class Regexp::Scanner
|
|
785
790
|
def emit(type, token, text)
|
786
791
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
792
|
|
788
|
-
emit_literal if
|
793
|
+
emit_literal if literal_run
|
789
794
|
|
790
795
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
796
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
797
|
ts_char_pos = char_pos
|
793
798
|
te_char_pos = char_pos + text.length
|
794
799
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
800
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
801
|
|
799
|
-
|
802
|
+
self.prev_token = tok
|
800
803
|
|
801
804
|
self.char_pos = te_char_pos
|
805
|
+
|
806
|
+
if block
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
808
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
809
|
+
tokens << tok if collect_tokens
|
810
|
+
elsif collect_tokens
|
811
|
+
tokens << tok
|
812
|
+
end
|
802
813
|
end
|
803
814
|
|
815
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
816
|
+
|
804
817
|
private
|
805
818
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
819
|
+
attr_accessor :block,
|
820
|
+
:collect_tokens, :tokens, :prev_token,
|
821
|
+
:free_spacing, :spacing_stack,
|
822
|
+
:group_depth, :set_depth, :conditional_stack,
|
823
|
+
:char_pos
|
808
824
|
|
809
825
|
def free_spacing?(input_object, options)
|
810
826
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +850,13 @@ class Regexp::Scanner
|
|
834
850
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
851
|
# by a call to emit_literal.
|
836
852
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
853
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
854
|
end
|
840
855
|
|
841
856
|
# Emits the literal run collected by calls to the append_literal method.
|
842
857
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
858
|
+
text = literal_run.join
|
859
|
+
self.literal_run = nil
|
845
860
|
emit(:literal, :literal, text)
|
846
861
|
end
|
847
862
|
|