regexp_parser 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +13 -0
- data/README.md +4 -3
- data/lib/regexp_parser/expression.rb +8 -8
- data/lib/regexp_parser/expression/classes/backref.rb +7 -7
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +2 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -18
- data/lib/regexp_parser/expression/classes/set.rb +1 -1
- data/lib/regexp_parser/expression/subexpression.rb +2 -2
- data/lib/regexp_parser/lexer.rb +10 -6
- data/lib/regexp_parser/parser.rb +202 -172
- data/lib/regexp_parser/scanner.rb +151 -148
- data/lib/regexp_parser/scanner/scanner.rl +44 -41
- data/lib/regexp_parser/syntax/tokens/backref.rb +1 -1
- data/lib/regexp_parser/version.rb +2 -2
- data/test/expression/test_to_h.rb +2 -2
- data/test/lexer/test_refcalls.rb +3 -0
- data/test/parser/test_errors.rb +13 -9
- data/test/parser/test_groups.rb +140 -14
- data/test/parser/test_refcalls.rb +13 -0
- data/test/scanner/test_free_space.rb +43 -0
- data/test/scanner/test_refcalls.rb +3 -0
- data/test/syntax/ruby/test_1.8.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de01aa2d195d95dd0bee1afd232f85195562a8bf
|
4
|
+
data.tar.gz: b41cb58a4e07d681da7c16473f4f03d96a792ff9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 342d6218d5553f2f2f6975f202cf650cd74c9128379348526981d829188b38836dfd88bc46b8476212d4b10aeb628479109baff075e1297c4cf69aaa4fe8ff03
|
7
|
+
data.tar.gz: 80883d05ff9bb3f5f9f296aeeb5eabde013a362c6d6c82f4eeab87438050f93fea2224f407b170b65ff55d175937c55f56d49077e0ad43b0a55832989e221544
|
data/ChangeLog
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
Sun Oct 15 2017 Janosch Müller <janosch84@gmail.com>
|
2
|
+
|
3
|
+
* Fixed a thread safety issue (issue #45)
|
4
|
+
* Some public class methods that were only reliable for
|
5
|
+
internal use are now private instance methods (PR #46)
|
6
|
+
* Improved the usefulness of Expression#options (issue #43) -
|
7
|
+
#options and derived methods such as #i?, #m? and #x? are now
|
8
|
+
defined for all Expressions that are affected by such flags.
|
9
|
+
* Fixed scanning of whitespace following (?x) (commit 5c94bd2)
|
10
|
+
* Fixed a Parser bug where the #number attribute of traditional
|
11
|
+
numerical backreferences was not set correctly (commit 851b620)
|
12
|
+
* Bumped version to 0.4.7
|
13
|
+
|
1
14
|
Mon Sep 18 2017 Janosch Müller <janosch84@gmail.com>
|
2
15
|
|
3
16
|
* Added Parser support for hex escapes in sets (PR #36)
|
data/README.md
CHANGED
@@ -125,9 +125,10 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
|
|
125
125
|
|
126
126
|
* If the input is a ruby **Regexp** object, the scanner calls #source on it to
|
127
127
|
get its string representation. #source does not include the options of
|
128
|
-
the expression (m, i, and x) To include the options in the scan, #to_s
|
129
|
-
should be called on the **Regexp** before passing it to the scanner or
|
130
|
-
|
128
|
+
the expression (m, i, and x). To include the options in the scan, #to_s
|
129
|
+
should be called on the **Regexp** before passing it to the scanner or the
|
130
|
+
lexer. For the parser, however, this is not necessary. It automatically
|
131
|
+
exposes the options of a passed **Regexp** in the returned root expression.
|
131
132
|
|
132
133
|
* To keep the scanner simple(r) and fairly reusable for other purposes, it
|
133
134
|
does not perform lexical analysis on the tokens, sticking to the task
|
@@ -8,7 +8,7 @@ module Regexp::Expression
|
|
8
8
|
attr_accessor :quantifier
|
9
9
|
attr_accessor :options
|
10
10
|
|
11
|
-
def initialize(token)
|
11
|
+
def initialize(token, options = {})
|
12
12
|
@type = token.type
|
13
13
|
@token = token.token
|
14
14
|
@text = token.text
|
@@ -17,7 +17,7 @@ module Regexp::Expression
|
|
17
17
|
@set_level = token.set_level
|
18
18
|
@conditional_level = token.conditional_level
|
19
19
|
@quantifier = nil
|
20
|
-
@options =
|
20
|
+
@options = options
|
21
21
|
end
|
22
22
|
|
23
23
|
def clone
|
@@ -95,35 +95,35 @@ module Regexp::Expression
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def multiline?
|
98
|
-
|
98
|
+
@options[:m] == true
|
99
99
|
end
|
100
100
|
alias :m? :multiline?
|
101
101
|
|
102
102
|
def case_insensitive?
|
103
|
-
|
103
|
+
@options[:i] == true
|
104
104
|
end
|
105
105
|
alias :i? :case_insensitive?
|
106
106
|
alias :ignore_case? :case_insensitive?
|
107
107
|
|
108
108
|
def free_spacing?
|
109
|
-
|
109
|
+
@options[:x] == true
|
110
110
|
end
|
111
111
|
alias :x? :free_spacing?
|
112
112
|
alias :extended? :free_spacing?
|
113
113
|
|
114
114
|
if RUBY_VERSION >= '2.0'
|
115
115
|
def default_classes?
|
116
|
-
|
116
|
+
@options[:d] == true
|
117
117
|
end
|
118
118
|
alias :d? :default_classes?
|
119
119
|
|
120
120
|
def ascii_classes?
|
121
|
-
|
121
|
+
@options[:a] == true
|
122
122
|
end
|
123
123
|
alias :a? :ascii_classes?
|
124
124
|
|
125
125
|
def unicode_classes?
|
126
|
-
|
126
|
+
@options[:u] == true
|
127
127
|
end
|
128
128
|
alias :u? :unicode_classes?
|
129
129
|
end
|
@@ -6,18 +6,18 @@ module Regexp::Expression
|
|
6
6
|
class Name < Backreference::Base
|
7
7
|
attr_reader :name
|
8
8
|
|
9
|
-
def initialize(token)
|
9
|
+
def initialize(token, options = {})
|
10
10
|
@name = token.text[3..-2]
|
11
|
-
super
|
11
|
+
super
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
15
|
class Number < Backreference::Base
|
16
16
|
attr_reader :number
|
17
17
|
|
18
|
-
def initialize(token)
|
19
|
-
@number = token.text[3..-2]
|
20
|
-
super
|
18
|
+
def initialize(token, options = {})
|
19
|
+
@number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2]
|
20
|
+
super
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
@@ -29,9 +29,9 @@ module Regexp::Expression
|
|
29
29
|
class NameCall < Backreference::Base
|
30
30
|
attr_reader :name
|
31
31
|
|
32
|
-
def initialize(token)
|
32
|
+
def initialize(token, options = {})
|
33
33
|
@name = token.text[3..-2]
|
34
|
-
super
|
34
|
+
super
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
@@ -11,8 +11,8 @@ module Regexp::Expression
|
|
11
11
|
class Branch < Regexp::Expression::Sequence; end
|
12
12
|
|
13
13
|
class Expression < Regexp::Expression::Subexpression
|
14
|
-
def initialize(token)
|
15
|
-
super
|
14
|
+
def initialize(token, options = {})
|
15
|
+
super
|
16
16
|
|
17
17
|
@condition = nil
|
18
18
|
@branches = []
|
@@ -1,26 +1,12 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
|
3
3
|
class Root < Regexp::Expression::Subexpression
|
4
|
-
def initialize
|
5
|
-
super
|
4
|
+
def initialize(options = {})
|
5
|
+
super(Regexp::Token.new(:expression, :root, '', 0), options)
|
6
6
|
end
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
-
alias :m? :multiline?
|
12
|
-
|
13
|
-
def case_insensitive?
|
14
|
-
@expressions[0].i?
|
15
|
-
end
|
16
|
-
alias :i? :case_insensitive?
|
17
|
-
alias :ignore_case? :case_insensitive?
|
18
|
-
|
19
|
-
def free_spacing?
|
20
|
-
@expressions[0].x?
|
21
|
-
end
|
22
|
-
alias :x? :free_spacing?
|
23
|
-
alias :extended? :free_spacing?
|
8
|
+
alias ignore_case? case_insensitive?
|
9
|
+
alias extended? free_spacing?
|
24
10
|
end
|
25
11
|
|
26
12
|
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# collects emitted tokens into an array, calculates their nesting depth, and
|
3
3
|
# normalizes tokens for the parser, and checks if they are implemented by the
|
4
4
|
# given syntax flavor.
|
5
|
-
|
5
|
+
class Regexp::Lexer
|
6
6
|
|
7
7
|
OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named, :absence,
|
8
8
|
:lookahead, :nlookahead, :lookbehind, :nlookbehind
|
@@ -11,6 +11,10 @@ module Regexp::Lexer
|
|
11
11
|
CLOSING_TOKENS = [:close].freeze
|
12
12
|
|
13
13
|
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
14
|
+
new.lex(input, syntax, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
14
18
|
syntax = Regexp::Syntax.new(syntax)
|
15
19
|
|
16
20
|
@tokens = []
|
@@ -57,7 +61,7 @@ module Regexp::Lexer
|
|
57
61
|
|
58
62
|
protected
|
59
63
|
|
60
|
-
def
|
64
|
+
def ascend(type, token)
|
61
65
|
if type == :group or type == :assertion
|
62
66
|
@nesting -= 1 if CLOSING_TOKENS.include?(token)
|
63
67
|
end
|
@@ -71,7 +75,7 @@ module Regexp::Lexer
|
|
71
75
|
end
|
72
76
|
end
|
73
77
|
|
74
|
-
def
|
78
|
+
def descend(type, token)
|
75
79
|
if type == :group or type == :assertion
|
76
80
|
@nesting += 1 if OPENING_TOKENS.include?(token)
|
77
81
|
end
|
@@ -87,7 +91,7 @@ module Regexp::Lexer
|
|
87
91
|
|
88
92
|
# called by scan to break a literal run that is longer than one character
|
89
93
|
# into two separate tokens when it is followed by a quantifier
|
90
|
-
def
|
94
|
+
def break_literal(token)
|
91
95
|
text = token.text
|
92
96
|
if text.scan(/./mu).length > 1
|
93
97
|
lead = text.sub(/.\z/mu, "")
|
@@ -113,7 +117,7 @@ module Regexp::Lexer
|
|
113
117
|
|
114
118
|
# called by scan to merge two consecutive literals. this happens when tokens
|
115
119
|
# get normalized (as in the case of posix/bre) and end up becoming literals.
|
116
|
-
def
|
120
|
+
def merge_literal(current)
|
117
121
|
last = @tokens.pop
|
118
122
|
|
119
123
|
Regexp::Token.new(
|
@@ -128,7 +132,7 @@ module Regexp::Lexer
|
|
128
132
|
)
|
129
133
|
end
|
130
134
|
|
131
|
-
def
|
135
|
+
def merge_condition(current)
|
132
136
|
last = @tokens.pop
|
133
137
|
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
134
138
|
last.ts, current.te, @nesting, @set_nesting, @conditional_nesting)
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'regexp_parser/expression'
|
2
2
|
|
3
|
-
|
3
|
+
class Regexp::Parser
|
4
4
|
include Regexp::Expression
|
5
5
|
include Regexp::Syntax
|
6
6
|
|
@@ -19,8 +19,14 @@ module Regexp::Parser
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
22
|
-
|
22
|
+
new.parse(input, syntax, &block)
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
|
26
|
+
@nesting = [@root = @node = Root.new(options_from_input(input))]
|
23
27
|
|
28
|
+
@options_stack = [@root.options]
|
29
|
+
@switching_options = false
|
24
30
|
@conditional_nesting = []
|
25
31
|
|
26
32
|
Regexp::Lexer.scan(input, syntax) do |token|
|
@@ -34,21 +40,33 @@ module Regexp::Parser
|
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
43
|
+
private
|
44
|
+
|
45
|
+
def options_from_input(input)
|
46
|
+
return {} unless input.is_a?(::Regexp)
|
47
|
+
|
48
|
+
options = {}
|
49
|
+
options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
|
50
|
+
options[:m] = true if input.options & ::Regexp::MULTILINE != 0
|
51
|
+
options[:x] = true if input.options & ::Regexp::EXTENDED != 0
|
52
|
+
options
|
53
|
+
end
|
54
|
+
|
55
|
+
def nest(exp)
|
38
56
|
@nesting.push exp
|
39
57
|
|
40
58
|
@node << exp
|
41
59
|
@node = exp
|
42
60
|
end
|
43
61
|
|
44
|
-
def
|
62
|
+
def nest_conditional(exp)
|
45
63
|
@conditional_nesting.push exp
|
46
64
|
|
47
65
|
@node << exp
|
48
66
|
@node = exp
|
49
67
|
end
|
50
68
|
|
51
|
-
def
|
69
|
+
def parse_token(token)
|
52
70
|
case token.type
|
53
71
|
when :meta; meta(token)
|
54
72
|
when :quantifier; quantifier(token)
|
@@ -66,7 +84,7 @@ module Regexp::Parser
|
|
66
84
|
property(token)
|
67
85
|
|
68
86
|
when :literal
|
69
|
-
@node << Literal.new(token)
|
87
|
+
@node << Literal.new(token, active_opts)
|
70
88
|
when :free_space
|
71
89
|
free_space(token)
|
72
90
|
|
@@ -75,7 +93,7 @@ module Regexp::Parser
|
|
75
93
|
end
|
76
94
|
end
|
77
95
|
|
78
|
-
def
|
96
|
+
def set(token)
|
79
97
|
case token.token
|
80
98
|
when :open
|
81
99
|
open_set(token)
|
@@ -96,14 +114,14 @@ module Regexp::Parser
|
|
96
114
|
end
|
97
115
|
end
|
98
116
|
|
99
|
-
def
|
117
|
+
def meta(token)
|
100
118
|
case token.token
|
101
119
|
when :dot
|
102
|
-
@node << CharacterType::Any.new(token)
|
120
|
+
@node << CharacterType::Any.new(token, active_opts)
|
103
121
|
when :alternation
|
104
122
|
unless @node.token == :alternation
|
105
123
|
unless @node.last.is_a?(Alternation)
|
106
|
-
alt = Alternation.new(token)
|
124
|
+
alt = Alternation.new(token, active_opts)
|
107
125
|
seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
|
108
126
|
|
109
127
|
while @node.expressions.last
|
@@ -126,62 +144,62 @@ module Regexp::Parser
|
|
126
144
|
end
|
127
145
|
end
|
128
146
|
|
129
|
-
def
|
147
|
+
def backref(token)
|
130
148
|
case token.token
|
131
149
|
when :name_ref
|
132
|
-
@node << Backreference::Name.new(token)
|
150
|
+
@node << Backreference::Name.new(token, active_opts)
|
133
151
|
when :name_nest_ref
|
134
|
-
@node << Backreference::NameNestLevel.new(token)
|
152
|
+
@node << Backreference::NameNestLevel.new(token, active_opts)
|
135
153
|
when :name_call
|
136
|
-
@node << Backreference::NameCall.new(token)
|
154
|
+
@node << Backreference::NameCall.new(token, active_opts)
|
137
155
|
when :number, :number_ref
|
138
|
-
@node << Backreference::Number.new(token)
|
156
|
+
@node << Backreference::Number.new(token, active_opts)
|
139
157
|
when :number_rel_ref
|
140
|
-
@node << Backreference::NumberRelative.new(token)
|
158
|
+
@node << Backreference::NumberRelative.new(token, active_opts)
|
141
159
|
when :number_nest_ref
|
142
|
-
@node << Backreference::NumberNestLevel.new(token)
|
160
|
+
@node << Backreference::NumberNestLevel.new(token, active_opts)
|
143
161
|
when :number_call
|
144
|
-
@node << Backreference::NumberCall.new(token)
|
162
|
+
@node << Backreference::NumberCall.new(token, active_opts)
|
145
163
|
when :number_rel_call
|
146
|
-
@node << Backreference::NumberCallRelative.new(token)
|
164
|
+
@node << Backreference::NumberCallRelative.new(token, active_opts)
|
147
165
|
else
|
148
166
|
raise UnknownTokenError.new('Backreference', token)
|
149
167
|
end
|
150
168
|
end
|
151
169
|
|
152
|
-
def
|
170
|
+
def type(token)
|
153
171
|
case token.token
|
154
172
|
when :digit
|
155
|
-
@node << CharacterType::Digit.new(token)
|
173
|
+
@node << CharacterType::Digit.new(token, active_opts)
|
156
174
|
when :nondigit
|
157
|
-
@node << CharacterType::NonDigit.new(token)
|
175
|
+
@node << CharacterType::NonDigit.new(token, active_opts)
|
158
176
|
when :hex
|
159
|
-
@node << CharacterType::Hex.new(token)
|
177
|
+
@node << CharacterType::Hex.new(token, active_opts)
|
160
178
|
when :nonhex
|
161
|
-
@node << CharacterType::NonHex.new(token)
|
179
|
+
@node << CharacterType::NonHex.new(token, active_opts)
|
162
180
|
when :space
|
163
|
-
@node << CharacterType::Space.new(token)
|
181
|
+
@node << CharacterType::Space.new(token, active_opts)
|
164
182
|
when :nonspace
|
165
|
-
@node << CharacterType::NonSpace.new(token)
|
183
|
+
@node << CharacterType::NonSpace.new(token, active_opts)
|
166
184
|
when :word
|
167
|
-
@node << CharacterType::Word.new(token)
|
185
|
+
@node << CharacterType::Word.new(token, active_opts)
|
168
186
|
when :nonword
|
169
|
-
@node << CharacterType::NonWord.new(token)
|
187
|
+
@node << CharacterType::NonWord.new(token, active_opts)
|
170
188
|
when :linebreak
|
171
|
-
@node << CharacterType::Linebreak.new(token)
|
189
|
+
@node << CharacterType::Linebreak.new(token, active_opts)
|
172
190
|
when :xgrapheme
|
173
|
-
@node << CharacterType::ExtendedGrapheme.new(token)
|
191
|
+
@node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
174
192
|
else
|
175
193
|
raise UnknownTokenError.new('CharacterType', token)
|
176
194
|
end
|
177
195
|
end
|
178
196
|
|
179
|
-
def
|
197
|
+
def conditional(token)
|
180
198
|
case token.token
|
181
199
|
when :open
|
182
|
-
nest_conditional(Conditional::Expression.new(token))
|
200
|
+
nest_conditional(Conditional::Expression.new(token, active_opts))
|
183
201
|
when :condition
|
184
|
-
@conditional_nesting.last.condition(Conditional::Condition.new(token))
|
202
|
+
@conditional_nesting.last.condition(Conditional::Condition.new(token, active_opts))
|
185
203
|
@conditional_nesting.last.branch
|
186
204
|
when :separator
|
187
205
|
@conditional_nesting.last.branch
|
@@ -200,175 +218,174 @@ module Regexp::Parser
|
|
200
218
|
end
|
201
219
|
end
|
202
220
|
|
203
|
-
|
204
|
-
include Regexp::Expression::UnicodeProperty
|
221
|
+
include Regexp::Expression::UnicodeProperty
|
205
222
|
|
223
|
+
def property(token)
|
206
224
|
case token.token
|
207
|
-
when :alnum; @node << Alnum.new(token)
|
208
|
-
when :alpha; @node << Alpha.new(token)
|
209
|
-
when :any; @node << Any.new(token)
|
210
|
-
when :ascii; @node << Ascii.new(token)
|
211
|
-
when :blank; @node << Blank.new(token)
|
212
|
-
when :cntrl; @node << Cntrl.new(token)
|
213
|
-
when :digit; @node << Digit.new(token)
|
214
|
-
when :graph; @node << Graph.new(token)
|
215
|
-
when :lower; @node << Lower.new(token)
|
216
|
-
when :print; @node << Print.new(token)
|
217
|
-
when :punct; @node << Punct.new(token)
|
218
|
-
when :space; @node << Space.new(token)
|
219
|
-
when :upper; @node << Upper.new(token)
|
220
|
-
when :word; @node << Word.new(token)
|
221
|
-
when :xdigit; @node << Xdigit.new(token)
|
222
|
-
when :newline; @node << Newline.new(token)
|
223
|
-
|
224
|
-
when :letter_any; @node << Letter::Any.new(token)
|
225
|
-
when :letter_uppercase; @node << Letter::Uppercase.new(token)
|
226
|
-
when :letter_lowercase; @node << Letter::Lowercase.new(token)
|
227
|
-
when :letter_titlecase; @node << Letter::Titlecase.new(token)
|
228
|
-
when :letter_modifier; @node << Letter::Modifier.new(token)
|
229
|
-
when :letter_other; @node << Letter::Other.new(token)
|
230
|
-
|
231
|
-
when :mark_any; @node << Mark::Any.new(token)
|
232
|
-
when :mark_nonspacing; @node << Mark::Nonspacing.new(token)
|
233
|
-
when :mark_spacing; @node << Mark::Spacing.new(token)
|
234
|
-
when :mark_enclosing; @node << Mark::Enclosing.new(token)
|
235
|
-
|
236
|
-
when :number_any; @node << Number::Any.new(token)
|
237
|
-
when :number_decimal; @node << Number::Decimal.new(token)
|
238
|
-
when :number_letter; @node << Number::Letter.new(token)
|
239
|
-
when :number_other; @node << Number::Other.new(token)
|
240
|
-
|
241
|
-
when :punct_any; @node << Punctuation::Any.new(token)
|
242
|
-
when :punct_connector; @node << Punctuation::Connector.new(token)
|
243
|
-
when :punct_dash; @node << Punctuation::Dash.new(token)
|
244
|
-
when :punct_open; @node << Punctuation::Open.new(token)
|
245
|
-
when :punct_close; @node << Punctuation::Close.new(token)
|
246
|
-
when :punct_initial; @node << Punctuation::Initial.new(token)
|
247
|
-
when :punct_final; @node << Punctuation::Final.new(token)
|
248
|
-
when :punct_other; @node << Punctuation::Other.new(token)
|
249
|
-
|
250
|
-
when :separator_any; @node << Separator::Any.new(token)
|
251
|
-
when :separator_space; @node << Separator::Space.new(token)
|
252
|
-
when :separator_line; @node << Separator::Line.new(token)
|
253
|
-
when :separator_para; @node << Separator::Paragraph.new(token)
|
254
|
-
|
255
|
-
when :symbol_any; @node << Symbol::Any.new(token)
|
256
|
-
when :symbol_math; @node << Symbol::Math.new(token)
|
257
|
-
when :symbol_currency; @node << Symbol::Currency.new(token)
|
258
|
-
when :symbol_modifier; @node << Symbol::Modifier.new(token)
|
259
|
-
when :symbol_other; @node << Symbol::Other.new(token)
|
260
|
-
|
261
|
-
when :other; @node << Codepoint::Any.new(token)
|
262
|
-
when :control; @node << Codepoint::Control.new(token)
|
263
|
-
when :format; @node << Codepoint::Format.new(token)
|
264
|
-
when :surrogate; @node << Codepoint::Surrogate.new(token)
|
265
|
-
when :private_use; @node << Codepoint::PrivateUse.new(token)
|
266
|
-
when :unassigned; @node << Codepoint::Unassigned.new(token)
|
225
|
+
when :alnum; @node << Alnum.new(token, active_opts)
|
226
|
+
when :alpha; @node << Alpha.new(token, active_opts)
|
227
|
+
when :any; @node << Any.new(token, active_opts)
|
228
|
+
when :ascii; @node << Ascii.new(token, active_opts)
|
229
|
+
when :blank; @node << Blank.new(token, active_opts)
|
230
|
+
when :cntrl; @node << Cntrl.new(token, active_opts)
|
231
|
+
when :digit; @node << Digit.new(token, active_opts)
|
232
|
+
when :graph; @node << Graph.new(token, active_opts)
|
233
|
+
when :lower; @node << Lower.new(token, active_opts)
|
234
|
+
when :print; @node << Print.new(token, active_opts)
|
235
|
+
when :punct; @node << Punct.new(token, active_opts)
|
236
|
+
when :space; @node << Space.new(token, active_opts)
|
237
|
+
when :upper; @node << Upper.new(token, active_opts)
|
238
|
+
when :word; @node << Word.new(token, active_opts)
|
239
|
+
when :xdigit; @node << Xdigit.new(token, active_opts)
|
240
|
+
when :newline; @node << Newline.new(token, active_opts)
|
241
|
+
|
242
|
+
when :letter_any; @node << Letter::Any.new(token, active_opts)
|
243
|
+
when :letter_uppercase; @node << Letter::Uppercase.new(token, active_opts)
|
244
|
+
when :letter_lowercase; @node << Letter::Lowercase.new(token, active_opts)
|
245
|
+
when :letter_titlecase; @node << Letter::Titlecase.new(token, active_opts)
|
246
|
+
when :letter_modifier; @node << Letter::Modifier.new(token, active_opts)
|
247
|
+
when :letter_other; @node << Letter::Other.new(token, active_opts)
|
248
|
+
|
249
|
+
when :mark_any; @node << Mark::Any.new(token, active_opts)
|
250
|
+
when :mark_nonspacing; @node << Mark::Nonspacing.new(token, active_opts)
|
251
|
+
when :mark_spacing; @node << Mark::Spacing.new(token, active_opts)
|
252
|
+
when :mark_enclosing; @node << Mark::Enclosing.new(token, active_opts)
|
253
|
+
|
254
|
+
when :number_any; @node << Number::Any.new(token, active_opts)
|
255
|
+
when :number_decimal; @node << Number::Decimal.new(token, active_opts)
|
256
|
+
when :number_letter; @node << Number::Letter.new(token, active_opts)
|
257
|
+
when :number_other; @node << Number::Other.new(token, active_opts)
|
258
|
+
|
259
|
+
when :punct_any; @node << Punctuation::Any.new(token, active_opts)
|
260
|
+
when :punct_connector; @node << Punctuation::Connector.new(token, active_opts)
|
261
|
+
when :punct_dash; @node << Punctuation::Dash.new(token, active_opts)
|
262
|
+
when :punct_open; @node << Punctuation::Open.new(token, active_opts)
|
263
|
+
when :punct_close; @node << Punctuation::Close.new(token, active_opts)
|
264
|
+
when :punct_initial; @node << Punctuation::Initial.new(token, active_opts)
|
265
|
+
when :punct_final; @node << Punctuation::Final.new(token, active_opts)
|
266
|
+
when :punct_other; @node << Punctuation::Other.new(token, active_opts)
|
267
|
+
|
268
|
+
when :separator_any; @node << Separator::Any.new(token, active_opts)
|
269
|
+
when :separator_space; @node << Separator::Space.new(token, active_opts)
|
270
|
+
when :separator_line; @node << Separator::Line.new(token, active_opts)
|
271
|
+
when :separator_para; @node << Separator::Paragraph.new(token, active_opts)
|
272
|
+
|
273
|
+
when :symbol_any; @node << Symbol::Any.new(token, active_opts)
|
274
|
+
when :symbol_math; @node << Symbol::Math.new(token, active_opts)
|
275
|
+
when :symbol_currency; @node << Symbol::Currency.new(token, active_opts)
|
276
|
+
when :symbol_modifier; @node << Symbol::Modifier.new(token, active_opts)
|
277
|
+
when :symbol_other; @node << Symbol::Other.new(token, active_opts)
|
278
|
+
|
279
|
+
when :other; @node << Codepoint::Any.new(token, active_opts)
|
280
|
+
when :control; @node << Codepoint::Control.new(token, active_opts)
|
281
|
+
when :format; @node << Codepoint::Format.new(token, active_opts)
|
282
|
+
when :surrogate; @node << Codepoint::Surrogate.new(token, active_opts)
|
283
|
+
when :private_use; @node << Codepoint::PrivateUse.new(token, active_opts)
|
284
|
+
when :unassigned; @node << Codepoint::Unassigned.new(token, active_opts)
|
267
285
|
|
268
286
|
when *Token::UnicodeProperty::Age
|
269
|
-
@node << Age.new(token)
|
287
|
+
@node << Age.new(token, active_opts)
|
270
288
|
|
271
289
|
when *Token::UnicodeProperty::Derived
|
272
|
-
@node << Derived.new(token)
|
290
|
+
@node << Derived.new(token, active_opts)
|
273
291
|
|
274
292
|
when *Regexp::Syntax::Token::UnicodeProperty::Script
|
275
|
-
@node << Script.new(token)
|
293
|
+
@node << Script.new(token, active_opts)
|
276
294
|
|
277
295
|
when *Regexp::Syntax::Token::UnicodeProperty::UnicodeBlock
|
278
|
-
@node << Block.new(token)
|
296
|
+
@node << Block.new(token, active_opts)
|
279
297
|
|
280
298
|
else
|
281
299
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
282
300
|
end
|
283
301
|
end
|
284
302
|
|
285
|
-
def
|
303
|
+
def anchor(token)
|
286
304
|
case token.token
|
287
305
|
when :bol
|
288
|
-
@node << Anchor::BeginningOfLine.new(token)
|
306
|
+
@node << Anchor::BeginningOfLine.new(token, active_opts)
|
289
307
|
when :eol
|
290
|
-
@node << Anchor::EndOfLine.new(token)
|
308
|
+
@node << Anchor::EndOfLine.new(token, active_opts)
|
291
309
|
when :bos
|
292
|
-
@node << Anchor::BOS.new(token)
|
310
|
+
@node << Anchor::BOS.new(token, active_opts)
|
293
311
|
when :eos
|
294
|
-
@node << Anchor::EOS.new(token)
|
312
|
+
@node << Anchor::EOS.new(token, active_opts)
|
295
313
|
when :eos_ob_eol
|
296
|
-
@node << Anchor::EOSobEOL.new(token)
|
314
|
+
@node << Anchor::EOSobEOL.new(token, active_opts)
|
297
315
|
when :word_boundary
|
298
|
-
@node << Anchor::WordBoundary.new(token)
|
316
|
+
@node << Anchor::WordBoundary.new(token, active_opts)
|
299
317
|
when :nonword_boundary
|
300
|
-
@node << Anchor::NonWordBoundary.new(token)
|
318
|
+
@node << Anchor::NonWordBoundary.new(token, active_opts)
|
301
319
|
when :match_start
|
302
|
-
@node << Anchor::MatchStart.new(token)
|
320
|
+
@node << Anchor::MatchStart.new(token, active_opts)
|
303
321
|
else
|
304
322
|
raise UnknownTokenError.new('Anchor', token)
|
305
323
|
end
|
306
324
|
end
|
307
325
|
|
308
|
-
def
|
326
|
+
def escape(token)
|
309
327
|
case token.token
|
310
328
|
|
311
329
|
when :backspace
|
312
|
-
@node << EscapeSequence::Backspace.new(token)
|
330
|
+
@node << EscapeSequence::Backspace.new(token, active_opts)
|
313
331
|
|
314
332
|
when :escape
|
315
|
-
@node << EscapeSequence::AsciiEscape.new(token)
|
333
|
+
@node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
316
334
|
when :bell
|
317
|
-
@node << EscapeSequence::Bell.new(token)
|
335
|
+
@node << EscapeSequence::Bell.new(token, active_opts)
|
318
336
|
when :form_feed
|
319
|
-
@node << EscapeSequence::FormFeed.new(token)
|
337
|
+
@node << EscapeSequence::FormFeed.new(token, active_opts)
|
320
338
|
when :newline
|
321
|
-
@node << EscapeSequence::Newline.new(token)
|
339
|
+
@node << EscapeSequence::Newline.new(token, active_opts)
|
322
340
|
when :carriage
|
323
|
-
@node << EscapeSequence::Return.new(token)
|
341
|
+
@node << EscapeSequence::Return.new(token, active_opts)
|
324
342
|
when :space
|
325
|
-
@node << EscapeSequence::Space.new(token)
|
343
|
+
@node << EscapeSequence::Space.new(token, active_opts)
|
326
344
|
when :tab
|
327
|
-
@node << EscapeSequence::Tab.new(token)
|
345
|
+
@node << EscapeSequence::Tab.new(token, active_opts)
|
328
346
|
when :vertical_tab
|
329
|
-
@node << EscapeSequence::VerticalTab.new(token)
|
347
|
+
@node << EscapeSequence::VerticalTab.new(token, active_opts)
|
330
348
|
|
331
349
|
when :control
|
332
350
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
333
|
-
@node << EscapeSequence::MetaControl.new(token)
|
351
|
+
@node << EscapeSequence::MetaControl.new(token, active_opts)
|
334
352
|
else
|
335
|
-
@node << EscapeSequence::Control.new(token)
|
353
|
+
@node << EscapeSequence::Control.new(token, active_opts)
|
336
354
|
end
|
337
355
|
|
338
356
|
when :meta_sequence
|
339
357
|
if token.text =~ /\A\\M-\\[Cc]/
|
340
|
-
@node << EscapeSequence::MetaControl.new(token)
|
358
|
+
@node << EscapeSequence::MetaControl.new(token, active_opts)
|
341
359
|
else
|
342
|
-
@node << EscapeSequence::Meta.new(token)
|
360
|
+
@node << EscapeSequence::Meta.new(token, active_opts)
|
343
361
|
end
|
344
362
|
|
345
363
|
else
|
346
364
|
# treating everything else as a literal
|
347
|
-
@node << EscapeSequence::Literal.new(token)
|
365
|
+
@node << EscapeSequence::Literal.new(token, active_opts)
|
348
366
|
end
|
349
367
|
end
|
350
368
|
|
351
|
-
|
352
|
-
|
353
|
-
@node << Keep::Mark.new(token)
|
369
|
+
def keep(token)
|
370
|
+
@node << Keep::Mark.new(token, active_opts)
|
354
371
|
end
|
355
372
|
|
356
|
-
def
|
373
|
+
def free_space(token)
|
357
374
|
case token.token
|
358
375
|
when :comment
|
359
|
-
@node << Comment.new(token)
|
376
|
+
@node << Comment.new(token, active_opts)
|
360
377
|
when :whitespace
|
361
378
|
if @node.last and @node.last.is_a?(WhiteSpace)
|
362
|
-
@node.last.merge(WhiteSpace.new(token))
|
379
|
+
@node.last.merge(WhiteSpace.new(token, active_opts))
|
363
380
|
else
|
364
|
-
@node << WhiteSpace.new(token)
|
381
|
+
@node << WhiteSpace.new(token, active_opts)
|
365
382
|
end
|
366
383
|
else
|
367
384
|
raise UnknownTokenError.new('FreeSpace', token)
|
368
385
|
end
|
369
386
|
end
|
370
387
|
|
371
|
-
def
|
388
|
+
def quantifier(token)
|
372
389
|
offset = -1
|
373
390
|
target_node = @node.expressions[offset]
|
374
391
|
while target_node and target_node.is_a?(FreeSpace)
|
@@ -378,15 +395,6 @@ module Regexp::Parser
|
|
378
395
|
raise ArgumentError.new("No valid target found for '#{token.text}' "+
|
379
396
|
"quantifier") unless target_node
|
380
397
|
|
381
|
-
unless target_node
|
382
|
-
if token.token == :zero_or_one
|
383
|
-
raise "Quantifier given without a target, or the syntax of the group " +
|
384
|
-
"or its options is incorrect"
|
385
|
-
else
|
386
|
-
raise "Quantifier `#{token.text}' given without a target"
|
387
|
-
end
|
388
|
-
end
|
389
|
-
|
390
398
|
case token.token
|
391
399
|
when :zero_or_one
|
392
400
|
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
@@ -417,7 +425,7 @@ module Regexp::Parser
|
|
417
425
|
end
|
418
426
|
end
|
419
427
|
|
420
|
-
def
|
428
|
+
def interval(target_node, token)
|
421
429
|
text = token.text
|
422
430
|
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
423
431
|
case mchr
|
@@ -439,91 +447,113 @@ module Regexp::Parser
|
|
439
447
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
440
448
|
end
|
441
449
|
|
442
|
-
def
|
450
|
+
def group(token)
|
443
451
|
case token.token
|
444
452
|
when :options
|
445
|
-
|
453
|
+
options_group(token)
|
446
454
|
when :close
|
447
455
|
close_group
|
448
456
|
when :comment
|
449
|
-
@node << Group::Comment.new(token)
|
457
|
+
@node << Group::Comment.new(token, active_opts)
|
450
458
|
else
|
451
459
|
open_group(token)
|
452
460
|
end
|
453
461
|
end
|
454
462
|
|
455
|
-
def
|
456
|
-
|
463
|
+
def options_group(token)
|
464
|
+
positive, negative = token.text.split('-', 2)
|
465
|
+
negative ||= ''
|
466
|
+
@switching_options = !token.text.include?(':')
|
467
|
+
# TODO: change this -^ to token.type == :options_switch in v1.0.0
|
468
|
+
|
469
|
+
new_options = active_opts.dup
|
457
470
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
471
|
+
# Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive.
|
472
|
+
%w[i m x].each do |flag|
|
473
|
+
new_options[flag.to_sym] = true if positive.include?(flag)
|
474
|
+
new_options.delete(flag.to_sym) if negative.include?(flag)
|
475
|
+
end
|
476
|
+
|
477
|
+
# Any encoding flag overrides all previous encoding flags. If there are
|
478
|
+
# multiple encoding flags in an options string, the last one wins.
|
479
|
+
# E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
|
480
|
+
if (flag = positive.reverse[/[adu]/])
|
481
|
+
%w[a d u].each { |key| new_options.delete(key.to_sym) }
|
482
|
+
new_options[flag.to_sym] = true
|
483
|
+
end
|
484
|
+
|
485
|
+
@options_stack << new_options
|
486
|
+
|
487
|
+
exp = Group::Options.new(token, active_opts)
|
467
488
|
|
468
489
|
nest(exp)
|
469
490
|
end
|
470
491
|
|
471
|
-
def
|
492
|
+
def open_group(token)
|
472
493
|
case token.token
|
473
494
|
when :passive
|
474
|
-
exp = Group::Passive.new(token)
|
495
|
+
exp = Group::Passive.new(token, active_opts)
|
475
496
|
when :atomic
|
476
|
-
exp = Group::Atomic.new(token)
|
497
|
+
exp = Group::Atomic.new(token, active_opts)
|
477
498
|
when :named
|
478
|
-
exp = Group::Named.new(token)
|
499
|
+
exp = Group::Named.new(token, active_opts)
|
479
500
|
when :capture
|
480
|
-
exp = Group::Capture.new(token)
|
501
|
+
exp = Group::Capture.new(token, active_opts)
|
481
502
|
when :absence
|
482
|
-
exp = Group::Absence.new(token)
|
503
|
+
exp = Group::Absence.new(token, active_opts)
|
483
504
|
|
484
505
|
when :lookahead
|
485
|
-
exp = Assertion::Lookahead.new(token)
|
506
|
+
exp = Assertion::Lookahead.new(token, active_opts)
|
486
507
|
when :nlookahead
|
487
|
-
exp = Assertion::NegativeLookahead.new(token)
|
508
|
+
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
488
509
|
when :lookbehind
|
489
|
-
exp = Assertion::Lookbehind.new(token)
|
510
|
+
exp = Assertion::Lookbehind.new(token, active_opts)
|
490
511
|
when :nlookbehind
|
491
|
-
exp = Assertion::NegativeLookbehind.new(token)
|
512
|
+
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
492
513
|
|
493
514
|
else
|
494
515
|
raise UnknownTokenError.new('Group type open', token)
|
495
516
|
end
|
496
517
|
|
518
|
+
# Push the active options to the stack again. This way we can simply pop the
|
519
|
+
# stack for any group we close, no matter if it had its own options or not.
|
520
|
+
@options_stack << active_opts
|
521
|
+
|
497
522
|
nest(exp)
|
498
523
|
end
|
499
524
|
|
500
|
-
def
|
525
|
+
def close_group
|
501
526
|
@nesting.pop
|
527
|
+
@options_stack.pop unless @switching_options
|
528
|
+
@switching_options = false
|
502
529
|
|
503
530
|
@node = @nesting.last
|
504
531
|
@node = @node.last if @node.last and @node.last.is_a?(Alternation)
|
505
532
|
end
|
506
533
|
|
507
|
-
def
|
534
|
+
def open_set(token)
|
508
535
|
token.token = :character
|
509
536
|
|
510
537
|
if token.type == :subset
|
511
|
-
@set << CharacterSubSet.new(token)
|
538
|
+
@set << CharacterSubSet.new(token, active_opts)
|
512
539
|
else
|
513
|
-
@node << (@set = CharacterSet.new(token))
|
540
|
+
@node << (@set = CharacterSet.new(token, active_opts))
|
514
541
|
end
|
515
542
|
end
|
516
543
|
|
517
|
-
def
|
544
|
+
def negate_set
|
518
545
|
@set.negate
|
519
546
|
end
|
520
547
|
|
521
|
-
def
|
548
|
+
def append_set(token)
|
522
549
|
@set << token.text
|
523
550
|
end
|
524
551
|
|
525
|
-
def
|
552
|
+
def close_set(token)
|
526
553
|
@set.close
|
527
554
|
end
|
528
555
|
|
556
|
+
def active_opts
|
557
|
+
@options_stack.last
|
558
|
+
end
|
529
559
|
end # module Regexp::Parser
|