regexp_parser 2.6.0 → 2.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +37 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +107 -103
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1171 -1365
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +17 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -6,57 +6,75 @@ class Regexp::Lexer
|
|
6
6
|
|
7
7
|
OPENING_TOKENS = %i[
|
8
8
|
capture passive lookahead nlookahead lookbehind nlookbehind
|
9
|
-
atomic options options_switch named absence
|
9
|
+
atomic options options_switch named absence open
|
10
10
|
].freeze
|
11
11
|
|
12
12
|
CLOSING_TOKENS = %i[close].freeze
|
13
13
|
|
14
14
|
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
15
|
|
16
|
-
def self.lex(input, syntax =
|
17
|
-
new.lex(input, syntax, options: options, &block)
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def lex(input, syntax =
|
21
|
-
syntax = Regexp::Syntax.for(syntax)
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
22
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
23
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
24
28
|
self.nesting = 0
|
25
29
|
self.set_nesting = 0
|
26
30
|
self.conditional_nesting = 0
|
27
31
|
self.shift = 0
|
28
32
|
|
29
|
-
|
30
|
-
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
31
34
|
type, token = *syntax.normalize(type, token)
|
32
35
|
syntax.check! type, token
|
33
36
|
|
34
37
|
ascend(type, token)
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
39
47
|
end
|
40
48
|
|
41
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
42
50
|
nesting, set_nesting, conditional_nesting)
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
52
62
|
|
53
63
|
descend(type, token)
|
54
64
|
end
|
55
65
|
|
56
|
-
if
|
57
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
58
76
|
else
|
59
|
-
tokens
|
77
|
+
tokens << token
|
60
78
|
end
|
61
79
|
end
|
62
80
|
|
@@ -66,27 +84,37 @@ class Regexp::Lexer
|
|
66
84
|
|
67
85
|
private
|
68
86
|
|
69
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
70
90
|
|
71
91
|
def ascend(type, token)
|
92
|
+
return unless CLOSING_TOKENS.include?(token)
|
93
|
+
|
72
94
|
case type
|
73
95
|
when :group, :assertion
|
74
|
-
self.nesting = nesting - 1
|
96
|
+
self.nesting = nesting - 1
|
75
97
|
when :set
|
76
|
-
self.set_nesting = set_nesting - 1
|
98
|
+
self.set_nesting = set_nesting - 1
|
77
99
|
when :conditional
|
78
|
-
self.conditional_nesting = conditional_nesting - 1
|
100
|
+
self.conditional_nesting = conditional_nesting - 1
|
101
|
+
else
|
102
|
+
raise "unhandled nesting type #{type}"
|
79
103
|
end
|
80
104
|
end
|
81
105
|
|
82
106
|
def descend(type, token)
|
107
|
+
return unless OPENING_TOKENS.include?(token)
|
108
|
+
|
83
109
|
case type
|
84
110
|
when :group, :assertion
|
85
|
-
self.nesting = nesting + 1
|
111
|
+
self.nesting = nesting + 1
|
86
112
|
when :set
|
87
|
-
self.set_nesting = set_nesting + 1
|
113
|
+
self.set_nesting = set_nesting + 1
|
88
114
|
when :conditional
|
89
|
-
self.conditional_nesting = conditional_nesting + 1
|
115
|
+
self.conditional_nesting = conditional_nesting + 1
|
116
|
+
else
|
117
|
+
raise "unhandled nesting type #{type}"
|
90
118
|
end
|
91
119
|
end
|
92
120
|
|
@@ -96,34 +124,46 @@ class Regexp::Lexer
|
|
96
124
|
lead, last, _ = token.text.partition(/.\z/mu)
|
97
125
|
return if lead.empty?
|
98
126
|
|
99
|
-
|
100
|
-
tokens << Regexp::Token.new(:literal, :literal, lead,
|
127
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
101
128
|
token.ts, (token.te - last.length),
|
102
129
|
nesting, set_nesting, conditional_nesting)
|
103
|
-
|
130
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
104
131
|
(token.ts + lead.length), token.te,
|
105
132
|
nesting, set_nesting, conditional_nesting)
|
133
|
+
|
134
|
+
token_1.previous = preprev_token
|
135
|
+
token_1.next = token_2
|
136
|
+
token_2.previous = token_1 # .next will be set by #lex
|
137
|
+
[token_1, token_2]
|
106
138
|
end
|
107
139
|
|
140
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
141
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
142
|
+
# c.f. #break_literal.
|
108
143
|
def break_codepoint_list(token)
|
109
144
|
lead, _, tail = token.text.rpartition(' ')
|
110
145
|
return if lead.empty?
|
111
146
|
|
112
|
-
|
113
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
147
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
114
148
|
token.ts, (token.te - tail.length),
|
115
149
|
nesting, set_nesting, conditional_nesting)
|
116
|
-
|
150
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
117
151
|
(token.ts + lead.length + 1), (token.te + 3),
|
118
152
|
nesting, set_nesting, conditional_nesting)
|
119
153
|
|
120
154
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
155
|
+
|
156
|
+
token_1.previous = preprev_token
|
157
|
+
token_1.next = token_2
|
158
|
+
token_2.previous = token_1 # .next will be set by #lex
|
159
|
+
[token_1, token_2]
|
121
160
|
end
|
122
161
|
|
123
|
-
def merge_condition(current)
|
124
|
-
|
125
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
162
|
+
def merge_condition(current, last)
|
163
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
126
164
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
165
|
+
token.previous = preprev_token # .next will be set by #lex
|
166
|
+
token
|
127
167
|
end
|
128
168
|
|
129
169
|
end # module Regexp::Lexer
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative 'error'
|
2
|
+
require_relative 'expression'
|
3
3
|
|
4
4
|
class Regexp::Parser
|
5
5
|
include Regexp::Expression
|
@@ -18,11 +18,11 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
26
|
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -232,7 +232,7 @@ class Regexp::Parser
|
|
232
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
233
233
|
when :name_call
|
234
234
|
node << Backreference::NameCall.new(token, active_opts)
|
235
|
-
when :number, :number_ref
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
236
236
|
node << Backreference::Number.new(token, active_opts)
|
237
237
|
when :number_recursion_ref
|
238
238
|
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
@@ -272,9 +272,9 @@ class Regexp::Parser
|
|
272
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
273
273
|
when :condition
|
274
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
275
|
-
conditional_nesting.last.add_sequence(active_opts)
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
276
276
|
when :separator
|
277
|
-
conditional_nesting.last.add_sequence(active_opts)
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
278
278
|
self.node = conditional_nesting.last.branches.last
|
279
279
|
when :close
|
280
280
|
conditional_nesting.pop
|
@@ -322,6 +322,7 @@ class Regexp::Parser
|
|
322
322
|
|
323
323
|
when :control
|
324
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
325
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
326
327
|
else
|
327
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
@@ -329,6 +330,7 @@ class Regexp::Parser
|
|
329
330
|
|
330
331
|
when :meta_sequence
|
331
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
332
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
333
335
|
else
|
334
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
@@ -349,11 +351,7 @@ class Regexp::Parser
|
|
349
351
|
when :comment
|
350
352
|
node << Comment.new(token, active_opts)
|
351
353
|
when :whitespace
|
352
|
-
|
353
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
354
|
-
else
|
355
|
-
node << WhiteSpace.new(token, active_opts)
|
356
|
-
end
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
357
355
|
else
|
358
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
359
357
|
end
|
@@ -379,98 +377,99 @@ class Regexp::Parser
|
|
379
377
|
end
|
380
378
|
|
381
379
|
def sequence_operation(klass, token)
|
382
|
-
unless node.
|
380
|
+
unless node.instance_of?(klass)
|
383
381
|
operator = klass.new(token, active_opts)
|
384
|
-
sequence = operator.add_sequence(active_opts)
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
385
383
|
sequence.expressions = node.expressions
|
386
384
|
node.expressions = []
|
387
385
|
nest(operator)
|
388
386
|
end
|
389
|
-
node.add_sequence(active_opts)
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
390
388
|
end
|
391
389
|
|
392
390
|
def posixclass(token)
|
393
391
|
node << PosixClass.new(token, active_opts)
|
394
392
|
end
|
395
393
|
|
396
|
-
|
397
|
-
UPTokens = Regexp::Syntax::Token::
|
394
|
+
UP = Regexp::Expression::Property
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
398
396
|
|
399
397
|
def property(token)
|
400
398
|
case token.token
|
401
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
402
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
403
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
404
|
-
when :blank; node << Blank.new(token, active_opts)
|
405
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
406
|
-
when :digit; node << Digit.new(token, active_opts)
|
407
|
-
when :graph; node << Graph.new(token, active_opts)
|
408
|
-
when :lower; node << Lower.new(token, active_opts)
|
409
|
-
when :print; node << Print.new(token, active_opts)
|
410
|
-
when :punct; node << Punct.new(token, active_opts)
|
411
|
-
when :space; node << Space.new(token, active_opts)
|
412
|
-
when :upper; node << Upper.new(token, active_opts)
|
413
|
-
when :word; node << Word.new(token, active_opts)
|
414
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
415
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
416
414
|
|
417
415
|
# only in Oniguruma (old rubies)
|
418
|
-
when :newline; node << Newline.new(token, active_opts)
|
419
|
-
|
420
|
-
when :any; node << Any.new(token, active_opts)
|
421
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
422
|
-
|
423
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
424
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
425
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
426
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
427
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
428
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
429
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
430
|
-
|
431
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
432
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
433
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
434
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
435
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
436
|
-
|
437
|
-
when :number; node << Number::Any.new(token, active_opts)
|
438
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
439
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
440
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
441
|
-
|
442
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
443
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
444
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
445
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
446
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
447
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
448
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
449
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
450
|
-
|
451
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
452
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
453
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
454
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
455
|
-
|
456
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
457
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
458
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
459
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
460
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
461
|
-
|
462
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
463
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
464
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
465
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
466
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
467
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
468
|
-
|
469
|
-
when *UPTokens::Age; node << Age.new(token, active_opts)
|
470
|
-
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
471
|
-
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
472
|
-
when *UPTokens::
|
473
|
-
when *UPTokens::
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
417
|
+
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
420
|
+
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
428
|
+
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
434
|
+
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
439
|
+
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
448
|
+
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
453
|
+
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
459
|
+
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
466
|
+
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
470
|
+
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
|
471
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
472
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
474
473
|
|
475
474
|
else
|
476
475
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
@@ -478,8 +477,7 @@ class Regexp::Parser
|
|
478
477
|
end
|
479
478
|
|
480
479
|
def quantifier(token)
|
481
|
-
target_node = node.
|
482
|
-
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
480
|
+
target_node = node.extract_quantifier_target(token.text)
|
483
481
|
|
484
482
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
485
483
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -527,6 +525,8 @@ class Regexp::Parser
|
|
527
525
|
end
|
528
526
|
|
529
527
|
def open_set(token)
|
528
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
529
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
530
530
|
token.token = :character
|
531
531
|
nest(CharacterSet.new(token, active_opts))
|
532
532
|
end
|
@@ -541,7 +541,7 @@ class Regexp::Parser
|
|
541
541
|
|
542
542
|
def range(token)
|
543
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
544
|
-
scope = node.last.
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
545
545
|
exp << scope.expressions.pop
|
546
546
|
nest(exp)
|
547
547
|
end
|
@@ -568,28 +568,32 @@ class Regexp::Parser
|
|
568
568
|
end
|
569
569
|
|
570
570
|
def close_completed_character_set_range
|
571
|
-
decrease_nesting if node.
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
572
572
|
end
|
573
573
|
|
574
574
|
def active_opts
|
575
575
|
options_stack.last
|
576
576
|
end
|
577
577
|
|
578
|
-
# Assigns referenced expressions to
|
578
|
+
# Assigns referenced expressions to referring expressions, e.g. if there is
|
579
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
|
-
# find all
|
583
|
-
targets = { 0 => root }
|
582
|
+
# find all referenceable and referring expressions
|
583
|
+
targets = { 0 => [root] }
|
584
|
+
referrers = []
|
584
585
|
root.each_expression do |exp|
|
585
|
-
exp.
|
586
|
+
if exp.referential?
|
587
|
+
referrers << exp
|
588
|
+
elsif exp.is_a?(Group::Capture)
|
589
|
+
(targets[exp.identifier] ||= []) << exp
|
590
|
+
end
|
586
591
|
end
|
587
|
-
# assign
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
raise(ParserError, "Invalid reference: #{exp.reference}")
|
592
|
+
# assign referenced expressions to referring expressions
|
593
|
+
# (in a second iteration because there might be forward references)
|
594
|
+
referrers.each do |exp|
|
595
|
+
exp.referenced_expressions = targets[exp.reference] ||
|
596
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
593
597
|
end
|
594
598
|
end
|
595
599
|
end # module Regexp::Parser
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class Regexp::Scanner
|
2
|
+
# Base for all scanner validation errors
|
3
|
+
class ValidationError < ScannerError
|
4
|
+
# Centralizes and unifies the handling of validation related errors.
|
5
|
+
def self.for(type, problem, reason = nil)
|
6
|
+
types.fetch(type).new(problem, reason)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.types
|
10
|
+
@types ||= {
|
11
|
+
backref: InvalidBackrefError,
|
12
|
+
group: InvalidGroupError,
|
13
|
+
group_option: InvalidGroupOption,
|
14
|
+
posix_class: UnknownPosixClassError,
|
15
|
+
property: UnknownUnicodePropertyError,
|
16
|
+
sequence: InvalidSequenceError,
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
22
|
+
class InvalidSequenceError < ValidationError
|
23
|
+
def initialize(what = 'sequence', where = '')
|
24
|
+
super "Invalid #{what} at #{where}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Invalid group. Used for named groups.
|
29
|
+
class InvalidGroupError < ValidationError
|
30
|
+
def initialize(what, reason)
|
31
|
+
super "Invalid #{what}, #{reason}."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Invalid groupOption. Used for inline options.
|
36
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
37
|
+
class InvalidGroupOption < ValidationError
|
38
|
+
def initialize(option, text)
|
39
|
+
super "Invalid group option #{option} in #{text}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Invalid back reference. Used for name a number refs/calls.
|
44
|
+
class InvalidBackrefError < ValidationError
|
45
|
+
def initialize(what, reason)
|
46
|
+
super "Invalid back reference #{what}, #{reason}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# The property name was not recognized by the scanner.
|
51
|
+
class UnknownUnicodePropertyError < ValidationError
|
52
|
+
def initialize(name, _)
|
53
|
+
super "Unknown unicode character property name #{name}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# The POSIX class name was not recognized by the scanner.
|
58
|
+
class UnknownPosixClassError < ValidationError
|
59
|
+
def initialize(text, _)
|
60
|
+
super "Unknown POSIX class #{text}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|