regexp_parser 2.6.0 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +37 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +107 -103
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1171 -1365
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +17 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -6,57 +6,75 @@ class Regexp::Lexer
|
|
6
6
|
|
7
7
|
OPENING_TOKENS = %i[
|
8
8
|
capture passive lookahead nlookahead lookbehind nlookbehind
|
9
|
-
atomic options options_switch named absence
|
9
|
+
atomic options options_switch named absence open
|
10
10
|
].freeze
|
11
11
|
|
12
12
|
CLOSING_TOKENS = %i[close].freeze
|
13
13
|
|
14
14
|
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
15
|
|
16
|
-
def self.lex(input, syntax =
|
17
|
-
new.lex(input, syntax, options: options, &block)
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def lex(input, syntax =
|
21
|
-
syntax = Regexp::Syntax.for(syntax)
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
22
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
23
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
24
28
|
self.nesting = 0
|
25
29
|
self.set_nesting = 0
|
26
30
|
self.conditional_nesting = 0
|
27
31
|
self.shift = 0
|
28
32
|
|
29
|
-
|
30
|
-
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
31
34
|
type, token = *syntax.normalize(type, token)
|
32
35
|
syntax.check! type, token
|
33
36
|
|
34
37
|
ascend(type, token)
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
39
47
|
end
|
40
48
|
|
41
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
42
50
|
nesting, set_nesting, conditional_nesting)
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
52
62
|
|
53
63
|
descend(type, token)
|
54
64
|
end
|
55
65
|
|
56
|
-
if
|
57
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
58
76
|
else
|
59
|
-
tokens
|
77
|
+
tokens << token
|
60
78
|
end
|
61
79
|
end
|
62
80
|
|
@@ -66,27 +84,37 @@ class Regexp::Lexer
|
|
66
84
|
|
67
85
|
private
|
68
86
|
|
69
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
70
90
|
|
71
91
|
def ascend(type, token)
|
92
|
+
return unless CLOSING_TOKENS.include?(token)
|
93
|
+
|
72
94
|
case type
|
73
95
|
when :group, :assertion
|
74
|
-
self.nesting = nesting - 1
|
96
|
+
self.nesting = nesting - 1
|
75
97
|
when :set
|
76
|
-
self.set_nesting = set_nesting - 1
|
98
|
+
self.set_nesting = set_nesting - 1
|
77
99
|
when :conditional
|
78
|
-
self.conditional_nesting = conditional_nesting - 1
|
100
|
+
self.conditional_nesting = conditional_nesting - 1
|
101
|
+
else
|
102
|
+
raise "unhandled nesting type #{type}"
|
79
103
|
end
|
80
104
|
end
|
81
105
|
|
82
106
|
def descend(type, token)
|
107
|
+
return unless OPENING_TOKENS.include?(token)
|
108
|
+
|
83
109
|
case type
|
84
110
|
when :group, :assertion
|
85
|
-
self.nesting = nesting + 1
|
111
|
+
self.nesting = nesting + 1
|
86
112
|
when :set
|
87
|
-
self.set_nesting = set_nesting + 1
|
113
|
+
self.set_nesting = set_nesting + 1
|
88
114
|
when :conditional
|
89
|
-
self.conditional_nesting = conditional_nesting + 1
|
115
|
+
self.conditional_nesting = conditional_nesting + 1
|
116
|
+
else
|
117
|
+
raise "unhandled nesting type #{type}"
|
90
118
|
end
|
91
119
|
end
|
92
120
|
|
@@ -96,34 +124,46 @@ class Regexp::Lexer
|
|
96
124
|
lead, last, _ = token.text.partition(/.\z/mu)
|
97
125
|
return if lead.empty?
|
98
126
|
|
99
|
-
|
100
|
-
tokens << Regexp::Token.new(:literal, :literal, lead,
|
127
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
101
128
|
token.ts, (token.te - last.length),
|
102
129
|
nesting, set_nesting, conditional_nesting)
|
103
|
-
|
130
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
104
131
|
(token.ts + lead.length), token.te,
|
105
132
|
nesting, set_nesting, conditional_nesting)
|
133
|
+
|
134
|
+
token_1.previous = preprev_token
|
135
|
+
token_1.next = token_2
|
136
|
+
token_2.previous = token_1 # .next will be set by #lex
|
137
|
+
[token_1, token_2]
|
106
138
|
end
|
107
139
|
|
140
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
141
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
142
|
+
# c.f. #break_literal.
|
108
143
|
def break_codepoint_list(token)
|
109
144
|
lead, _, tail = token.text.rpartition(' ')
|
110
145
|
return if lead.empty?
|
111
146
|
|
112
|
-
|
113
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
147
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
114
148
|
token.ts, (token.te - tail.length),
|
115
149
|
nesting, set_nesting, conditional_nesting)
|
116
|
-
|
150
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
117
151
|
(token.ts + lead.length + 1), (token.te + 3),
|
118
152
|
nesting, set_nesting, conditional_nesting)
|
119
153
|
|
120
154
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
155
|
+
|
156
|
+
token_1.previous = preprev_token
|
157
|
+
token_1.next = token_2
|
158
|
+
token_2.previous = token_1 # .next will be set by #lex
|
159
|
+
[token_1, token_2]
|
121
160
|
end
|
122
161
|
|
123
|
-
def merge_condition(current)
|
124
|
-
|
125
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
162
|
+
def merge_condition(current, last)
|
163
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
126
164
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
165
|
+
token.previous = preprev_token # .next will be set by #lex
|
166
|
+
token
|
127
167
|
end
|
128
168
|
|
129
169
|
end # module Regexp::Lexer
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative 'error'
|
2
|
+
require_relative 'expression'
|
3
3
|
|
4
4
|
class Regexp::Parser
|
5
5
|
include Regexp::Expression
|
@@ -18,11 +18,11 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
26
|
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -232,7 +232,7 @@ class Regexp::Parser
|
|
232
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
233
233
|
when :name_call
|
234
234
|
node << Backreference::NameCall.new(token, active_opts)
|
235
|
-
when :number, :number_ref
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
236
236
|
node << Backreference::Number.new(token, active_opts)
|
237
237
|
when :number_recursion_ref
|
238
238
|
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
@@ -272,9 +272,9 @@ class Regexp::Parser
|
|
272
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
273
273
|
when :condition
|
274
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
275
|
-
conditional_nesting.last.add_sequence(active_opts)
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
276
276
|
when :separator
|
277
|
-
conditional_nesting.last.add_sequence(active_opts)
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
278
278
|
self.node = conditional_nesting.last.branches.last
|
279
279
|
when :close
|
280
280
|
conditional_nesting.pop
|
@@ -322,6 +322,7 @@ class Regexp::Parser
|
|
322
322
|
|
323
323
|
when :control
|
324
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
325
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
326
327
|
else
|
327
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
@@ -329,6 +330,7 @@ class Regexp::Parser
|
|
329
330
|
|
330
331
|
when :meta_sequence
|
331
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
332
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
333
335
|
else
|
334
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
@@ -349,11 +351,7 @@ class Regexp::Parser
|
|
349
351
|
when :comment
|
350
352
|
node << Comment.new(token, active_opts)
|
351
353
|
when :whitespace
|
352
|
-
|
353
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
354
|
-
else
|
355
|
-
node << WhiteSpace.new(token, active_opts)
|
356
|
-
end
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
357
355
|
else
|
358
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
359
357
|
end
|
@@ -379,98 +377,99 @@ class Regexp::Parser
|
|
379
377
|
end
|
380
378
|
|
381
379
|
def sequence_operation(klass, token)
|
382
|
-
unless node.
|
380
|
+
unless node.instance_of?(klass)
|
383
381
|
operator = klass.new(token, active_opts)
|
384
|
-
sequence = operator.add_sequence(active_opts)
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
385
383
|
sequence.expressions = node.expressions
|
386
384
|
node.expressions = []
|
387
385
|
nest(operator)
|
388
386
|
end
|
389
|
-
node.add_sequence(active_opts)
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
390
388
|
end
|
391
389
|
|
392
390
|
def posixclass(token)
|
393
391
|
node << PosixClass.new(token, active_opts)
|
394
392
|
end
|
395
393
|
|
396
|
-
|
397
|
-
UPTokens = Regexp::Syntax::Token::
|
394
|
+
UP = Regexp::Expression::Property
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
398
396
|
|
399
397
|
def property(token)
|
400
398
|
case token.token
|
401
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
402
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
403
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
404
|
-
when :blank; node << Blank.new(token, active_opts)
|
405
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
406
|
-
when :digit; node << Digit.new(token, active_opts)
|
407
|
-
when :graph; node << Graph.new(token, active_opts)
|
408
|
-
when :lower; node << Lower.new(token, active_opts)
|
409
|
-
when :print; node << Print.new(token, active_opts)
|
410
|
-
when :punct; node << Punct.new(token, active_opts)
|
411
|
-
when :space; node << Space.new(token, active_opts)
|
412
|
-
when :upper; node << Upper.new(token, active_opts)
|
413
|
-
when :word; node << Word.new(token, active_opts)
|
414
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
415
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
416
414
|
|
417
415
|
# only in Oniguruma (old rubies)
|
418
|
-
when :newline; node << Newline.new(token, active_opts)
|
419
|
-
|
420
|
-
when :any; node << Any.new(token, active_opts)
|
421
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
422
|
-
|
423
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
424
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
425
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
426
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
427
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
428
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
429
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
430
|
-
|
431
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
432
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
433
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
434
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
435
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
436
|
-
|
437
|
-
when :number; node << Number::Any.new(token, active_opts)
|
438
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
439
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
440
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
441
|
-
|
442
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
443
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
444
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
445
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
446
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
447
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
448
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
449
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
450
|
-
|
451
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
452
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
453
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
454
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
455
|
-
|
456
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
457
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
458
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
459
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
460
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
461
|
-
|
462
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
463
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
464
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
465
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
466
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
467
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
468
|
-
|
469
|
-
when *UPTokens::Age; node << Age.new(token, active_opts)
|
470
|
-
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
471
|
-
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
472
|
-
when *UPTokens::
|
473
|
-
when *UPTokens::
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
417
|
+
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
420
|
+
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
428
|
+
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
434
|
+
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
439
|
+
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
448
|
+
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
453
|
+
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
459
|
+
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
466
|
+
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
470
|
+
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
|
471
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
472
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
474
473
|
|
475
474
|
else
|
476
475
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
@@ -478,8 +477,7 @@ class Regexp::Parser
|
|
478
477
|
end
|
479
478
|
|
480
479
|
def quantifier(token)
|
481
|
-
target_node = node.
|
482
|
-
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
480
|
+
target_node = node.extract_quantifier_target(token.text)
|
483
481
|
|
484
482
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
485
483
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -527,6 +525,8 @@ class Regexp::Parser
|
|
527
525
|
end
|
528
526
|
|
529
527
|
def open_set(token)
|
528
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
529
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
530
530
|
token.token = :character
|
531
531
|
nest(CharacterSet.new(token, active_opts))
|
532
532
|
end
|
@@ -541,7 +541,7 @@ class Regexp::Parser
|
|
541
541
|
|
542
542
|
def range(token)
|
543
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
544
|
-
scope = node.last.
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
545
545
|
exp << scope.expressions.pop
|
546
546
|
nest(exp)
|
547
547
|
end
|
@@ -568,28 +568,32 @@ class Regexp::Parser
|
|
568
568
|
end
|
569
569
|
|
570
570
|
def close_completed_character_set_range
|
571
|
-
decrease_nesting if node.
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
572
572
|
end
|
573
573
|
|
574
574
|
def active_opts
|
575
575
|
options_stack.last
|
576
576
|
end
|
577
577
|
|
578
|
-
# Assigns referenced expressions to
|
578
|
+
# Assigns referenced expressions to referring expressions, e.g. if there is
|
579
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
|
-
# find all
|
583
|
-
targets = { 0 => root }
|
582
|
+
# find all referenceable and referring expressions
|
583
|
+
targets = { 0 => [root] }
|
584
|
+
referrers = []
|
584
585
|
root.each_expression do |exp|
|
585
|
-
exp.
|
586
|
+
if exp.referential?
|
587
|
+
referrers << exp
|
588
|
+
elsif exp.is_a?(Group::Capture)
|
589
|
+
(targets[exp.identifier] ||= []) << exp
|
590
|
+
end
|
586
591
|
end
|
587
|
-
# assign
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
raise(ParserError, "Invalid reference: #{exp.reference}")
|
592
|
+
# assign referenced expressions to referring expressions
|
593
|
+
# (in a second iteration because there might be forward references)
|
594
|
+
referrers.each do |exp|
|
595
|
+
exp.referenced_expressions = targets[exp.reference] ||
|
596
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
593
597
|
end
|
594
598
|
end
|
595
599
|
end # module Regexp::Parser
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class Regexp::Scanner
|
2
|
+
# Base for all scanner validation errors
|
3
|
+
class ValidationError < ScannerError
|
4
|
+
# Centralizes and unifies the handling of validation related errors.
|
5
|
+
def self.for(type, problem, reason = nil)
|
6
|
+
types.fetch(type).new(problem, reason)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.types
|
10
|
+
@types ||= {
|
11
|
+
backref: InvalidBackrefError,
|
12
|
+
group: InvalidGroupError,
|
13
|
+
group_option: InvalidGroupOption,
|
14
|
+
posix_class: UnknownPosixClassError,
|
15
|
+
property: UnknownUnicodePropertyError,
|
16
|
+
sequence: InvalidSequenceError,
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
22
|
+
class InvalidSequenceError < ValidationError
|
23
|
+
def initialize(what = 'sequence', where = '')
|
24
|
+
super "Invalid #{what} at #{where}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Invalid group. Used for named groups.
|
29
|
+
class InvalidGroupError < ValidationError
|
30
|
+
def initialize(what, reason)
|
31
|
+
super "Invalid #{what}, #{reason}."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Invalid groupOption. Used for inline options.
|
36
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
37
|
+
class InvalidGroupOption < ValidationError
|
38
|
+
def initialize(option, text)
|
39
|
+
super "Invalid group option #{option} in #{text}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Invalid back reference. Used for name a number refs/calls.
|
44
|
+
class InvalidBackrefError < ValidationError
|
45
|
+
def initialize(what, reason)
|
46
|
+
super "Invalid back reference #{what}, #{reason}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# The property name was not recognized by the scanner.
|
51
|
+
class UnknownUnicodePropertyError < ValidationError
|
52
|
+
def initialize(name, _)
|
53
|
+
super "Unknown unicode character property name #{name}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# The POSIX class name was not recognized by the scanner.
|
58
|
+
class UnknownPosixClassError < ValidationError
|
59
|
+
def initialize(text, _)
|
60
|
+
super "Unknown POSIX class #{text}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|