regexp_parser 2.1.1 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -5
- data/LICENSE +1 -1
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +18 -3
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +4 -4
- data/lib/regexp_parser/expression/classes/group.rb +10 -22
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +10 -11
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +55 -24
- data/lib/regexp_parser/expression/sequence.rb +11 -31
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +26 -18
- data/lib/regexp_parser/expression.rb +37 -155
- data/lib/regexp_parser/lexer.rb +81 -39
- data/lib/regexp_parser/parser.rb +135 -173
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +651 -0
- data/lib/regexp_parser/scanner/properties/short.csv +249 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +127 -185
- data/lib/regexp_parser/scanner.rb +1185 -1402
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +17 -34
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +4 -2
- data/lib/regexp_parser/syntax.rb +2 -2
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -8
- data/regexp_parser.gemspec +20 -22
- metadata +49 -171
- data/CHANGELOG.md +0 -494
- data/README.md +0 -479
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/parser.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
require_relative 'error'
|
|
2
|
+
require_relative 'expression'
|
|
3
3
|
|
|
4
4
|
class Regexp::Parser
|
|
5
5
|
include Regexp::Expression
|
|
@@ -18,12 +18,12 @@ class Regexp::Parser
|
|
|
18
18
|
end
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
def self.parse(input, syntax =
|
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
-
def parse(input, syntax =
|
|
26
|
-
root = Root.
|
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
|
26
|
+
root = Root.construct(options: extract_options(input, options))
|
|
27
27
|
|
|
28
28
|
self.root = root
|
|
29
29
|
self.node = root
|
|
@@ -35,10 +35,13 @@ class Regexp::Parser
|
|
|
35
35
|
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
|
37
37
|
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
|
39
39
|
parse_token(token)
|
|
40
40
|
end
|
|
41
41
|
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
|
44
|
+
root.nesting_level = 0
|
|
42
45
|
assign_referenced_expressions
|
|
43
46
|
|
|
44
47
|
if block_given?
|
|
@@ -197,11 +200,11 @@ class Regexp::Parser
|
|
|
197
200
|
end
|
|
198
201
|
|
|
199
202
|
def captured_group_count_at_level
|
|
200
|
-
captured_group_counts[node
|
|
203
|
+
captured_group_counts[node]
|
|
201
204
|
end
|
|
202
205
|
|
|
203
206
|
def count_captured_group
|
|
204
|
-
captured_group_counts[node
|
|
207
|
+
captured_group_counts[node] += 1
|
|
205
208
|
end
|
|
206
209
|
|
|
207
210
|
def close_group
|
|
@@ -229,10 +232,18 @@ class Regexp::Parser
|
|
|
229
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
|
230
233
|
when :name_call
|
|
231
234
|
node << Backreference::NameCall.new(token, active_opts)
|
|
232
|
-
when :number, :number_ref
|
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
|
233
236
|
node << Backreference::Number.new(token, active_opts)
|
|
234
237
|
when :number_recursion_ref
|
|
235
|
-
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
|
238
|
+
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
|
239
|
+
# TODO: should split off new token number_recursion_rel_ref and new
|
|
240
|
+
# class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
|
|
241
|
+
if exp.text =~ /[<'][+-]/
|
|
242
|
+
assign_effective_number(exp)
|
|
243
|
+
else
|
|
244
|
+
exp.effective_number = exp.number
|
|
245
|
+
end
|
|
246
|
+
end
|
|
236
247
|
when :number_call
|
|
237
248
|
node << Backreference::NumberCall.new(token, active_opts)
|
|
238
249
|
when :number_rel_ref
|
|
@@ -251,6 +262,8 @@ class Regexp::Parser
|
|
|
251
262
|
def assign_effective_number(exp)
|
|
252
263
|
exp.effective_number =
|
|
253
264
|
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
|
265
|
+
exp.effective_number > 0 ||
|
|
266
|
+
raise(ParserError, "Invalid reference: #{exp.reference}")
|
|
254
267
|
end
|
|
255
268
|
|
|
256
269
|
def conditional(token)
|
|
@@ -259,9 +272,9 @@ class Regexp::Parser
|
|
|
259
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
|
260
273
|
when :condition
|
|
261
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
|
262
|
-
conditional_nesting.last.add_sequence(active_opts)
|
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
|
263
276
|
when :separator
|
|
264
|
-
conditional_nesting.last.add_sequence(active_opts)
|
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
|
265
278
|
self.node = conditional_nesting.last.branches.last
|
|
266
279
|
when :close
|
|
267
280
|
conditional_nesting.pop
|
|
@@ -286,17 +299,9 @@ class Regexp::Parser
|
|
|
286
299
|
def nest(exp)
|
|
287
300
|
nesting.push(exp)
|
|
288
301
|
node << exp
|
|
289
|
-
update_transplanted_subtree(exp, node)
|
|
290
302
|
self.node = exp
|
|
291
303
|
end
|
|
292
304
|
|
|
293
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
|
294
|
-
def update_transplanted_subtree(exp, new_parent)
|
|
295
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
|
296
|
-
exp.respond_to?(:each) &&
|
|
297
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
|
298
|
-
end
|
|
299
|
-
|
|
300
305
|
def escape(token)
|
|
301
306
|
case token.token
|
|
302
307
|
|
|
@@ -317,6 +322,7 @@ class Regexp::Parser
|
|
|
317
322
|
|
|
318
323
|
when :control
|
|
319
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
|
320
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
|
321
327
|
else
|
|
322
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
|
@@ -324,6 +330,7 @@ class Regexp::Parser
|
|
|
324
330
|
|
|
325
331
|
when :meta_sequence
|
|
326
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
|
327
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
|
328
335
|
else
|
|
329
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
|
@@ -344,11 +351,7 @@ class Regexp::Parser
|
|
|
344
351
|
when :comment
|
|
345
352
|
node << Comment.new(token, active_opts)
|
|
346
353
|
when :whitespace
|
|
347
|
-
|
|
348
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
|
349
|
-
else
|
|
350
|
-
node << WhiteSpace.new(token, active_opts)
|
|
351
|
-
end
|
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
|
352
355
|
else
|
|
353
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
|
354
357
|
end
|
|
@@ -374,98 +377,99 @@ class Regexp::Parser
|
|
|
374
377
|
end
|
|
375
378
|
|
|
376
379
|
def sequence_operation(klass, token)
|
|
377
|
-
unless node.
|
|
380
|
+
unless node.instance_of?(klass)
|
|
378
381
|
operator = klass.new(token, active_opts)
|
|
379
|
-
sequence = operator.add_sequence(active_opts)
|
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
|
380
383
|
sequence.expressions = node.expressions
|
|
381
384
|
node.expressions = []
|
|
382
385
|
nest(operator)
|
|
383
386
|
end
|
|
384
|
-
node.add_sequence(active_opts)
|
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
|
385
388
|
end
|
|
386
389
|
|
|
387
390
|
def posixclass(token)
|
|
388
391
|
node << PosixClass.new(token, active_opts)
|
|
389
392
|
end
|
|
390
393
|
|
|
391
|
-
|
|
392
|
-
UPTokens = Regexp::Syntax::Token::
|
|
394
|
+
UP = Regexp::Expression::Property
|
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
|
393
396
|
|
|
394
397
|
def property(token)
|
|
395
398
|
case token.token
|
|
396
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
|
397
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
|
398
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
|
399
|
-
when :blank; node << Blank.new(token, active_opts)
|
|
400
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
|
401
|
-
when :digit; node << Digit.new(token, active_opts)
|
|
402
|
-
when :graph; node << Graph.new(token, active_opts)
|
|
403
|
-
when :lower; node << Lower.new(token, active_opts)
|
|
404
|
-
when :print; node << Print.new(token, active_opts)
|
|
405
|
-
when :punct; node << Punct.new(token, active_opts)
|
|
406
|
-
when :space; node << Space.new(token, active_opts)
|
|
407
|
-
when :upper; node << Upper.new(token, active_opts)
|
|
408
|
-
when :word; node << Word.new(token, active_opts)
|
|
409
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
|
410
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
|
411
414
|
|
|
412
415
|
# only in Oniguruma (old rubies)
|
|
413
|
-
when :newline; node << Newline.new(token, active_opts)
|
|
414
|
-
|
|
415
|
-
when :any; node << Any.new(token, active_opts)
|
|
416
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
|
417
|
-
|
|
418
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
|
419
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
|
420
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
|
421
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
|
422
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
|
423
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
|
424
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
|
425
|
-
|
|
426
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
|
427
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
|
428
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
|
429
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
|
430
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
|
431
|
-
|
|
432
|
-
when :number; node << Number::Any.new(token, active_opts)
|
|
433
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
|
434
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
|
435
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
|
436
|
-
|
|
437
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
|
438
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
|
439
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
|
440
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
|
441
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
|
442
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
|
443
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
|
444
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
|
445
|
-
|
|
446
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
|
447
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
|
448
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
|
449
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
|
450
|
-
|
|
451
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
|
452
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
|
453
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
|
454
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
|
455
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
|
456
|
-
|
|
457
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
|
458
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
|
459
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
|
460
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
|
461
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
|
462
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
|
463
|
-
|
|
464
|
-
when *UPTokens::Age; node << Age.new(token, active_opts)
|
|
465
|
-
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
|
466
|
-
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
|
467
|
-
when *UPTokens::
|
|
468
|
-
when *UPTokens::
|
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
|
417
|
+
|
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
|
420
|
+
|
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
|
428
|
+
|
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
|
434
|
+
|
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
|
439
|
+
|
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
|
448
|
+
|
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
|
453
|
+
|
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
|
459
|
+
|
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
|
466
|
+
|
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
|
470
|
+
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
|
|
471
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
|
472
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
|
469
473
|
|
|
470
474
|
else
|
|
471
475
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
|
@@ -473,86 +477,39 @@ class Regexp::Parser
|
|
|
473
477
|
end
|
|
474
478
|
|
|
475
479
|
def quantifier(token)
|
|
476
|
-
target_node = node.
|
|
477
|
-
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
|
480
|
+
target_node = node.extract_quantifier_target(token.text)
|
|
478
481
|
|
|
479
482
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
|
480
483
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
|
481
484
|
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
|
482
485
|
if target_node.quantified?
|
|
483
|
-
|
|
484
|
-
:
|
|
485
|
-
:
|
|
486
|
-
|
|
487
|
-
target_node.
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
target_node.set_level,
|
|
491
|
-
target_node.conditional_level
|
|
486
|
+
new_group = Group::Passive.construct(
|
|
487
|
+
token: :passive,
|
|
488
|
+
ts: target_node.ts,
|
|
489
|
+
level: target_node.level,
|
|
490
|
+
set_level: target_node.set_level,
|
|
491
|
+
conditional_level: target_node.conditional_level,
|
|
492
|
+
options: active_opts,
|
|
492
493
|
)
|
|
493
|
-
new_group = Group::Passive.new(new_token, active_opts)
|
|
494
494
|
new_group.implicit = true
|
|
495
495
|
new_group << target_node
|
|
496
|
-
|
|
496
|
+
increase_group_level(target_node)
|
|
497
497
|
node.expressions[node.expressions.index(target_node)] = new_group
|
|
498
498
|
target_node = new_group
|
|
499
499
|
end
|
|
500
500
|
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
|
504
|
-
when :zero_or_one_reluctant
|
|
505
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
|
|
506
|
-
when :zero_or_one_possessive
|
|
507
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
|
508
|
-
|
|
509
|
-
when :zero_or_more
|
|
510
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
|
511
|
-
when :zero_or_more_reluctant
|
|
512
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
|
513
|
-
when :zero_or_more_possessive
|
|
514
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
|
515
|
-
|
|
516
|
-
when :one_or_more
|
|
517
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
|
518
|
-
when :one_or_more_reluctant
|
|
519
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
|
520
|
-
when :one_or_more_possessive
|
|
521
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
|
522
|
-
|
|
523
|
-
when :interval
|
|
524
|
-
interval(target_node, token)
|
|
525
|
-
|
|
526
|
-
else
|
|
501
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
|
502
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
|
527
503
|
raise UnknownTokenError.new('Quantifier', token)
|
|
528
504
|
end
|
|
505
|
+
|
|
506
|
+
target_node.quantify(token, active_opts)
|
|
529
507
|
end
|
|
530
508
|
|
|
531
|
-
def
|
|
509
|
+
def increase_group_level(exp)
|
|
532
510
|
exp.level += 1
|
|
533
|
-
exp.
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
def interval(target_node, token)
|
|
537
|
-
text = token.text
|
|
538
|
-
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
|
539
|
-
case mchr
|
|
540
|
-
when '?'
|
|
541
|
-
range_text = text[0...-1]
|
|
542
|
-
mode = :reluctant
|
|
543
|
-
when '+'
|
|
544
|
-
range_text = text[0...-1]
|
|
545
|
-
mode = :possessive
|
|
546
|
-
else
|
|
547
|
-
range_text = text
|
|
548
|
-
mode = :greedy
|
|
549
|
-
end
|
|
550
|
-
|
|
551
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
|
552
|
-
min = range[0].empty? ? 0 : range[0]
|
|
553
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
|
554
|
-
|
|
555
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
|
511
|
+
exp.quantifier.level += 1 if exp.quantifier
|
|
512
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
|
556
513
|
end
|
|
557
514
|
|
|
558
515
|
def set(token)
|
|
@@ -568,6 +525,8 @@ class Regexp::Parser
|
|
|
568
525
|
end
|
|
569
526
|
|
|
570
527
|
def open_set(token)
|
|
528
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
|
529
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
|
571
530
|
token.token = :character
|
|
572
531
|
nest(CharacterSet.new(token, active_opts))
|
|
573
532
|
end
|
|
@@ -582,7 +541,7 @@ class Regexp::Parser
|
|
|
582
541
|
|
|
583
542
|
def range(token)
|
|
584
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
|
585
|
-
scope = node.last.
|
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
|
586
545
|
exp << scope.expressions.pop
|
|
587
546
|
nest(exp)
|
|
588
547
|
end
|
|
@@ -609,26 +568,29 @@ class Regexp::Parser
|
|
|
609
568
|
end
|
|
610
569
|
|
|
611
570
|
def close_completed_character_set_range
|
|
612
|
-
decrease_nesting if node.
|
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
|
613
572
|
end
|
|
614
573
|
|
|
615
574
|
def active_opts
|
|
616
575
|
options_stack.last
|
|
617
576
|
end
|
|
618
577
|
|
|
619
|
-
# Assigns referenced expressions to
|
|
578
|
+
# Assigns referenced expressions to referring expressions, e.g. if there is
|
|
620
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
|
621
580
|
# the instance of Group::Capture that it refers to via its number.
|
|
622
581
|
def assign_referenced_expressions
|
|
623
|
-
|
|
624
|
-
|
|
582
|
+
# find all referenceable and referring expressions
|
|
583
|
+
targets = { 0 => root }
|
|
584
|
+
referrers = []
|
|
625
585
|
root.each_expression do |exp|
|
|
626
586
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
|
587
|
+
referrers << exp if exp.referential?
|
|
627
588
|
end
|
|
628
|
-
# assign
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
589
|
+
# assign reference expression to referring expressions
|
|
590
|
+
# (in a second iteration because there might be forward references)
|
|
591
|
+
referrers.each do |exp|
|
|
592
|
+
exp.referenced_expression = targets[exp.reference] ||
|
|
593
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
|
632
594
|
end
|
|
633
595
|
end
|
|
634
596
|
end # module Regexp::Parser
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
class Regexp::Scanner
|
|
2
|
+
# Base for all scanner validation errors
|
|
3
|
+
class ValidationError < ScannerError
|
|
4
|
+
# Centralizes and unifies the handling of validation related errors.
|
|
5
|
+
def self.for(type, problem, reason = nil)
|
|
6
|
+
types.fetch(type).new(problem, reason)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def self.types
|
|
10
|
+
@types ||= {
|
|
11
|
+
backref: InvalidBackrefError,
|
|
12
|
+
group: InvalidGroupError,
|
|
13
|
+
group_option: InvalidGroupOption,
|
|
14
|
+
posix_class: UnknownPosixClassError,
|
|
15
|
+
property: UnknownUnicodePropertyError,
|
|
16
|
+
sequence: InvalidSequenceError,
|
|
17
|
+
}
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
|
22
|
+
class InvalidSequenceError < ValidationError
|
|
23
|
+
def initialize(what = 'sequence', where = '')
|
|
24
|
+
super "Invalid #{what} at #{where}"
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Invalid group. Used for named groups.
|
|
29
|
+
class InvalidGroupError < ValidationError
|
|
30
|
+
def initialize(what, reason)
|
|
31
|
+
super "Invalid #{what}, #{reason}."
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Invalid groupOption. Used for inline options.
|
|
36
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
|
37
|
+
class InvalidGroupOption < ValidationError
|
|
38
|
+
def initialize(option, text)
|
|
39
|
+
super "Invalid group option #{option} in #{text}"
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Invalid back reference. Used for name a number refs/calls.
|
|
44
|
+
class InvalidBackrefError < ValidationError
|
|
45
|
+
def initialize(what, reason)
|
|
46
|
+
super "Invalid back reference #{what}, #{reason}"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# The property name was not recognized by the scanner.
|
|
51
|
+
class UnknownUnicodePropertyError < ValidationError
|
|
52
|
+
def initialize(name, _)
|
|
53
|
+
super "Unknown unicode character property name #{name}"
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# The POSIX class name was not recognized by the scanner.
|
|
58
|
+
class UnknownPosixClassError < ValidationError
|
|
59
|
+
def initialize(text, _)
|
|
60
|
+
super "Unknown POSIX class #{text}"
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|