regexp_parser 2.6.0 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +34 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +101 -100
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1132 -1283
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +14 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative 'error'
|
2
|
+
require_relative 'expression'
|
3
3
|
|
4
4
|
class Regexp::Parser
|
5
5
|
include Regexp::Expression
|
@@ -18,11 +18,11 @@ class Regexp::Parser
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def self.parse(input, syntax =
|
21
|
+
def self.parse(input, syntax = nil, options: nil, &block)
|
22
22
|
new.parse(input, syntax, options: options, &block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def parse(input, syntax =
|
25
|
+
def parse(input, syntax = nil, options: nil, &block)
|
26
26
|
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
@@ -35,7 +35,7 @@ class Regexp::Parser
|
|
35
35
|
|
36
36
|
self.captured_group_counts = Hash.new(0)
|
37
37
|
|
38
|
-
Regexp::Lexer.scan(input, syntax, options: options) do |token|
|
38
|
+
Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
@@ -232,7 +232,7 @@ class Regexp::Parser
|
|
232
232
|
node << Backreference::NameRecursionLevel.new(token, active_opts)
|
233
233
|
when :name_call
|
234
234
|
node << Backreference::NameCall.new(token, active_opts)
|
235
|
-
when :number, :number_ref
|
235
|
+
when :number, :number_ref # TODO: split in v3.0.0
|
236
236
|
node << Backreference::Number.new(token, active_opts)
|
237
237
|
when :number_recursion_ref
|
238
238
|
node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
|
@@ -272,9 +272,9 @@ class Regexp::Parser
|
|
272
272
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
273
273
|
when :condition
|
274
274
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
275
|
-
conditional_nesting.last.add_sequence(active_opts)
|
275
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
276
276
|
when :separator
|
277
|
-
conditional_nesting.last.add_sequence(active_opts)
|
277
|
+
conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
|
278
278
|
self.node = conditional_nesting.last.branches.last
|
279
279
|
when :close
|
280
280
|
conditional_nesting.pop
|
@@ -322,6 +322,7 @@ class Regexp::Parser
|
|
322
322
|
|
323
323
|
when :control
|
324
324
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
325
|
+
# TODO: emit :meta_control_sequence token in v3.0.0
|
325
326
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
326
327
|
else
|
327
328
|
node << EscapeSequence::Control.new(token, active_opts)
|
@@ -329,6 +330,7 @@ class Regexp::Parser
|
|
329
330
|
|
330
331
|
when :meta_sequence
|
331
332
|
if token.text =~ /\A\\M-\\[Cc]/
|
333
|
+
# TODO: emit :meta_control_sequence token in v3.0.0:
|
332
334
|
node << EscapeSequence::MetaControl.new(token, active_opts)
|
333
335
|
else
|
334
336
|
node << EscapeSequence::Meta.new(token, active_opts)
|
@@ -349,11 +351,7 @@ class Regexp::Parser
|
|
349
351
|
when :comment
|
350
352
|
node << Comment.new(token, active_opts)
|
351
353
|
when :whitespace
|
352
|
-
|
353
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
354
|
-
else
|
355
|
-
node << WhiteSpace.new(token, active_opts)
|
356
|
-
end
|
354
|
+
node << WhiteSpace.new(token, active_opts)
|
357
355
|
else
|
358
356
|
raise UnknownTokenError.new('FreeSpace', token)
|
359
357
|
end
|
@@ -379,98 +377,99 @@ class Regexp::Parser
|
|
379
377
|
end
|
380
378
|
|
381
379
|
def sequence_operation(klass, token)
|
382
|
-
unless node.
|
380
|
+
unless node.instance_of?(klass)
|
383
381
|
operator = klass.new(token, active_opts)
|
384
|
-
sequence = operator.add_sequence(active_opts)
|
382
|
+
sequence = operator.add_sequence(active_opts, { ts: token.ts })
|
385
383
|
sequence.expressions = node.expressions
|
386
384
|
node.expressions = []
|
387
385
|
nest(operator)
|
388
386
|
end
|
389
|
-
node.add_sequence(active_opts)
|
387
|
+
node.add_sequence(active_opts, { ts: token.te })
|
390
388
|
end
|
391
389
|
|
392
390
|
def posixclass(token)
|
393
391
|
node << PosixClass.new(token, active_opts)
|
394
392
|
end
|
395
393
|
|
396
|
-
|
397
|
-
UPTokens = Regexp::Syntax::Token::
|
394
|
+
UP = Regexp::Expression::Property
|
395
|
+
UPTokens = Regexp::Syntax::Token::Property
|
398
396
|
|
399
397
|
def property(token)
|
400
398
|
case token.token
|
401
|
-
when :alnum; node << Alnum.new(token, active_opts)
|
402
|
-
when :alpha; node << Alpha.new(token, active_opts)
|
403
|
-
when :ascii; node << Ascii.new(token, active_opts)
|
404
|
-
when :blank; node << Blank.new(token, active_opts)
|
405
|
-
when :cntrl; node << Cntrl.new(token, active_opts)
|
406
|
-
when :digit; node << Digit.new(token, active_opts)
|
407
|
-
when :graph; node << Graph.new(token, active_opts)
|
408
|
-
when :lower; node << Lower.new(token, active_opts)
|
409
|
-
when :print; node << Print.new(token, active_opts)
|
410
|
-
when :punct; node << Punct.new(token, active_opts)
|
411
|
-
when :space; node << Space.new(token, active_opts)
|
412
|
-
when :upper; node << Upper.new(token, active_opts)
|
413
|
-
when :word; node << Word.new(token, active_opts)
|
414
|
-
when :xdigit; node << Xdigit.new(token, active_opts)
|
415
|
-
when :xposixpunct; node << XPosixPunct.new(token, active_opts)
|
399
|
+
when :alnum; node << UP::Alnum.new(token, active_opts)
|
400
|
+
when :alpha; node << UP::Alpha.new(token, active_opts)
|
401
|
+
when :ascii; node << UP::Ascii.new(token, active_opts)
|
402
|
+
when :blank; node << UP::Blank.new(token, active_opts)
|
403
|
+
when :cntrl; node << UP::Cntrl.new(token, active_opts)
|
404
|
+
when :digit; node << UP::Digit.new(token, active_opts)
|
405
|
+
when :graph; node << UP::Graph.new(token, active_opts)
|
406
|
+
when :lower; node << UP::Lower.new(token, active_opts)
|
407
|
+
when :print; node << UP::Print.new(token, active_opts)
|
408
|
+
when :punct; node << UP::Punct.new(token, active_opts)
|
409
|
+
when :space; node << UP::Space.new(token, active_opts)
|
410
|
+
when :upper; node << UP::Upper.new(token, active_opts)
|
411
|
+
when :word; node << UP::Word.new(token, active_opts)
|
412
|
+
when :xdigit; node << UP::Xdigit.new(token, active_opts)
|
413
|
+
when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
|
416
414
|
|
417
415
|
# only in Oniguruma (old rubies)
|
418
|
-
when :newline; node << Newline.new(token, active_opts)
|
419
|
-
|
420
|
-
when :any; node << Any.new(token, active_opts)
|
421
|
-
when :assigned; node << Assigned.new(token, active_opts)
|
422
|
-
|
423
|
-
when :letter; node << Letter::Any.new(token, active_opts)
|
424
|
-
when :cased_letter; node << Letter::Cased.new(token, active_opts)
|
425
|
-
when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
|
426
|
-
when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
|
427
|
-
when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
|
428
|
-
when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
|
429
|
-
when :other_letter; node << Letter::Other.new(token, active_opts)
|
430
|
-
|
431
|
-
when :mark; node << Mark::Any.new(token, active_opts)
|
432
|
-
when :combining_mark; node << Mark::Combining.new(token, active_opts)
|
433
|
-
when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
|
434
|
-
when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
|
435
|
-
when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
|
436
|
-
|
437
|
-
when :number; node << Number::Any.new(token, active_opts)
|
438
|
-
when :decimal_number; node << Number::Decimal.new(token, active_opts)
|
439
|
-
when :letter_number; node << Number::Letter.new(token, active_opts)
|
440
|
-
when :other_number; node << Number::Other.new(token, active_opts)
|
441
|
-
|
442
|
-
when :punctuation; node << Punctuation::Any.new(token, active_opts)
|
443
|
-
when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
|
444
|
-
when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
|
445
|
-
when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
|
446
|
-
when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
|
447
|
-
when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
|
448
|
-
when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
|
449
|
-
when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
|
450
|
-
|
451
|
-
when :separator; node << Separator::Any.new(token, active_opts)
|
452
|
-
when :space_separator; node << Separator::Space.new(token, active_opts)
|
453
|
-
when :line_separator; node << Separator::Line.new(token, active_opts)
|
454
|
-
when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
|
455
|
-
|
456
|
-
when :symbol; node << Symbol::Any.new(token, active_opts)
|
457
|
-
when :math_symbol; node << Symbol::Math.new(token, active_opts)
|
458
|
-
when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
|
459
|
-
when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
|
460
|
-
when :other_symbol; node << Symbol::Other.new(token, active_opts)
|
461
|
-
|
462
|
-
when :other; node << Codepoint::Any.new(token, active_opts)
|
463
|
-
when :control; node << Codepoint::Control.new(token, active_opts)
|
464
|
-
when :format; node << Codepoint::Format.new(token, active_opts)
|
465
|
-
when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
|
466
|
-
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
467
|
-
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
468
|
-
|
469
|
-
when *UPTokens::Age; node << Age.new(token, active_opts)
|
470
|
-
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
471
|
-
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
472
|
-
when *UPTokens::
|
473
|
-
when *UPTokens::
|
416
|
+
when :newline; node << UP::Newline.new(token, active_opts)
|
417
|
+
|
418
|
+
when :any; node << UP::Any.new(token, active_opts)
|
419
|
+
when :assigned; node << UP::Assigned.new(token, active_opts)
|
420
|
+
|
421
|
+
when :letter; node << UP::Letter::Any.new(token, active_opts)
|
422
|
+
when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
|
423
|
+
when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
|
424
|
+
when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
|
425
|
+
when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
|
426
|
+
when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
|
427
|
+
when :other_letter; node << UP::Letter::Other.new(token, active_opts)
|
428
|
+
|
429
|
+
when :mark; node << UP::Mark::Any.new(token, active_opts)
|
430
|
+
when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
|
431
|
+
when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
|
432
|
+
when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
|
433
|
+
when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
|
434
|
+
|
435
|
+
when :number; node << UP::Number::Any.new(token, active_opts)
|
436
|
+
when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
|
437
|
+
when :letter_number; node << UP::Number::Letter.new(token, active_opts)
|
438
|
+
when :other_number; node << UP::Number::Other.new(token, active_opts)
|
439
|
+
|
440
|
+
when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
|
441
|
+
when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
|
442
|
+
when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
|
443
|
+
when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
|
444
|
+
when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
|
445
|
+
when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
|
446
|
+
when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
|
447
|
+
when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
|
448
|
+
|
449
|
+
when :separator; node << UP::Separator::Any.new(token, active_opts)
|
450
|
+
when :space_separator; node << UP::Separator::Space.new(token, active_opts)
|
451
|
+
when :line_separator; node << UP::Separator::Line.new(token, active_opts)
|
452
|
+
when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
|
453
|
+
|
454
|
+
when :symbol; node << UP::Symbol::Any.new(token, active_opts)
|
455
|
+
when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
|
456
|
+
when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
|
457
|
+
when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
|
458
|
+
when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
|
459
|
+
|
460
|
+
when :other; node << UP::Codepoint::Any.new(token, active_opts)
|
461
|
+
when :control; node << UP::Codepoint::Control.new(token, active_opts)
|
462
|
+
when :format; node << UP::Codepoint::Format.new(token, active_opts)
|
463
|
+
when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
|
464
|
+
when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
|
465
|
+
when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
|
466
|
+
|
467
|
+
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
468
|
+
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
469
|
+
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
470
|
+
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
|
471
|
+
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
472
|
+
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
474
473
|
|
475
474
|
else
|
476
475
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
@@ -478,8 +477,7 @@ class Regexp::Parser
|
|
478
477
|
end
|
479
478
|
|
480
479
|
def quantifier(token)
|
481
|
-
target_node = node.
|
482
|
-
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
480
|
+
target_node = node.extract_quantifier_target(token.text)
|
483
481
|
|
484
482
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
485
483
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -527,6 +525,8 @@ class Regexp::Parser
|
|
527
525
|
end
|
528
526
|
|
529
527
|
def open_set(token)
|
528
|
+
# TODO: this and Quantifier are the only cases where Expression#token
|
529
|
+
# does not match the scanner/lexer output. Fix in v3.0.0.
|
530
530
|
token.token = :character
|
531
531
|
nest(CharacterSet.new(token, active_opts))
|
532
532
|
end
|
@@ -541,7 +541,7 @@ class Regexp::Parser
|
|
541
541
|
|
542
542
|
def range(token)
|
543
543
|
exp = CharacterSet::Range.new(token, active_opts)
|
544
|
-
scope = node.last.
|
544
|
+
scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
|
545
545
|
exp << scope.expressions.pop
|
546
546
|
nest(exp)
|
547
547
|
end
|
@@ -568,28 +568,29 @@ class Regexp::Parser
|
|
568
568
|
end
|
569
569
|
|
570
570
|
def close_completed_character_set_range
|
571
|
-
decrease_nesting if node.
|
571
|
+
decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
|
572
572
|
end
|
573
573
|
|
574
574
|
def active_opts
|
575
575
|
options_stack.last
|
576
576
|
end
|
577
577
|
|
578
|
-
# Assigns referenced expressions to
|
578
|
+
# Assigns referenced expressions to referring expressions, e.g. if there is
|
579
579
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
|
-
# find all
|
582
|
+
# find all referenceable and referring expressions
|
583
583
|
targets = { 0 => root }
|
584
|
+
referrers = []
|
584
585
|
root.each_expression do |exp|
|
585
586
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
587
|
+
referrers << exp if exp.referential?
|
586
588
|
end
|
587
|
-
# assign
|
588
|
-
|
589
|
-
|
590
|
-
|
589
|
+
# assign reference expression to referring expressions
|
590
|
+
# (in a second iteration because there might be forward references)
|
591
|
+
referrers.each do |exp|
|
591
592
|
exp.referenced_expression = targets[exp.reference] ||
|
592
|
-
raise(ParserError, "Invalid reference
|
593
|
+
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
593
594
|
end
|
594
595
|
end
|
595
596
|
end # module Regexp::Parser
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class Regexp::Scanner
|
2
|
+
# Base for all scanner validation errors
|
3
|
+
class ValidationError < ScannerError
|
4
|
+
# Centralizes and unifies the handling of validation related errors.
|
5
|
+
def self.for(type, problem, reason = nil)
|
6
|
+
types.fetch(type).new(problem, reason)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.types
|
10
|
+
@types ||= {
|
11
|
+
backref: InvalidBackrefError,
|
12
|
+
group: InvalidGroupError,
|
13
|
+
group_option: InvalidGroupOption,
|
14
|
+
posix_class: UnknownPosixClassError,
|
15
|
+
property: UnknownUnicodePropertyError,
|
16
|
+
sequence: InvalidSequenceError,
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
22
|
+
class InvalidSequenceError < ValidationError
|
23
|
+
def initialize(what = 'sequence', where = '')
|
24
|
+
super "Invalid #{what} at #{where}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Invalid group. Used for named groups.
|
29
|
+
class InvalidGroupError < ValidationError
|
30
|
+
def initialize(what, reason)
|
31
|
+
super "Invalid #{what}, #{reason}."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Invalid groupOption. Used for inline options.
|
36
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
37
|
+
class InvalidGroupOption < ValidationError
|
38
|
+
def initialize(option, text)
|
39
|
+
super "Invalid group option #{option} in #{text}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Invalid back reference. Used for name a number refs/calls.
|
44
|
+
class InvalidBackrefError < ValidationError
|
45
|
+
def initialize(what, reason)
|
46
|
+
super "Invalid back reference #{what}, #{reason}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# The property name was not recognized by the scanner.
|
51
|
+
class UnknownUnicodePropertyError < ValidationError
|
52
|
+
def initialize(name, _)
|
53
|
+
super "Unknown unicode character property name #{name}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# The POSIX class name was not recognized by the scanner.
|
58
|
+
class UnknownPosixClassError < ValidationError
|
59
|
+
def initialize(text, _)
|
60
|
+
super "Unknown POSIX class #{text}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -7,6 +7,8 @@ age=12.0,age=12.0
|
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
9
|
age=14.0,age=14.0
|
10
|
+
age=15.0,age=15.0
|
11
|
+
age=15.1,age=15.1
|
10
12
|
age=2.0,age=2.0
|
11
13
|
age=2.1,age=2.1
|
12
14
|
age=3.0,age=3.0
|
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
|
|
97
99
|
emojipresentation,emoji_presentation
|
98
100
|
enclosingmark,enclosing_mark
|
99
101
|
ethiopic,ethiopic
|
102
|
+
extendedpictographic,extended_pictographic
|
100
103
|
extender,extender
|
101
104
|
finalpunctuation,final_punctuation
|
102
105
|
format,format
|
@@ -106,6 +109,19 @@ gothic,gothic
|
|
106
109
|
grantha,grantha
|
107
110
|
graph,graph
|
108
111
|
graphemebase,grapheme_base
|
112
|
+
graphemeclusterbreak=control,grapheme_cluster_break=control
|
113
|
+
graphemeclusterbreak=cr,grapheme_cluster_break=cr
|
114
|
+
graphemeclusterbreak=extend,grapheme_cluster_break=extend
|
115
|
+
graphemeclusterbreak=l,grapheme_cluster_break=l
|
116
|
+
graphemeclusterbreak=lf,grapheme_cluster_break=lf
|
117
|
+
graphemeclusterbreak=lv,grapheme_cluster_break=lv
|
118
|
+
graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
|
119
|
+
graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
|
120
|
+
graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
|
121
|
+
graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
|
122
|
+
graphemeclusterbreak=t,grapheme_cluster_break=t
|
123
|
+
graphemeclusterbreak=v,grapheme_cluster_break=v
|
124
|
+
graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
|
109
125
|
graphemeextend,grapheme_extend
|
110
126
|
graphemelink,grapheme_link
|
111
127
|
greek,greek
|
@@ -121,11 +137,14 @@ hebrew,hebrew
|
|
121
137
|
hexdigit,hex_digit
|
122
138
|
hiragana,hiragana
|
123
139
|
hyphen,hyphen
|
140
|
+
idcompatmathcontinue,id_compat_math_continue
|
141
|
+
idcompatmathstart,id_compat_math_start
|
124
142
|
idcontinue,id_continue
|
125
143
|
ideographic,ideographic
|
126
144
|
idsbinaryoperator,ids_binary_operator
|
127
145
|
idstart,id_start
|
128
146
|
idstrinaryoperator,ids_trinary_operator
|
147
|
+
idsunaryoperator,ids_unary_operator
|
129
148
|
imperialaramaic,imperial_aramaic
|
130
149
|
inadlam,in_adlam
|
131
150
|
inaegeannumbers,in_aegean_numbers
|
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
|
|
139
158
|
inarabic,in_arabic
|
140
159
|
inarabicextendeda,in_arabic_extended_a
|
141
160
|
inarabicextendedb,in_arabic_extended_b
|
161
|
+
inarabicextendedc,in_arabic_extended_c
|
142
162
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
143
163
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
144
164
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
|
|
186
206
|
incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
|
187
207
|
incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
|
188
208
|
incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
|
209
|
+
incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
|
210
|
+
incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
|
189
211
|
incombiningdiacriticalmarks,in_combining_diacritical_marks
|
190
212
|
incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
|
191
213
|
incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
|
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
|
|
205
227
|
incyrillicextendeda,in_cyrillic_extended_a
|
206
228
|
incyrillicextendedb,in_cyrillic_extended_b
|
207
229
|
incyrillicextendedc,in_cyrillic_extended_c
|
230
|
+
incyrillicextendedd,in_cyrillic_extended_d
|
208
231
|
incyrillicsupplement,in_cyrillic_supplement
|
209
232
|
indeseret,in_deseret
|
210
233
|
indevanagari,in_devanagari
|
211
234
|
indevanagariextended,in_devanagari_extended
|
235
|
+
indevanagariextendeda,in_devanagari_extended_a
|
212
236
|
indingbats,in_dingbats
|
213
237
|
indivesakuru,in_dives_akuru
|
214
238
|
indogra,in_dogra
|
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
|
|
268
292
|
initialpunctuation,initial_punctuation
|
269
293
|
injavanese,in_javanese
|
270
294
|
inkaithi,in_kaithi
|
295
|
+
inkaktoviknumerals,in_kaktovik_numerals
|
271
296
|
inkanaextendeda,in_kana_extended_a
|
272
297
|
inkanaextendedb,in_kana_extended_b
|
273
298
|
inkanasupplement,in_kana_supplement
|
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
|
|
276
301
|
inkannada,in_kannada
|
277
302
|
inkatakana,in_katakana
|
278
303
|
inkatakanaphoneticextensions,in_katakana_phonetic_extensions
|
304
|
+
inkawi,in_kawi
|
279
305
|
inkayahli,in_kayah_li
|
280
306
|
inkharoshthi,in_kharoshthi
|
281
307
|
inkhitansmallscript,in_khitan_small_script
|
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
|
|
339
365
|
inmyanmarextendeda,in_myanmar_extended_a
|
340
366
|
inmyanmarextendedb,in_myanmar_extended_b
|
341
367
|
innabataean,in_nabataean
|
368
|
+
innagmundari,in_nag_mundari
|
342
369
|
innandinagari,in_nandinagari
|
343
370
|
innewa,in_newa
|
344
371
|
innewtailue,in_new_tai_lue
|
@@ -457,6 +484,7 @@ joincontrol,join_control
|
|
457
484
|
kaithi,kaithi
|
458
485
|
kannada,kannada
|
459
486
|
katakana,katakana
|
487
|
+
kawi,kawi
|
460
488
|
kayahli,kayah_li
|
461
489
|
kharoshthi,kharoshthi
|
462
490
|
khitansmallscript,khitan_small_script
|
@@ -503,6 +531,7 @@ mro,mro
|
|
503
531
|
multani,multani
|
504
532
|
myanmar,myanmar
|
505
533
|
nabataean,nabataean
|
534
|
+
nagmundari,nag_mundari
|
506
535
|
nandinagari,nandinagari
|
507
536
|
newa,newa
|
508
537
|
newline,newline
|
@@ -57,6 +57,7 @@ emod,emoji_modifier
|
|
57
57
|
epres,emoji_presentation
|
58
58
|
ethi,ethiopic
|
59
59
|
ext,extender
|
60
|
+
extpict,extended_pictographic
|
60
61
|
geor,georgian
|
61
62
|
glag,glagolitic
|
62
63
|
gong,gunjala_gondi
|
@@ -85,6 +86,7 @@ ideo,ideographic
|
|
85
86
|
ids,id_start
|
86
87
|
idsb,ids_binary_operator
|
87
88
|
idst,ids_trinary_operator
|
89
|
+
idsu,ids_unary_operator
|
88
90
|
ital,old_italic
|
89
91
|
java,javanese
|
90
92
|
joinc,join_control
|
@@ -133,6 +135,7 @@ mtei,meetei_mayek
|
|
133
135
|
mult,multani
|
134
136
|
mymr,myanmar
|
135
137
|
n,number
|
138
|
+
nagm,nag_mundari
|
136
139
|
nand,nandinagari
|
137
140
|
narb,old_north_arabian
|
138
141
|
nbat,nabataean
|
@@ -17,10 +17,10 @@
|
|
17
17
|
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
|
-
name =
|
20
|
+
name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
raise ValidationError.for(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|