RubyGems - regexp_parser - Versions diffs - 2.6.0 → 2.9.2 - Mend

regexp_parser 2.6.0 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/Gemfile +5 -5
data/LICENSE +1 -1
data/lib/regexp_parser/expression/base.rb +0 -7
data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
data/lib/regexp_parser/expression/classes/group.rb +0 -22
data/lib/regexp_parser/expression/classes/keep.rb +1 -1
data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
data/lib/regexp_parser/expression/methods/construct.rb +2 -4
data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
data/lib/regexp_parser/expression/methods/negative.rb +20 -0
data/lib/regexp_parser/expression/methods/parts.rb +23 -0
data/lib/regexp_parser/expression/methods/printing.rb +26 -0
data/lib/regexp_parser/expression/methods/tests.rb +40 -3
data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
data/lib/regexp_parser/expression/quantifier.rb +30 -17
data/lib/regexp_parser/expression/sequence.rb +5 -10
data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
data/lib/regexp_parser/expression/shared.rb +37 -20
data/lib/regexp_parser/expression/subexpression.rb +20 -15
data/lib/regexp_parser/expression.rb +34 -31
data/lib/regexp_parser/lexer.rb +76 -36
data/lib/regexp_parser/parser.rb +101 -100
data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
data/lib/regexp_parser/scanner/properties/long.csv +29 -0
data/lib/regexp_parser/scanner/properties/short.csv +3 -0
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +101 -172
data/lib/regexp_parser/scanner.rb +1132 -1283
data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
data/lib/regexp_parser/syntax/token/escape.rb +3 -1
data/lib/regexp_parser/syntax/token/meta.rb +9 -2
data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
data/lib/regexp_parser/syntax/token.rb +13 -13
data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
data/lib/regexp_parser/syntax/versions.rb +3 -1
data/lib/regexp_parser/syntax.rb +1 -1
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +6 -6
data/regexp_parser.gemspec +5 -5
metadata +14 -8
data/CHANGELOG.md +0 -601
data/README.md +0 -503

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-require 'regexp_parser/error'
-require 'regexp_parser/expression'
+require_relative 'error'
+require_relative 'expression'
 class Regexp::Parser
   include Regexp::Expression
@@ -18,11 +18,11 @@ class Regexp::Parser
     end
   end
-  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+  def self.parse(input, syntax = nil, options: nil, &block)
     new.parse(input, syntax, options: options, &block)
   end
-  def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
+  def parse(input, syntax = nil, options: nil, &block)
     root = Root.construct(options: extract_options(input, options))
     self.root = root
@@ -35,7 +35,7 @@ class Regexp::Parser
     self.captured_group_counts = Hash.new(0)
-    Regexp::Lexer.scan(input, syntax, options: options) do |token|
+    Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
       parse_token(token)
     end
@@ -232,7 +232,7 @@ class Regexp::Parser
       node << Backreference::NameRecursionLevel.new(token, active_opts)
     when :name_call
       node << Backreference::NameCall.new(token, active_opts)
-    when :number, :number_ref
+    when :number, :number_ref # TODO: split in v3.0.0
       node << Backreference::Number.new(token, active_opts)
     when :number_recursion_ref
       node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
@@ -272,9 +272,9 @@ class Regexp::Parser
       nest_conditional(Conditional::Expression.new(token, active_opts))
     when :condition
       conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
     when :separator
-      conditional_nesting.last.add_sequence(active_opts)
+      conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
       self.node = conditional_nesting.last.branches.last
     when :close
       conditional_nesting.pop
@@ -322,6 +322,7 @@ class Regexp::Parser
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
+        # TODO: emit :meta_control_sequence token in v3.0.0
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Control.new(token, active_opts)
@@ -329,6 +330,7 @@ class Regexp::Parser
     when :meta_sequence
       if token.text =~ /\A\\M-\\[Cc]/
+        # TODO: emit :meta_control_sequence token in v3.0.0:
         node << EscapeSequence::MetaControl.new(token, active_opts)
       else
         node << EscapeSequence::Meta.new(token, active_opts)
@@ -349,11 +351,7 @@ class Regexp::Parser
     when :comment
       node << Comment.new(token, active_opts)
     when :whitespace
-      if node.last.is_a?(WhiteSpace)
-        node.last.merge(WhiteSpace.new(token, active_opts))
-      else
-        node << WhiteSpace.new(token, active_opts)
-      end
+      node << WhiteSpace.new(token, active_opts)
     else
       raise UnknownTokenError.new('FreeSpace', token)
     end
@@ -379,98 +377,99 @@ class Regexp::Parser
   end
   def sequence_operation(klass, token)
-    unless node.is_a?(klass)
+    unless node.instance_of?(klass)
       operator = klass.new(token, active_opts)
-      sequence = operator.add_sequence(active_opts)
+      sequence = operator.add_sequence(active_opts, { ts: token.ts })
       sequence.expressions = node.expressions
       node.expressions = []
       nest(operator)
     end
-    node.add_sequence(active_opts)
+    node.add_sequence(active_opts, { ts: token.te })
   end
   def posixclass(token)
     node << PosixClass.new(token, active_opts)
   end
-  include Regexp::Expression::UnicodeProperty
-  UPTokens = Regexp::Syntax::Token::UnicodeProperty
+  UP = Regexp::Expression::Property
+  UPTokens = Regexp::Syntax::Token::Property
   def property(token)
     case token.token
-    when :alnum;                  node << Alnum.new(token, active_opts)
-    when :alpha;                  node << Alpha.new(token, active_opts)
-    when :ascii;                  node << Ascii.new(token, active_opts)
-    when :blank;                  node << Blank.new(token, active_opts)
-    when :cntrl;                  node << Cntrl.new(token, active_opts)
-    when :digit;                  node << Digit.new(token, active_opts)
-    when :graph;                  node << Graph.new(token, active_opts)
-    when :lower;                  node << Lower.new(token, active_opts)
-    when :print;                  node << Print.new(token, active_opts)
-    when :punct;                  node << Punct.new(token, active_opts)
-    when :space;                  node << Space.new(token, active_opts)
-    when :upper;                  node << Upper.new(token, active_opts)
-    when :word;                   node << Word.new(token, active_opts)
-    when :xdigit;                 node << Xdigit.new(token, active_opts)
-    when :xposixpunct;            node << XPosixPunct.new(token, active_opts)
+    when :alnum;                  node << UP::Alnum.new(token, active_opts)
+    when :alpha;                  node << UP::Alpha.new(token, active_opts)
+    when :ascii;                  node << UP::Ascii.new(token, active_opts)
+    when :blank;                  node << UP::Blank.new(token, active_opts)
+    when :cntrl;                  node << UP::Cntrl.new(token, active_opts)
+    when :digit;                  node << UP::Digit.new(token, active_opts)
+    when :graph;                  node << UP::Graph.new(token, active_opts)
+    when :lower;                  node << UP::Lower.new(token, active_opts)
+    when :print;                  node << UP::Print.new(token, active_opts)
+    when :punct;                  node << UP::Punct.new(token, active_opts)
+    when :space;                  node << UP::Space.new(token, active_opts)
+    when :upper;                  node << UP::Upper.new(token, active_opts)
+    when :word;                   node << UP::Word.new(token, active_opts)
+    when :xdigit;                 node << UP::Xdigit.new(token, active_opts)
+    when :xposixpunct;            node << UP::XPosixPunct.new(token, active_opts)
     # only in Oniguruma (old rubies)
-    when :newline;                node << Newline.new(token, active_opts)
-    when :any;                    node << Any.new(token, active_opts)
-    when :assigned;               node << Assigned.new(token, active_opts)
-    when :letter;                 node << Letter::Any.new(token, active_opts)
-    when :cased_letter;           node << Letter::Cased.new(token, active_opts)
-    when :uppercase_letter;       node << Letter::Uppercase.new(token, active_opts)
-    when :lowercase_letter;       node << Letter::Lowercase.new(token, active_opts)
-    when :titlecase_letter;       node << Letter::Titlecase.new(token, active_opts)
-    when :modifier_letter;        node << Letter::Modifier.new(token, active_opts)
-    when :other_letter;           node << Letter::Other.new(token, active_opts)
-    when :mark;                   node << Mark::Any.new(token, active_opts)
-    when :combining_mark;         node << Mark::Combining.new(token, active_opts)
-    when :nonspacing_mark;        node << Mark::Nonspacing.new(token, active_opts)
-    when :spacing_mark;           node << Mark::Spacing.new(token, active_opts)
-    when :enclosing_mark;         node << Mark::Enclosing.new(token, active_opts)
-    when :number;                 node << Number::Any.new(token, active_opts)
-    when :decimal_number;         node << Number::Decimal.new(token, active_opts)
-    when :letter_number;          node << Number::Letter.new(token, active_opts)
-    when :other_number;           node << Number::Other.new(token, active_opts)
-    when :punctuation;            node << Punctuation::Any.new(token, active_opts)
-    when :connector_punctuation;  node << Punctuation::Connector.new(token, active_opts)
-    when :dash_punctuation;       node << Punctuation::Dash.new(token, active_opts)
-    when :open_punctuation;       node << Punctuation::Open.new(token, active_opts)
-    when :close_punctuation;      node << Punctuation::Close.new(token, active_opts)
-    when :initial_punctuation;    node << Punctuation::Initial.new(token, active_opts)
-    when :final_punctuation;      node << Punctuation::Final.new(token, active_opts)
-    when :other_punctuation;      node << Punctuation::Other.new(token, active_opts)
-    when :separator;              node << Separator::Any.new(token, active_opts)
-    when :space_separator;        node << Separator::Space.new(token, active_opts)
-    when :line_separator;         node << Separator::Line.new(token, active_opts)
-    when :paragraph_separator;    node << Separator::Paragraph.new(token, active_opts)
-    when :symbol;                 node << Symbol::Any.new(token, active_opts)
-    when :math_symbol;            node << Symbol::Math.new(token, active_opts)
-    when :currency_symbol;        node << Symbol::Currency.new(token, active_opts)
-    when :modifier_symbol;        node << Symbol::Modifier.new(token, active_opts)
-    when :other_symbol;           node << Symbol::Other.new(token, active_opts)
-    when :other;                  node << Codepoint::Any.new(token, active_opts)
-    when :control;                node << Codepoint::Control.new(token, active_opts)
-    when :format;                 node << Codepoint::Format.new(token, active_opts)
-    when :surrogate;              node << Codepoint::Surrogate.new(token, active_opts)
-    when :private_use;            node << Codepoint::PrivateUse.new(token, active_opts)
-    when :unassigned;             node << Codepoint::Unassigned.new(token, active_opts)
-    when *UPTokens::Age;          node << Age.new(token, active_opts)
-    when *UPTokens::Derived;      node << Derived.new(token, active_opts)
-    when *UPTokens::Emoji;        node << Emoji.new(token, active_opts)
-    when *UPTokens::Script;       node << Script.new(token, active_opts)
-    when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
+    when :newline;                node << UP::Newline.new(token, active_opts)
+    when :any;                    node << UP::Any.new(token, active_opts)
+    when :assigned;               node << UP::Assigned.new(token, active_opts)
+    when :letter;                 node << UP::Letter::Any.new(token, active_opts)
+    when :cased_letter;           node << UP::Letter::Cased.new(token, active_opts)
+    when :uppercase_letter;       node << UP::Letter::Uppercase.new(token, active_opts)
+    when :lowercase_letter;       node << UP::Letter::Lowercase.new(token, active_opts)
+    when :titlecase_letter;       node << UP::Letter::Titlecase.new(token, active_opts)
+    when :modifier_letter;        node << UP::Letter::Modifier.new(token, active_opts)
+    when :other_letter;           node << UP::Letter::Other.new(token, active_opts)
+    when :mark;                   node << UP::Mark::Any.new(token, active_opts)
+    when :combining_mark;         node << UP::Mark::Combining.new(token, active_opts)
+    when :nonspacing_mark;        node << UP::Mark::Nonspacing.new(token, active_opts)
+    when :spacing_mark;           node << UP::Mark::Spacing.new(token, active_opts)
+    when :enclosing_mark;         node << UP::Mark::Enclosing.new(token, active_opts)
+    when :number;                 node << UP::Number::Any.new(token, active_opts)
+    when :decimal_number;         node << UP::Number::Decimal.new(token, active_opts)
+    when :letter_number;          node << UP::Number::Letter.new(token, active_opts)
+    when :other_number;           node << UP::Number::Other.new(token, active_opts)
+    when :punctuation;            node << UP::Punctuation::Any.new(token, active_opts)
+    when :connector_punctuation;  node << UP::Punctuation::Connector.new(token, active_opts)
+    when :dash_punctuation;       node << UP::Punctuation::Dash.new(token, active_opts)
+    when :open_punctuation;       node << UP::Punctuation::Open.new(token, active_opts)
+    when :close_punctuation;      node << UP::Punctuation::Close.new(token, active_opts)
+    when :initial_punctuation;    node << UP::Punctuation::Initial.new(token, active_opts)
+    when :final_punctuation;      node << UP::Punctuation::Final.new(token, active_opts)
+    when :other_punctuation;      node << UP::Punctuation::Other.new(token, active_opts)
+    when :separator;              node << UP::Separator::Any.new(token, active_opts)
+    when :space_separator;        node << UP::Separator::Space.new(token, active_opts)
+    when :line_separator;         node << UP::Separator::Line.new(token, active_opts)
+    when :paragraph_separator;    node << UP::Separator::Paragraph.new(token, active_opts)
+    when :symbol;                 node << UP::Symbol::Any.new(token, active_opts)
+    when :math_symbol;            node << UP::Symbol::Math.new(token, active_opts)
+    when :currency_symbol;        node << UP::Symbol::Currency.new(token, active_opts)
+    when :modifier_symbol;        node << UP::Symbol::Modifier.new(token, active_opts)
+    when :other_symbol;           node << UP::Symbol::Other.new(token, active_opts)
+    when :other;                  node << UP::Codepoint::Any.new(token, active_opts)
+    when :control;                node << UP::Codepoint::Control.new(token, active_opts)
+    when :format;                 node << UP::Codepoint::Format.new(token, active_opts)
+    when :surrogate;              node << UP::Codepoint::Surrogate.new(token, active_opts)
+    when :private_use;            node << UP::Codepoint::PrivateUse.new(token, active_opts)
+    when :unassigned;             node << UP::Codepoint::Unassigned.new(token, active_opts)
+    when *UPTokens::Age;          node << UP::Age.new(token, active_opts)
+    when *UPTokens::Derived;      node << UP::Derived.new(token, active_opts)
+    when *UPTokens::Emoji;        node << UP::Emoji.new(token, active_opts)
+    when *UPTokens::Enumerated;   node << UP::Enumerated.new(token, active_opts)
+    when *UPTokens::Script;       node << UP::Script.new(token, active_opts)
+    when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
     else
       raise UnknownTokenError.new('UnicodeProperty', token)
@@ -478,8 +477,7 @@ class Regexp::Parser
   end
   def quantifier(token)
-    target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
-    target_node or raise ParserError, "No valid target found for '#{token.text}'"
+    target_node = node.extract_quantifier_target(token.text)
     # in case of chained quantifiers, wrap target in an implicit passive group
     # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -527,6 +525,8 @@ class Regexp::Parser
   end
   def open_set(token)
+    # TODO: this and Quantifier are the only cases where Expression#token
+    # does not match the scanner/lexer output. Fix in v3.0.0.
     token.token = :character
     nest(CharacterSet.new(token, active_opts))
   end
@@ -541,7 +541,7 @@ class Regexp::Parser
   def range(token)
     exp = CharacterSet::Range.new(token, active_opts)
-    scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
+    scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
     exp << scope.expressions.pop
     nest(exp)
   end
@@ -568,28 +568,29 @@ class Regexp::Parser
   end
   def close_completed_character_set_range
-    decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
+    decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
   end
   def active_opts
     options_stack.last
   end
-  # Assigns referenced expressions to refering expressions, e.g. if there is
+  # Assigns referenced expressions to referring expressions, e.g. if there is
   # an instance of Backreference::Number, its #referenced_expression is set to
   # the instance of Group::Capture that it refers to via its number.
   def assign_referenced_expressions
-    # find all referencable expressions
+    # find all referenceable and referring expressions
     targets = { 0 => root }
+    referrers = []
     root.each_expression do |exp|
       exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
+      referrers << exp if exp.referential?
     end
-    # assign them to any refering expressions
-    root.each_expression do |exp|
-      next unless exp.respond_to?(:reference)
+    # assign reference expression to referring expressions
+    # (in a second iteration because there might be forward references)
+    referrers.each do |exp|
       exp.referenced_expression = targets[exp.reference] ||
-        raise(ParserError, "Invalid reference: #{exp.reference}")
+        raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
     end
   end
 end # module Regexp::Parser

data/lib/regexp_parser/scanner/errors/premature_end_error.rb ADDED Viewed

@@ -0,0 +1,8 @@
+class Regexp::Scanner
+  # Unexpected end of pattern
+  class PrematureEndError < ScannerError
+    def initialize(where = '')
+      super "Premature end of pattern at #{where}"
+    end
+  end
+end

data/lib/regexp_parser/scanner/errors/scanner_error.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require_relative '../../../regexp_parser/error'
+class Regexp::Scanner
+  # General scanner error (catch all)
+  class ScannerError < Regexp::Parser::Error; end
+end

data/lib/regexp_parser/scanner/errors/validation_error.rb ADDED Viewed

@@ -0,0 +1,63 @@
+class Regexp::Scanner
+  # Base for all scanner validation errors
+  class ValidationError < ScannerError
+    # Centralizes and unifies the handling of validation related errors.
+    def self.for(type, problem, reason = nil)
+      types.fetch(type).new(problem, reason)
+    end
+    def self.types
+      @types ||= {
+        backref:      InvalidBackrefError,
+        group:        InvalidGroupError,
+        group_option: InvalidGroupOption,
+        posix_class:  UnknownPosixClassError,
+        property:     UnknownUnicodePropertyError,
+        sequence:     InvalidSequenceError,
+      }
+    end
+  end
+  # Invalid sequence format. Used for escape sequences, mainly.
+  class InvalidSequenceError < ValidationError
+    def initialize(what = 'sequence', where = '')
+      super "Invalid #{what} at #{where}"
+    end
+  end
+  # Invalid group. Used for named groups.
+  class InvalidGroupError < ValidationError
+    def initialize(what, reason)
+      super "Invalid #{what}, #{reason}."
+    end
+  end
+  # Invalid groupOption. Used for inline options.
+  # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
+  class InvalidGroupOption < ValidationError
+    def initialize(option, text)
+      super "Invalid group option #{option} in #{text}"
+    end
+  end
+  # Invalid back reference. Used for name a number refs/calls.
+  class InvalidBackrefError < ValidationError
+    def initialize(what, reason)
+      super "Invalid back reference #{what}, #{reason}"
+    end
+  end
+  # The property name was not recognized by the scanner.
+  class UnknownUnicodePropertyError < ValidationError
+    def initialize(name, _)
+      super "Unknown unicode character property name #{name}"
+    end
+  end
+  # The POSIX class name was not recognized by the scanner.
+  class UnknownPosixClassError < ValidationError
+    def initialize(text, _)
+      super "Unknown POSIX class #{text}"
+    end
+  end
+end

data/lib/regexp_parser/scanner/properties/long.csv CHANGED Viewed

@@ -7,6 +7,8 @@ age=12.0,age=12.0
 age=12.1,age=12.1
 age=13.0,age=13.0
 age=14.0,age=14.0
+age=15.0,age=15.0
+age=15.1,age=15.1
 age=2.0,age=2.0
 age=2.1,age=2.1
 age=3.0,age=3.0
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
 emojipresentation,emoji_presentation
 enclosingmark,enclosing_mark
 ethiopic,ethiopic
+extendedpictographic,extended_pictographic
 extender,extender
 finalpunctuation,final_punctuation
 format,format
@@ -106,6 +109,19 @@ gothic,gothic
 grantha,grantha
 graph,graph
 graphemebase,grapheme_base
+graphemeclusterbreak=control,grapheme_cluster_break=control
+graphemeclusterbreak=cr,grapheme_cluster_break=cr
+graphemeclusterbreak=extend,grapheme_cluster_break=extend
+graphemeclusterbreak=l,grapheme_cluster_break=l
+graphemeclusterbreak=lf,grapheme_cluster_break=lf
+graphemeclusterbreak=lv,grapheme_cluster_break=lv
+graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
+graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
+graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
+graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
+graphemeclusterbreak=t,grapheme_cluster_break=t
+graphemeclusterbreak=v,grapheme_cluster_break=v
+graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
 graphemeextend,grapheme_extend
 graphemelink,grapheme_link
 greek,greek
@@ -121,11 +137,14 @@ hebrew,hebrew
 hexdigit,hex_digit
 hiragana,hiragana
 hyphen,hyphen
+idcompatmathcontinue,id_compat_math_continue
+idcompatmathstart,id_compat_math_start
 idcontinue,id_continue
 ideographic,ideographic
 idsbinaryoperator,ids_binary_operator
 idstart,id_start
 idstrinaryoperator,ids_trinary_operator
+idsunaryoperator,ids_unary_operator
 imperialaramaic,imperial_aramaic
 inadlam,in_adlam
 inaegeannumbers,in_aegean_numbers
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
 inarabic,in_arabic
 inarabicextendeda,in_arabic_extended_a
 inarabicextendedb,in_arabic_extended_b
+inarabicextendedc,in_arabic_extended_c
 inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
 inarabicpresentationformsa,in_arabic_presentation_forms_a
 inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
 incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
 incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
 incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
+incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
+incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
 incombiningdiacriticalmarks,in_combining_diacritical_marks
 incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
 incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
 incyrillicextendeda,in_cyrillic_extended_a
 incyrillicextendedb,in_cyrillic_extended_b
 incyrillicextendedc,in_cyrillic_extended_c
+incyrillicextendedd,in_cyrillic_extended_d
 incyrillicsupplement,in_cyrillic_supplement
 indeseret,in_deseret
 indevanagari,in_devanagari
 indevanagariextended,in_devanagari_extended
+indevanagariextendeda,in_devanagari_extended_a
 indingbats,in_dingbats
 indivesakuru,in_dives_akuru
 indogra,in_dogra
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
 initialpunctuation,initial_punctuation
 injavanese,in_javanese
 inkaithi,in_kaithi
+inkaktoviknumerals,in_kaktovik_numerals
 inkanaextendeda,in_kana_extended_a
 inkanaextendedb,in_kana_extended_b
 inkanasupplement,in_kana_supplement
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
 inkannada,in_kannada
 inkatakana,in_katakana
 inkatakanaphoneticextensions,in_katakana_phonetic_extensions
+inkawi,in_kawi
 inkayahli,in_kayah_li
 inkharoshthi,in_kharoshthi
 inkhitansmallscript,in_khitan_small_script
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
 inmyanmarextendeda,in_myanmar_extended_a
 inmyanmarextendedb,in_myanmar_extended_b
 innabataean,in_nabataean
+innagmundari,in_nag_mundari
 innandinagari,in_nandinagari
 innewa,in_newa
 innewtailue,in_new_tai_lue
@@ -457,6 +484,7 @@ joincontrol,join_control
 kaithi,kaithi
 kannada,kannada
 katakana,katakana
+kawi,kawi
 kayahli,kayah_li
 kharoshthi,kharoshthi
 khitansmallscript,khitan_small_script
@@ -503,6 +531,7 @@ mro,mro
 multani,multani
 myanmar,myanmar
 nabataean,nabataean
+nagmundari,nag_mundari
 nandinagari,nandinagari
 newa,newa
 newline,newline

data/lib/regexp_parser/scanner/properties/short.csv CHANGED Viewed

@@ -57,6 +57,7 @@ emod,emoji_modifier
 epres,emoji_presentation
 ethi,ethiopic
 ext,extender
+extpict,extended_pictographic
 geor,georgian
 glag,glagolitic
 gong,gunjala_gondi
@@ -85,6 +86,7 @@ ideo,ideographic
 ids,id_start
 idsb,ids_binary_operator
 idst,ids_trinary_operator
+idsu,ids_unary_operator
 ital,old_italic
 java,javanese
 joinc,join_control
@@ -133,6 +135,7 @@ mtei,meetei_mayek
 mult,multani
 mymr,myanmar
 n,number
+nagm,nag_mundari
 nand,nandinagari
 narb,old_north_arabian
 nbat,nabataean

data/lib/regexp_parser/scanner/property.rl CHANGED Viewed

@@ -17,10 +17,10 @@
       text = copy(data, ts-1, te)
       type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
-      name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
+      name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
       token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
-      validation_error(:property, name) unless token
+      raise ValidationError.for(:property, name) unless token
       self.emit(type, token.to_sym, text)