RubyGems - regexp_parser - Versions diffs - 2.2.1 → 2.4.0 - Mend

regexp_parser 2.2.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +70 -6
data/Gemfile +2 -1
data/README.md +23 -9
data/Rakefile +1 -56
data/lib/regexp_parser/error.rb +1 -1
data/lib/regexp_parser/expression/base.rb +9 -57
data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
data/lib/regexp_parser/expression/classes/group.rb +6 -6
data/lib/regexp_parser/expression/methods/tests.rb +10 -1
data/lib/regexp_parser/expression/quantifier.rb +40 -23
data/lib/regexp_parser/expression/sequence.rb +2 -2
data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
data/lib/regexp_parser/expression/shared.rb +81 -0
data/lib/regexp_parser/expression/subexpression.rb +11 -7
data/lib/regexp_parser/expression.rb +1 -0
data/lib/regexp_parser/lexer.rb +1 -1
data/lib/regexp_parser/parser.rb +12 -60
data/lib/regexp_parser/scanner/properties/long.csv +18 -0
data/lib/regexp_parser/scanner/properties/short.csv +4 -0
data/lib/regexp_parser/scanner/property.rl +1 -1
data/lib/regexp_parser/scanner/scanner.rl +42 -31
data/lib/regexp_parser/scanner.rb +729 -797
data/lib/regexp_parser/syntax/any.rb +2 -5
data/lib/regexp_parser/syntax/base.rb +91 -64
data/lib/regexp_parser/syntax/token/quantifier.rb +4 -4
data/lib/regexp_parser/syntax/token/unicode_property.rb +26 -5
data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.1.0.rb +3 -9
data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
data/lib/regexp_parser/syntax/versions.rb +1 -1
data/lib/regexp_parser/version.rb +1 -1
metadata +4 -2

data/lib/regexp_parser/expression/shared.rb ADDED Viewed

@@ -0,0 +1,81 @@
+module Regexp::Expression
+  module Shared
+    def self.included(mod)
+      mod.class_eval do
+        attr_accessor :type, :token, :text, :ts, :te,
+                      :level, :set_level, :conditional_level,
+                      :options, :quantifier
+        attr_reader   :nesting_level
+      end
+    end
+    def init_from_token_and_options(token, options = {})
+      self.type              = token.type
+      self.token             = token.token
+      self.text              = token.text
+      self.ts                = token.ts
+      self.te                = token.te
+      self.level             = token.level
+      self.set_level         = token.set_level
+      self.conditional_level = token.conditional_level
+      self.nesting_level     = 0
+      self.options           = options || {}
+    end
+    private :init_from_token_and_options
+    def initialize_copy(orig)
+      self.text       = orig.text.dup         if orig.text
+      self.options    = orig.options.dup      if orig.options
+      self.quantifier = orig.quantifier.clone if orig.quantifier
+      super
+    end
+    def starts_at
+      ts
+    end
+    def base_length
+      to_s(:base).length
+    end
+    def full_length
+      to_s.length
+    end
+    def to_s(format = :full)
+      "#{parts.join}#{quantifier_affix(format)}"
+    end
+    alias :to_str :to_s
+    def parts
+      [text.dup]
+    end
+    def quantifier_affix(expression_format)
+      quantifier.to_s if quantified? && expression_format != :base
+    end
+    def quantified?
+      !quantifier.nil?
+    end
+    def offset
+      [starts_at, full_length]
+    end
+    def coded_offset
+      '@%d+%d' % offset
+    end
+    def terminal?
+      !respond_to?(:expressions)
+    end
+    def nesting_level=(lvl)
+      @nesting_level = lvl
+      quantifier && quantifier.nesting_level = lvl
+      terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
+    end
+  end
+end

data/lib/regexp_parser/expression/subexpression.rb CHANGED Viewed

@@ -5,9 +5,8 @@ module Regexp::Expression
     attr_accessor :expressions
     def initialize(token, options = {})
-      super
       self.expressions = []
+      super
     end
     # Override base method to clone the expressions as well.
@@ -43,16 +42,21 @@ module Regexp::Expression
       ts + to_s.length
     end
-    def to_s(format = :full)
-      # Note: the format does not get passed down to subexpressions.
-      "#{expressions.join}#{quantifier_affix(format)}"
+    def parts
+      expressions
     end
     def to_h
-      attributes.merge({
+      attributes.merge(
         text:        to_s(:base),
         expressions: expressions.map(&:to_h)
-      })
+      )
+    end
+    private
+    def intersperse(expressions, separator)
+      expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
     end
   end
 end

data/lib/regexp_parser/expression.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'regexp_parser/error'
+require 'regexp_parser/expression/shared'
 require 'regexp_parser/expression/base'
 require 'regexp_parser/expression/quantifier'
 require 'regexp_parser/expression/subexpression'

data/lib/regexp_parser/lexer.rb CHANGED Viewed

@@ -18,7 +18,7 @@ class Regexp::Lexer
   end
   def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
-    syntax = Regexp::Syntax.new(syntax)
+    syntax = Regexp::Syntax.for(syntax)
     self.tokens = []
     self.nesting = 0

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -39,6 +39,9 @@ class Regexp::Parser
       parse_token(token)
     end
+    # Trigger recursive setting of #nesting_level, which reflects how deep
+    # a node is in the tree. Do this at the end to account for tree rewrites.
+    root.nesting_level = 0
     assign_referenced_expressions
     if block_given?
@@ -286,17 +289,9 @@ class Regexp::Parser
   def nest(exp)
     nesting.push(exp)
     node << exp
-    update_transplanted_subtree(exp, node)
     self.node = exp
   end
-  # subtrees are transplanted to build Alternations, Intersections, Ranges
-  def update_transplanted_subtree(exp, new_parent)
-    exp.nesting_level = new_parent.nesting_level + 1
-    exp.respond_to?(:each) &&
-      exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
-  end
   def escape(token)
     case token.token
@@ -483,7 +478,7 @@ class Regexp::Parser
       new_token = Regexp::Token.new(
         :group,
         :passive,
-        '', # text
+        '', # text (none because this group is implicit)
         target_node.ts,
         nil, # te (unused)
         target_node.level,
@@ -493,66 +488,23 @@ class Regexp::Parser
       new_group = Group::Passive.new(new_token, active_opts)
       new_group.implicit = true
       new_group << target_node
-      increase_level(target_node)
+      increase_group_level(target_node)
       node.expressions[node.expressions.index(target_node)] = new_group
       target_node = new_group
     end
-    case token.token
-    when :zero_or_one
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
-    when :zero_or_one_reluctant
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
-    when :zero_or_one_possessive
-      target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
-    when :zero_or_more
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
-    when :zero_or_more_reluctant
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
-    when :zero_or_more_possessive
-      target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
-    when :one_or_more
-      target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
-    when :one_or_more_reluctant
-      target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
-    when :one_or_more_possessive
-      target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
-    when :interval
-      interval(target_node, token)
-    else
+    unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
+                             (?:_greedy|_reluctant|_possessive)?\z/x
       raise UnknownTokenError.new('Quantifier', token)
     end
+    target_node.quantify(token, active_opts)
   end
-  def increase_level(exp)
+  def increase_group_level(exp)
     exp.level += 1
-    exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
-  end
-  def interval(target_node, token)
-    text = token.text
-    mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
-    case mchr
-    when '?'
-      range_text = text[0...-1]
-      mode = :reluctant
-    when '+'
-      range_text = text[0...-1]
-      mode = :possessive
-    else
-      range_text = text
-      mode = :greedy
-    end
-    range = range_text.gsub(/\{|\}/, '').split(',', 2)
-    min = range[0].empty? ? 0 : range[0]
-    max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
-    target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
+    exp.quantifier.level += 1 if exp.quantifier
+    exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
   end
   def set(token)

data/lib/regexp_parser/scanner/properties/long.csv CHANGED Viewed

@@ -6,6 +6,7 @@ age=11.0,age=11.0
 age=12.0,age=12.0
 age=12.1,age=12.1
 age=13.0,age=13.0
+age=14.0,age=14.0
 age=2.0,age=2.0
 age=2.1,age=2.1
 age=3.0,age=3.0
@@ -72,6 +73,7 @@ coptic,coptic
 cuneiform,cuneiform
 currencysymbol,currency_symbol
 cypriot,cypriot
+cyprominoan,cypro_minoan
 cyrillic,cyrillic
 dash,dash
 dashpunctuation,dash_punctuation
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
 inancientsymbols,in_ancient_symbols
 inarabic,in_arabic
 inarabicextendeda,in_arabic_extended_a
+inarabicextendedb,in_arabic_extended_b
 inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
 inarabicpresentationformsa,in_arabic_presentation_forms_a
 inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
 incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
 incurrencysymbols,in_currency_symbols
 incypriotsyllabary,in_cypriot_syllabary
+incyprominoan,in_cypro_minoan
 incyrillic,in_cyrillic
 incyrillicextendeda,in_cyrillic_extended_a
 incyrillicextendedb,in_cyrillic_extended_b
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
 inethiopic,in_ethiopic
 inethiopicextended,in_ethiopic_extended
 inethiopicextendeda,in_ethiopic_extended_a
+inethiopicextendedb,in_ethiopic_extended_b
 inethiopicsupplement,in_ethiopic_supplement
 ingeneralpunctuation,in_general_punctuation
 ingeometricshapes,in_geometric_shapes
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
 injavanese,in_javanese
 inkaithi,in_kaithi
 inkanaextendeda,in_kana_extended_a
+inkanaextendedb,in_kana_extended_b
 inkanasupplement,in_kana_supplement
 inkanbun,in_kanbun
 inkangxiradicals,in_kangxi_radicals
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
 inlatinextendedc,in_latin_extended_c
 inlatinextendedd,in_latin_extended_d
 inlatinextendede,in_latin_extended_e
+inlatinextendedf,in_latin_extended_f
+inlatinextendedg,in_latin_extended_g
 inlepcha,in_lepcha
 inletterlikesymbols,in_letterlike_symbols
 inlimbu,in_limbu
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
 inoldsogdian,in_old_sogdian
 inoldsoutharabian,in_old_south_arabian
 inoldturkic,in_old_turkic
+inolduyghur,in_old_uyghur
 inopticalcharacterrecognition,in_optical_character_recognition
 inoriya,in_oriya
 inornamentaldingbats,in_ornamental_dingbats
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
 intakri,in_takri
 intamil,in_tamil
 intamilsupplement,in_tamil_supplement
+intangsa,in_tangsa
 intangut,in_tangut
 intangutcomponents,in_tangut_components
 intangutsupplement,in_tangut_supplement
@@ -422,15 +432,18 @@ inthai,in_thai
 intibetan,in_tibetan
 intifinagh,in_tifinagh
 intirhuta,in_tirhuta
+intoto,in_toto
 intransportandmapsymbols,in_transport_and_map_symbols
 inugaritic,in_ugaritic
 inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
 inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
+inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
 invai,in_vai
 invariationselectors,in_variation_selectors
 invariationselectorssupplement,in_variation_selectors_supplement
 invedicextensions,in_vedic_extensions
 inverticalforms,in_vertical_forms
+invithkuqi,in_vithkuqi
 inwancho,in_wancho
 inwarangciti,in_warang_citi
 inyezidi,in_yezidi
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
 inyiradicals,in_yi_radicals
 inyisyllables,in_yi_syllables
 inzanabazarsquare,in_zanabazar_square
+inznamennymusicalnotation,in_znamenny_musical_notation
 javanese,javanese
 joincontrol,join_control
 kaithi,kaithi
@@ -509,6 +523,7 @@ oldpersian,old_persian
 oldsogdian,old_sogdian
 oldsoutharabian,old_south_arabian
 oldturkic,old_turkic
+olduyghur,old_uyghur
 openpunctuation,open_punctuation
 oriya,oriya
 osage,osage
@@ -573,6 +588,7 @@ taitham,tai_tham
 taiviet,tai_viet
 takri,takri
 tamil,tamil
+tangsa,tangsa
 tangut,tangut
 telugu,telugu
 terminalpunctuation,terminal_punctuation
@@ -582,6 +598,7 @@ tibetan,tibetan
 tifinagh,tifinagh
 tirhuta,tirhuta
 titlecaseletter,titlecase_letter
+toto,toto
 ugaritic,ugaritic
 unassigned,unassigned
 unifiedideograph,unified_ideograph
@@ -591,6 +608,7 @@ uppercase,uppercase
 uppercaseletter,uppercase_letter
 vai,vai
 variationselector,variation_selector
+vithkuqi,vithkuqi
 wancho,wancho
 warangciti,warang_citi
 whitespace,white_space

data/lib/regexp_parser/scanner/properties/short.csv CHANGED Viewed

@@ -31,6 +31,7 @@ cn,unassigned
 co,private_use
 combiningmark,mark
 copt,coptic
+cpmn,cypro_minoan
 cprt,cypriot
 cs,surrogate
 cwcf,changes_when_casefolded
@@ -154,6 +155,7 @@ orkh,old_turkic
 orya,oriya
 osge,osage
 osma,osmanya
+ougr,old_uyghur
 oupper,other_uppercase
 p,punctuation
 palm,palmyrene
@@ -219,9 +221,11 @@ tglg,tagalog
 thaa,thaana
 tibt,tibetan
 tirh,tirhuta
+tnsa,tangsa
 ugar,ugaritic
 uideo,unified_ideograph
 vaii,vai
+vith,vithkuqi
 vs,variation_selector
 wara,warang_citi
 wcho,wancho

data/lib/regexp_parser/scanner/property.rl CHANGED Viewed

@@ -20,7 +20,7 @@
       name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
       token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
-      raise UnknownUnicodePropertyError.new(name) unless token
+      validation_error(:property, name) unless token
       self.emit(type, token.to_sym, text)

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -28,13 +28,7 @@
   comment               = ('#' . [^\n]* . '\n'?);
-  class_name_posix      = 'alnum' | 'alpha' | 'blank' |
-                          'cntrl' | 'digit' | 'graph' |
-                          'lower' | 'print' | 'punct' |
-                          'space' | 'upper' | 'xdigit' |
-                          'word'  | 'ascii';
-  class_posix           = ('[:' . '^'? . class_name_posix . ':]');
+  class_posix           = ('[:' . '^'? . [^\[\]]* . ':]');
   # these are not supported in ruby at the moment
@@ -74,8 +68,7 @@
   quantity_maximum      = ',' . (digit+);
   quantity_range        = (digit+) . ',' . (digit+);
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
-                          quantity_maximum | quantity_range ) . range_close .
-                          quantifier_mode?;
+                          quantity_maximum | quantity_range ) . range_close;
   quantifiers           = quantifier_greedy | quantifier_reluctant |
                           quantifier_possessive | quantifier_interval;
@@ -223,24 +216,28 @@
       fcall character_set;
     };
-    class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
       text = copy(data, ts, te)
       type = :posixclass
       class_name = text[2..-3]
-      if class_name[0].chr == '^'
+      if class_name[0] == '^'
         class_name = class_name[1..-1]
         type = :nonposixclass
       end
+      unless self.class.posix_classes.include?(class_name)
+        validation_error(:posix_class, text)
+      end
       emit(type, class_name.to_sym, text)
     };
     # These are not supported in ruby at the moment. Enable them if they are.
-    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
     #   emit(:set, :collation, copy(data, ts, te))
     # };
-    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
     #   emit(:set, :equivalent, copy(data, ts, te))
     # };
@@ -323,7 +320,7 @@
     codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
       text = copy(data, ts-1, te)
-      if text[2].chr == '{'
+      if text[2] == '{'
         emit(:escape, :codepoint_list, text)
       else
         emit(:escape, :codepoint,      text)
@@ -419,12 +416,12 @@
     backslash . anchor_char > (backslashed, 3) {
       case text = copy(data, ts, te)
-      when '\\A'; emit(:anchor, :bos,                text)
-      when '\\z'; emit(:anchor, :eos,                text)
-      when '\\Z'; emit(:anchor, :eos_ob_eol,         text)
-      when '\\b'; emit(:anchor, :word_boundary,      text)
-      when '\\B'; emit(:anchor, :nonword_boundary,   text)
-      when '\\G'; emit(:anchor, :match_start,        text)
+      when '\A';  emit(:anchor, :bos,                text)
+      when '\z';  emit(:anchor, :eos,                text)
+      when '\Z';  emit(:anchor, :eos_ob_eol,         text)
+      when '\b';  emit(:anchor, :word_boundary,      text)
+      when '\B';  emit(:anchor, :nonword_boundary,   text)
+      when '\G';  emit(:anchor, :match_start,        text)
       end
     };
@@ -477,7 +474,7 @@
     group_open . group_options >group_opened {
       text = copy(data, ts, te)
       if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
-        raise InvalidGroupOption.new($1 || "-#{$2}", text)
+        validation_error(:group_option, $1 || "-#{$2}", text)
       end
       emit_options(text)
     };
@@ -605,7 +602,7 @@
       end
     };
-    quantifier_interval  {
+    quantifier_interval {
       emit(:quantifier, :interval, copy(data, ts, te))
     };
@@ -686,6 +683,7 @@ class Regexp::Scanner
   end
   # Invalid groupOption. Used for inline options.
+  # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
   class InvalidGroupOption < ValidationError
     def initialize(option, text)
       super "Invalid group option #{option} in #{text}"
@@ -706,6 +704,13 @@ class Regexp::Scanner
     end
   end
+  # The POSIX class name was not recognized by the scanner.
+  class UnknownPosixClassError < ValidationError
+    def initialize(text)
+      super "Unknown POSIX class #{text}"
+    end
+  end
   # Scans the given regular expression text, or Regexp object and collects the
   # emitted token into an array that gets returned at the end. If a block is
   # given, it gets called for each emitted token.
@@ -771,6 +776,11 @@ class Regexp::Scanner
     File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
   end
+  def self.posix_classes
+    %w[alnum alpha ascii blank cntrl digit graph
+       lower print punct space upper word xdigit]
+  end
   # Emits an array with the details of the scanned pattern
   def emit(type, token, text)
     #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -873,15 +883,16 @@ class Regexp::Scanner
   # Centralizes and unifies the handling of validation related
   # errors.
-  def validation_error(type, what, reason)
-    case type
-    when :group
-      error = InvalidGroupError.new(what, reason)
-    when :backref
-      error = InvalidBackrefError.new(what, reason)
-    when :sequence
-      error = InvalidSequenceError.new(what, reason)
-    end
+  def validation_error(type, what, reason = nil)
+    error =
+      case type
+      when :backref      then InvalidBackrefError.new(what, reason)
+      when :group        then InvalidGroupError.new(what, reason)
+      when :group_option then InvalidGroupOption.new(what, reason)
+      when :posix_class  then UnknownPosixClassError.new(what)
+      when :property     then UnknownUnicodePropertyError.new(what)
+      when :sequence     then InvalidSequenceError.new(what, reason)
+      end
     raise error # unless @@config.validation_ignore
   end