RubyGems - srl_ruby - Versions diffs - 0.0.1 - Mend

srl_ruby 0.0.1

Files changed (44) hide show

checksums.yaml +7 -0
data/.rspec +4 -0
data/.rubocop.yml +3 -0
data/.yardopts +6 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +66 -0
data/Rakefile +16 -0
data/bin/srl_ruby +58 -0
data/lib/regex/abstract_method.rb +35 -0
data/lib/regex/alternation.rb +27 -0
data/lib/regex/anchor.rb +45 -0
data/lib/regex/atomic_expression.rb +16 -0
data/lib/regex/capturing_group.rb +51 -0
data/lib/regex/char_class.rb +38 -0
data/lib/regex/char_range.rb +51 -0
data/lib/regex/char_shorthand.rb +50 -0
data/lib/regex/character.rb +204 -0
data/lib/regex/compound_expression.rb +57 -0
data/lib/regex/concatenation.rb +29 -0
data/lib/regex/expression.rb +60 -0
data/lib/regex/lookaround.rb +50 -0
data/lib/regex/match_option.rb +34 -0
data/lib/regex/monadic_expression.rb +28 -0
data/lib/regex/multiplicity.rb +91 -0
data/lib/regex/non_capturing_group.rb +27 -0
data/lib/regex/polyadic_expression.rb +60 -0
data/lib/regex/quantifiable.rb +22 -0
data/lib/regex/repetition.rb +29 -0
data/lib/regex/wildcard.rb +23 -0
data/lib/srl_ruby/ast_builder.rb +384 -0
data/lib/srl_ruby/grammar.rb +106 -0
data/lib/srl_ruby/regex_repr.rb +13 -0
data/lib/srl_ruby/tokenizer.rb +147 -0
data/lib/srl_ruby/version.rb +3 -0
data/lib/srl_ruby.rb +4 -0
data/spec/integration_spec.rb +451 -0
data/spec/regex/character_spec.rb +166 -0
data/spec/regex/multiplicity_spec.rb +79 -0
data/spec/spec_helper.rb +16 -0
data/spec/srl_ruby/srl_ruby_spec.rb +7 -0
data/spec/srl_ruby/tokenizer_spec.rb +147 -0
data/srl_ruby.gemspec +58 -0
metadata +150 -0

data/lib/regex/character.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# File: character.rb
+require_relative 'atomic_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # A regular expression that matches a specific character in a given character set
+  class Character < AtomicExpression
+    # Constant with all special 2-characters escape sequences
+    DigramSequences = {
+      "\\a" => 0x7, # alarm
+      "\\n" => 0xA, # newline
+      "\\r" => 0xD, # carriage return
+      "\\t" => 0x9, # tab
+      "\\e" => 0x1B, # escape
+      "\\f" => 0xC, # form feed
+      "\\v" => 0xB, # vertical feed
+      # Single octal digit literals
+      "\\0" => 0,
+      "\\1" => 1,
+      "\\2" => 2,
+      "\\3" => 3,
+      "\\4" => 4,
+      "\\5" => 5,
+      "\\6" => 6,
+      "\\7" => 7
+    }.freeze
+    MetaChars = '\^$+?.'.freeze
+    # The integer value that uniquely identifies the character.
+    attr_reader(:codepoint)
+    # The initial text representation of the character (if any).
+    attr_reader(:lexeme)
+    # Constructor.
+    # [aValue] Initialize the character with a either a String literal or a
+    # codepoint value.
+    # Examples:
+    # Initializing with codepoint value...
+    # RegAn::Character.new(0x3a3) # Represents: Σ
+    # (Unicode GREEK CAPITAL LETTER SIGMA)
+    # RegAn::Character.new(931)   # Also represents: Σ (931 dec == 3a3 hex)
+    #
+    # Initializing with a single character string
+    # RegAn::Character.new(?\u03a3) # Also represents: Σ
+    # RegAn::Character.new('Σ')   # Obviously, represents a Σ
+    #
+    # Initializing with an escape sequence string
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
+    # \f (form feed, 0xC)
+    # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+    # \xXX (hex)
+    # Any other escaped character will be treated as a literal character
+    # RegAn::Character.new('\n')    # Represents a newline
+    # RegAn::Character.new('\u03a3')  # Represents a Σ
+    def initialize(aValue)
+      case aValue
+        when String
+          if aValue.size == 1
+            # Literal single character case...
+            @codepoint = self.class.char2codepoint(aValue)
+          else
+            # Should be an escape sequence...
+            @codepoint = self.class.esc2codepoint(aValue)
+          end
+          @lexeme = aValue
+        when Integer
+          @codepoint = aValue
+        else
+          raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
+      end
+    end
+    # Convertion method that returns a character given a codepoint (integer) value.
+    # Example:
+    # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
+    # The Unicode GREEK CAPITAL LETTER SIGMA)
+    def self.codepoint2char(aCodepoint)
+      return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
+    end
+    # Convertion method that returns the codepoint for the given single character.
+    # Example:
+    # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
+    def self.char2codepoint(aChar)
+      return aChar.ord
+    end
+    # Convertion method that returns the codepoint for the given escape
+    # sequence (a String).
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
+    # 0xC), \v (vertical feed, 0xB)
+    # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+    # \xXX (hex)
+    # Any other escaped character will be treated as a literal character
+    # Example:
+    # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
+    def self.esc2codepoint(anEscapeSequence)
+      msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
+      raise StandardError, msg unless anEscapeSequence[0] == "\\"
+      result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
+      return result
+    end
+    # Return the character as a String object
+    def char()
+      self.class.codepoint2char(@codepoint)
+    end
+    # Returns true iff this Character and parameter 'another' represent the same character.
+    # [another] any Object. The way the equality is tested depends on the another's class
+    # Example:
+    # newOne = Character.new(?\u03a3)
+    # newOne == newOne  # true. Identity
+    # newOne == Character.new(?\u03a3)  # true. Both have same codepoint
+    # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
+    # newOne == 0x03a3  # true. The Integer is compared to the codepoint value.
+    # Will test equality with any Object that knows the to_s method
+    def ==(other)
+      result = case other
+        when Character
+          self.to_str == other.to_str
+        when Integer
+          self.codepoint == other
+        when String
+          other.size > 1 ? false : to_str == other
+        else
+          # Unknown type: try with a convertion
+          self == other.to_s # Recursive call
+      end
+      return result
+    end
+    # Return a plain English description of the character
+    def explain()
+      return "the character '#{to_str}'"
+    end
+    protected
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the expression.
+    # If the Character was initially from a text (the lexeme), then the lexeme
+    # is returned back.
+    # Otherwise the character corresponding to the codepoint is returned.
+    def text_repr()
+      return char if lexeme.nil?
+      return lexeme.dup
+    end
+    # Convertion method that returns a codepoint for the given two characters
+    # (digram) escape sequence.
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
+    # \f (form feed, 0xC), \v (vertical feed, 0xB)
+    # Any other escape sequence will return the codepoint of the escaped
+    # character.
+    # [aDigram] A sequence of two characters that starts with a backslash.
+    def self.digram2codepoint(aDigram)
+      # Check that the digram is a special escape sequence
+      result = DigramSequences.fetch(aDigram, nil)
+      # If it not a special sequence, then escaped character is
+      # considered literally (the backslash is 'dummy')
+      result = char2codepoint(aDigram[-1]) if result.nil?
+      return result
+    end
+    private_class_method :digram2codepoint
+    # Convertion method that returns a codepoint for the given complex
+    # escape sequence.
+    # [anEscapeSequence] A String with the format:
+    # \uXXXX where XXXX is a 4 hex digits integer value,
+    # \u{X...} X 1 or more hex digits
+    # \ooo (1..3 octal digits literal)
+    # \xXX (1..2 hex digits literal)
+    def self.esc_number2codepoint(anEscapeSequence)
+      unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
+        raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
+      else
+      # Octal literal case?
+        return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
+        # Extract the hexadecimal number
+        hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
+        return hexliteral.hex
+      end
+    end
+    private_class_method :esc_number2codepoint
+  end # class
+end # module
+# End of file

data/lib/regex/compound_expression.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# File: compound_expression.rb
+require_relative 'expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Abstract class. An element that is part of a regular expression &
+  # that has its own child sub-expressions.
+  class CompoundExpression < Expression
+    # Redefined method. Return false since it may have one or more children.
+    def atomic?
+      return false
+    end
+=begin
+    # Build a depth-first in-order children visitor.
+    # The visitor is implemented as an Enumerator.
+    def df_visitor()
+      root = children # The visit will start from the children of this object
+      visitor = Enumerator.new do |result|  # result is a Yielder
+        # Initialization part: will run once
+        visit_stack = [ root ]  # The LIFO queue of nodes to visit
+        begin # Traversal part (as a loop)
+          top = visit_stack.pop()
+          if top.kind_of?(Array)
+            if top.empty?
+              next
+            else
+              currChild = top.pop()
+              visit_stack.push top
+            end
+          else
+            currChild = top
+          end
+          result << currChild   # Return the visited child
+          unless currChild.atomic?
+            children_to_enqueue = currChild.children.reverse()  # in-order traversal implies LIFO queue
+            visit_stack.push(children_to_enqueue)
+          end
+        end until visit_stack.empty?
+      end
+    end
+=end
+    protected
+    # Abstract method. Return the text representation of the child (if any)
+    def all_child_text()
+      abstract_method
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/concatenation.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# File: concatenation.rb
+require_relative 'polyadic_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Abstract class. A n-ary matching operator.
+  # It succeeds when each child succeeds to match the subject text in the same
+  # serial arrangement than defined by this concatenation.
+  class Concatenation < PolyadicExpression
+    # Constructor.
+    def initialize(*theChildren)
+      super(theChildren)
+    end
+    protected
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the concatented expressions.
+    def text_repr()
+      outcome = children.inject('') do |result, aChild|
+        result << aChild.to_str
+      end
+      return outcome
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/expression.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# File: expression.rb
+require_relative 'abstract_method'
+module Regex # This module is used as a namespace
+  # Abstract class. The generalization of any valid regular (sub)expression.
+  class Expression
+    attr_accessor :begin_anchor
+    attr_accessor :end_anchor
+    # Constructor
+    def initialize(); end
+    # Abstract method. Return true iff the expression is atomic
+    # (= may not have any child).
+    def atomic?()
+      abstract_method
+    end
+    # Abstract method. Return the number of values that match this expression.
+    # [_parent_options] an Hash of matching options. They are overridden
+    #   by options with same name that are bound to this object.
+    def cardinality(_parent_options)
+      abstract_method
+    end
+    # Determine the matching options to apply to this object, given the options
+    # coming from the parent
+    # and options that are local to this object. Local options take precedence.
+    # @param theParentOptions [Hash] matching options. They are overridden
+    # by options with same name that are bound to this object.
+    def options(theParentOptions)
+      resulting_options = theParentOptions.merge(@local_options)
+      return resulting_options
+    end
+    # Template method.
+    # Purpose: Return the String representation of the expression.
+    def to_str()
+      result = ''
+      result << prefix
+      result << text_repr
+      result << suffix
+      return result
+    end
+    protected
+    def prefix()
+      begin_anchor ? begin_anchor.to_str : ''
+    end
+    def suffix()
+      end_anchor ? end_anchor.to_str : ''
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/lookaround.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# File: Lookaround.rb
+########################
+# TODO: make it a binary expression
+########################
+require_relative 'polyadic_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Lookaround is a zero-width assertion just like the start and end of line
+  # anchors.
+  # The difference is that lookarounds will actually match characters, but only
+  # return the result of the match: match or no match.
+  # That is why they are called "assertions". They do not consume characters
+  # from the subject, but only assert whether a match is possible or not.
+  class Lookaround < PolyadicExpression
+    # The "direction" of the lookaround. Can be ahead or behind. It specifies
+    # the relative position of the expression to match compared to
+    # the current 'position' in the subject text.
+    attr_reader(:dir)
+    # The kind indicates whether the assertion is positive
+    # (succeeds when there is a match) or negative
+    # (assertion succeeds when there is NO match).
+    attr_reader(:kind)
+    # Constructor.
+    # [assertedExpression]  A sub-expression to match.
+    # [theDir]  One of the following values: [ :ahead, :behind ]
+    # [theKind] One of the following values: [ :positive, :negative ]
+    def initialize(assertedExpression, theDir, theKind)
+      super([assertedExpression])
+      @dir = theDir
+      @kind = theKind
+    end
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the captured expression.
+    def to_str()
+      result = children[0].to_str
+      dir_syntax = (dir == :ahead) ? '' : '<'
+      kind_syntax = (kind == :positive) ? '=' : '!'
+      result << '(?' + dir_syntax + kind_syntax + children[1].to_str + ')'
+      return result
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/match_option.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# File: MatchOption.rb
+module Regex # This module is used as a namespace
+  # Represents an option that influences the way a regular (sub)expression
+  # can perform its matching.
+  class MatchOption
+    # The symbolic name of the option
+    attr_reader(:name)
+    # An indicator that tells whether the option is turned on or off
+    attr_reader(:setting)
+    # Constructor.
+    def initialize(theName, theSetting)
+      @name = theName
+      @setting = theSetting
+    end
+    # Equality operator
+    def ==(other)
+      return true if object_id == other.object_id
+      if other.kind_of?(MatchOption)
+        isEqual = ((name == other.name) && (setting == other.setting))
+      else
+        isEqual = false
+      end
+      return isEqual
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/monadic_expression.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# File: monadic_expression.rb
+require_relative 'compound_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Abstract class. An element that is part of a regular expression &
+  # that can have up to one child sub-expression.
+  class MonadicExpression < CompoundExpression
+    # The (optional) child sub-expression
+    attr_reader(:child)
+    # Constructor.
+    def initialize(theChild)
+      super()
+      @child = theChild
+    end
+    protected
+    # Return the text representation of the child (if any)
+    def all_child_text()
+      result = child.nil? ? '' : child.to_str
+      return result
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/multiplicity.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# File: Multiplicity.rb
+module SRL
+  module Regex # This module is used as a namespace
+    # The multiplicity specifies by how much a given expression can be repeated.
+    class Multiplicity
+      # The lowest acceptable repetition count
+      attr_reader(:lower_bound)
+      # The highest possible repetition count
+      attr_reader(:upper_bound)
+      # An indicator that specifies how to repeat (:greedy, :lazy, :possessive)
+      attr_reader(:policy)
+      # @param aLowerBound [Integer]
+      # @param anUpperBound [Integer, Symbol] integer or :more symbol
+      # @param aPolicy [Symbol] One of: (:greedy, :lazy, :possessive)
+      def initialize(aLowerBound, anUpperBound, aPolicy)
+        @lower_bound = valid_lower_bound(aLowerBound)
+        @upper_bound = valid_upper_bound(anUpperBound)
+        @policy = valid_policy(aPolicy)
+      end
+      # Purpose: Return the String representation of the multiplicity.
+      def to_str()
+        case upper_bound
+          when :more
+            case lower_bound
+              when 0
+                subresult = '*'
+              when 1
+                subresult = '+'
+              else
+                subresult = "{#{lower_bound},}"
+            end
+          when lower_bound
+            subresult = "{#{lower_bound}}"
+          else
+            if [lower_bound, upper_bound] == [0, 1]
+              subresult = '?'
+            else
+              subresult = "{#{lower_bound},#{upper_bound}}"
+            end
+        end
+        suffix = case policy
+          when :greedy
+            ''
+          when :lazy
+            '?'
+          when :possessive
+            '+'
+        end
+        return subresult + suffix
+      end
+      private
+      # Validation method. Return the validated lower bound value
+      def valid_lower_bound(aLowerBound)
+        err_msg = "Invalid lower bound of repetition count #{aLowerBound}"
+        raise StandardError, err_msg unless aLowerBound.kind_of?(Integer)
+        return aLowerBound
+      end
+      # Validation method. Return the validated lower bound value
+      def valid_upper_bound(anUpperBound)
+        err_msg = "Invalid upper bound of repetition count #{anUpperBound}"
+        unless anUpperBound.kind_of?(Integer) || (anUpperBound == :more)
+          raise StandardError, err_msg
+        end
+        return anUpperBound
+      end
+      # Validation method. Return the validated policy value.
+      def valid_policy(aPolicy)
+        err_msg = "Invalid repetition policy '#{aPolicy}'."
+        valid_policies = %i[greedy lazy possessive]
+        raise StandardError, err_msg unless valid_policies.include? aPolicy
+        return aPolicy
+      end
+    end # class
+  end # module
+end # module
+# End of file

data/lib/regex/non_capturing_group.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# File: non_capturing_group.rb
+require_relative 'monadic_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # A non-capturing group, in other word it is a pure grouping
+  # of sub-expressions
+  class NonCapturingGroup < MonadicExpression
+    # Constructor.
+    # [aChildExpression]  A sub-expression to match. When successful
+    # the matching text is assigned to the capture variable.
+    def initialize(aChildExpression)
+      super(aChildExpression)
+    end
+    protected
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the captured expression.
+    def text_repr()
+      result = '(?:' + all_child_text + ')'
+      return result
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/polyadic_expression.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# File: polyadic_expression.rb
+require_relative 'compound_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Abstract class. An element that is part of a regular expression &
+  # that has its own child sub-expressions.
+  class PolyadicExpression < CompoundExpression
+    # The aggregation of child elements
+    attr_reader(:children)
+    # Constructor.
+    def initialize(theChildren)
+      super()
+      @children = theChildren
+    end
+    # Append the given child to the list of children.
+    # TODO: assess whether to defer to a subclass NAryExpression
+    def <<(aChild)
+      @children << aChild
+      return self
+    end
+    # Build a depth-first in-order children visitor.
+    # The visitor is implemented as an Enumerator.
+    def df_visitor()
+      root = children # The visit will start from the children of this object
+      visitor = Enumerator.new do |result| # result is a Yielder
+        # Initialization part: will run once
+        visit_stack = [root] # The LIFO queue of nodes to visit
+        begin # Traversal part (as a loop)
+          top = visit_stack.pop
+          if top.kind_of?(Array)
+            next if top.empty?
+            currChild = top.pop
+            visit_stack.push top
+          else
+            currChild = top
+          end
+          result << currChild # Return the visited child
+          unless currChild.atomic?
+            # in-order traversal implies LIFO queue
+            children_to_enqueue = currChild.children.reverse
+            visit_stack.push(children_to_enqueue)
+          end
+        end until visit_stack.empty?
+      end
+      return visitor
+    end
+  end # class
+end # module
+# End of file

data/lib/regex/quantifiable.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# File: quantifiable.rb
+require_relative 'multiplicity'
+module Regex # This module is used as a namespace
+  module Quantifiable
+    # Redefined method. Return true since it may not have any child.
+    def quantified?
+      return @quantifier.nil? ? false : true
+    end
+    def quantifier
+      @quantifier
+    end
+    def quantifier=(aQuantifier)
+      @quantifier = aQuantifier
+    end
+  end # module
+end # module
+# End of file

data/lib/regex/repetition.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# File: repetition.rb
+require_relative 'monadic_expression' # Access the superclass
+module Regex # This module is used as a namespace
+  # Abstract class. An unary matching operator.
+  # It succeeds when the specified repetition of the child expression
+  # succeeds to match the subject text in the same serial arrangement
+  class Repetition < MonadicExpression
+    attr_reader(:multiplicity)
+    # Constructor.
+    def initialize(childExpressionToRepeat, aMultiplicity)
+      super(childExpressionToRepeat)
+      @multiplicity = aMultiplicity
+    end
+    protected
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the concatented expressions.
+    def text_repr()
+      result = all_child_text + multiplicity.to_str
+      return result
+    end
+  end # class
+end # module
+# End of file