RubyGems - dhaka - Versions diffs - 2.0.1 → 2.1.0 - Mend

dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/Rakefile +64 -0
data/lib/dhaka.rb +12 -0
data/lib/dot/dot.rb +29 -0
data/lib/evaluator/evaluator.rb +35 -26
data/lib/grammar/grammar.rb +42 -17
data/lib/grammar/grammar_symbol.rb +4 -3
data/lib/grammar/production.rb +9 -3
data/lib/lexer/compiled_lexer.rb +46 -0
data/lib/lexer/dfa.rb +71 -0
data/lib/lexer/lexeme.rb +33 -0
data/lib/lexer/lexer.rb +61 -0
data/lib/lexer/lexer_run.rb +66 -0
data/lib/lexer/regex_grammar.rb +368 -0
data/lib/lexer/regex_parser.rb +1888 -0
data/lib/lexer/regex_tokenizer.rb +14 -0
data/lib/lexer/specification.rb +69 -0
data/lib/lexer/state.rb +45 -0
data/lib/lexer/state_machine.rb +37 -0
data/lib/parser/action.rb +3 -3
data/lib/parser/compiled_parser.rb +11 -3
data/lib/parser/parse_result.rb +3 -5
data/lib/parser/parse_tree.rb +6 -17
data/lib/parser/parser.rb +15 -14
data/lib/parser/parser_run.rb +4 -2
data/lib/parser/parser_state.rb +16 -8
data/lib/tokenizer/tokenizer.rb +5 -3
data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
data/test/chittagong/chittagong_driver.rb +12 -13
data/test/chittagong/chittagong_driver_test.rb +18 -11
data/test/chittagong/chittagong_evaluator.rb +7 -16
data/test/chittagong/chittagong_evaluator_test.rb +7 -4
data/test/chittagong/chittagong_grammar.rb +0 -6
data/test/chittagong/chittagong_lexer.rb +109 -0
data/test/chittagong/chittagong_lexer_specification.rb +39 -0
data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
data/test/chittagong/chittagong_parser.rb +879 -0
data/test/chittagong/chittagong_parser_test.rb +8 -10
data/test/chittagong/chittagong_test.rb +17 -13
data/test/compiled_parser_test.rb +7 -2
data/test/evaluator_test.rb +0 -1
data/test/grammar_test.rb +19 -1
data/test/lexer_test.rb +215 -0
data/test/parse_result_test.rb +8 -8
data/test/parser_state_test.rb +0 -12
metadata +21 -5
data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
data/test/chittagong/chittagong_tokenizer.rb +0 -88

data/lib/lexer/dfa.rb ADDED

@@ -0,0 +1,71 @@
+module Dhaka
+  module LexerSupport
+    # Raised when an invalid regular expression pattern is encountered
+    # in a LexerSpecification
+    class InvalidRegexException < StandardError
+    end
+    class DFA < StateMachine #:nodoc:
+      def initialize(regex)
+        @regex = regex
+        tokenize_result = RegexTokenizer.tokenize(@regex)
+        raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
+        parse_result = RegexParser.parse(tokenize_result)
+        raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
+        ast = parse_result
+        ast.calculate_follow_sets
+        super(ItemSet.new(ast.first))
+      end
+      def tokenize_error_message(tokenize_result)
+        "Invalid character #{@regex[tokenize_result.unexpected_char_index].chr}: #{@regex.dup.insert(tokenize_result.unexpected_char_index, '>>>')}"
+      end
+      def parse_error_message(parse_result)
+        unexpected_token = parse_result.unexpected_token
+        if unexpected_token.symbol_name == END_SYMBOL_NAME
+          "Unexpected end of regex."
+        else
+          "Unexpected token #{parse_result.unexpected_token.symbol_name}: #{@regex.dup.insert(parse_result.unexpected_token.input_position, '>>>')}"
+        end
+      end
+      def dest_key_for key, char
+        result = ItemSet.new
+        key.each do |position|
+          result.merge(position.follow_set) if position.character == char
+        end
+        result
+      end
+      def new_state_for_key key
+         accepting = key.detect {|position| position.accepting}
+         State.new(self, accepting && @regex)
+      end
+      def transition_characters key
+        result = Set.new
+        key.each do |node|
+          result << node.character unless node.accepting
+        end
+        result
+      end
+      def matches(string)
+        curr_state = @start_state
+        string.unpack("C*").each do |i|
+          dest_state = curr_state.transitions[i.chr]
+          return false unless dest_state
+          curr_state = dest_state
+        end
+        return curr_state.accepting?
+      end
+    end
+  end
+end

data/lib/lexer/lexeme.rb ADDED

@@ -0,0 +1,33 @@
+module Dhaka
+  # Represents a portion of the input string that has been recognized as matching a given lexer pattern.
+  class Lexeme
+    # The pattern matched by this lexeme.
+    attr_accessor :pattern
+    # +input_position+ is the index in the input stream that this lexeme starts at.
+    attr_reader :input_position
+    attr_reader :characters
+    def initialize(input_position) #:nodoc:
+      @input_position = input_position
+      @characters = []
+    end
+    # The substring of the input stream that this lexeme is comprised of.
+    def value
+      characters.join
+    end
+    def accepted? #:nodoc:
+      pattern
+    end
+    def << char #:nodoc:
+      @characters << char
+    end
+    def concat chars #:nodoc:
+      @characters.concat chars
+    end
+  end
+end

data/lib/lexer/lexer.rb ADDED

@@ -0,0 +1,61 @@
+module Dhaka
+  # The lexer generator. To generate a lexer from a lexer specification +MyLexerSpecification+:
+  #   lexer = Dhaka::Lexer.new(MyLexerSpecification)
+  #
+  # To compile this lexer as +MyLexer+ to a string of Ruby source:
+  #   lexer.compile_to_ruby_source_as(:MyLexer)
+  class Lexer < LexerSupport::StateMachine
+    attr_reader :specification
+    # Creates a new lexer from a given specification.
+    def initialize(specification)
+      dfas           = {}
+      @specification = specification
+      specification.items.each do |pattern, item|
+        dfas[pattern] = LexerSupport::DFA.new(pattern)
+      end
+      super(ItemSet.new(dfas.values.collect{|dfa| dfa.start_state}))
+    end
+    # Compiles the lexer to Ruby code that when executed, reloads all the states and actions of the lexer
+    # into a class named +lexer_class_name+.
+    def compile_to_ruby_source_as lexer_class_name
+      result  =   "class #{lexer_class_name} < Dhaka::CompiledLexer\n\n"
+      result <<   "  self.specification = #{specification.name}\n\n"
+      result <<   "  start_with #{start_state.object_id}\n\n"
+      @states.each do |key, state|
+        result << "#{state.compile_to_ruby_source}\n\n"
+      end
+      result <<   "end"
+      result
+    end
+    # Returns a LexerRun that tokenizes +input+.
+    def lex input
+      LexerRun.new(self, input)
+    end
+    def action_for_pattern pattern #:nodoc
+      @specification.items[pattern].action
+    end
+    private
+      def new_state_for_key key
+        item = key.select {|state| state.accepting?}.collect {|state| @specification.items[state.pattern]}.min
+        LexerSupport::State.new(self, item && item.pattern)
+      end
+      def transition_characters states
+        states.collect{|state| state.transitions.keys}.flatten.uniq
+      end
+      def dest_key_for states, char
+        result = ItemSet.new
+        states.each do |state|
+          dest_state = state.transitions[char]
+          result << dest_state if dest_state
+        end
+        result
+      end
+  end
+end

data/lib/lexer/lexer_run.rb ADDED

@@ -0,0 +1,66 @@
+module Dhaka
+  # Represents a run of a lexer on a given input string.
+  class LexerRun
+    include Enumerable
+    attr_reader :current_lexeme
+    def initialize lexer, input
+      @lexer, @input          = lexer, input
+      @input_position         = 0
+      @not_yet_accepted_chars = []
+    end
+    # Constructs a token of type +symbol_name+ from the +current_lexeme+.
+    def create_token(symbol_name)
+      Token.new(symbol_name, @current_lexeme.characters.join, @current_lexeme.input_position)
+    end
+    # Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
+    def each
+      reset_and_rewind
+      loop do
+        c = curr_char
+        break if (c == "\0" && @not_yet_accepted_chars.empty? && !@current_lexeme.accepted?)
+        dest_state  = @curr_state.transitions[c]
+        unless dest_state
+          return TokenizerErrorResult.new(@input_position) unless @current_lexeme.accepted?
+          token = get_token
+          yield token if token
+          reset_and_rewind
+        else
+          @curr_state = dest_state
+          if @curr_state.accepting?
+            @current_lexeme.pattern = @curr_state.pattern
+            @current_lexeme.concat @not_yet_accepted_chars
+            @not_yet_accepted_chars = []
+            @current_lexeme << c
+          else
+            @not_yet_accepted_chars << c
+          end
+          advance
+        end
+      end
+      yield Token.new(END_SYMBOL_NAME, nil, nil)
+    end
+    private
+      def reset_and_rewind
+        @input_position -= @not_yet_accepted_chars.size
+        @current_lexeme = Lexeme.new(@input_position)
+        @curr_state     = @lexer.start_state
+        @not_yet_accepted_chars = []
+      end
+      def curr_char
+        (@input[@input_position] || 0).chr
+      end
+      def advance
+        @input_position += 1
+      end
+      def get_token
+        instance_eval(&@lexer.action_for_pattern(@current_lexeme.pattern))
+      end
+  end
+end

data/lib/lexer/regex_grammar.rb ADDED

@@ -0,0 +1,368 @@
+module Dhaka
+  module LexerSupport #:nodoc:all
+    DIGITS                  = ('0'..'9').to_a
+    LOWERCASE_LETTERS       = ('a'..'z').to_a
+    UPPERCASE_LETTERS       = ('A'..'Z').to_a
+    LETTERS                 = LOWERCASE_LETTERS + UPPERCASE_LETTERS
+    WHITESPACE              = [" ", "\n", "\t"]
+    SYMBOLS                 = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
+    CLASSES                 = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
+    OPERATOR_CHARACTERS     = {'(' => 'open_parenth', ')' => 'close_parenth', '[' => 'open_square_bracket',
+                              ']' => 'close_square_bracket', '+' => 'plus', '*' => 'asterisk',
+                              '?' => 'question_mark', '.' => 'period', '\\' => 'back_slash',
+                              '|' => 'pipe', '{' => 'left_curly_brace', '}' => 'right_curly_brace',
+                              '/' => 'forward_slash', '^' => 'caret', '$' => 'dollar'}
+    SET_OPERATOR_CHARACTERS = %w| - ^ [ ] \\ |
+    ALL_CHARACTERS          = DIGITS + LETTERS + SYMBOLS + WHITESPACE + OPERATOR_CHARACTERS.keys
+    class RegexGrammar < Dhaka::Grammar
+      for_symbol(Dhaka::START_SYMBOL_NAME) do
+        regex                         %w| Disjunction |                         do RootNode.new(child_nodes[0]) end
+      end
+      for_symbol('Disjunction') do
+        disjunction                   %w| Alternative \| Disjunction |          do OrNode.new(child_nodes[0], child_nodes[2]) end
+        alternative                   %w| Alternative |                         do child_nodes[0] end
+      end
+      for_symbol('Alternative') do
+        concatenation                 %w| Alternative Term |                    do CatNode.new(child_nodes[0], child_nodes[1]) end
+        term                          %w| Term |                                do child_nodes[0] end
+      end
+      for_symbol('Term') do
+        zero_or_more                  %w| Atom * |                              do ZeroOrMoreNode.new(child_nodes[0]) end
+        one_or_more                   %w| Atom + |                              do OneOrMoreNode.new(child_nodes[0]) end
+        zero_or_one                   %w| Atom ? |                              do ZeroOrOneNode.new(child_nodes[0]) end
+        atom                          %w| Atom |                                do child_nodes[0] end
+      end
+      for_symbol('Atom') do
+        group                         %w| ( Disjunction ) |                     do child_nodes[1] end
+        char                          %w| Character |                           do LeafNode.new(child_nodes[0]) end
+        anything                      %w| . |                                   do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
+        positive_set                  %w| [ SetContents ] |                     do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
+        negative_set                  %w| [ ^ SetContents ] |                   do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
+        CLASSES.each do |char, expansion|
+          send("character_class_#{char}", ['\\', char]) do
+            OrNode.new(*CLASSES[char].collect {|c| LeafNode.new(c)})
+          end
+        end
+        OPERATOR_CHARACTERS.each do |char, method_name|
+          send(method_name, ['\\', char]) do
+            LeafNode.new(char)
+          end
+        end
+      end
+      for_symbol('Character') do
+        letter_character              %w| Letter |                              do child_nodes[0] end
+        digit_character               %w| Digit |                               do child_nodes[0] end
+        white_space_character         %w| Whitespace |                          do child_nodes[0] end
+        symbol_character              %w| Symbol |                              do child_nodes[0] end
+      end
+      for_symbol('SetContents') do
+        single_item                   %w| SetItem |                             do child_nodes[0] end
+        multiple_items                %w| SetContents SetItem |                 do child_nodes[0].concat child_nodes[1] end
+      end
+      for_symbol('SetItem') do
+        single_char_item              %w| SetCharacter |                        do [child_nodes[0]] end
+        lower_case_letter_range       %w| LowercaseLetter - LowercaseLetter |   do (child_nodes[0]..child_nodes[2]).to_a end
+        upper_case_letter_range       %w| UppercaseLetter - UppercaseLetter |   do (child_nodes[0]..child_nodes[2]).to_a end
+        digit_range                   %w| Digit - Digit |                       do (child_nodes[0]..child_nodes[2]).to_a end
+      end
+      for_symbol('Letter') do
+        lower_case_letter             %w| LowercaseLetter |                     do child_nodes[0] end
+        upper_case_letter             %w| UppercaseLetter |                     do child_nodes[0] end
+      end
+      for_symbol('LowercaseLetter') do
+        LOWERCASE_LETTERS.each do |letter|
+          send("lower_char_letter_#{letter}", letter) do
+            letter
+          end
+        end
+      end
+      for_symbol('UppercaseLetter') do
+        UPPERCASE_LETTERS.each do |letter|
+          send("upper_case_letter_#{letter}", letter) do
+            letter
+          end
+        end
+      end
+      for_symbol('Digit') do
+        DIGITS.each do |digit|
+          send("digit_#{digit}", digit) do
+            digit
+          end
+        end
+      end
+      for_symbol('Whitespace') do
+        WHITESPACE.each do |whitespace_char|
+          send("whitespace_#{whitespace_char[0]}", whitespace_char) do
+            whitespace_char
+          end
+        end
+      end
+      for_symbol('Symbol') do
+        SYMBOLS.each do |symbol_char|
+          send("symbol_char_#{symbol_char[0]}", symbol_char) do
+            symbol_char
+          end
+        end
+      end
+      for_symbol('SetCharacter') do
+        (ALL_CHARACTERS - SET_OPERATOR_CHARACTERS).each do |char|
+          send("set_character_#{char[0]}", char) do
+            char
+          end
+        end
+        SET_OPERATOR_CHARACTERS.each do |char|
+          send("set_operator_character_#{char[0]}", ['\\', char]) do
+            char
+          end
+        end
+      end
+    end
+    class ASTNode
+      def accepting
+        false
+      end
+    end
+    class BinaryNode < ASTNode
+      attr_reader :left, :right
+      def initialize left, right
+        @left, @right = left, right
+      end
+      def to_dot(graph)
+        graph.node(self, :label => label)
+        graph.edge(self, left)
+        graph.edge(self, right)
+        left.to_dot(graph)
+        right.to_dot(graph)
+      end
+      def calculate_follow_sets
+        left.calculate_follow_sets
+        right.calculate_follow_sets
+      end
+    end
+    class OrNode < ASTNode
+      attr_reader :children
+      def initialize(*children)
+        @children = children
+      end
+      def label
+        "|"
+      end
+      def nullable
+        children.any? {|child| child.nullable}
+      end
+      def first
+        children.inject(Set.new([])) do |result, child|
+          result | child.first
+        end
+      end
+      def last
+        children.inject(Set.new([])) do |result, child|
+          result | child.last
+        end
+      end
+      def to_dot(graph)
+        graph.node(self, :label => label)
+        children.each do |child|
+          graph.edge(self, child)
+          child.to_dot(graph)
+        end
+      end
+      def calculate_follow_sets
+        children.each do |child|
+          child.calculate_follow_sets
+        end
+      end
+    end
+    class CatNode < BinaryNode
+      def label
+        "cat"
+      end
+      def nullable
+        left.nullable && right.nullable
+      end
+      def first
+        left.nullable ? (left.first | right.first) : left.first
+      end
+      def last
+        right.nullable ? (left.last | right.last) : right.last
+      end
+      def calculate_follow_sets
+        super
+        left.last.each do |leaf_node|
+          leaf_node.follow_set.merge right.first
+        end
+      end
+    end
+    class UnaryNode < ASTNode
+      attr_reader :child
+      def initialize child
+        @child = child
+      end
+      def to_dot(graph)
+        graph.node(self, :label => label)
+        graph.edge(self, child)
+        child.to_dot(graph)
+      end
+      def nullable
+        child.nullable
+      end
+      def first
+        child.first
+      end
+      def last
+        child.last
+      end
+      def calculate_follow_sets
+        child.calculate_follow_sets
+      end
+    end
+    class RootNode < CatNode
+      def initialize(left)
+        super(left, AcceptingNode.new())
+      end
+      def label
+        "start"
+      end
+      def head_node?
+        true
+      end
+    end
+    class ZeroOrMoreNode < UnaryNode
+      def label
+        "*"
+      end
+      def nullable
+        true
+      end
+      def calculate_follow_sets
+        super
+        last.each do |leaf_node|
+          leaf_node.follow_set.merge first
+        end
+      end
+    end
+    class ZeroOrOneNode < UnaryNode
+      def label
+        "?"
+      end
+      def nullable
+        true
+      end
+    end
+    class OneOrMoreNode < UnaryNode
+      def label
+        "+"
+      end
+      def calculate_follow_sets
+        super
+        last.each do |leaf_node|
+          leaf_node.follow_set.merge first
+        end
+      end
+    end
+    class LeafNode < ASTNode
+      attr_reader :character, :follow_set
+      def initialize character
+        @character = character
+        @follow_set = Set.new
+      end
+      def to_dot(graph)
+        graph.node(self, :label => character)
+      end
+      def nullable
+        false
+      end
+      def first
+        Set.new([self])
+      end
+      def last
+        Set.new([self])
+      end
+      def calculate_follow_sets
+      end
+    end
+    class AcceptingNode < ASTNode
+      def accepting
+        true
+      end
+      def character
+      end
+      def first
+        Set.new([self])
+      end
+      def calculate_follow_sets
+      end
+      def to_dot(graph)
+        graph.node(self, :label => '#')
+      end
+    end
+  end
+end