RubyGems - dhaka - Versions diffs - 1.0.0 → 2.0.0 - Mend

dhaka 1.0.0 → 2.0.0

Files changed (32) hide show

data/lib/dhaka.rb +1 -4
data/lib/evaluator/evaluator.rb +65 -15
data/lib/grammar/grammar.rb +30 -0
data/lib/grammar/grammar_symbol.rb +1 -1
data/lib/grammar/production.rb +1 -1
data/lib/parser/action.rb +1 -3
data/lib/parser/parse_result.rb +9 -7
data/lib/parser/parse_tree.rb +9 -2
data/lib/parser/parser.rb +7 -0
data/lib/parser/parser_run.rb +12 -19
data/lib/parser/token.rb +10 -7
data/lib/tokenizer/tokenizer.rb +90 -17
data/test/all_tests.rb +7 -6
data/test/arithmetic_evaluator_test.rb +20 -20
data/test/arithmetic_precedence_evaluator.rb +1 -1
data/test/arithmetic_precedence_parser_test.rb +7 -7
data/test/arithmetic_precedence_tokenizer.rb +3 -9
data/test/arithmetic_test_methods.rb +2 -2
data/test/arithmetic_tokenizer.rb +3 -9
data/test/arithmetic_tokenizer_test.rb +14 -10
data/test/bracket_tokenizer.rb +1 -1
data/test/chittagong_driver_test.rb +261 -0
data/test/chittagong_evaluator.rb +218 -47
data/test/chittagong_evaluator_test.rb +18 -20
data/test/chittagong_grammar.rb +61 -15
data/test/chittagong_parser_test.rb +24 -12
data/test/chittagong_test.rb +148 -6
data/test/chittagong_tokenizer.rb +33 -21
data/test/chittagong_tokenizer_test.rb +16 -8
data/test/compiled_parser_test.rb +14 -12
data/test/parser_test.rb +16 -16
metadata +3 -2

data/lib/dhaka.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2006 Mushfeq Khan
+# Copyright (c) 2006, 2007 Mushfeq Khan
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -21,9 +21,6 @@
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #++
-# An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
-#
-# Further examples can be found in the test suites included with the gem.
 module Dhaka
 end

data/lib/evaluator/evaluator.rb CHANGED

@@ -8,6 +8,54 @@ module Dhaka
   # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
   # a block that performs the evaluation. For detailed examples, see the evaluators in the
   # test suite.
+  #
+  # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
+  # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
+  # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
+  # result is obtained by adding the evaluation results of the child nodes.
+  #
+  #    class ArithmeticPrecedenceEvaluator < Dhaka::Evaluator
+  #
+  #      self.grammar = ArithmeticPrecedenceGrammar
+  #
+  #      define_evaluation_rules do
+  #
+  #        for_subtraction do
+  #          evaluate(child_nodes[0]) - evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_addition do
+  #          evaluate(child_nodes[0]) + evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_division do
+  #          evaluate(child_nodes[0]).to_f/evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_multiplication do
+  #          evaluate(child_nodes[0]) * evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_literal do
+  #          child_nodes[0].token.value.to_i
+  #        end
+  #
+  #        for_parenthetized_expression do
+  #          evaluate(child_nodes[1])
+  #        end
+  #
+  #        for_negated_expression do
+  #          -evaluate(child_nodes[1])
+  #        end
+  #
+  #        for_power do
+  #          evaluate(child_nodes[0])**evaluate(child_nodes[2])
+  #        end
+  #
+  #      end
+  #
+  #    end
   class Evaluator
@@ -18,17 +66,29 @@ module Dhaka
     def evaluate node
       @node_stack ||= []
       @node_stack << node.child_nodes
-      proc = self.class.actions[node.production.name]
-      result = self.instance_eval(&proc)
+      result = self.send(node.production.name)
       @node_stack.pop
       result
     end
+    # Performs the pass-through calculations for nodes with only one child_node for which an
+    # evaluation rule is not explicitly defined. Will probably be deprecated in future versions.
+    def method_missing(method_name)
+      evaluate(child_nodes[0])
+    end
     # Returns the array of child nodes of the node being currently evaluated.
     def child_nodes
       @node_stack[-1]
     end
+    # Evaluation rules are defined within a block passed to this method.
+    def self.define_evaluation_rules
+      self.actions = []
+      yield
+      check_definitions
+    end
     private
     def self.inherited(evaluator)
@@ -37,29 +97,19 @@ module Dhaka
       end
     end
-    def self.define_evaluation_rules
-      default_action = Proc.new { evaluate(child_nodes[0]) }
-      self.actions = Hash.new { |hash, key| default_action }
-      yield
-      check_definitions
-    end
     def self.method_missing(method_name, &blk)
       if method_name.to_s =~ /^for_*/
         rule_name = method_name.to_s[4..-1]
-        self.for_rule_named(rule_name, &blk)
+        self.actions << rule_name
+        self.send(:define_method, rule_name, &blk)
       end
     end
     def self.check_definitions
-      non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
+      non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions
       raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
     end
-    def self.for_rule_named(name, &blk)
-      self.actions[name] = blk
-    end
   end
   class EvaluatorDefinitionError < StandardError #:nodoc:

data/lib/grammar/grammar.rb CHANGED

@@ -60,6 +60,36 @@ module Dhaka
   end
   # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
+  #
+  # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
+  # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
+  # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
+  # encoding the Evaluator) and the expansion for that particular production. For example, the production named
+  # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
+  #
+  #  class ArithmeticPrecedenceGrammar < Dhaka::Grammar
+  #    precedences do
+  #      left ['+', '-']
+  #      left ['*', '/']
+  #      nonassoc ['^']
+  #    end
+  #
+  #    for_symbol(Dhaka::START_SYMBOL_NAME) do
+  #      expression ['E']
+  #    end
+  #
+  #    for_symbol('E') do
+  #      addition ['E', '+', 'E']
+  #      subtraction ['E', '-', 'E']
+  #      multiplication ['E', '*', 'E']
+  #      division ['E', '/', 'E']
+  #      power ['E', '^', 'E']
+  #      literal ['n']
+  #      parenthetized_expression ['(', 'E', ')']
+  #      negated_expression ['-', 'E'], :prec => '*'
+  #    end
+  #  end
+  #
   class Grammar
     # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is

data/lib/grammar/grammar_symbol.rb CHANGED

@@ -13,7 +13,7 @@ module Dhaka
     def terminal
       !non_terminal
     end
-    def to_s
+    def to_s #:nodoc:
       name
     end
     def <=> other

data/lib/grammar/production.rb CHANGED

@@ -23,7 +23,7 @@ module Dhaka
       @precedence
     end
-    def to_s
+    def to_s #:nodoc:
       "#{@name} #{@symbol} ::= #{@expansion.join(' ')}"
     end

data/lib/parser/action.rb CHANGED

@@ -36,9 +36,7 @@ module Dhaka
         node_stack << composite_node
         unless composite_node.head_node?
-          [production.symbol.name, current_token.grammar_symbol.name]
-        else
-          []
+          @symbol_queue += [@current_token.symbol_name, production.symbol.name]
         end
       end
     end

data/lib/parser/parse_result.rb CHANGED

@@ -2,22 +2,24 @@ module Dhaka
   # Returned on successful parsing of the input token stream.
   class ParseSuccessResult
     # Contains the parse result.
-    attr_accessor :syntax_tree
-    def initialize(syntax_tree) #:nodoc:
-      @syntax_tree = syntax_tree
+    attr_accessor :parse_tree
+    def initialize(parse_tree) #:nodoc:
+      @parse_tree = parse_tree
     end
     # This is false.
     def has_error?
       false
     end
+    # Deprecated. Use the +parse_tree+ accessor.
+    alias syntax_tree parse_tree
   end
   # Returned on unsuccessful parsing of the input token stream.
   class ParseErrorResult
-    # The index of the token that caused the parse error.
-    attr_reader :bad_token_index
-    def initialize(bad_token_index) #:nodoc:
-      @bad_token_index = bad_token_index
+    # The token that caused the parse error.
+    attr_reader :unexpected_token
+    def initialize(unexpected_token) #:nodoc:
+      @unexpected_token = unexpected_token
     end
     # This is true.
     def has_error?

data/lib/parser/parse_tree.rb CHANGED

@@ -7,7 +7,10 @@ module Dhaka
       @child_nodes = []
     end
     def linearize #:nodoc:
-      child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
+      child_nodes.collect {|child_node| child_node.linearize}.flatten + [self]
+    end
+    def tokens
+      child_nodes.collect{|child_node| child_node.tokens}.flatten
     end
     def to_s #:nodoc:
       "CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
@@ -33,7 +36,7 @@ module Dhaka
     def dot_name #:nodoc:
       "Node#{object_id}"
     end
   end
   # These are leaf nodes of syntax trees. They contain tokens.
@@ -45,6 +48,9 @@ module Dhaka
     def linearize #:nodoc:
       []
     end
+    def tokens
+      [token]
+    end
     def to_s #:nodoc:
       "LeafNode: #{token}"
     end
@@ -60,5 +66,6 @@ module Dhaka
     def dot_name #:nodoc:
       "Node#{object_id}"
     end
   end
 end

data/lib/parser/parser.rb CHANGED

@@ -3,6 +3,13 @@ require 'set'
 require 'logger'
 module Dhaka
+  # The parser generator. To generate a parser from a grammar specification +ArithmeticPrecedenceGrammar+, one would
+  # write:
+  #   parser = Dhaka::Parser.new(ArithmeticPrecedenceGrammar)
+  #
+  # To compile this parser to Ruby source as +ArithmeticPrecedenceParser+:
+  #   parser.compile_to_ruby_source_as(:ArithmeticPrecedenceParser)
+  # which returns a string of Ruby code.
   class Parser
     include ParserMethods
     attr_reader :grammar, :start_state

data/lib/parser/parser_run.rb CHANGED

@@ -6,36 +6,29 @@ module Dhaka
       @node_stack = []
       @state_stack = [start_state]
       @token_stream = token_stream
-      @current_token_index = 0
+      @symbol_queue = []
     end
     def run
-      for_each_token_with_end do
-        error = execute_action current_token.grammar_symbol.name
+      token_stream.each do |token|
+        @current_token = token
+        @symbol_queue << @current_token.symbol_name
+        error = execute_actions
         return error if error
-        node_stack << ParseTreeLeafNode.new(current_token)
-        @current_token_index += 1
+        node_stack << ParseTreeLeafNode.new(@current_token)
       end
       ParseSuccessResult.new(node_stack[0])
     end
     private
-    attr_reader :state_stack, :token_stream, :node_stack, :current_token
+    attr_reader :state_stack, :token_stream, :node_stack
-    def for_each_token_with_end
-      token_stream.each do |@current_token|
-        yield
-      end
-      @current_token = Token.new(@grammar.end_symbol, nil)
-      yield
-    end
-    def execute_action symbol_name
-      action = state_stack[-1].actions[symbol_name]
-      return ParseErrorResult.new(@current_token_index) unless action
-      self.instance_eval(&action.action_code).each do |symbol_name|
-        execute_action symbol_name
+    def execute_actions
+      while symbol_name = @symbol_queue.pop
+        action = state_stack[-1].actions[symbol_name]
+        return ParseErrorResult.new(@current_token) unless action
+        self.instance_eval(&action.action_code)
       end
       nil
     end

data/lib/parser/token.rb CHANGED

@@ -1,17 +1,20 @@
 module Dhaka
   # Represents a portion of the input character stream that is mapped by the tokenizer
-  # to a symbol in the grammar.
+  # to a symbol in the grammar. The attribute +input_position+ contains the start index position of the original
+  # string input that this token came from. It can be used to report errors by indicating the specific portion
+  # of the input where the error occurred.
   class Token
-    attr_accessor :grammar_symbol, :value
-    def initialize(grammar_symbol, value)
-      @grammar_symbol = grammar_symbol
+    attr_accessor :symbol_name, :value, :input_position
+    def initialize(symbol_name, value, input_position)
+      @symbol_name = symbol_name
       @value = value
+      @input_position = input_position
     end
-    def to_s
-      "#{@grammar_symbol.name}"
+    def to_s #:nodoc:
+      "#{symbol_name}"
     end
     def == other
-      (grammar_symbol == other.grammar_symbol) && (value == other.value)
+      (symbol_name == other.symbol_name) && (value == other.value)
     end
   end
 end

data/lib/tokenizer/tokenizer.rb CHANGED

@@ -2,17 +2,35 @@ module Dhaka
   # Reserved constant used to identify the idle state of the tokenizer.
   TOKENIZER_IDLE_STATE = :idle_state
-  # Raised when the tokenizer encounters a character that has no corresponding action in
-  # its current state.
-  class UnrecognizedInputCharacterException < StandardError
-    attr_reader :input, :char_index
-    def initialize(input, char_index)
-      @input = input
-      @char_index = char_index
+  # Returned on successful tokenizing of the input stream. Supports iteration by including Enumerable, so it can
+  # be passed in directly to the parser.
+  class TokenizerSuccessResult
+    include Enumerable
+    def initialize(tokens)
+      @tokens = tokens
+    end
+    # Returns false.
+    def has_error?
+      false
+    end
+    def each
+      @tokens.each do |token|
+        yield token
+      end
+    end
+  end
+  # Returned when tokenizing fails due to an unexpected character in the input stream.
+  class TokenizerErrorResult
+    # The index of the character that caused the error.
+    attr_reader :unexpected_char_index
+    def initialize(unexpected_char_index)
+      @unexpected_char_index = unexpected_char_index
     end
-    def to_s
-      "Unrecognized character '#{input[char_index].chr}' encountered while tokenizing:\n #{input} at index #{char_index}"
+    # Returns true.
+    def has_error?
+      true
     end
   end
@@ -45,7 +63,52 @@ module Dhaka
   #
   # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
   # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
-  # that it starts in).
+  # that it starts in).
+  #
+  # The following is a tokenizer for arithmetic expressions with integer terms. The tokenizer starts in the idle state
+  # creating single-character tokens for all characters excepts digits and whitespace. It shifts to
+  # <tt>:get_integer_literal</tt> when it encounters a digit character and creates a token on the stack on which it
+  # accumulates the value of the literal. When it again encounters a non-digit character, it shifts back to idle.
+  # Whitespace is treated as a delimiter, but not shifted as a token.
+  #
+  #  class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
+  #
+  #    digits = ('0'..'9').to_a
+  #    parenths = ['(', ')']
+  #    operators = ['-', '+', '/', '*', '^']
+  #    functions = ['h', 'l']
+  #    arg_separator = [',']
+  #    whitespace = [' ']
+  #
+  #    all_characters = digits + parenths + operators + functions + arg_separator + whitespace
+  #
+  #    for_state Dhaka::TOKENIZER_IDLE_STATE do
+  #      for_characters(all_characters - (digits + whitespace)) do
+  #        create_token(curr_char, nil)
+  #        advance
+  #      end
+  #      for_characters digits do
+  #        create_token('n', '')
+  #        switch_to :get_integer_literal
+  #      end
+  #      for_character whitespace do
+  #        advance
+  #      end
+  #    end
+  #
+  #    for_state :get_integer_literal do
+  #      for_characters all_characters - digits do
+  #        switch_to Dhaka::TOKENIZER_IDLE_STATE
+  #      end
+  #      for_characters digits do
+  #        curr_token.value += curr_char
+  #        advance
+  #      end
+  #    end
+  #
+  #  end
   class Tokenizer
     # Define the action for the state named +state_name+.
@@ -53,13 +116,11 @@ module Dhaka
       states[state_name].instance_eval(&blk)
     end
-    # Tokenizes a string +input+ and returns an array of Token-s.
+    # Tokenizes a string +input+ and returns a TokenizerErrorResult on failure or a TokenizerSuccessResult on sucess.
     def self.tokenize(input)
       self.new(input).run
     end
-    # A slot that can be used to accumulate characters when processing multi-character tokens.
-    attr_accessor :accumulator
     # The tokens shifted so far.
     attr_reader :tokens
@@ -80,6 +141,17 @@ module Dhaka
       @curr_char_index += 1
     end
+    # The token currently on top of the stack.
+    def curr_token
+      tokens[-1]
+    end
+    # Push a new token on to the stack with symbol corresponding to +symbol_name+ and a value of +value+.
+    def create_token(symbol_name, value)
+      new_token = Dhaka::Token.new(symbol_name, value, @curr_char_index)
+      tokens << new_token
+    end
     # Change the active state of the tokenizer to the state identified by the symbol +state_name+.
     def switch_to state_name
       @current_state = self.class.states[state_name]
@@ -88,16 +160,17 @@ module Dhaka
     def run #:nodoc:
       while curr_char
         blk = @current_state.actions[curr_char]
-        raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
+        return TokenizerErrorResult.new(@curr_char_index) unless blk
         instance_eval(&blk)
       end
-      tokens
+      tokens << Dhaka::Token.new(Dhaka::END_SYMBOL_NAME, nil, nil)
+      return TokenizerSuccessResult.new(tokens)
     end
     private
     def self.inherited(tokenizer)
       class << tokenizer
-        attr_accessor :states
+        attr_accessor :states, :grammar
       end
       tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
     end