RubyGems - dhaka - Versions diffs - 1.0.0 → 2.0.0 - Mend

dhaka 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/lib/dhaka.rb +1 -4
data/lib/evaluator/evaluator.rb +65 -15
data/lib/grammar/grammar.rb +30 -0
data/lib/grammar/grammar_symbol.rb +1 -1
data/lib/grammar/production.rb +1 -1
data/lib/parser/action.rb +1 -3
data/lib/parser/parse_result.rb +9 -7
data/lib/parser/parse_tree.rb +9 -2
data/lib/parser/parser.rb +7 -0
data/lib/parser/parser_run.rb +12 -19
data/lib/parser/token.rb +10 -7
data/lib/tokenizer/tokenizer.rb +90 -17
data/test/all_tests.rb +7 -6
data/test/arithmetic_evaluator_test.rb +20 -20
data/test/arithmetic_precedence_evaluator.rb +1 -1
data/test/arithmetic_precedence_parser_test.rb +7 -7
data/test/arithmetic_precedence_tokenizer.rb +3 -9
data/test/arithmetic_test_methods.rb +2 -2
data/test/arithmetic_tokenizer.rb +3 -9
data/test/arithmetic_tokenizer_test.rb +14 -10
data/test/bracket_tokenizer.rb +1 -1
data/test/chittagong_driver_test.rb +261 -0
data/test/chittagong_evaluator.rb +218 -47
data/test/chittagong_evaluator_test.rb +18 -20
data/test/chittagong_grammar.rb +61 -15
data/test/chittagong_parser_test.rb +24 -12
data/test/chittagong_test.rb +148 -6
data/test/chittagong_tokenizer.rb +33 -21
data/test/chittagong_tokenizer_test.rb +16 -8
data/test/compiled_parser_test.rb +14 -12
data/test/parser_test.rb +16 -16
metadata +3 -2

data/lib/dhaka.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2006 Mushfeq Khan
+# Copyright (c) 2006, 2007 Mushfeq Khan
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -21,9 +21,6 @@
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #++
-# An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
-#
-# Further examples can be found in the test suites included with the gem.
 module Dhaka
 end

data/lib/evaluator/evaluator.rb CHANGED

@@ -8,6 +8,54 @@ module Dhaka
   # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
   # a block that performs the evaluation. For detailed examples, see the evaluators in the
   # test suite.
+  #
+  # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
+  # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
+  # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
+  # result is obtained by adding the evaluation results of the child nodes.
+  #
+  #    class ArithmeticPrecedenceEvaluator < Dhaka::Evaluator
+  #
+  #      self.grammar = ArithmeticPrecedenceGrammar
+  #
+  #      define_evaluation_rules do
+  #
+  #        for_subtraction do
+  #          evaluate(child_nodes[0]) - evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_addition do
+  #          evaluate(child_nodes[0]) + evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_division do
+  #          evaluate(child_nodes[0]).to_f/evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_multiplication do
+  #          evaluate(child_nodes[0]) * evaluate(child_nodes[2])
+  #        end
+  #
+  #        for_literal do
+  #          child_nodes[0].token.value.to_i
+  #        end
+  #
+  #        for_parenthetized_expression do
+  #          evaluate(child_nodes[1])
+  #        end
+  #
+  #        for_negated_expression do
+  #          -evaluate(child_nodes[1])
+  #        end
+  #
+  #        for_power do
+  #          evaluate(child_nodes[0])**evaluate(child_nodes[2])
+  #        end
+  #
+  #      end
+  #
+  #    end
   class Evaluator
@@ -18,17 +66,29 @@ module Dhaka
     def evaluate node
       @node_stack ||= []
       @node_stack << node.child_nodes
-      proc = self.class.actions[node.production.name]
-      result = self.instance_eval(&proc)
+      result = self.send(node.production.name)
       @node_stack.pop
       result
     end
+    # Performs the pass-through calculations for nodes with only one child_node for which an
+    # evaluation rule is not explicitly defined. Will probably be deprecated in future versions.
+    def method_missing(method_name)
+      evaluate(child_nodes[0])
+    end
     # Returns the array of child nodes of the node being currently evaluated.
     def child_nodes
       @node_stack[-1]
     end
+    # Evaluation rules are defined within a block passed to this method.
+    def self.define_evaluation_rules
+      self.actions = []
+      yield
+      check_definitions
+    end
     private
     def self.inherited(evaluator)
@@ -37,29 +97,19 @@ module Dhaka
       end
     end
-    def self.define_evaluation_rules
-      default_action = Proc.new { evaluate(child_nodes[0]) }
-      self.actions = Hash.new { |hash, key| default_action }
-      yield
-      check_definitions
-    end
     def self.method_missing(method_name, &blk)
       if method_name.to_s =~ /^for_*/
         rule_name = method_name.to_s[4..-1]
-        self.for_rule_named(rule_name, &blk)
+        self.actions << rule_name
+        self.send(:define_method, rule_name, &blk)
       end
     end
     def self.check_definitions
-      non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
+      non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions
       raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
     end
-    def self.for_rule_named(name, &blk)
-      self.actions[name] = blk
-    end
   end
   class EvaluatorDefinitionError < StandardError #:nodoc:

data/lib/grammar/grammar.rb CHANGED

@@ -60,6 +60,36 @@ module Dhaka
   end
   # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
+  #
+  # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
+  # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
+  # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
+  # encoding the Evaluator) and the expansion for that particular production. For example, the production named
+  # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
+  #
+  #  class ArithmeticPrecedenceGrammar < Dhaka::Grammar
+  #    precedences do
+  #      left ['+', '-']
+  #      left ['*', '/']
+  #      nonassoc ['^']
+  #    end
+  #
+  #    for_symbol(Dhaka::START_SYMBOL_NAME) do
+  #      expression ['E']
+  #    end
+  #
+  #    for_symbol('E') do
+  #      addition ['E', '+', 'E']
+  #      subtraction ['E', '-', 'E']
+  #      multiplication ['E', '*', 'E']
+  #      division ['E', '/', 'E']
+  #      power ['E', '^', 'E']
+  #      literal ['n']
+  #      parenthetized_expression ['(', 'E', ')']
+  #      negated_expression ['-', 'E'], :prec => '*'
+  #    end
+  #  end
+  #
   class Grammar
     # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is

data/lib/grammar/grammar_symbol.rb CHANGED

@@ -13,7 +13,7 @@ module Dhaka
     def terminal
       !non_terminal
     end
-    def to_s
+    def to_s #:nodoc:
       name
     end
     def <=> other

data/lib/grammar/production.rb CHANGED

@@ -23,7 +23,7 @@ module Dhaka
       @precedence
     end
-    def to_s
+    def to_s #:nodoc:
       "#{@name} #{@symbol} ::= #{@expansion.join(' ')}"
     end

data/lib/parser/action.rb CHANGED

@@ -36,9 +36,7 @@ module Dhaka
         node_stack << composite_node
         unless composite_node.head_node?
-          [production.symbol.name, current_token.grammar_symbol.name]
-        else
-          []
+          @symbol_queue += [@current_token.symbol_name, production.symbol.name]
         end
       end
     end

data/lib/parser/parse_result.rb CHANGED

@@ -2,22 +2,24 @@ module Dhaka
   # Returned on successful parsing of the input token stream.
   class ParseSuccessResult
     # Contains the parse result.
-    attr_accessor :syntax_tree
-    def initialize(syntax_tree) #:nodoc:
-      @syntax_tree = syntax_tree
+    attr_accessor :parse_tree
+    def initialize(parse_tree) #:nodoc:
+      @parse_tree = parse_tree
     end
     # This is false.
     def has_error?
       false
     end
+    # Deprecated. Use the +parse_tree+ accessor.
+    alias syntax_tree parse_tree
   end
   # Returned on unsuccessful parsing of the input token stream.
   class ParseErrorResult
-    # The index of the token that caused the parse error.
-    attr_reader :bad_token_index
-    def initialize(bad_token_index) #:nodoc:
-      @bad_token_index = bad_token_index
+    # The token that caused the parse error.
+    attr_reader :unexpected_token
+    def initialize(unexpected_token) #:nodoc:
+      @unexpected_token = unexpected_token
     end
     # This is true.
     def has_error?

data/lib/parser/parse_tree.rb CHANGED

@@ -7,7 +7,10 @@ module Dhaka
       @child_nodes = []
     end
     def linearize #:nodoc:
-      child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
+      child_nodes.collect {|child_node| child_node.linearize}.flatten + [self]
+    end
+    def tokens
+      child_nodes.collect{|child_node| child_node.tokens}.flatten
     end
     def to_s #:nodoc:
       "CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
@@ -33,7 +36,7 @@ module Dhaka
     def dot_name #:nodoc:
       "Node#{object_id}"
     end
   end
   # These are leaf nodes of syntax trees. They contain tokens.
@@ -45,6 +48,9 @@ module Dhaka
     def linearize #:nodoc:
       []
     end
+    def tokens
+      [token]
+    end
     def to_s #:nodoc:
       "LeafNode: #{token}"
     end
@@ -60,5 +66,6 @@ module Dhaka
     def dot_name #:nodoc:
       "Node#{object_id}"
     end
   end
 end

data/lib/parser/parser.rb CHANGED

@@ -3,6 +3,13 @@ require 'set'
 require 'logger'
 module Dhaka
+  # The parser generator. To generate a parser from a grammar specification +ArithmeticPrecedenceGrammar+, one would
+  # write:
+  #   parser = Dhaka::Parser.new(ArithmeticPrecedenceGrammar)
+  #
+  # To compile this parser to Ruby source as +ArithmeticPrecedenceParser+:
+  #   parser.compile_to_ruby_source_as(:ArithmeticPrecedenceParser)
+  # which returns a string of Ruby code.
   class Parser
     include ParserMethods
     attr_reader :grammar, :start_state

data/lib/parser/parser_run.rb CHANGED

@@ -6,36 +6,29 @@ module Dhaka
       @node_stack = []
       @state_stack = [start_state]
       @token_stream = token_stream
-      @current_token_index = 0
+      @symbol_queue = []
     end
     def run
-      for_each_token_with_end do
-        error = execute_action current_token.grammar_symbol.name
+      token_stream.each do |token|
+        @current_token = token
+        @symbol_queue << @current_token.symbol_name
+        error = execute_actions
         return error if error
-        node_stack << ParseTreeLeafNode.new(current_token)
-        @current_token_index += 1
+        node_stack << ParseTreeLeafNode.new(@current_token)
       end
       ParseSuccessResult.new(node_stack[0])
     end
     private
-    attr_reader :state_stack, :token_stream, :node_stack, :current_token
+    attr_reader :state_stack, :token_stream, :node_stack
-    def for_each_token_with_end
-      token_stream.each do |@current_token|
-        yield
-      end
-      @current_token = Token.new(@grammar.end_symbol, nil)
-      yield
-    end
-    def execute_action symbol_name
-      action = state_stack[-1].actions[symbol_name]
-      return ParseErrorResult.new(@current_token_index) unless action
-      self.instance_eval(&action.action_code).each do |symbol_name|
-        execute_action symbol_name
+    def execute_actions
+      while symbol_name = @symbol_queue.pop
+        action = state_stack[-1].actions[symbol_name]
+        return ParseErrorResult.new(@current_token) unless action
+        self.instance_eval(&action.action_code)
       end
       nil
     end

data/lib/parser/token.rb CHANGED

@@ -1,17 +1,20 @@
 module Dhaka
   # Represents a portion of the input character stream that is mapped by the tokenizer
-  # to a symbol in the grammar.
+  # to a symbol in the grammar. The attribute +input_position+ contains the start index position of the original
+  # string input that this token came from. It can be used to report errors by indicating the specific portion
+  # of the input where the error occurred.
   class Token
-    attr_accessor :grammar_symbol, :value
-    def initialize(grammar_symbol, value)
-      @grammar_symbol = grammar_symbol
+    attr_accessor :symbol_name, :value, :input_position
+    def initialize(symbol_name, value, input_position)
+      @symbol_name = symbol_name
       @value = value
+      @input_position = input_position
     end
-    def to_s
-      "#{@grammar_symbol.name}"
+    def to_s #:nodoc:
+      "#{symbol_name}"
     end
     def == other
-      (grammar_symbol == other.grammar_symbol) && (value == other.value)
+      (symbol_name == other.symbol_name) && (value == other.value)
     end
   end
 end

data/lib/tokenizer/tokenizer.rb CHANGED

@@ -2,17 +2,35 @@ module Dhaka
   # Reserved constant used to identify the idle state of the tokenizer.
   TOKENIZER_IDLE_STATE = :idle_state
-  # Raised when the tokenizer encounters a character that has no corresponding action in
-  # its current state.
-  class UnrecognizedInputCharacterException < StandardError
-    attr_reader :input, :char_index
-    def initialize(input, char_index)
-      @input = input
-      @char_index = char_index
+  # Returned on successful tokenizing of the input stream. Supports iteration by including Enumerable, so it can
+  # be passed in directly to the parser.
+  class TokenizerSuccessResult
+    include Enumerable
+    def initialize(tokens)
+      @tokens = tokens
+    end
+    # Returns false.
+    def has_error?
+      false
+    end
+    def each
+      @tokens.each do |token|
+        yield token
+      end
+    end
+  end
+  # Returned when tokenizing fails due to an unexpected character in the input stream.
+  class TokenizerErrorResult
+    # The index of the character that caused the error.
+    attr_reader :unexpected_char_index
+    def initialize(unexpected_char_index)
+      @unexpected_char_index = unexpected_char_index
     end
-    def to_s
-      "Unrecognized character '#{input[char_index].chr}' encountered while tokenizing:\n #{input} at index #{char_index}"
+    # Returns true.
+    def has_error?
+      true
     end
   end
@@ -45,7 +63,52 @@ module Dhaka
   #
   # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
   # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
-  # that it starts in).
+  # that it starts in).
+  #
+  # The following is a tokenizer for arithmetic expressions with integer terms. The tokenizer starts in the idle state
+  # creating single-character tokens for all characters excepts digits and whitespace. It shifts to
+  # <tt>:get_integer_literal</tt> when it encounters a digit character and creates a token on the stack on which it
+  # accumulates the value of the literal. When it again encounters a non-digit character, it shifts back to idle.
+  # Whitespace is treated as a delimiter, but not shifted as a token.
+  #
+  #  class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
+  #
+  #    digits = ('0'..'9').to_a
+  #    parenths = ['(', ')']
+  #    operators = ['-', '+', '/', '*', '^']
+  #    functions = ['h', 'l']
+  #    arg_separator = [',']
+  #    whitespace = [' ']
+  #
+  #    all_characters = digits + parenths + operators + functions + arg_separator + whitespace
+  #
+  #    for_state Dhaka::TOKENIZER_IDLE_STATE do
+  #      for_characters(all_characters - (digits + whitespace)) do
+  #        create_token(curr_char, nil)
+  #        advance
+  #      end
+  #      for_characters digits do
+  #        create_token('n', '')
+  #        switch_to :get_integer_literal
+  #      end
+  #      for_character whitespace do
+  #        advance
+  #      end
+  #    end
+  #
+  #    for_state :get_integer_literal do
+  #      for_characters all_characters - digits do
+  #        switch_to Dhaka::TOKENIZER_IDLE_STATE
+  #      end
+  #      for_characters digits do
+  #        curr_token.value += curr_char
+  #        advance
+  #      end
+  #    end
+  #
+  #  end
   class Tokenizer
     # Define the action for the state named +state_name+.
@@ -53,13 +116,11 @@ module Dhaka
       states[state_name].instance_eval(&blk)
     end
-    # Tokenizes a string +input+ and returns an array of Token-s.
+    # Tokenizes a string +input+ and returns a TokenizerErrorResult on failure or a TokenizerSuccessResult on sucess.
     def self.tokenize(input)
       self.new(input).run
     end
-    # A slot that can be used to accumulate characters when processing multi-character tokens.
-    attr_accessor :accumulator
     # The tokens shifted so far.
     attr_reader :tokens
@@ -80,6 +141,17 @@ module Dhaka
       @curr_char_index += 1
     end
+    # The token currently on top of the stack.
+    def curr_token
+      tokens[-1]
+    end
+    # Push a new token on to the stack with symbol corresponding to +symbol_name+ and a value of +value+.
+    def create_token(symbol_name, value)
+      new_token = Dhaka::Token.new(symbol_name, value, @curr_char_index)
+      tokens << new_token
+    end
     # Change the active state of the tokenizer to the state identified by the symbol +state_name+.
     def switch_to state_name
       @current_state = self.class.states[state_name]
@@ -88,16 +160,17 @@ module Dhaka
     def run #:nodoc:
       while curr_char
         blk = @current_state.actions[curr_char]
-        raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
+        return TokenizerErrorResult.new(@curr_char_index) unless blk
         instance_eval(&blk)
       end
-      tokens
+      tokens << Dhaka::Token.new(Dhaka::END_SYMBOL_NAME, nil, nil)
+      return TokenizerSuccessResult.new(tokens)
     end
     private
     def self.inherited(tokenizer)
       class << tokenizer
-        attr_accessor :states
+        attr_accessor :states, :grammar
       end
       tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
     end