RubyGems - dhaka - Versions diffs - 2.0.1 → 2.1.0 - Mend

dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/Rakefile +64 -0
data/lib/dhaka.rb +12 -0
data/lib/dot/dot.rb +29 -0
data/lib/evaluator/evaluator.rb +35 -26
data/lib/grammar/grammar.rb +42 -17
data/lib/grammar/grammar_symbol.rb +4 -3
data/lib/grammar/production.rb +9 -3
data/lib/lexer/compiled_lexer.rb +46 -0
data/lib/lexer/dfa.rb +71 -0
data/lib/lexer/lexeme.rb +33 -0
data/lib/lexer/lexer.rb +61 -0
data/lib/lexer/lexer_run.rb +66 -0
data/lib/lexer/regex_grammar.rb +368 -0
data/lib/lexer/regex_parser.rb +1888 -0
data/lib/lexer/regex_tokenizer.rb +14 -0
data/lib/lexer/specification.rb +69 -0
data/lib/lexer/state.rb +45 -0
data/lib/lexer/state_machine.rb +37 -0
data/lib/parser/action.rb +3 -3
data/lib/parser/compiled_parser.rb +11 -3
data/lib/parser/parse_result.rb +3 -5
data/lib/parser/parse_tree.rb +6 -17
data/lib/parser/parser.rb +15 -14
data/lib/parser/parser_run.rb +4 -2
data/lib/parser/parser_state.rb +16 -8
data/lib/tokenizer/tokenizer.rb +5 -3
data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
data/test/chittagong/chittagong_driver.rb +12 -13
data/test/chittagong/chittagong_driver_test.rb +18 -11
data/test/chittagong/chittagong_evaluator.rb +7 -16
data/test/chittagong/chittagong_evaluator_test.rb +7 -4
data/test/chittagong/chittagong_grammar.rb +0 -6
data/test/chittagong/chittagong_lexer.rb +109 -0
data/test/chittagong/chittagong_lexer_specification.rb +39 -0
data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
data/test/chittagong/chittagong_parser.rb +879 -0
data/test/chittagong/chittagong_parser_test.rb +8 -10
data/test/chittagong/chittagong_test.rb +17 -13
data/test/compiled_parser_test.rb +7 -2
data/test/evaluator_test.rb +0 -1
data/test/grammar_test.rb +19 -1
data/test/lexer_test.rb +215 -0
data/test/parse_result_test.rb +8 -8
data/test/parser_state_test.rb +0 -12
metadata +21 -5
data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
data/test/chittagong/chittagong_tokenizer.rb +0 -88

data/lib/lexer/regex_tokenizer.rb ADDED

@@ -0,0 +1,14 @@
+module Dhaka
+  module LexerSupport
+    class RegexTokenizer < Tokenizer
+      for_state TOKENIZER_IDLE_STATE do
+        for_characters(ALL_CHARACTERS) do
+          create_token(curr_char, nil)
+          advance
+        end
+      end
+    end
+  end
+end

data/lib/lexer/specification.rb ADDED

@@ -0,0 +1,69 @@
+module Dhaka
+  # Abstract base class for lexer specifications.
+  #
+  # Use this to specify the transformations that will be performed when the lexer recognizes a given pattern. Actions are listed in
+  # descending order of priority. For example in the following lexer specification:
+  #
+  #   class LexerSpec < Dhaka::LexerSpecification
+  #     for_pattern 'zz' do
+  #       "recognized two zs"
+  #     end
+  #
+  #     for_pattern '\w(\w|\d)*' do
+  #       "recognized word token #{current_lexeme.value}"
+  #     end
+  #
+  #     for_pattern '(\d)+(\.\d+)?' do
+  #       "recognized number #{current_lexeme.value}"
+  #     end
+  #
+  #     for_pattern ' +' do
+  #       #ignores whitespace
+  #     end
+  #
+  #     for_pattern "\n+" do
+  #       "recognized newline"
+  #     end
+  #   end
+  #
+  # the pattern 'zz' takes precedence over the pattern immediately below it, so the lexer will announce that it has recognized two
+  # 'z's instead of a word token.
+  #
+  # The patterns are <i>not</i> Ruby regular expressions - a lot of operators featured in Ruby's regular expression engine are not yet supported.
+  # See http://dhaka.rubyforge.org/regex_grammar.html for the current syntax.
+  class LexerSpecification
+    class << self
+      # Associates +blk+ as the action to be performed when a lexer recognizes +pattern+. When Lexer#lex is invoked,
+      # it creates a LexerRun object that provides the context for +blk+ to be evaluated in. Methods available in this block
+      # are LexerRun#current_lexeme and LexerRun#create_token.
+      def for_pattern(pattern, &blk)
+        items[pattern] = LexerSpecificationItem.new(pattern, priority, blk)
+        self.priority += 1
+      end
+      private
+        def inherited(specification)
+          class << specification
+            attr_accessor :items, :priority
+          end
+          specification.items = {}
+          specification.priority = 0
+        end
+    end
+  end
+  class LexerSpecificationItem #:nodoc:
+    include Comparable
+    attr_reader :pattern, :action, :priority
+    def initialize(pattern, priority, action)
+      @pattern, @priority, @action = pattern, priority, action
+    end
+    def <=> other
+      priority <=> other.priority
+    end
+  end
+end

data/lib/lexer/state.rb ADDED

@@ -0,0 +1,45 @@
+module Dhaka
+  module LexerSupport
+    class State
+      attr_reader :transitions, :pattern
+      def initialize state_machine, pattern
+        @state_machine = state_machine
+        @pattern          = pattern
+        @transitions      = {}
+      end
+      def accepting?
+        pattern
+      end
+      def for_characters *characters, &blk
+        dest_state = @state_machine.instance_eval(&blk)
+        characters.each do |char|
+          transitions[char] = dest_state
+        end
+      end
+      def recognize pattern
+        @pattern = pattern
+      end
+      def compile_to_ruby_source
+        result  = "  at_state(#{object_id}) {\n"
+        result << "    recognize(#{pattern.inspect})\n" if accepting?
+        transition_keys_by_destination_state = Hash.new {|hash, key| hash[key] = []}
+        transitions.each do |key, dest_state|
+          transition_keys_by_destination_state[dest_state.object_id] << key
+        end
+        transition_keys_by_destination_state.keys.each do |state_id|
+          transition_keys = transition_keys_by_destination_state[state_id].collect {|transition_key| "#{transition_key.inspect}"}.join(', ')
+          result << "    for_characters(#{transition_keys}) { switch_to #{state_id} }\n"
+        end
+        result << "  }"
+        result
+      end
+    end
+  end
+end

data/lib/lexer/state_machine.rb ADDED

@@ -0,0 +1,37 @@
+module Dhaka
+  module LexerSupport
+    class StateMachine
+      attr_reader :start_state
+      def initialize start_key
+        @states = Hash.new do |hash, key|
+          new_state = new_state_for_key key
+          hash[key] = new_state
+          transition_characters(key).each do |char|
+            dest_key                    = dest_key_for(key, char)
+            dest_state                  = hash[dest_key]
+            new_state.transitions[char] = dest_state
+          end
+          new_state
+        end
+        @start_state = @states[start_key]
+      end
+      def to_dot
+        Dot::Digraph.new(:fontsize => 10, :shape => :circle, :size => 5) do |g|
+          start = 'Start'
+          g.node(start, :label => start)
+          g.edge(start, @start_state)
+          @states.values.each do |state|
+            state_attributes = {}
+            state_attributes.merge!(:shape => :doublecircle, :label => state.pattern) if state.accepting?
+            g.node(state, state_attributes)
+            state.transitions.each do |transition_key, dest_state|
+              g.edge(state, dest_state, :label => transition_key)
+            end
+          end
+        end.to_dot
+      end
+    end
+  end
+end

data/lib/parser/action.rb CHANGED

@@ -31,12 +31,12 @@ module Dhaka
       @action_code = Proc.new do
         composite_node = ParseTreeCompositeNode.new(production)
-        production.expansion.each do |symbol|
+        production.expansion.each do |symbol|
           state_stack.pop
           composite_node.child_nodes.unshift(node_stack.pop)
         end
-        node_stack << composite_node
+        node_stack << composite_node.instance_eval(&production.action)
         unless composite_node.head_node?
           @symbol_queue.concat [@current_token.symbol_name, production.symbol.name]
@@ -45,7 +45,7 @@ module Dhaka
     end
     def compile_to_ruby_source
-      "reduce_with '#{production.name}'"
+      "reduce_with #{production.name.inspect}"
     end
     def to_s

data/lib/parser/compiled_parser.rb CHANGED

@@ -1,14 +1,22 @@
 module Dhaka
-  # This is the superclass of all compiled Parsers. It is only used by generated code.
+  # Abstract base class of all compiled Parsers. It is only used by generated code.
   class CompiledParser
     class << self
       private
         def inherited(compiled_parser)
           class << compiled_parser
-            attr_accessor :states, :grammar, :start_state_id
+            attr_accessor :states, :grammar, :start_state_id, :shift_actions, :reduce_actions
           end
-          compiled_parser.states = Hash.new {|hash, state_id| hash[state_id] = ParserState.new(compiled_parser, {}, state_id)}
+          compiled_parser.states          = Hash.new do |hash, state_id|
+                                              hash[state_id] = ParserState.new(compiled_parser, {}, state_id)
+                                            end
+          compiled_parser.shift_actions   = Hash.new do |hash, state_id|
+                                              hash[state_id] = ShiftAction.new(compiled_parser.states[state_id])
+                                            end
+          compiled_parser.reduce_actions  = Hash.new do |hash, production_name|
+                                              hash[production_name] = ReduceAction.new(compiled_parser.grammar.production_named(production_name))
+                                            end
         end
         def at_state x, &blk

data/lib/parser/parse_result.rb CHANGED

@@ -16,11 +16,9 @@ module Dhaka
     # Returns the dot representation of the parse tree
     def to_dot
-      result = []
-      result << ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
-      result << parse_tree.to_dot
-      result << ['}']
-      result.join("\n")
+      Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
+        parse_tree.to_dot(g)
+      end.to_dot
     end
     # Deprecated. Use the +parse_tree+ accessor.

data/lib/parser/parse_tree.rb CHANGED

@@ -20,25 +20,18 @@ module Dhaka
     end
     # Returns the dot representation of this node.
-    def to_dot
-      result = []
-      label  = production
-      result << %(#{dot_name} [label="#{label}"])
+    def to_dot graph
+      graph.node(self, :label => production)
       child_nodes.each do |child|
-        result << "#{dot_name} -> #{child.dot_name}"
-        result << "#{child.to_dot}"
+        graph.edge(self, child)
+        child.to_dot(graph)
       end
-      result.join("\n")
     end
     def head_node? #:nodoc:
       production.symbol.name == START_SYMBOL_NAME
     end
-    def dot_name #:nodoc:
-      "Node#{object_id}"
-    end
   end
   # These are leaf nodes of syntax trees. They contain tokens.
@@ -62,16 +55,12 @@ module Dhaka
     end
     # Returns the dot representation of this node.
-    def to_dot
-      %(#{dot_name} [label="#{token.to_s}"])
+    def to_dot(graph)
+      graph.node(self, :label => token)
     end
     def head_node? #:nodoc:
       false
     end
-    def dot_name #:nodoc:
-      "Node#{object_id}"
-    end
   end
 end

data/lib/parser/parser.rb CHANGED

@@ -14,10 +14,12 @@ module Dhaka
     # and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
     # at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
     def initialize(grammar, logger = nil)
-      @logger      = logger || default_logger
-      @transitions = Hash.new {|hash, state| hash[state] = {}}
-      @grammar     = grammar
-      @channels    = []
+      @shift_actions  = Hash.new {|hash, state| hash[state] = ShiftAction.new(state)}
+      @reduce_actions = Hash.new {|hash, production| hash[production] = ReduceAction.new(production)}
+      @logger         = logger || default_logger
+      @transitions    = Hash.new {|hash, state| hash[state] = {}}
+      @grammar        = grammar
+      @channels       = []
       @states = Hash.new do |hash, kernel|
           channels, closure = grammar.closure(kernel)
           @channels.concat channels.to_a
@@ -51,15 +53,14 @@ module Dhaka
     # options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
     # of lookahead symbols for every item in every state.
     def to_dot(options = {})
-      result = ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
-      result.concat states.collect { |state| state.to_dot(options) }
-      states.each do |state|
-        @transitions[state].each do |symbol, dest_state|
-          result << %(#{state.unique_name} -> #{dest_state.unique_name} [label="#{symbol.name}"])
+      Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
+        states.each do |state|
+          g.node(state, :label => state.items.values.collect{|item| item.to_s(options)}.join("\n"))
+          @transitions[state].each do |symbol, dest_state|
+            g.edge(state, dest_state, :label => symbol.name)
+          end
         end
-      end
-      result << ['}']
-      result.join("\n")
+      end.to_dot
     end
     def inspect
@@ -97,7 +98,7 @@ module Dhaka
     def generate_shift_actions
       @states.values.each do |state|
         @transitions[state].keys.each do |symbol|
-          state.actions[symbol.name] = ShiftAction.new(@transitions[state][symbol])
+          state.actions[symbol.name] = @shift_actions[@transitions[state][symbol]]
         end
       end
     end
@@ -112,7 +113,7 @@ module Dhaka
     def create_reduction_actions_for_item_and_state item, state
       item.lookaheadset.each do |lookahead|
-        new_action = ReduceAction.new(item.production)
+        new_action = @reduce_actions[item.production]
         if existing_action = state.actions[lookahead.name]
           if ReduceAction === existing_action
             message = ReduceReduceConflict.new(state, lookahead, new_action).resolve

data/lib/parser/parser_run.rb CHANGED

@@ -10,14 +10,16 @@ module Dhaka
     end
     def run
-      token_stream.each do |token|
+      tokenize_result = token_stream.each do |token|
         @current_token = token
         @symbol_queue << @current_token.symbol_name
         error = execute_actions
         return error if error
         node_stack << ParseTreeLeafNode.new(@current_token)
+        state_stack.last
       end
-      ParseSuccessResult.new(node_stack.first)
+      return tokenize_result if TokenizerErrorResult === tokenize_result
+      ParseSuccessResult.new(node_stack.first) if node_stack.first.head_node?
     end
     private

data/lib/parser/parser_state.rb CHANGED

@@ -29,23 +29,31 @@ module Dhaka
       "State#{id}"
     end
-    def to_dot(options = {})
-      %(#{unique_name} [label="#{items.values.collect{|item| item.to_s(options)}.join('\n')}"])
-    end
     def compile_to_ruby_source
       result = "  at_state(#{id}) {\n"
+      symbol_names_by_action = Hash.new {|hash, key| hash[key] = []}
       actions.each do |symbol_name, action|
-        result << "    for_symbol('#{symbol_name}') { #{action.compile_to_ruby_source} }\n"
+        symbol_names_by_action[action] << symbol_name
+      end
+      symbol_names_by_action.keys.each do |action|
+        symbol_names = symbol_names_by_action[action].collect {|symbol_name| "#{symbol_name.inspect}"}.join(', ')
+        result << "    for_symbols(#{symbol_names}) { #{action.compile_to_ruby_source} }\n"
       end
       result << "  }"
       result
     end
-    def for_symbol symbol_name, &blk
-      actions[symbol_name] = @parser.instance_eval(&blk)
+    def for_symbols *symbol_names, &blk
+      symbol_names.each do |symbol_name|
+        actions[symbol_name] = @parser.instance_eval(&blk)
+      end
     end
+    alias :for_symbol :for_symbols
     def to_s(options = {})
       items.values.collect{|item| item.to_s(options)}.join("\n")
     end

data/lib/tokenizer/tokenizer.rb CHANGED

@@ -60,10 +60,9 @@ module Dhaka
   end
-  # This class contains a DSL for specifying tokenizers. Subclass it to implement tokenizers for specific grammars.
-  # Subclasses of this class may not be further subclassed.
+  # This abstract class contains a DSL for hand-coding tokenizers. Subclass it to implement tokenizers for specific grammars.
   #
-  # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
+  # Tokenizers are state machines. Each state of a tokenizer is identified
   # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
   # that it starts in).
   #
@@ -109,6 +108,9 @@ module Dhaka
   #    end
   #
   #  end
+  #
+  # For languages where the lexical structure is very complicated, it may be too tedious to implement a Tokenizer by hand.
+  # In such cases, it's a lot easier to write a LexerSpecification using regular expressions and create a Lexer from that.
   class Tokenizer
     class << self
       # Define the action for the state named +state_name+.

data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb ADDED

@@ -0,0 +1,23 @@
+class ArithmeticPrecedenceLexerSpecification < Dhaka::LexerSpecification
+  for_pattern '\s' do
+    # ignore whitespace
+  end
+  %w| - h l , |.each do |char|
+    for_pattern char do
+      create_token(char)
+    end
+  end
+  %w| ( ) + / * ^ |.each do |char|
+    for_pattern "\\#{char}" do
+      create_token(char)
+    end
+  end
+  for_pattern '\d+' do
+    create_token('n')
+  end
+end