RubyGems - dhaka - Versions diffs - 2.0.1 → 2.1.0 - Mend

dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/Rakefile +64 -0
data/lib/dhaka.rb +12 -0
data/lib/dot/dot.rb +29 -0
data/lib/evaluator/evaluator.rb +35 -26
data/lib/grammar/grammar.rb +42 -17
data/lib/grammar/grammar_symbol.rb +4 -3
data/lib/grammar/production.rb +9 -3
data/lib/lexer/compiled_lexer.rb +46 -0
data/lib/lexer/dfa.rb +71 -0
data/lib/lexer/lexeme.rb +33 -0
data/lib/lexer/lexer.rb +61 -0
data/lib/lexer/lexer_run.rb +66 -0
data/lib/lexer/regex_grammar.rb +368 -0
data/lib/lexer/regex_parser.rb +1888 -0
data/lib/lexer/regex_tokenizer.rb +14 -0
data/lib/lexer/specification.rb +69 -0
data/lib/lexer/state.rb +45 -0
data/lib/lexer/state_machine.rb +37 -0
data/lib/parser/action.rb +3 -3
data/lib/parser/compiled_parser.rb +11 -3
data/lib/parser/parse_result.rb +3 -5
data/lib/parser/parse_tree.rb +6 -17
data/lib/parser/parser.rb +15 -14
data/lib/parser/parser_run.rb +4 -2
data/lib/parser/parser_state.rb +16 -8
data/lib/tokenizer/tokenizer.rb +5 -3
data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
data/test/chittagong/chittagong_driver.rb +12 -13
data/test/chittagong/chittagong_driver_test.rb +18 -11
data/test/chittagong/chittagong_evaluator.rb +7 -16
data/test/chittagong/chittagong_evaluator_test.rb +7 -4
data/test/chittagong/chittagong_grammar.rb +0 -6
data/test/chittagong/chittagong_lexer.rb +109 -0
data/test/chittagong/chittagong_lexer_specification.rb +39 -0
data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
data/test/chittagong/chittagong_parser.rb +879 -0
data/test/chittagong/chittagong_parser_test.rb +8 -10
data/test/chittagong/chittagong_test.rb +17 -13
data/test/compiled_parser_test.rb +7 -2
data/test/evaluator_test.rb +0 -1
data/test/grammar_test.rb +19 -1
data/test/lexer_test.rb +215 -0
data/test/parse_result_test.rb +8 -8
data/test/parser_state_test.rb +0 -12
metadata +21 -5
data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
data/test/chittagong/chittagong_tokenizer.rb +0 -88

data/Rakefile ADDED

@@ -0,0 +1,64 @@
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+require 'rake/testtask'
+require 'rubygems'
+Rake::RDocTask.new do |rdoc|
+  rdoc.rdoc_files.include('README', 'lib/**/*.rb')
+  rdoc.rdoc_files.exclude("lib/lexer/regex_parser.rb")
+  rdoc.main = "README"
+  rdoc.rdoc_dir = '../doc'
+end
+spec = Gem::Specification.new do |s|
+  s.name = "dhaka"
+  s.author = "Mushfeq Khan"
+  s.email = "mushfeq dot khan at gmail dot com"
+  s.version = ENV['VERSION'] || "0.0.0"
+  s.platform = Gem::Platform::RUBY
+  s.summary = "An LALR1 parser generator written in Ruby"
+  s.files = Dir.glob("{lib,test}/**/*").select {|file| file.include?('.rb') || file.include?('.txt')} + ['Rakefile']
+  s.require_path = 'lib'
+  s.autorequire = 'dhaka'
+  s.has_rdoc = true
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+	pkg.package_dir = "../gems"
+end
+Rake::TestTask.new do |t|
+  t.libs << "test"
+  t.test_files = FileList['test/**/*test.rb']
+  t.verbose = true
+end
+task :generate_regex_parser do
+  require 'lib/dhaka'
+  File.open('lib/lexer/regex_parser.rb', 'w') do |file|
+    file << Dhaka::Parser.new(Dhaka::LexerSupport::RegexGrammar).compile_to_ruby_source_as('Dhaka::LexerSupport::RegexParser')
+  end
+end
+task :gem => [:test, :generate_regex_parser]
+task :default => :test
+task :test => [:generate_chittagong_parser, :generate_chittagong_lexer]
+task :generate_chittagong_parser do
+  require 'lib/dhaka'
+  require 'test/chittagong/chittagong_grammar'
+  require 'test/fake_logger'
+  File.open('test/chittagong/chittagong_parser.rb', 'w') do |file|
+    file << Dhaka::Parser.new(ChittagongGrammar, FakeLogger.new).compile_to_ruby_source_as(:ChittagongParser)
+  end
+end
+task :generate_chittagong_lexer do
+  require 'lib/dhaka'
+  require 'test/chittagong/chittagong_lexer_specification'
+  File.open('test/chittagong/chittagong_lexer.rb', 'w') do |file|
+    file << Dhaka::Lexer.new(ChittagongLexerSpecification).compile_to_ruby_source_as(:ChittagongLexer)
+  end
+end

data/lib/dhaka.rb CHANGED

@@ -26,6 +26,7 @@ require 'logger'
 require 'delegate'
 %w[
+dot/dot
 grammar/grammar_symbol
 grammar/production
 grammar/closure_hash
@@ -45,4 +46,15 @@ parser/parser
 parser/compiled_parser
 tokenizer/tokenizer
 evaluator/evaluator
+lexer/regex_grammar
+lexer/regex_tokenizer
+lexer/regex_parser
+lexer/state_machine
+lexer/dfa
+lexer/state
+lexer/specification
+lexer/lexeme
+lexer/lexer_run
+lexer/lexer
+lexer/compiled_lexer
 ].each {|path| require File.join(File.dirname(__FILE__), path)}

data/lib/dot/dot.rb ADDED

@@ -0,0 +1,29 @@
+module Dhaka
+  module Dot #:nodoc:
+    class Digraph #:nodoc:
+      def initialize(node_attributes = {})
+        @result = ["digraph x {"]
+        @result << %(node #{dotify_hash(node_attributes)})
+        yield(self)
+        @result << '}'
+      end
+      def node(obj, attributes = {})
+        @result << "#{obj.object_id} #{dotify_hash(attributes)}"
+      end
+      def edge(src, dest, attributes = {})
+        @result << "#{src.object_id} -> #{dest.object_id} #{dotify_hash(attributes)}"
+      end
+      def dotify_hash hash
+        sorted_key_value_pairs = hash.collect {|key, value| [key.to_s, value.to_s]}.sort
+        hash.empty? ? "" : '[' + sorted_key_value_pairs.collect {|key, value| "#{key}=#{value.to_s.inspect}"}.join(' ') + ']'
+      end
+      def to_dot
+        @result.join("\n")
+      end
+    end
+  end
+end

data/lib/evaluator/evaluator.rb CHANGED

@@ -1,14 +1,14 @@
 module Dhaka
-  # This is the abstract base evaluator class. It is not directly instantiated.
-  # When defining an evaluator for a specific grammar, we subclass it. e.g. for FooGrammar
-  # we create a FooEvaluator that subclasses Evaluator. Note that FooEvaluator may not
-  # be further subclassed.
+  # Abstract base class for evaluators.
+  #
+  # Defining an evaluator is an easy way to perform syntax-directed evaluation without having to generate an abstract
+  # syntax tree representation of the input.
   #
   # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
   # a block that performs the evaluation. For detailed examples, see the evaluators in the
   # test suite.
   #
-  # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
+  # The following is an evaluator for arithmetic expressions. When a parse tree node is encountered that
   # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
   # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
   # result is obtained by adding the evaluation results of the child nodes.
@@ -54,28 +54,18 @@ module Dhaka
   #      end
   #
   #    end
   class Evaluator
     class << self
-      def inherited(evaluator)
-        class << evaluator
-          attr_accessor :grammar, :actions
-        end
-      end
-      def method_missing(method_name, *args, &blk)
-        name = method_name.to_s
-        if name =~ /^for_(.+)$/
-          rule_name = $1
-          actions   << rule_name
-          send(:define_method, rule_name, &blk)
-        else
-          super
-        end
-      end
-      # Evaluation rules are defined within a block passed to this method.
+      # Defining evaluation rules within a block passed to this method tells the evaluator to carry out a
+      # rudimentary check of your definitions and define default evaluation rules for pass-through
+      # productions (i.e. productions with expansions consisting of exactly one grammar symbol). The
+      # default evaluation rule for such productions is to simply return the result of calling +evaluate+
+      # on the unique child node. If you neglect to define a rule for a non-pass-through production (one
+      # where the expansion consists of multiple symbols), the evaluator will raise an exception
+      # at loading time, listing all the productions that absolutely need to be defined before you can
+      # continue.
       def define_evaluation_rules
-        self.actions = []
         yield
         check_definitions
       end
@@ -93,9 +83,28 @@ module Dhaka
         non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
         raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
       end
+      def inherited(evaluator)
+        class << evaluator
+          attr_accessor :grammar, :actions
+        end
+        evaluator.actions = []
+      end
+      def method_missing(method_name, *args, &blk)
+        name = method_name.to_s
+        if name =~ /^for_(.+)$/
+          rule_name = $1
+          raise "Attempted to define evaluation rule for non-existent production '#{rule_name}'" unless grammar.production_named(rule_name)
+          actions   << rule_name
+          send(:define_method, rule_name, &blk)
+        else
+          super
+        end
+      end
     end
-    # Evaluate a syntax tree node.
+    # Evaluate a parse tree node.
     def evaluate node
       @node_stack ||= []
       @node_stack << node.child_nodes
@@ -104,7 +113,7 @@ module Dhaka
       result
     end
-    # Returns the array of child nodes of the node being currently evaluated.
+    # Returns the array of child nodes of the node being evaluated currently.
     def child_nodes
       @node_stack.last
     end

data/lib/grammar/grammar.rb CHANGED

@@ -6,7 +6,6 @@ module Dhaka
   # Productions for specific grammar symbols are defined in the context of this class.
   class ProductionBuilder
     # +symbol+ is the grammar symbol that productions are being defined for.
     def initialize(grammar, symbol)
       @grammar = grammar
@@ -18,14 +17,15 @@ module Dhaka
     # set to the precedence of the grammar symbol corresponding to that name.
     #
     # See the arithmetic precedence grammar in the test suites for an example.
-    def method_missing(production_name, expansion, options = {})
+    def method_missing(production_name, expansion, options = {}, &blk)
       expansion_symbols = expansion.collect {|name| @grammar.symbols[name]}
-      production_args   = [@symbol, expansion_symbols, production_name.to_s]
+      production_args   = [@symbol, expansion_symbols, production_name.to_s, blk, @grammar.production_index]
       if precedence_symbol_name = options[:prec]
         production_args << @grammar.symbol_for_name(precedence_symbol_name).precedence
       end
       production = Production.new(*production_args)
+      @grammar.production_index += 1
       @symbol.nullable = true if expansion_symbols.empty?
       @grammar.productions_by_symbol[production.symbol] << production
@@ -35,9 +35,9 @@ module Dhaka
   end
   # The precedence builder defines three methods, +left+, +right+ and +nonassoc+. These accept arrays of grammar
-  # symbols all of which have the same precedence level and associativity. This works almost exactly like Yacc.
+  # symbols all of which have the same precedence level and associativity.
   #
-  # See the arithmetic precedence grammar in the test suites for an example.
+  # See the arithmetic precedence grammar in the test suites for an example of how this works.
   class PrecedenceBuilder
     def initialize(grammar) #:nodoc:
       @grammar          = grammar
@@ -60,11 +60,11 @@ module Dhaka
     end
   end
-  # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
+  # Abstract base class for grammar specifications.
   #
-  # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
-  # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
-  # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
+  # The following is a grammar specification for simple arithmetic. Precedences are specified as in Yacc -
+  # in ascending order of binding strength, with equal-strength symbols on the same level.
+  # Production rules are specified for each symbol by specifying the name of the production (used when
   # encoding the Evaluator) and the expansion for that particular production. For example, the production named
   # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
   #
@@ -91,9 +91,14 @@ module Dhaka
   #    end
   #  end
   #
+  # In the above grammar, the symbols <tt>+</tt> and <tt>-</tt> are declared as being +left+-associative, meaning that
+  # 1 + 2 + 3 is parsed as (1 + 2) + 3 as opposed to 1 + (2 + 3) (+right+-associativity). The symbol <tt>^</tt> is declared
+  # +nonassoc+ which means that expressions such as 2 ^ 3 ^ 4 are not allowed (non-associative). <tt>+</tt> and <tt>-</tt> are listed
+  # before <tt>^</tt> which means that they bind lower, and an expression such as 2 + 3 ^ 5 will be always be parsed as
+  # 2 + (3 ^ 5) and not (2 + 3) ^ 5.
   class Grammar
     class << self
-      # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
+      # Used for defining the Production-s for the symbol with name +symbol+. The block +blk+ is
       # evaluated in the context of a ProductionBuilder.
       def for_symbol symbol, &blk
         symbol              = symbols[symbol]
@@ -116,15 +121,16 @@ module Dhaka
         end
       end
-      def productions
+      # Returns a list of all the Production-s in this grammar.
+      def productions
         productions_by_name.values
       end
-      def productions_for_symbol(symbol)
+      def productions_for_symbol(symbol) #:nodoc:
         productions_by_symbol[symbol]
       end
-      def closure(kernel)
+      def closure(kernel) #:nodoc:
         channels = Set.new
         result = compute_closure(kernel) do |hash, item|
@@ -138,11 +144,11 @@ module Dhaka
         [channels, result]
       end
-      def passive_channel(start_item, end_item)
+      def passive_channel(start_item, end_item) #:nodoc:
         PassiveChannel.new(self, start_item, end_item)
       end
-      def first(given_symbol)
+      def first(given_symbol) #:nodoc:
         cached_result = __first_cache[given_symbol]
         return cached_result if cached_result
         result = compute_closure([given_symbol]) do |hash, symbol|
@@ -159,23 +165,41 @@ module Dhaka
         result
       end
+      # Returns the Production identified by +name+.
       def production_named(name)
         productions_by_name[name]
       end
+      # Returns the set of terminal symbols in the grammar.
       def terminal_symbols
         symbols.values.select {|symbol| symbol.terminal}
       end
+      # Returns the set of non-terminal symbols in the grammar.
       def non_terminal_symbols
         symbols.values.select {|symbol| symbol.non_terminal}
       end
+      # Export the grammar to a BNF-like format
+      def to_bnf
+        result = []
+        last_symbol = nil
+        productions.sort.each do |production|
+          if production.symbol != last_symbol
+            result << ""
+            result << "#{production.symbol.name.inspect} :"
+            last_symbol = production.symbol
+          end
+          result << "  | #{production.expansion.collect{|symbol| symbol.name.inspect}.join(' ')}"
+        end
+        result.join("\n")
+      end
       private
         def inherited(grammar)
           class << grammar
-            attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache
+            attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache, :production_index
           end
           grammar.symbols               = Hash.new {|hash, name| hash[name] = GrammarSymbol.new(name)}
           grammar.productions_by_symbol = Hash.new {|hash, name| hash[name] = Set.new([])}
@@ -183,6 +207,7 @@ module Dhaka
           grammar.end_symbol            = grammar.symbols[END_SYMBOL_NAME]
           grammar.start_symbol          = grammar.symbols[START_SYMBOL_NAME]
           grammar.__first_cache         = {}
+          grammar.production_index      = 0
         end
         def spontaneous_channel(start_item, end_item)

data/lib/grammar/grammar_symbol.rb CHANGED

@@ -1,8 +1,9 @@
 module Dhaka
   # Each grammar symbol is uniquely identified by a string name. The name of a symbol can
-  # be anything and need not correspond to its character representation. For example, an ampersand in the
-  # character stream could be tokenized as a symbol with a name 'whatever'. In general, it's best to choose
-  # symbol names that are descriptive.
+  # be anything (except the two reserved names <tt>'\_Start_'</tt> and <tt>'\_End_'</tt>) and need not
+  # correspond to its character representation. For example, an ampersand in the input string could
+  # be tokenized as a symbol with a name 'AND_OP'. You never have to directly instantiate a
+  # GrammarSymbol. It is done implicitly for you when you define a Grammar.
   class GrammarSymbol
     attr_reader :name
     attr_accessor :non_terminal, :nullable, :precedence, :associativity

data/lib/grammar/production.rb CHANGED

@@ -1,13 +1,16 @@
 module Dhaka
   class Production
+    include Comparable
+    attr_reader :symbol, :expansion, :name, :action, :priority
-    attr_reader :symbol, :expansion, :name
-    def initialize(symbol, expansion, name, precedence = nil)
+    def initialize(symbol, expansion, name, action, priority, precedence = nil)
       @symbol     = symbol
       @expansion  = expansion
       @name       = name
       @precedence = precedence
+      @action     = action || proc { self }
+      @priority   = priority
     end
     def precedence
@@ -26,5 +29,8 @@ module Dhaka
       "#{name} #{symbol} ::= #{expansion.join(' ')}"
     end
+    def <=> other
+      priority <=> other.priority
+    end
   end
 end

data/lib/lexer/compiled_lexer.rb ADDED

@@ -0,0 +1,46 @@
+module Dhaka
+  # Abstract base class of all compiled Lexers. It is only used by generated code.
+  class CompiledLexer
+    class << self
+      # Returns a LexerRun that tokenizes +input+.
+      def lex input
+        LexerRun.new(self, input)
+      end
+      def start_state #:nodoc:
+        states[start_state_id]
+      end
+      def action_for_pattern pattern #:nodoc:
+        specification.items[pattern].action
+      end
+      private
+        def inherited(lexer)
+          class << lexer
+            attr_accessor :states, :specification, :start_state_id
+          end
+          lexer.states = Hash.new do |hash, state_id|
+                           hash[state_id] = LexerSupport::State.new(lexer, nil)
+                         end
+        end
+        def at_state x, &blk
+          states[x].instance_eval(&blk)
+        end
+        def start_with start_state_id
+          self.start_state_id = start_state_id
+        end
+        def switch_to dest_state_id
+          states[dest_state_id]
+        end
+        def inspect
+          "<Dhaka::CompiledLexer specification : #{specification}>"
+        end
+    end
+  end
+end