RubyGems - kanocc - Versions diffs - 0.1.0 - Mend

kanocc 0.1.0

Files changed (12) hide show

data/lib/kanocc/earley.rb ADDED Viewed

@@ -0,0 +1,322 @@
+#
+#  Copyright 2008 Christian Surlykke
+#
+#  This file is part of Kanocc.
+#
+#  Kanocc is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License, version 3
+#  as published by the Free Software Foundation.
+#
+#  Kanocc is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License, version 3 for more details.
+#
+#  You should have received a copy of the GNU General Public License,
+#  version 3 along with Kanocc.  If not, see <http://www.gnu.org/licenses/>.
+#
+require 'kanocc/grammar_rule'
+require 'kanocc/token'
+require 'logger'
+module Kanocc
+  #
+  # Parser for Kanocc based on Earleys algorithm. For a description see:
+  # Alfred V. Aho, Jeffrey D. Ullman, The Theory of Parsing, Translation and  Compiling,
+  # or try a web search engine of your choice with 'Earley parsing'
+  #
+  # Earley's parser will parse according to any zcontext-free grammar using O(n*n*n) time
+  # and O(n*n) space, n being the length of input. If the grammar is unambigous time/space
+  # complexity is O(n*n)/O(n*n).
+  # As of yet (version 0.1) the implementation is surely not optimal,
+  # so time/space complexity is probably worse.
+  #
+  # Christian Surlykke 2007.
+  #
+  class EarleyParser
+    attr_accessor :kanocc, :logger
+    ErrorRule = GrammarRule.new(Error, [], nil)
+    def initialize(kanocc, options = {})
+      @kanocc = kanocc
+      @logger = options[:logger] || Logger.new
+    end
+    #
+    # Sets up the parser, creating itemlist 0.
+    #
+    def startsymbol=(startSymbol)
+      @start_symbol = startSymbol
+      @itemLists = [ItemList.new(nil, 0)]
+      @inputPos = 0
+      @recoveryPoints = []
+      @itemLists[0].add_all(@start_symbol.rules.map{|rule| Item.new(rule, 0)})
+      predict_and_complete(0)
+    end
+    def prepare
+      @itemLists = @itemLists[0..0]
+      @inputPos = 0
+      if @recoveryPoints.size > 0 and @recoveryPoints[0] == 0
+        @recoveryPoints = [0]
+      else
+        @recoveryPoints = []
+      end
+      @logger.info("Itemlist 0:\n" + @itemLists[0].inspect) unless not @logger
+    end
+    def scan(token_match)
+      token_match[:matches].each do |match|
+        if match[:token]
+	  symbol = match[:token]
+        else
+          symbol = match[:literal]
+        end
+	items = @itemLists[@inputPos - 1].find_matching(symbol)
+	@itemLists[@inputPos].add_all(items.map{|item| item.move})
+      end
+    end
+    def predict_and_complete(pos)
+      item_list = @itemLists[pos]
+      prev_size = 0
+      while prev_size < item_list.size do
+        prev_size = item_list.size
+	item_list.each do |item|
+	  if item.rule.rhs.length <= item.dot
+            # complete
+	    item_list.add_all(@itemLists[item.j].find_matching(item.rule.lhs).map{|item| item.move})
+          elsif (nont = item.rule.rhs[item.dot]).respond_to?(:rules)
+            # predict
+	    item_list.add_all(nont.rules.map {|rule| Item.new(rule, @inputPos)})
+	  end
+        end
+      end
+    end
+    def add_recovery_points(pos)
+      if @recoveryPoints[-1] != pos
+	@itemLists[pos].each do |item|
+	  if Error == item.rule.rhs[item.dot]
+	    @recoveryPoints.push(pos)
+	    break
+	  end
+	end
+      end
+    end
+    #
+    # Consume and parse next input symbol
+    #
+    def consume(token_match)
+      @inputPos += 1
+      @itemLists.push(ItemList.new(token_match, @inputPos))
+      # scan, predict and complete until no more can be added
+      scan(token_match)
+      if @itemLists[@inputPos].size == 0
+        @logger.debug("Found no items matching #{token_match} in itemlist #{@inputPos - 1}")
+        @logger.debug("@recoveryPoints = " + @recoveryPoints.inspect)
+        for i in 1..@recoveryPoints.length do
+          if @recoveryPoints[-i] < @inputPos
+            @itemLists[@inputPos - 1].add(Item.new(ErrorRule, @recoveryPoints[-i]))
+            predict_and_complete(@inputPos - 1)
+	    scan(token_match)
+	    break if @itemLists[@inputPos].size > 0
+          end
+        end
+      end
+      predict_and_complete(@inputPos)
+      add_recovery_points(@inputPos)
+      @logger.info("Itemlist #{@inputPos}:\n" + @itemLists[@inputPos].inspect) if @logger
+    end
+    #
+    # Signal to the parser that end of input is reached
+    #
+    def eof
+      top_item = find_full_items(@start_symbol, @inputPos).find_all {|item| item.j == 0}.max
+      if top_item
+        translate(top_item, @inputPos)
+      else
+        raise(KanoccException, "It didn't parse")
+      end
+    end
+    def translate(element, pos)
+      @logger.debug("translate: " + element.inspect + " on " + pos.inspect)
+      if element.class == Item
+        translate_helper(element, pos)
+        @kanocc.report_reduction(element.rule)
+      else  # Its a token or a string
+	@kanocc.report_token(@itemLists[pos].inputSymbol, element)
+      end
+    end
+    def translate_helper(item, pos)
+      @logger.debug("translateHelper: " + item.inspect + " on " + pos.inspect)
+      return if item.dot == 0
+      if item.rule.rhs[item.dot - 1].respond_to?("rules")
+        # Assume item is of form [A --> aB*c, k] in itemlist i
+        # Must then find item of form [B --> x*, j] in itemlist i so
+        # that there exists item of form [A --> a*Bc, k] on itemlist j
+        # First: Items of form [B --> x*, j] on list i
+        candidates = find_full_items(item.rule.rhs[item.dot - 1], pos)
+        # Then: Those for which item of form [A --> a*Bc, k] exists
+        # on list j
+        candidates = candidates.find_all {|subItem|
+          @itemLists[subItem.j].find_item(item.rule, item.dot - 1, item.j)
+        }
+        # Precedence: We pick the posibility with the higest precedence
+        sub_item = candidates.max
+        prev_item = @itemLists[sub_item.j].find_item(item.rule, item.dot - 1, item.j)
+        prev_list = sub_item.j
+      else
+        prev_item = @itemLists[pos - 1].find_item(item.rule, item.dot - 1, item.j)
+        prev_list = pos - 1
+        sub_item = item.rule.rhs[item.dot - 1]
+      end
+      translate_helper(prev_item, prev_list)
+      translate(sub_item, pos)
+    end
+    def find_full_items(nonterminal, inputPos)
+      @itemLists[inputPos].find_all do |item|
+        item.rule.lhs == nonterminal and item.dot >= item.rule.rhs.length
+      end
+    end
+  end
+  class ItemList
+    attr_reader :inputSymbol
+    attr_accessor :items
+    def initialize(inputSymbol, inputPos)
+      @inputPos = inputPos
+      @inputSymbol = inputSymbol
+      @items = Hash.new
+    end
+    def copy
+      res = clone
+      res.items = @items.clone
+      return res
+    end
+    def size
+      return @items.size
+    end
+    def find_all(&b)
+      return @items.keys.find_all(&b)
+    end
+    def find_item(rule, dot, j)
+      return @items.keys.find{ |item|
+        item.rule == rule and
+        item.dot == dot and
+        item.j == j
+      }
+    end
+    def each_matching(inputSymbol)
+      find_matching(inputSymbol).each do |item|
+        yield(item)
+      end
+    end
+    def find_matching(inputSymbol)
+      @items.keys.find_all do |item|
+        inputSymbol === item.symbol_after_dot or inputSymbol == item.symbol_after_dot
+      end
+    end
+    def contains(item)
+      return @items[item]
+    end
+    def add(item)
+      @items.store(item, true)
+    end
+    def add_all(items)
+      items.each {|item| @items.store(item, true)}
+    end
+    def each
+      @items.keys.each do |item|
+        yield item
+      end
+    end
+    def inspect
+      return "[" + @inputSymbol.inspect + "\n " +
+                   @items.keys.map{|item| item.inspect}.join("\n  ") + "]\n"
+    end
+  end
+  class Item
+    attr_reader :rule, :j, :dot
+    @@items = Hash.new
+    def Item.new(rule, j, dot = 0)
+      unless (item = @@items[[rule,j,dot]])
+        item = super(rule, j, dot)
+        @@items.store([rule, j, dot], item)
+      end
+      return item
+    end
+    def symbol_after_dot
+      return @dot < @rule.rhs.size  ? @rule.rhs[@dot] : nil
+    end
+    def initialize(rule, j, dot = 0)
+      @rule = rule
+      @j = j
+      @dot = dot
+    end
+    def move
+      return Item.new(@rule, @j, @dot + 1)
+    end
+    def inspect
+      return "[" +
+      @rule.lhs.inspect + " --> " +
+       (@rule.rhs.slice(0, dot) +
+      [Dot.new] +
+      @rule.rhs.slice(dot, @rule.rhs.length - dot)).map{|symbol| symbol.inspect}.join(" ") +
+              " ; " + @j.to_s + "]"
+    end
+    def <=>(other)
+      res = @rule.prec <=> other.rule.prec;
+      if res == 0 and @rule.operator_prec and other.rule.operator_prec
+         res = other.rule.operator_prec <=> @rule.operator_prec
+      end
+      if res == 0
+        res = @j <=> other.j
+      end
+      return res
+    end
+  end
+  # Just for Item inspect
+  class Dot
+    def inspect
+      return "*"
+    end
+  end
+end

data/lib/kanocc/grammar_rule.rb ADDED Viewed

@@ -0,0 +1,50 @@
+#
+#  Copyright 2008 Christian Surlykke
+#
+#  This file is part of Kanocc.
+#
+#  Kanocc is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License, version 3
+#  as published by the Free Software Foundation.
+#
+#  Kanocc is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License, version 3 for more details.
+#
+#  You should have received a copy of the GNU General Public License,
+#  version 3 along with Kanocc.  If not, see <http://www.gnu.org/licenses/>.
+#
+module Kanocc
+  class GrammarRule
+    attr_reader :lhs, :rhs, :method, :operator_prec
+    attr_accessor :prec
+    def initialize(lhs, rhs, method)
+      @lhs = lhs
+      @rhs = rhs
+      @method = method
+      if (operator =rhs.find {|s| s.is_a?(String) or s.is_a?(Token)})
+        @operator_prec = Nonterminal.operator_precedence(operator)
+      end
+      @prec = 0
+      @logger.debug("#{lhs} --> #{rhs.map {|gs| gs.is_a?(Symbol) ? gs.to_s : gs}.join}, #prec = #{@prec}, method = #{method}") unless not @logger
+    end
+    def operator_prec
+      unless @operator_prec_calculated
+          operator = rhs.find {|s| s.is_a?(String) or s.is_a?(Token)}
+          if operator
+            @operator_prec = lhs.operator_precedence(operator)
+          end
+          @operator_prec_calculated = true
+      end
+      @operator_prec
+    end
+    def inspect
+      return lhs.inspect + " ::= " + rhs.map{|gs| gs.inspect}.join(" ")
+    end
+  end
+end

data/lib/kanocc/nonterminal.rb ADDED Viewed

@@ -0,0 +1,176 @@
+#
+#  Copyright 2008 Christian Surlykke
+#
+#  This file is part of Kanocc.
+#
+#  Kanocc is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License, version 3
+#  as published by the Free Software Foundation.
+#
+#  Kanocc is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License, version 3 for more details.
+#
+#  You should have received a copy of the GNU General Public License,
+#  version 3 along with Kanocc.  If not, see <http://www.gnu.org/licenses/>.
+#
+require 'kanocc/grammar_rule'
+module Kanocc
+  class Nonterminal
+    @@rules = Hash.new
+    @@last_rule = Hash.new
+    @@derives_right = Hash.new
+    @@operator_precedence = Hash.new
+    @@method_names = Hash.new
+    Left = 1
+    Right = 2
+    def Nonterminal.derives_right
+      @@derives_right[self] = true
+    end
+    def Nonterminal.derives_right?
+      return @@derives_right[self]
+    end
+    def Nonterminal.set_operator_precedence(operator, precedence)
+      raise "Precedence must be an integer" unless precedence.class == Fixnum
+      @@operator_precedence[self] ||= Hash.new
+      if is_an_operator?(operator)
+        @@operator_precedence[self][operator] = precedence
+      elsif is_an_array_of_operators(operator)
+        operator.each {|o| @@operator_precedence[self][o] = precedence}
+      else
+        raise "Operator must be a string, a token or an array of those"
+      end
+    end
+    def Nonterminal.operator_precedence(operator)
+      (@@operator_precedence[self] and @@operator_precedence[self][operator]) or 0
+    end
+    def Nonterminal.is_an_array_of_operators(arr)
+       arr.is_a?(Array) and
+       arr.collect{|o| is_an_operator?(o)}.inject {|b1, b2| b1 and b2 }
+    end
+    def Nonterminal.is_an_operator?(operator)
+        operator.is_a?(String) or operator.is_a?(Token)
+    end
+    def Nonterminal.rules
+      rules = @@rules[self]
+      return rules ? rules : []
+    end
+    def Nonterminal.add_rule(rule)
+      @@rules[self] ||= []
+      @@rules[self].push(rule)
+      @@last_rule[self] = rule
+    end
+    def Nonterminal.is_a_grammarsymbol?(x)
+      x.is_a?(String) or (x.respond_to?("is_a_kanocc_grammarsymbol?") and x.is_a_kanocc_grammarsymbol?)
+    end
+    def Nonterminal.is_a_kanocc_grammarsymbol?
+      return true
+    end
+    def Nonterminal.rule(*rhs, &block)
+      for pos in 0..rhs.length - 1 do
+        unless is_a_grammarsymbol?(rhs[pos])
+          raise "Problem with rule: #{rhs.inspect}, element:#{pos.to_s} - #{rhs[pos].inspect}\nElements of a rule must be Strings, Tokens or Nonterminals"
+        end
+      end
+      if block_given?
+        method_name = generate_method_name(*rhs)
+        define_method(method_name.to_sym, &block)
+        add_rule(GrammarRule.new(self, rhs, method_name.to_sym))
+      else
+        add_rule(GrammarRule.new(self, rhs, nil))
+      end
+    end
+    def Nonterminal.zm(symbols, sep = nil)
+      list_class = new_list_class
+      list_class.rule() {@elements = []}
+      list_class.rule(om(symbols, sep)) {@elements = @rhs[0].elements}
+      return list_class
+    end
+    def Nonterminal.om(symbols, sep = nil)
+      symbols = [symbols] unless symbols.is_a? Array
+      list_class = new_list_class
+      list_class.rule(*symbols) {@elements = @rhs}
+      if sep
+        list_class.rule(list_class, sep, *symbols) {@elements = @rhs[0].elements + @rhs[2..@rhs.length]}
+      else
+        list_class.rule(list_class, *symbols) {@elements = @rhs[0].elements + @rhs[1..@rhs.length]}
+      end
+      return list_class
+    end
+    @@listClassNumber = 0
+    def Nonterminal.new_list_class
+      list_class = Class.new(List)
+      @@listClassNumber += 1
+      def list_class.inspect
+        return "anonList_#{@@listClassNumber}"
+      end
+      return list_class
+    end
+    def Nonterminal.generate_method_name(*args)
+      method_name = self.name + " --> " + args.map {|a| a.inspect}.join(' ')
+      @@method_names[self] ||= []
+      i = 1
+      while @@method_names[self].member?(method_name) do
+        method_name += ' ';
+      end
+      @@method_names[self].push(method_name)
+      return method_name
+    end
+    def Nonterminal.prec(p)
+      raise "Call to prec not preceded by rule" unless @@last_rule[self]
+      @@last_rule[self].prec = p
+    end
+    def Nonterminal.show_method_names
+      @@method_names[self].each{|mn| puts mn.inspect} if @@method_names[self]
+    end
+    def inspect
+      self.class.name
+    end
+  end
+  class List < Nonterminal
+    attr_reader :elements
+        protected
+    # Assumes @rhs[0] is a Kanocc::List and that rhs.length > 1
+    def collect(strip_separator = false)
+      @elements = @rhs[0].elements
+      if strip_separator
+        @elements = @elements + @rhs[2..@rhs.length]
+      else
+        @elements = @elements + @rhs[1..@rhs.length]
+      end
+    end
+  end
+  class Error < Nonterminal
+    attr_reader :text
+    def initialize
+      super
+      @text = "FIXME"
+    end
+  end
+end