RubyGems - whittle - Versions diffs - 0.0.1 - Mend

whittle 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/.gitignore +4 -0
data/.rspec +1 -0
data/Gemfile +4 -0
data/LICENSE +20 -0
data/README.md +468 -0
data/Rakefile +1 -0
data/lib/whittle/error.rb +9 -0
data/lib/whittle/errors/grammar_error.rb +9 -0
data/lib/whittle/errors/parse_error.rb +35 -0
data/lib/whittle/errors/unconsumed_input_error.rb +9 -0
data/lib/whittle/parser.rb +343 -0
data/lib/whittle/rule.rb +239 -0
data/lib/whittle/rule_set.rb +118 -0
data/lib/whittle/version.rb +3 -0
data/lib/whittle.rb +8 -0
data/spec/spec_helper.rb +4 -0
data/spec/unit/parser/empty_rule_spec.rb +21 -0
data/spec/unit/parser/empty_string_spec.rb +17 -0
data/spec/unit/parser/error_reporting_spec.rb +55 -0
data/spec/unit/parser/grouped_expr_spec.rb +27 -0
data/spec/unit/parser/multiple_precedence_spec.rb +33 -0
data/spec/unit/parser/noop_spec.rb +23 -0
data/spec/unit/parser/pass_through_parser_spec.rb +17 -0
data/spec/unit/parser/precedence_spec.rb +26 -0
data/spec/unit/parser/self_referential_expr_spec.rb +26 -0
data/spec/unit/parser/skipped_tokens_spec.rb +28 -0
data/spec/unit/parser/sum_parser_spec.rb +23 -0
data/spec/unit/parser/typecast_parser_spec.rb +17 -0
data/whittle.gemspec +27 -0
metadata +104 -0

data/lib/whittle/parser.rb ADDED Viewed

@@ -0,0 +1,343 @@
+# Whittle: A little LALR(1) parser in pure ruby, without a generator.
+#
+# Copyright (c) Chris Corbyn, 2011
+module Whittle
+  # Parsers are created by subclassing the Parser class and defining a context-free grammar.
+  #
+  # Unlike other LALR(1) parsers, Whittle does not rely on code-generation, instead it
+  # synthesizes a parse table from the grammar at runtime, on the first parse.
+  #
+  # While Whittle's implementation works a little differently to yacc/bison and ruby parser
+  # generators like racc and citrus, the parseable grammars are the same.  LALR(1) parsers are
+  # very powerful and it is generally said that the languages they cannot parse are difficult
+  # for humans to understand.
+  #
+  # You should refer to the README for a full description of how to use the parser,
+  # but a quick example follows.
+  #
+  # @example A simple Whittle Parser
+  #
+  #   class Calculator < Whittle::Parser
+  #     rule(:wsp) do |r|
+  #       r[/s+/] # skip whitespace
+  #     end
+  #
+  #     rule(:int) do |r|
+  #       r[/[0-9]+/].as { |i| Integer(i) }
+  #     end
+  #
+  #     rule("+") % :left
+  #     rule("-") % :left
+  #     rule("/") % :left
+  #     rule("*") % :left
+  #
+  #     rule(:expr) do |r|
+  #       r[:expr, "+", :expr].as { |left, _, right| left + right }
+  #       r[:expr, "-", :expr].as { |left, _, right| left - right }
+  #       r[:expr, "/", :expr].as { |left, _, right| left / right }
+  #       r[:expr, "*", :expr].as { |left, _, right| left * right }
+  #       r[:int].as(:value)
+  #     end
+  #
+  #     start(:expr)
+  #   end
+  #
+  #   calculator = Calculator.new
+  #   calculator.parse("1 + (2 * 6) - 7")
+  #   # => 6
+  class Parser
+    class << self
+      # Returns a Hash mapping rule names with their RuleSets.
+      #
+      # @return [Hash<String, RuleSet>]
+      #   all rules defined by the parser
+      def rules
+        @rules ||= {}
+      end
+      # Declares a new rule.
+      #
+      # The are two ways to call this method.  The most fundamental way is to pass a Symbol
+      # in the +name+ parameter, along with a block, in which you will add one more possible
+      # rules.
+      #
+      # @example Specifying multiple rules with a block
+      #
+      #   rule(:expr) do |r|
+      #     r[:expr, "+", :expr].as { |a, _, b| a + b }
+      #     r[:expr, "-", :expr].as { |a, _, b| a - b }
+      #     r[:expr, "/", :expr].as { |a, _, b| a / b }
+      #     r[:expr, "*", :expr].as { |a, _, b| a * b }
+      #     r[:integer].as { |i| Integer(i) }
+      #   end
+      #
+      # Each rule specified in this way defines one of many possibilities to describe the input.
+      # Rules may refer back to themselves, which means in the above, any integer is a valid
+      # expr:
+      #
+      #   42
+      #
+      # Therefore any sum of integers as also a valid expr:
+      #
+      #   42 + 24
+      #
+      # Therefore any multiplication of sums of integers is also a valid expr, and so on.
+      #
+      #   42 + 24 * 7 + 52
+      #
+      # A rule like the above is called a 'nonterminal', because upon recognizing any expr, it
+      # is possible for the rule to continue collecting input and becoming a larger expr.
+      #
+      # In subtle contrast, a rule like the following:
+      #
+      #   rule("+") do |r|
+      #     r["+"].as { |plus| plus }
+      #   end
+      #
+      # Is called a 'terminal' token, since upon recognizing a "+", the parser cannot
+      # add further input to the "+" itself... it is the tip of a branch in the parse tree; the
+      # branch terminates here, and subsequently the rule is terminal.
+      #
+      # There is a shorthand way to write the above rule:
+      #
+      #   rule("+")
+      #
+      # Not given a block, #rule treats the name parameter as a literal token.
+      #
+      # Note that nonterminal rules are composed of other nonterminal rules and/or terminal
+      # rules.  Terminal rules contain one, and only one Regexp pattern or fixed string.
+      #
+      # @param [Symbol, String] name
+      #   the name of the ruleset (note the one ruleset can contain multiple rules)
+      #
+      # @return [RuleSet, Rule]
+      #   the newly created RuleSet if a block was given, otherwise a rule representing a
+      #   terminal token for the input string +name+.
+      def rule(name)
+        rules[name] = RuleSet.new(name)
+        if block_given?
+          rules[name].tap { |r| yield r }
+        else
+          rules[name][name].as(:value)
+        end
+      end
+      # Declares most general rule that can be used to describe an entire input.
+      #
+      # Called without any arguments, returns the current start rule.
+      #
+      # @param [Symbol] name
+      #   the name of a rule defined in the parser (does not need to be defined beforehand)
+      #
+      # @return [Symbol]
+      #   the new (or current) start rule
+      def start(name = nil)
+        @start = name unless name.nil?
+        @start
+      end
+      # Returns the numeric value for the initial state (the state ID associated with the start
+      # rule).
+      #
+      # In most LALR(1) parsers, this would be zero, but for implementation reasons, this will
+      # be an unpredictably large (or small) number.
+      #
+      # @return [Fixnum]
+      #   the ID for the initial state in the parse table
+      def initial_state
+        prepare_start_rule
+        [rules[start], 0].hash
+      end
+      # Returns the entire parse table used to interpret input into the parser.
+      #
+      # You should not need to call this method, though you may wish to inspect its contents
+      # during debugging.
+      #
+      # Note that the token +nil+ in the parse table represents "anything" and its action is
+      # always to reduce.
+      #
+      # Shift-reduce conflicts are resolved at runtime and therefore remain in the parse table.
+      #
+      # @return [Hash]
+      #   a 2-dimensional Hash representing states with actions to perform for a given lookahead
+      def parse_table
+        @parse_table ||= begin
+          prepare_start_rule
+          rules[start].build_parse_table(
+            {},
+            self,
+            {
+              :state  => initial_state,
+              :seen   => [],
+              :offset => 0,
+              :prec   => 0
+            }
+          )
+        end
+      end
+      private
+      def prepare_start_rule
+        raise GrammarError, "Undefined start rule #{start.inspect}" unless rules.key?(start)
+        if rules[start].terminal?
+          rule(:*) do |r|
+            r[start].as { |prog| prog }
+          end
+          start(:*)
+        end
+      end
+    end
+    # Alias for class method Parser.rules
+    #
+    # @see Parser.rules
+    def rules
+      self.class.rules
+    end
+    # Accepts input in the form of a String and attempts to parse it according to the grammar.
+    #
+    # The input is scanned using a lexical analysis routine, defined by the #lex method. Each
+    # token detected by the routine is used to pick an action from the parse table.  Each
+    # reduction initially builds a branch in an AST (abstract syntax tree), until all input has
+    # been read and the start rule has been recognized, at which point the AST is evaluated by
+    # invoking the callbacks defined in the grammar in a depth-first fashion.
+    #
+    # If the parser encounters a token it does not recognise, a parse error will be raised,
+    # specifying what was expected, what was received, and on which line the error occurred.
+    #
+    # A successful parse returns the result of evaluating the start rule, whatever that may be.
+    #
+    # @param [String] input
+    #   a complete input string to parse according to the grammar
+    #
+    # @return [Object]
+    #   whatever the grammar defines
+    def parse(input)
+      table  = self.class.parse_table
+      states = [self.class.initial_state]
+      args   = []
+      line   = 1
+      lex(input) do |token|
+        line  = token[:line]
+        input = token
+        catch(:shifted) do
+          loop do
+            state = table[states.last]
+            if ins = state[input[:name]] || state[nil]
+              case ins[:action]
+                when :shift
+                  input[:args] = [input.delete(:value)]
+                  states << ins[:state]
+                  args << input
+                  throw :shifted
+                when :reduce
+                  size = ins[:rule].components.length
+                  input = {
+                    :rule => ins[:rule],
+                    :name => ins[:rule].name,
+                    :line => line,
+                    :args => args.pop(size)
+                  }
+                  states.pop(size)
+                  args << input
+                  return accept(args.pop) if states.length == 1 && token[:name] == :$end
+                when :goto
+                  input = token
+                  states << ins[:state]
+              end
+            else
+              error(state, input, :states => states, :args => args)
+            end
+          end
+        end
+      end
+    end
+    # Accepts a String as input and repeatedly yields terminal tokens found in the grammar.
+    #
+    # The last token yielded is always named :$end and has the value of +nil+.
+    #
+    # You may override this method to define a smarter implementation, should you need to.
+    #
+    # @param [String] input
+    #   the complete input string the lex
+    def lex(input)
+      line   = 1
+      offset = 0
+      ending = input.length
+      until offset == ending do
+        next_token(input, offset, line).tap do |token|
+          raise UnconsumedInputError,
+            "Unmatched input #{input[offset..-1].inspect} on line #{line}" if token.nil?
+          offset += token[:value].length
+          line, token[:line] = token[:line], line
+          yield token unless token[:discarded]
+        end
+      end
+      yield ({ :name => :$end, :line => line, :value => nil })
+    end
+    # Invoked when the parser detects an error.
+    #
+    # The default implementation raises a RuntimeError specifying the allowed inputs
+    # and the received input, along with a line number.
+    #
+    # You may override this method with your own implementation, which, at least in theory,
+    # can recover from the error and allow the parse to continue, though this is an extremely
+    # advanced topic and requires a good understanding of how LALR(1) parsers operate.
+    #
+    # @param [Hash] state
+    #   the possible actions for the current parser state
+    #
+    # @param [Hash] input
+    #   the received token (or, unlikely, a nonterminal symbol)
+    #
+    # @param [Hash] stack
+    #   the current parse context (arg stack + state stack)
+    def error(state, input, stack)
+      expected = state.reject { |s, i| i[:action] == :goto }.keys
+      message  = <<-ERROR.gsub(/\n\s+/, " ").strip
+        Parse error:
+        expected
+        #{expected.map { |k| k.inspect }.join("; or ")}
+        but got
+        #{input[:name].inspect}
+        on line
+        #{input[:line]}
+      ERROR
+      raise ParseError.new(message, input[:line], expected, input[:name])
+    end
+    private
+    def next_token(source, offset, line)
+      rules.each do |name, rule|
+        if token = rule.scan(source, offset, line)
+          token[:name] = name
+          return token
+        end
+      end
+      nil
+    end
+    def accept(tree)
+      tree[:rule].action.call(*tree[:args].map { |arg| Hash === arg ? accept(arg) : arg })
+    end
+  end
+end

data/lib/whittle/rule.rb ADDED Viewed

@@ -0,0 +1,239 @@
+# Whittle: A little LALR(1) parser in pure ruby, without a generator.
+#
+# Copyright (c) Chris Corbyn, 2011
+module Whittle
+  # Represents an individual Rule, forming part of an overall RuleSet.
+  class Rule
+    NULL_ACTION = Proc.new { }
+    DUMP_ACTION = Proc.new { |input| input }
+    attr_reader :name
+    attr_reader :action
+    attr_reader :components
+    attr_reader :assoc
+    attr_reader :prec
+    # Create a new Rule for the RuleSet named +name+.
+    #
+    # The components can either be names of other Rules, or for a terminal Rule,
+    # a single pattern to match in the input string.
+    #
+    # @param [String] name
+    #   the name of the RuleSet to which this Rule belongs
+    #
+    # @param [Object...] components...
+    #   a variable list of components that make up the Rule
+    def initialize(name, *components)
+      @components = components
+      @action     = NULL_ACTION
+      @name       = name
+      @terminal   = components.length == 1 && !components.first.kind_of?(Symbol)
+      @assoc      = :right
+      @prec       = 0
+      @components.each do |c|
+        unless Regexp === c || String === c || Symbol === c
+          raise ArgumentError, "Unsupported rule component #{c.class}"
+        end
+      end
+      pattern = @components.first
+      if @terminal
+        @pattern = if pattern.kind_of?(Regexp)
+          Regexp.new("\\G#{pattern}")
+        else
+          Regexp.new("\\G#{Regexp.escape(pattern)}")
+        end
+      end
+    end
+    # Predicate check for  whether or not the Rule represents a terminal symbol.
+    #
+    # A terminal symbol is effectively any rule that directly matches some
+    # pattern in the input string and references no other rules.
+    #
+    # @return [Boolean]
+    #   true if this rule represents a terminal symbol
+    def terminal?
+      @terminal
+    end
+    # Walks all possible branches from the given rule, building a parse table.
+    #
+    # The parse table is a list of instructions (transitions) that can be looked
+    # up, given the current parser state and the current lookahead token.
+    #
+    # @param [Hash<Fixnum,Hash>] table
+    #   the table to construct for
+    #
+    # @param [Parser] parser
+    #   the Parser containing all the Rules in the grammar
+    #
+    # @param [Hash] context
+    #   a Hash used to track state as the grammar is analyzed
+    def build_parse_table(table, parser, context)
+      state      = table[context[:state]] ||= {}
+      sym        = components[context[:offset]]
+      rule       = parser.rules[sym]
+      new_offset = context[:offset] + 1
+      new_state  = if state.key?(sym)
+        state[sym][:state]
+      end || [self, new_offset].hash
+      if sym.nil?
+        state[sym] = {
+          :action => :reduce,
+          :rule   => self,
+          :prec   => context[:prec]
+        }
+      else
+        raise GrammarError, "Unreferenced rule #{sym.inspect}" if rule.nil?
+        new_prec = if rule.terminal?
+          rule.prec
+        else
+          context[:prec]
+        end
+        if rule.terminal?
+          state[sym] = {
+            :action => :shift,
+            :state  => new_state,
+            :prec   => new_prec,
+            :assoc  => rule.assoc
+          }
+        else
+          state[sym] = {
+            :action => :goto,
+            :state  => new_state
+          }
+          rule.build_parse_table(
+            table,
+            parser,
+            {
+              :state  => context[:state],
+              :seen   => context[:seen],
+              :offset => 0,
+              :prec   => 0
+            }
+          )
+        end
+        build_parse_table(
+          table,
+          parser,
+          {
+            :state  => new_state,
+            :seen   => context[:seen],
+            :offset => new_offset,
+            :prec   => new_prec
+          }
+        )
+      end
+      resolve_conflicts(state)
+    end
+    # Specify how this Rule should be reduced.
+    #
+    # Given a block, the Rule will be reduced by passing the result of reducing
+    # all inputs as arguments to the block.
+    #
+    # Given the Symbol :value, the matched input will be returned verbatim.
+    # Given the Symbol :nothing, nil will be returned; you can use this to
+    # skip whitesapce and comments, for example.
+    #
+    # @param [Symbol] preset
+    #   one of the preset actions, :value or :nothing; optional
+    #
+    # @return [Rule]
+    #   returns self
+    def as(preset = nil, &block)
+      tap do
+        case preset
+          when :value   then @action = DUMP_ACTION
+          when :nothing then @action = NULL_ACTION
+          when nil
+            raise ArgumentError, "Rule#as expected a block, not none given" unless block_given?
+            @action = block
+          else
+            raise ArgumentError, "Invalid preset #{preset.inspect} to Rule#as"
+        end
+      end
+    end
+    # Set the associativity of this Rule.
+    #
+    # Accepts values of :left, :right (default) or :nonassoc.
+    #
+    # @param [Symbol] assoc
+    #   one of :left, :right or :nonassoc
+    #
+    # @return [Rule]
+    #   returns self
+    def %(assoc)
+      raise ArgumentError, "Invalid associativity #{assoc.inspect}" \
+        unless [:left, :right, :nonassoc].include?(assoc)
+      tap { @assoc = assoc }
+    end
+    # Set the precedence of this Rule, as an Integer.
+    #
+    # The higher the number, the higher the precedence.
+    #
+    # @param [Fixnum] prec
+    #   the precedence (default is zero)
+    def ^(prec)
+      raise ArgumentError, "Invalid precedence level #{prec.inspect}" \
+        unless prec.respond_to?(:to_i)
+      tap { @prec = prec.to_i }
+    end
+    # Invoked for terminal rules during lexing, ignored for nonterminal rules.
+    #
+    # @param [String] source
+    #   the input String the scan
+    #
+    # @param [Fixnum] offset
+    #   the current index in the search
+    #
+    # @param [Fixnum] line
+    #   the line the lexer was up to when the previous token was matched
+    #
+    # @return [Hash]
+    #   a Hash representing the token, containing :rule, :value, :line and
+    #   :discarded, if the token is to be skipped.
+    #
+    # Returns nil if nothing is matched.
+    def scan(source, offset, line)
+      return nil unless @terminal
+      if match = source.match(@pattern, offset)
+        {
+          :rule      => self,
+          :value     => match[0],
+          # FIXME: Optimize this line count in a cross-platform compatible way
+          :line      => line + ("~" + match[0] + "~").lines.count - 1,
+          :discarded => @action.equal?(NULL_ACTION)
+        }
+      end
+    end
+    private
+    def resolve_conflicts(instructions)
+      if r = instructions.values.detect { |i| i[:action] == :reduce }
+        instructions.reject! do |s, i|
+          ((i[:action] == :shift) &&
+           ((r[:prec] > i[:prec]) ||
+            (r[:prec] == i[:prec] && i[:assoc] == :left)))
+        end
+      end
+    end
+  end
+end

data/lib/whittle/rule_set.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# Whittle: A little LALR(1) parser in pure ruby, without a generator.
+#
+# Copyright (c) Chris Corbyn, 2011
+module Whittle
+  # RuleSets are named collections of Rules.
+  #
+  # When you use the name of a rule in the grammar, you actually refer to the
+  # entire RuleSet and not an individual rule within it (unless of course, it
+  # only contains one Rule)
+  class RuleSet
+    include Enumerable
+    # Create a new RuleSet named +name+.
+    #
+    # @param [Symbol, String] name
+    #   the name of the rule in the grammar
+    def initialize(name)
+      @name  = name
+      @rules = []
+    end
+    # Enumerate all Rules in the set.
+    def each(&block)
+      @rules.each(&block)
+    end
+    # Add a new Rule to the set.
+    #
+    # @param [Object...] components...
+    #   a variable list of components (Symbols, Strings, or Regexps)
+    def [](*components)
+      Rule.new(@name, *components).tap do |rule|
+        @rules << rule
+      end
+    end
+    # Invoked during lexing, delegating to each rule in the set.
+    #
+    # @param [String] source
+    #   the complete input string
+    #
+    # @param [Fixnum] offset
+    #   the current index in the search
+    # @param [Fixnum] line
+    #   the current line number
+    #
+    # @return [Hash]
+    #   a Hash representing the found token, or nil
+    def scan(source, offset, line)
+      each do |rule|
+        if token = rule.scan(source, offset, line)
+          return token
+        end
+      end
+      nil
+    end
+    # Recursively builds the parse table into +table+.
+    #
+    # @param [Hash<Fixnum,Hash>] table
+    #   the parse table as constructed so far
+    #
+    # @param [Parser] parser
+    #   the parser containing the grammar
+    #
+    # @param [Hash] context
+    #   a Hash used to track state when building the parse table
+    #
+    # @return [Hash]
+    #   the parse table
+    def build_parse_table(table, parser, context)
+      return table if context[:seen].include?([context[:state], self])
+      context[:seen] << [context[:state], self]
+      table.tap do
+        each do |rule|
+          rule.build_parse_table(table, parser, context)
+        end
+      end
+    end
+    # Predicate test for whether or not this RuleSet references a single
+    # terminal Symbol.
+    #
+    # @return [Boolean]
+    #   true if this rule is a terminal symbol
+    def terminal?
+      @rules.length == 1 && @rules.first.terminal?
+    end
+    # Predicate test for whether or not this RuleSet references a nonterminal Symbol.
+    #
+    # @return [Boolean]
+    #   true if this rule is a nonterminal symbol
+    def nonterminal?
+      !terminal?
+    end
+    # Convenience method to access the precedence of a RuleSet representing a terminal.
+    #
+    # @return [Fixnum]
+    #   the precedence of the terminal Symbol, or zero for nonterminals.
+    def prec
+      terminal? ? @rules.first.prec : 0
+    end
+    # Convenience method to access the associativity of a RuleSet representing a terminal.
+    #
+    # @return [Symbol]
+    #   the associativty of the terminal Symbol.
+    def assoc
+      terminal? ? @rules.first.assoc : :right
+    end
+  end
+end

data/lib/whittle/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Whittle
+  VERSION = "0.0.1"
+end

data/lib/whittle.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "whittle/version"
+require "whittle/error"
+require "whittle/errors/unconsumed_input_error"
+require "whittle/errors/parse_error"
+require "whittle/errors/grammar_error"
+require "whittle/rule"
+require "whittle/rule_set"
+require "whittle/parser"