dhaka 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2006 Mushfeq Khan
2
+ # Copyright (c) 2006, 2007 Mushfeq Khan
3
3
  #
4
4
  # Permission is hereby granted, free of charge, to any person obtaining
5
5
  # a copy of this software and associated documentation files (the
@@ -21,9 +21,6 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
 
24
- # An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
25
- #
26
- # Further examples can be found in the test suites included with the gem.
27
24
  module Dhaka
28
25
  end
29
26
 
@@ -8,6 +8,54 @@ module Dhaka
8
8
  # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
9
9
  # a block that performs the evaluation. For detailed examples, see the evaluators in the
10
10
  # test suite.
11
+ #
12
+ # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
13
+ # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
14
+ # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
15
+ # result is obtained by adding the evaluation results of the child nodes.
16
+ #
17
+ # class ArithmeticPrecedenceEvaluator < Dhaka::Evaluator
18
+ #
19
+ # self.grammar = ArithmeticPrecedenceGrammar
20
+ #
21
+ # define_evaluation_rules do
22
+ #
23
+ # for_subtraction do
24
+ # evaluate(child_nodes[0]) - evaluate(child_nodes[2])
25
+ # end
26
+ #
27
+ # for_addition do
28
+ # evaluate(child_nodes[0]) + evaluate(child_nodes[2])
29
+ # end
30
+ #
31
+ # for_division do
32
+ # evaluate(child_nodes[0]).to_f/evaluate(child_nodes[2])
33
+ # end
34
+ #
35
+ # for_multiplication do
36
+ # evaluate(child_nodes[0]) * evaluate(child_nodes[2])
37
+ # end
38
+ #
39
+ # for_literal do
40
+ # child_nodes[0].token.value.to_i
41
+ # end
42
+ #
43
+ # for_parenthetized_expression do
44
+ # evaluate(child_nodes[1])
45
+ # end
46
+ #
47
+ # for_negated_expression do
48
+ # -evaluate(child_nodes[1])
49
+ # end
50
+ #
51
+ # for_power do
52
+ # evaluate(child_nodes[0])**evaluate(child_nodes[2])
53
+ # end
54
+ #
55
+ # end
56
+ #
57
+ # end
58
+
11
59
 
12
60
  class Evaluator
13
61
 
@@ -18,17 +66,29 @@ module Dhaka
18
66
  def evaluate node
19
67
  @node_stack ||= []
20
68
  @node_stack << node.child_nodes
21
- proc = self.class.actions[node.production.name]
22
- result = self.instance_eval(&proc)
69
+ result = self.send(node.production.name)
23
70
  @node_stack.pop
24
71
  result
25
72
  end
26
73
 
74
+ # Performs the pass-through calculations for nodes with only one child_node for which an
75
+ # evaluation rule is not explicitly defined. Will probably be deprecated in future versions.
76
+ def method_missing(method_name)
77
+ evaluate(child_nodes[0])
78
+ end
79
+
27
80
  # Returns the array of child nodes of the node being currently evaluated.
28
81
  def child_nodes
29
82
  @node_stack[-1]
30
83
  end
31
84
 
85
+ # Evaluation rules are defined within a block passed to this method.
86
+ def self.define_evaluation_rules
87
+ self.actions = []
88
+ yield
89
+ check_definitions
90
+ end
91
+
32
92
  private
33
93
 
34
94
  def self.inherited(evaluator)
@@ -37,29 +97,19 @@ module Dhaka
37
97
  end
38
98
  end
39
99
 
40
- def self.define_evaluation_rules
41
- default_action = Proc.new { evaluate(child_nodes[0]) }
42
- self.actions = Hash.new { |hash, key| default_action }
43
- yield
44
- check_definitions
45
- end
46
-
47
100
  def self.method_missing(method_name, &blk)
48
101
  if method_name.to_s =~ /^for_*/
49
102
  rule_name = method_name.to_s[4..-1]
50
- self.for_rule_named(rule_name, &blk)
103
+ self.actions << rule_name
104
+ self.send(:define_method, rule_name, &blk)
51
105
  end
52
106
  end
53
107
 
54
108
  def self.check_definitions
55
- non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
109
+ non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions
56
110
  raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
57
111
  end
58
112
 
59
- def self.for_rule_named(name, &blk)
60
- self.actions[name] = blk
61
- end
62
-
63
113
  end
64
114
 
65
115
  class EvaluatorDefinitionError < StandardError #:nodoc:
@@ -60,6 +60,36 @@ module Dhaka
60
60
  end
61
61
 
62
62
  # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
63
+ #
64
+ # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
65
+ # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
66
+ # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
67
+ # encoding the Evaluator) and the expansion for that particular production. For example, the production named
68
+ # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
69
+ #
70
+ # class ArithmeticPrecedenceGrammar < Dhaka::Grammar
71
+ # precedences do
72
+ # left ['+', '-']
73
+ # left ['*', '/']
74
+ # nonassoc ['^']
75
+ # end
76
+ #
77
+ # for_symbol(Dhaka::START_SYMBOL_NAME) do
78
+ # expression ['E']
79
+ # end
80
+ #
81
+ # for_symbol('E') do
82
+ # addition ['E', '+', 'E']
83
+ # subtraction ['E', '-', 'E']
84
+ # multiplication ['E', '*', 'E']
85
+ # division ['E', '/', 'E']
86
+ # power ['E', '^', 'E']
87
+ # literal ['n']
88
+ # parenthetized_expression ['(', 'E', ')']
89
+ # negated_expression ['-', 'E'], :prec => '*'
90
+ # end
91
+ # end
92
+ #
63
93
  class Grammar
64
94
 
65
95
  # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
@@ -13,7 +13,7 @@ module Dhaka
13
13
  def terminal
14
14
  !non_terminal
15
15
  end
16
- def to_s
16
+ def to_s #:nodoc:
17
17
  name
18
18
  end
19
19
  def <=> other
@@ -23,7 +23,7 @@ module Dhaka
23
23
  @precedence
24
24
  end
25
25
 
26
- def to_s
26
+ def to_s #:nodoc:
27
27
  "#{@name} #{@symbol} ::= #{@expansion.join(' ')}"
28
28
  end
29
29
 
@@ -36,9 +36,7 @@ module Dhaka
36
36
  node_stack << composite_node
37
37
 
38
38
  unless composite_node.head_node?
39
- [production.symbol.name, current_token.grammar_symbol.name]
40
- else
41
- []
39
+ @symbol_queue += [@current_token.symbol_name, production.symbol.name]
42
40
  end
43
41
  end
44
42
  end
@@ -2,22 +2,24 @@ module Dhaka
2
2
  # Returned on successful parsing of the input token stream.
3
3
  class ParseSuccessResult
4
4
  # Contains the parse result.
5
- attr_accessor :syntax_tree
6
- def initialize(syntax_tree) #:nodoc:
7
- @syntax_tree = syntax_tree
5
+ attr_accessor :parse_tree
6
+ def initialize(parse_tree) #:nodoc:
7
+ @parse_tree = parse_tree
8
8
  end
9
9
  # This is false.
10
10
  def has_error?
11
11
  false
12
12
  end
13
+ # Deprecated. Use the +parse_tree+ accessor.
14
+ alias syntax_tree parse_tree
13
15
  end
14
16
 
15
17
  # Returned on unsuccessful parsing of the input token stream.
16
18
  class ParseErrorResult
17
- # The index of the token that caused the parse error.
18
- attr_reader :bad_token_index
19
- def initialize(bad_token_index) #:nodoc:
20
- @bad_token_index = bad_token_index
19
+ # The token that caused the parse error.
20
+ attr_reader :unexpected_token
21
+ def initialize(unexpected_token) #:nodoc:
22
+ @unexpected_token = unexpected_token
21
23
  end
22
24
  # This is true.
23
25
  def has_error?
@@ -7,7 +7,10 @@ module Dhaka
7
7
  @child_nodes = []
8
8
  end
9
9
  def linearize #:nodoc:
10
- child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
10
+ child_nodes.collect {|child_node| child_node.linearize}.flatten + [self]
11
+ end
12
+ def tokens
13
+ child_nodes.collect{|child_node| child_node.tokens}.flatten
11
14
  end
12
15
  def to_s #:nodoc:
13
16
  "CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
@@ -33,7 +36,7 @@ module Dhaka
33
36
  def dot_name #:nodoc:
34
37
  "Node#{object_id}"
35
38
  end
36
-
39
+
37
40
  end
38
41
 
39
42
  # These are leaf nodes of syntax trees. They contain tokens.
@@ -45,6 +48,9 @@ module Dhaka
45
48
  def linearize #:nodoc:
46
49
  []
47
50
  end
51
+ def tokens
52
+ [token]
53
+ end
48
54
  def to_s #:nodoc:
49
55
  "LeafNode: #{token}"
50
56
  end
@@ -60,5 +66,6 @@ module Dhaka
60
66
  def dot_name #:nodoc:
61
67
  "Node#{object_id}"
62
68
  end
69
+
63
70
  end
64
71
  end
@@ -3,6 +3,13 @@ require 'set'
3
3
  require 'logger'
4
4
 
5
5
  module Dhaka
6
+ # The parser generator. To generate a parser from a grammar specification +ArithmeticPrecedenceGrammar+, one would
7
+ # write:
8
+ # parser = Dhaka::Parser.new(ArithmeticPrecedenceGrammar)
9
+ #
10
+ # To compile this parser to Ruby source as +ArithmeticPrecedenceParser+:
11
+ # parser.compile_to_ruby_source_as(:ArithmeticPrecedenceParser)
12
+ # which returns a string of Ruby code.
6
13
  class Parser
7
14
  include ParserMethods
8
15
  attr_reader :grammar, :start_state
@@ -6,36 +6,29 @@ module Dhaka
6
6
  @node_stack = []
7
7
  @state_stack = [start_state]
8
8
  @token_stream = token_stream
9
- @current_token_index = 0
9
+ @symbol_queue = []
10
10
  end
11
11
 
12
12
  def run
13
- for_each_token_with_end do
14
- error = execute_action current_token.grammar_symbol.name
13
+ token_stream.each do |token|
14
+ @current_token = token
15
+ @symbol_queue << @current_token.symbol_name
16
+ error = execute_actions
15
17
  return error if error
16
- node_stack << ParseTreeLeafNode.new(current_token)
17
- @current_token_index += 1
18
+ node_stack << ParseTreeLeafNode.new(@current_token)
18
19
  end
19
20
  ParseSuccessResult.new(node_stack[0])
20
21
  end
21
22
 
22
23
  private
23
24
 
24
- attr_reader :state_stack, :token_stream, :node_stack, :current_token
25
+ attr_reader :state_stack, :token_stream, :node_stack
25
26
 
26
- def for_each_token_with_end
27
- token_stream.each do |@current_token|
28
- yield
29
- end
30
- @current_token = Token.new(@grammar.end_symbol, nil)
31
- yield
32
- end
33
-
34
- def execute_action symbol_name
35
- action = state_stack[-1].actions[symbol_name]
36
- return ParseErrorResult.new(@current_token_index) unless action
37
- self.instance_eval(&action.action_code).each do |symbol_name|
38
- execute_action symbol_name
27
+ def execute_actions
28
+ while symbol_name = @symbol_queue.pop
29
+ action = state_stack[-1].actions[symbol_name]
30
+ return ParseErrorResult.new(@current_token) unless action
31
+ self.instance_eval(&action.action_code)
39
32
  end
40
33
  nil
41
34
  end
@@ -1,17 +1,20 @@
1
1
  module Dhaka
2
2
  # Represents a portion of the input character stream that is mapped by the tokenizer
3
- # to a symbol in the grammar.
3
+ # to a symbol in the grammar. The attribute +input_position+ contains the start index position of the original
4
+ # string input that this token came from. It can be used to report errors by indicating the specific portion
5
+ # of the input where the error occurred.
4
6
  class Token
5
- attr_accessor :grammar_symbol, :value
6
- def initialize(grammar_symbol, value)
7
- @grammar_symbol = grammar_symbol
7
+ attr_accessor :symbol_name, :value, :input_position
8
+ def initialize(symbol_name, value, input_position)
9
+ @symbol_name = symbol_name
8
10
  @value = value
11
+ @input_position = input_position
9
12
  end
10
- def to_s
11
- "#{@grammar_symbol.name}"
13
+ def to_s #:nodoc:
14
+ "#{symbol_name}"
12
15
  end
13
16
  def == other
14
- (grammar_symbol == other.grammar_symbol) && (value == other.value)
17
+ (symbol_name == other.symbol_name) && (value == other.value)
15
18
  end
16
19
  end
17
20
  end
@@ -2,17 +2,35 @@ module Dhaka
2
2
 
3
3
  # Reserved constant used to identify the idle state of the tokenizer.
4
4
  TOKENIZER_IDLE_STATE = :idle_state
5
-
6
- # Raised when the tokenizer encounters a character that has no corresponding action in
7
- # its current state.
8
- class UnrecognizedInputCharacterException < StandardError
9
- attr_reader :input, :char_index
10
- def initialize(input, char_index)
11
- @input = input
12
- @char_index = char_index
5
+
6
+ # Returned on successful tokenizing of the input stream. Supports iteration by including Enumerable, so it can
7
+ # be passed in directly to the parser.
8
+ class TokenizerSuccessResult
9
+ include Enumerable
10
+ def initialize(tokens)
11
+ @tokens = tokens
12
+ end
13
+ # Returns false.
14
+ def has_error?
15
+ false
16
+ end
17
+ def each
18
+ @tokens.each do |token|
19
+ yield token
20
+ end
21
+ end
22
+ end
23
+
24
+ # Returned when tokenizing fails due to an unexpected character in the input stream.
25
+ class TokenizerErrorResult
26
+ # The index of the character that caused the error.
27
+ attr_reader :unexpected_char_index
28
+ def initialize(unexpected_char_index)
29
+ @unexpected_char_index = unexpected_char_index
13
30
  end
14
- def to_s
15
- "Unrecognized character '#{input[char_index].chr}' encountered while tokenizing:\n #{input} at index #{char_index}"
31
+ # Returns true.
32
+ def has_error?
33
+ true
16
34
  end
17
35
  end
18
36
 
@@ -45,7 +63,52 @@ module Dhaka
45
63
  #
46
64
  # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
47
65
  # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
48
- # that it starts in).
66
+ # that it starts in).
67
+ #
68
+ # The following is a tokenizer for arithmetic expressions with integer terms. The tokenizer starts in the idle state
69
+ # creating single-character tokens for all characters excepts digits and whitespace. It shifts to
70
+ # <tt>:get_integer_literal</tt> when it encounters a digit character and creates a token on the stack on which it
71
+ # accumulates the value of the literal. When it again encounters a non-digit character, it shifts back to idle.
72
+ # Whitespace is treated as a delimiter, but not shifted as a token.
73
+ #
74
+ # class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
75
+ #
76
+ # digits = ('0'..'9').to_a
77
+ # parenths = ['(', ')']
78
+ # operators = ['-', '+', '/', '*', '^']
79
+ # functions = ['h', 'l']
80
+ # arg_separator = [',']
81
+ # whitespace = [' ']
82
+ #
83
+ # all_characters = digits + parenths + operators + functions + arg_separator + whitespace
84
+ #
85
+ # for_state Dhaka::TOKENIZER_IDLE_STATE do
86
+ # for_characters(all_characters - (digits + whitespace)) do
87
+ # create_token(curr_char, nil)
88
+ # advance
89
+ # end
90
+ # for_characters digits do
91
+ # create_token('n', '')
92
+ # switch_to :get_integer_literal
93
+ # end
94
+ # for_character whitespace do
95
+ # advance
96
+ # end
97
+ # end
98
+ #
99
+ # for_state :get_integer_literal do
100
+ # for_characters all_characters - digits do
101
+ # switch_to Dhaka::TOKENIZER_IDLE_STATE
102
+ # end
103
+ # for_characters digits do
104
+ # curr_token.value += curr_char
105
+ # advance
106
+ # end
107
+ # end
108
+ #
109
+ # end
110
+
111
+
49
112
  class Tokenizer
50
113
 
51
114
  # Define the action for the state named +state_name+.
@@ -53,13 +116,11 @@ module Dhaka
53
116
  states[state_name].instance_eval(&blk)
54
117
  end
55
118
 
56
- # Tokenizes a string +input+ and returns an array of Token-s.
119
+ # Tokenizes a string +input+ and returns a TokenizerErrorResult on failure or a TokenizerSuccessResult on sucess.
57
120
  def self.tokenize(input)
58
121
  self.new(input).run
59
122
  end
60
123
 
61
- # A slot that can be used to accumulate characters when processing multi-character tokens.
62
- attr_accessor :accumulator
63
124
  # The tokens shifted so far.
64
125
  attr_reader :tokens
65
126
 
@@ -80,6 +141,17 @@ module Dhaka
80
141
  @curr_char_index += 1
81
142
  end
82
143
 
144
+ # The token currently on top of the stack.
145
+ def curr_token
146
+ tokens[-1]
147
+ end
148
+
149
+ # Push a new token on to the stack with symbol corresponding to +symbol_name+ and a value of +value+.
150
+ def create_token(symbol_name, value)
151
+ new_token = Dhaka::Token.new(symbol_name, value, @curr_char_index)
152
+ tokens << new_token
153
+ end
154
+
83
155
  # Change the active state of the tokenizer to the state identified by the symbol +state_name+.
84
156
  def switch_to state_name
85
157
  @current_state = self.class.states[state_name]
@@ -88,16 +160,17 @@ module Dhaka
88
160
  def run #:nodoc:
89
161
  while curr_char
90
162
  blk = @current_state.actions[curr_char]
91
- raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
163
+ return TokenizerErrorResult.new(@curr_char_index) unless blk
92
164
  instance_eval(&blk)
93
165
  end
94
- tokens
166
+ tokens << Dhaka::Token.new(Dhaka::END_SYMBOL_NAME, nil, nil)
167
+ return TokenizerSuccessResult.new(tokens)
95
168
  end
96
169
 
97
170
  private
98
171
  def self.inherited(tokenizer)
99
172
  class << tokenizer
100
- attr_accessor :states
173
+ attr_accessor :states, :grammar
101
174
  end
102
175
  tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
103
176
  end