dhaka 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2006 Mushfeq Khan
2
+ # Copyright (c) 2006, 2007 Mushfeq Khan
3
3
  #
4
4
  # Permission is hereby granted, free of charge, to any person obtaining
5
5
  # a copy of this software and associated documentation files (the
@@ -21,9 +21,6 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
 
24
- # An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
25
- #
26
- # Further examples can be found in the test suites included with the gem.
27
24
  module Dhaka
28
25
  end
29
26
 
@@ -8,6 +8,54 @@ module Dhaka
8
8
  # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
9
9
  # a block that performs the evaluation. For detailed examples, see the evaluators in the
10
10
  # test suite.
11
+ #
12
+ # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
13
+ # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
14
+ # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
15
+ # result is obtained by adding the evaluation results of the child nodes.
16
+ #
17
+ # class ArithmeticPrecedenceEvaluator < Dhaka::Evaluator
18
+ #
19
+ # self.grammar = ArithmeticPrecedenceGrammar
20
+ #
21
+ # define_evaluation_rules do
22
+ #
23
+ # for_subtraction do
24
+ # evaluate(child_nodes[0]) - evaluate(child_nodes[2])
25
+ # end
26
+ #
27
+ # for_addition do
28
+ # evaluate(child_nodes[0]) + evaluate(child_nodes[2])
29
+ # end
30
+ #
31
+ # for_division do
32
+ # evaluate(child_nodes[0]).to_f/evaluate(child_nodes[2])
33
+ # end
34
+ #
35
+ # for_multiplication do
36
+ # evaluate(child_nodes[0]) * evaluate(child_nodes[2])
37
+ # end
38
+ #
39
+ # for_literal do
40
+ # child_nodes[0].token.value.to_i
41
+ # end
42
+ #
43
+ # for_parenthetized_expression do
44
+ # evaluate(child_nodes[1])
45
+ # end
46
+ #
47
+ # for_negated_expression do
48
+ # -evaluate(child_nodes[1])
49
+ # end
50
+ #
51
+ # for_power do
52
+ # evaluate(child_nodes[0])**evaluate(child_nodes[2])
53
+ # end
54
+ #
55
+ # end
56
+ #
57
+ # end
58
+
11
59
 
12
60
  class Evaluator
13
61
 
@@ -18,17 +66,29 @@ module Dhaka
18
66
  def evaluate node
19
67
  @node_stack ||= []
20
68
  @node_stack << node.child_nodes
21
- proc = self.class.actions[node.production.name]
22
- result = self.instance_eval(&proc)
69
+ result = self.send(node.production.name)
23
70
  @node_stack.pop
24
71
  result
25
72
  end
26
73
 
74
+ # Performs the pass-through calculations for nodes with only one child_node for which an
75
+ # evaluation rule is not explicitly defined. Will probably be deprecated in future versions.
76
+ def method_missing(method_name)
77
+ evaluate(child_nodes[0])
78
+ end
79
+
27
80
  # Returns the array of child nodes of the node being currently evaluated.
28
81
  def child_nodes
29
82
  @node_stack[-1]
30
83
  end
31
84
 
85
+ # Evaluation rules are defined within a block passed to this method.
86
+ def self.define_evaluation_rules
87
+ self.actions = []
88
+ yield
89
+ check_definitions
90
+ end
91
+
32
92
  private
33
93
 
34
94
  def self.inherited(evaluator)
@@ -37,29 +97,19 @@ module Dhaka
37
97
  end
38
98
  end
39
99
 
40
- def self.define_evaluation_rules
41
- default_action = Proc.new { evaluate(child_nodes[0]) }
42
- self.actions = Hash.new { |hash, key| default_action }
43
- yield
44
- check_definitions
45
- end
46
-
47
100
  def self.method_missing(method_name, &blk)
48
101
  if method_name.to_s =~ /^for_*/
49
102
  rule_name = method_name.to_s[4..-1]
50
- self.for_rule_named(rule_name, &blk)
103
+ self.actions << rule_name
104
+ self.send(:define_method, rule_name, &blk)
51
105
  end
52
106
  end
53
107
 
54
108
  def self.check_definitions
55
- non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
109
+ non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions
56
110
  raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
57
111
  end
58
112
 
59
- def self.for_rule_named(name, &blk)
60
- self.actions[name] = blk
61
- end
62
-
63
113
  end
64
114
 
65
115
  class EvaluatorDefinitionError < StandardError #:nodoc:
@@ -60,6 +60,36 @@ module Dhaka
60
60
  end
61
61
 
62
62
  # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
63
+ #
64
+ # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
65
+ # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
66
+ # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
67
+ # encoding the Evaluator) and the expansion for that particular production. For example, the production named
68
+ # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
69
+ #
70
+ # class ArithmeticPrecedenceGrammar < Dhaka::Grammar
71
+ # precedences do
72
+ # left ['+', '-']
73
+ # left ['*', '/']
74
+ # nonassoc ['^']
75
+ # end
76
+ #
77
+ # for_symbol(Dhaka::START_SYMBOL_NAME) do
78
+ # expression ['E']
79
+ # end
80
+ #
81
+ # for_symbol('E') do
82
+ # addition ['E', '+', 'E']
83
+ # subtraction ['E', '-', 'E']
84
+ # multiplication ['E', '*', 'E']
85
+ # division ['E', '/', 'E']
86
+ # power ['E', '^', 'E']
87
+ # literal ['n']
88
+ # parenthetized_expression ['(', 'E', ')']
89
+ # negated_expression ['-', 'E'], :prec => '*'
90
+ # end
91
+ # end
92
+ #
63
93
  class Grammar
64
94
 
65
95
  # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
@@ -13,7 +13,7 @@ module Dhaka
13
13
  def terminal
14
14
  !non_terminal
15
15
  end
16
- def to_s
16
+ def to_s #:nodoc:
17
17
  name
18
18
  end
19
19
  def <=> other
@@ -23,7 +23,7 @@ module Dhaka
23
23
  @precedence
24
24
  end
25
25
 
26
- def to_s
26
+ def to_s #:nodoc:
27
27
  "#{@name} #{@symbol} ::= #{@expansion.join(' ')}"
28
28
  end
29
29
 
@@ -36,9 +36,7 @@ module Dhaka
36
36
  node_stack << composite_node
37
37
 
38
38
  unless composite_node.head_node?
39
- [production.symbol.name, current_token.grammar_symbol.name]
40
- else
41
- []
39
+ @symbol_queue += [@current_token.symbol_name, production.symbol.name]
42
40
  end
43
41
  end
44
42
  end
@@ -2,22 +2,24 @@ module Dhaka
2
2
  # Returned on successful parsing of the input token stream.
3
3
  class ParseSuccessResult
4
4
  # Contains the parse result.
5
- attr_accessor :syntax_tree
6
- def initialize(syntax_tree) #:nodoc:
7
- @syntax_tree = syntax_tree
5
+ attr_accessor :parse_tree
6
+ def initialize(parse_tree) #:nodoc:
7
+ @parse_tree = parse_tree
8
8
  end
9
9
  # This is false.
10
10
  def has_error?
11
11
  false
12
12
  end
13
+ # Deprecated. Use the +parse_tree+ accessor.
14
+ alias syntax_tree parse_tree
13
15
  end
14
16
 
15
17
  # Returned on unsuccessful parsing of the input token stream.
16
18
  class ParseErrorResult
17
- # The index of the token that caused the parse error.
18
- attr_reader :bad_token_index
19
- def initialize(bad_token_index) #:nodoc:
20
- @bad_token_index = bad_token_index
19
+ # The token that caused the parse error.
20
+ attr_reader :unexpected_token
21
+ def initialize(unexpected_token) #:nodoc:
22
+ @unexpected_token = unexpected_token
21
23
  end
22
24
  # This is true.
23
25
  def has_error?
@@ -7,7 +7,10 @@ module Dhaka
7
7
  @child_nodes = []
8
8
  end
9
9
  def linearize #:nodoc:
10
- child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
10
+ child_nodes.collect {|child_node| child_node.linearize}.flatten + [self]
11
+ end
12
+ def tokens
13
+ child_nodes.collect{|child_node| child_node.tokens}.flatten
11
14
  end
12
15
  def to_s #:nodoc:
13
16
  "CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
@@ -33,7 +36,7 @@ module Dhaka
33
36
  def dot_name #:nodoc:
34
37
  "Node#{object_id}"
35
38
  end
36
-
39
+
37
40
  end
38
41
 
39
42
  # These are leaf nodes of syntax trees. They contain tokens.
@@ -45,6 +48,9 @@ module Dhaka
45
48
  def linearize #:nodoc:
46
49
  []
47
50
  end
51
+ def tokens
52
+ [token]
53
+ end
48
54
  def to_s #:nodoc:
49
55
  "LeafNode: #{token}"
50
56
  end
@@ -60,5 +66,6 @@ module Dhaka
60
66
  def dot_name #:nodoc:
61
67
  "Node#{object_id}"
62
68
  end
69
+
63
70
  end
64
71
  end
@@ -3,6 +3,13 @@ require 'set'
3
3
  require 'logger'
4
4
 
5
5
  module Dhaka
6
+ # The parser generator. To generate a parser from a grammar specification +ArithmeticPrecedenceGrammar+, one would
7
+ # write:
8
+ # parser = Dhaka::Parser.new(ArithmeticPrecedenceGrammar)
9
+ #
10
+ # To compile this parser to Ruby source as +ArithmeticPrecedenceParser+:
11
+ # parser.compile_to_ruby_source_as(:ArithmeticPrecedenceParser)
12
+ # which returns a string of Ruby code.
6
13
  class Parser
7
14
  include ParserMethods
8
15
  attr_reader :grammar, :start_state
@@ -6,36 +6,29 @@ module Dhaka
6
6
  @node_stack = []
7
7
  @state_stack = [start_state]
8
8
  @token_stream = token_stream
9
- @current_token_index = 0
9
+ @symbol_queue = []
10
10
  end
11
11
 
12
12
  def run
13
- for_each_token_with_end do
14
- error = execute_action current_token.grammar_symbol.name
13
+ token_stream.each do |token|
14
+ @current_token = token
15
+ @symbol_queue << @current_token.symbol_name
16
+ error = execute_actions
15
17
  return error if error
16
- node_stack << ParseTreeLeafNode.new(current_token)
17
- @current_token_index += 1
18
+ node_stack << ParseTreeLeafNode.new(@current_token)
18
19
  end
19
20
  ParseSuccessResult.new(node_stack[0])
20
21
  end
21
22
 
22
23
  private
23
24
 
24
- attr_reader :state_stack, :token_stream, :node_stack, :current_token
25
+ attr_reader :state_stack, :token_stream, :node_stack
25
26
 
26
- def for_each_token_with_end
27
- token_stream.each do |@current_token|
28
- yield
29
- end
30
- @current_token = Token.new(@grammar.end_symbol, nil)
31
- yield
32
- end
33
-
34
- def execute_action symbol_name
35
- action = state_stack[-1].actions[symbol_name]
36
- return ParseErrorResult.new(@current_token_index) unless action
37
- self.instance_eval(&action.action_code).each do |symbol_name|
38
- execute_action symbol_name
27
+ def execute_actions
28
+ while symbol_name = @symbol_queue.pop
29
+ action = state_stack[-1].actions[symbol_name]
30
+ return ParseErrorResult.new(@current_token) unless action
31
+ self.instance_eval(&action.action_code)
39
32
  end
40
33
  nil
41
34
  end
@@ -1,17 +1,20 @@
1
1
  module Dhaka
2
2
  # Represents a portion of the input character stream that is mapped by the tokenizer
3
- # to a symbol in the grammar.
3
+ # to a symbol in the grammar. The attribute +input_position+ contains the start index position of the original
4
+ # string input that this token came from. It can be used to report errors by indicating the specific portion
5
+ # of the input where the error occurred.
4
6
  class Token
5
- attr_accessor :grammar_symbol, :value
6
- def initialize(grammar_symbol, value)
7
- @grammar_symbol = grammar_symbol
7
+ attr_accessor :symbol_name, :value, :input_position
8
+ def initialize(symbol_name, value, input_position)
9
+ @symbol_name = symbol_name
8
10
  @value = value
11
+ @input_position = input_position
9
12
  end
10
- def to_s
11
- "#{@grammar_symbol.name}"
13
+ def to_s #:nodoc:
14
+ "#{symbol_name}"
12
15
  end
13
16
  def == other
14
- (grammar_symbol == other.grammar_symbol) && (value == other.value)
17
+ (symbol_name == other.symbol_name) && (value == other.value)
15
18
  end
16
19
  end
17
20
  end
@@ -2,17 +2,35 @@ module Dhaka
2
2
 
3
3
  # Reserved constant used to identify the idle state of the tokenizer.
4
4
  TOKENIZER_IDLE_STATE = :idle_state
5
-
6
- # Raised when the tokenizer encounters a character that has no corresponding action in
7
- # its current state.
8
- class UnrecognizedInputCharacterException < StandardError
9
- attr_reader :input, :char_index
10
- def initialize(input, char_index)
11
- @input = input
12
- @char_index = char_index
5
+
6
+ # Returned on successful tokenizing of the input stream. Supports iteration by including Enumerable, so it can
7
+ # be passed in directly to the parser.
8
+ class TokenizerSuccessResult
9
+ include Enumerable
10
+ def initialize(tokens)
11
+ @tokens = tokens
12
+ end
13
+ # Returns false.
14
+ def has_error?
15
+ false
16
+ end
17
+ def each
18
+ @tokens.each do |token|
19
+ yield token
20
+ end
21
+ end
22
+ end
23
+
24
+ # Returned when tokenizing fails due to an unexpected character in the input stream.
25
+ class TokenizerErrorResult
26
+ # The index of the character that caused the error.
27
+ attr_reader :unexpected_char_index
28
+ def initialize(unexpected_char_index)
29
+ @unexpected_char_index = unexpected_char_index
13
30
  end
14
- def to_s
15
- "Unrecognized character '#{input[char_index].chr}' encountered while tokenizing:\n #{input} at index #{char_index}"
31
+ # Returns true.
32
+ def has_error?
33
+ true
16
34
  end
17
35
  end
18
36
 
@@ -45,7 +63,52 @@ module Dhaka
45
63
  #
46
64
  # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
47
65
  # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
48
- # that it starts in).
66
+ # that it starts in).
67
+ #
68
+ # The following is a tokenizer for arithmetic expressions with integer terms. The tokenizer starts in the idle state
69
+ # creating single-character tokens for all characters excepts digits and whitespace. It shifts to
70
+ # <tt>:get_integer_literal</tt> when it encounters a digit character and creates a token on the stack on which it
71
+ # accumulates the value of the literal. When it again encounters a non-digit character, it shifts back to idle.
72
+ # Whitespace is treated as a delimiter, but not shifted as a token.
73
+ #
74
+ # class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
75
+ #
76
+ # digits = ('0'..'9').to_a
77
+ # parenths = ['(', ')']
78
+ # operators = ['-', '+', '/', '*', '^']
79
+ # functions = ['h', 'l']
80
+ # arg_separator = [',']
81
+ # whitespace = [' ']
82
+ #
83
+ # all_characters = digits + parenths + operators + functions + arg_separator + whitespace
84
+ #
85
+ # for_state Dhaka::TOKENIZER_IDLE_STATE do
86
+ # for_characters(all_characters - (digits + whitespace)) do
87
+ # create_token(curr_char, nil)
88
+ # advance
89
+ # end
90
+ # for_characters digits do
91
+ # create_token('n', '')
92
+ # switch_to :get_integer_literal
93
+ # end
94
+ # for_character whitespace do
95
+ # advance
96
+ # end
97
+ # end
98
+ #
99
+ # for_state :get_integer_literal do
100
+ # for_characters all_characters - digits do
101
+ # switch_to Dhaka::TOKENIZER_IDLE_STATE
102
+ # end
103
+ # for_characters digits do
104
+ # curr_token.value += curr_char
105
+ # advance
106
+ # end
107
+ # end
108
+ #
109
+ # end
110
+
111
+
49
112
  class Tokenizer
50
113
 
51
114
  # Define the action for the state named +state_name+.
@@ -53,13 +116,11 @@ module Dhaka
53
116
  states[state_name].instance_eval(&blk)
54
117
  end
55
118
 
56
- # Tokenizes a string +input+ and returns an array of Token-s.
119
+ # Tokenizes a string +input+ and returns a TokenizerErrorResult on failure or a TokenizerSuccessResult on sucess.
57
120
  def self.tokenize(input)
58
121
  self.new(input).run
59
122
  end
60
123
 
61
- # A slot that can be used to accumulate characters when processing multi-character tokens.
62
- attr_accessor :accumulator
63
124
  # The tokens shifted so far.
64
125
  attr_reader :tokens
65
126
 
@@ -80,6 +141,17 @@ module Dhaka
80
141
  @curr_char_index += 1
81
142
  end
82
143
 
144
+ # The token currently on top of the stack.
145
+ def curr_token
146
+ tokens[-1]
147
+ end
148
+
149
+ # Push a new token on to the stack with symbol corresponding to +symbol_name+ and a value of +value+.
150
+ def create_token(symbol_name, value)
151
+ new_token = Dhaka::Token.new(symbol_name, value, @curr_char_index)
152
+ tokens << new_token
153
+ end
154
+
83
155
  # Change the active state of the tokenizer to the state identified by the symbol +state_name+.
84
156
  def switch_to state_name
85
157
  @current_state = self.class.states[state_name]
@@ -88,16 +160,17 @@ module Dhaka
88
160
  def run #:nodoc:
89
161
  while curr_char
90
162
  blk = @current_state.actions[curr_char]
91
- raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
163
+ return TokenizerErrorResult.new(@curr_char_index) unless blk
92
164
  instance_eval(&blk)
93
165
  end
94
- tokens
166
+ tokens << Dhaka::Token.new(Dhaka::END_SYMBOL_NAME, nil, nil)
167
+ return TokenizerSuccessResult.new(tokens)
95
168
  end
96
169
 
97
170
  private
98
171
  def self.inherited(tokenizer)
99
172
  class << tokenizer
100
- attr_accessor :states
173
+ attr_accessor :states, :grammar
101
174
  end
102
175
  tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
103
176
  end