dhaka 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/dhaka.rb +44 -0
- data/lib/evaluator/evaluator.rb +70 -0
- data/lib/grammar/closure_hash.rb +13 -0
- data/lib/grammar/grammar.rb +129 -0
- data/lib/grammar/grammar_symbol.rb +19 -0
- data/lib/grammar/production.rb +14 -0
- data/lib/parser/action.rb +51 -0
- data/lib/parser/channel.rb +51 -0
- data/lib/parser/compiled_parser.rb +35 -0
- data/lib/parser/item.rb +37 -0
- data/lib/parser/parse_result.rb +26 -0
- data/lib/parser/parse_tree.rb +34 -0
- data/lib/parser/parser.rb +125 -0
- data/lib/parser/parser_methods.rb +10 -0
- data/lib/parser/parser_run.rb +35 -0
- data/lib/parser/parser_state.rb +66 -0
- data/lib/parser/token.rb +15 -0
- data/lib/tokenizer/tokenizer.rb +88 -0
- data/test/all_tests.rb +11 -0
- data/test/arithmetic_evaluator.rb +70 -0
- data/test/arithmetic_evaluator_test.rb +55 -0
- data/test/arithmetic_grammar.rb +38 -0
- data/test/arithmetic_grammar_test.rb +11 -0
- data/test/arithmetic_test_methods.rb +11 -0
- data/test/arithmetic_tokenizer.rb +43 -0
- data/test/arithmetic_tokenizer_test.rb +32 -0
- data/test/bracket_grammar.rb +25 -0
- data/test/bracket_tokenizer.rb +17 -0
- data/test/brackets_test.rb +20 -0
- data/test/compiled_arithmetic_parser.rb +252 -0
- data/test/compiled_parser_test.rb +71 -0
- data/test/evaluator_test.rb +8 -0
- data/test/grammar_test.rb +70 -0
- data/test/incomplete_arithmetic_evaluator.rb +60 -0
- data/test/lalr_but_not_slr_grammar.rb +17 -0
- data/test/malformed_grammar.rb +9 -0
- data/test/malformed_grammar_test.rb +9 -0
- data/test/nullable_grammar.rb +18 -0
- data/test/parser_test.rb +168 -0
- data/test/rr_conflict_grammar.rb +23 -0
- data/test/simple_grammar.rb +24 -0
- data/test/sr_conflict_grammar.rb +16 -0
- metadata +87 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
module Dhaka
|
2
|
+
class ParseSuccessResult
|
3
|
+
attr_accessor :syntax_tree
|
4
|
+
def initialize(syntax_tree)
|
5
|
+
@syntax_tree = syntax_tree
|
6
|
+
end
|
7
|
+
|
8
|
+
def has_error?
|
9
|
+
false
|
10
|
+
end
|
11
|
+
end
|
12
|
+
class ParseErrorResult
|
13
|
+
attr_reader :bad_token_index
|
14
|
+
def initialize(bad_token_index)
|
15
|
+
@bad_token_index = bad_token_index
|
16
|
+
end
|
17
|
+
|
18
|
+
def has_error?
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Dhaka
|
2
|
+
class ParseTreeCompositeNode
|
3
|
+
attr_reader :production, :child_nodes
|
4
|
+
def initialize(production)
|
5
|
+
@production = production
|
6
|
+
@child_nodes = []
|
7
|
+
end
|
8
|
+
def linearize
|
9
|
+
child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
|
10
|
+
end
|
11
|
+
def to_s
|
12
|
+
"CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
|
13
|
+
end
|
14
|
+
def head_node?
|
15
|
+
production.symbol.name == START_SYMBOL_NAME
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class ParseTreeLeafNode
|
20
|
+
attr_reader :token
|
21
|
+
def initialize(token)
|
22
|
+
@token = token
|
23
|
+
end
|
24
|
+
def linearize
|
25
|
+
[]
|
26
|
+
end
|
27
|
+
def to_s
|
28
|
+
"LeafNode: #{token}"
|
29
|
+
end
|
30
|
+
def head_node?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'set'
|
3
|
+
module Dhaka
|
4
|
+
class Parser
|
5
|
+
include ParserMethods
|
6
|
+
attr_reader :grammar, :start_state
|
7
|
+
|
8
|
+
def initialize(grammar)
|
9
|
+
@transitions = Hash.new {|hash, state| hash[state] = {}}
|
10
|
+
@grammar = grammar
|
11
|
+
@channels = []
|
12
|
+
@states = Hash.new do |hash, kernel|
|
13
|
+
channels, closure = @grammar.closure(kernel)
|
14
|
+
@channels += channels.to_a
|
15
|
+
new_state = ParserState.new(self, closure)
|
16
|
+
hash[kernel] = new_state
|
17
|
+
new_state.transition_items.each do |symbol, items|
|
18
|
+
destination_kernel = ItemSet.new(items.collect{|item| item.next_item})
|
19
|
+
destination_state = hash[destination_kernel]
|
20
|
+
items.each { |item| @channels << @grammar.passive_channel(item, destination_state.items[item.next_item]) }
|
21
|
+
@transitions[new_state][symbol] = destination_state
|
22
|
+
end
|
23
|
+
new_state
|
24
|
+
end
|
25
|
+
initialize_states
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize_states
|
29
|
+
start_productions = @grammar.productions_for_symbol(@grammar.start_symbol)
|
30
|
+
raise NoStartProductionsError.new(@grammar) if start_productions.empty?
|
31
|
+
start_items = ItemSet.new(start_productions.collect {|production| Item.new(production, 0)})
|
32
|
+
start_items.each {|start_item| start_item.lookaheadset << @grammar.end_symbol}
|
33
|
+
@start_state = @states[start_items]
|
34
|
+
pump_channels
|
35
|
+
generate_shift_actions
|
36
|
+
generate_reduce_actions
|
37
|
+
end
|
38
|
+
|
39
|
+
def compile_to_ruby_source_as parser_class_name
|
40
|
+
result = "class #{parser_class_name} < Dhaka::CompiledParser\n\n"
|
41
|
+
result << " self.grammar = #{@grammar.name}\n\n"
|
42
|
+
result << " start_with #{start_state.id}\n\n"
|
43
|
+
states.each do |state|
|
44
|
+
result << "#{state.compile_to_ruby_source}\n\n"
|
45
|
+
end
|
46
|
+
result << "end"
|
47
|
+
result
|
48
|
+
end
|
49
|
+
|
50
|
+
def to_dot
|
51
|
+
result = ["digraph x {", "node [fontsize=\"10\" shape=box size=\"5\"]"]
|
52
|
+
result += states.collect { |state| state.to_dot }
|
53
|
+
states.each { |state|
|
54
|
+
@transitions[state].each { |symbol, dest_state|
|
55
|
+
result << "#{state.dot_name} -> #{dest_state.dot_name} [label=\"#{symbol.name}\"]"
|
56
|
+
}
|
57
|
+
}
|
58
|
+
result << ['}']
|
59
|
+
result.join("\n")
|
60
|
+
end
|
61
|
+
def states
|
62
|
+
@states.values
|
63
|
+
end
|
64
|
+
|
65
|
+
def generate_shift_actions
|
66
|
+
@states.values.each do |state|
|
67
|
+
@transitions[state].keys.each { |symbol|
|
68
|
+
state.actions[symbol.name] = ShiftAction.new(@transitions[state][symbol])
|
69
|
+
}
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def generate_reduce_actions
|
74
|
+
@states.values.each do |state|
|
75
|
+
state.items.values.select{ |item| !item.next_symbol }.each do |item|
|
76
|
+
create_reduction_actions_for_item_and_state item, state
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def create_reduction_actions_for_item_and_state item, state
|
82
|
+
item.lookaheadset.each do |lookahead|
|
83
|
+
existing_action = state.actions[lookahead.name]
|
84
|
+
new_action = ReduceAction.new(item.production)
|
85
|
+
if existing_action
|
86
|
+
raise ParserConflictError.new(state, existing_action, new_action)
|
87
|
+
else
|
88
|
+
state.actions[lookahead.name] = new_action
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def pump_channels
|
95
|
+
while true
|
96
|
+
break unless @channels.inject(false) do |pumped, channel|
|
97
|
+
pumped || channel.pump
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
class ParserConflictError < StandardError
|
106
|
+
def initialize(state, existing_action, new_action)
|
107
|
+
@state = state
|
108
|
+
@existing_action = existing_action
|
109
|
+
@new_action = new_action
|
110
|
+
end
|
111
|
+
def to_s
|
112
|
+
"Conflict in state #{@state}\n Existing: #{@existing_action}\n New: #{@new_action}"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class NoStartProductionsError < StandardError
|
117
|
+
def initialize(grammar)
|
118
|
+
@grammar = grammar
|
119
|
+
end
|
120
|
+
def to_s
|
121
|
+
"No start productions defined for #{@grammar.name}"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Dhaka
|
2
|
+
class ParserRun
|
3
|
+
attr_reader :state_stack, :token_stream, :node_stack
|
4
|
+
def initialize(grammar, start_state, token_stream)
|
5
|
+
@grammar = grammar
|
6
|
+
@node_stack = []
|
7
|
+
@state_stack = [start_state]
|
8
|
+
@token_stream = token_stream
|
9
|
+
@current_token_index = 0
|
10
|
+
end
|
11
|
+
def current_token
|
12
|
+
@token_stream[@current_token_index]
|
13
|
+
end
|
14
|
+
def advance
|
15
|
+
node_stack << ParseTreeLeafNode.new(current_token)
|
16
|
+
@current_token_index += 1
|
17
|
+
end
|
18
|
+
def run
|
19
|
+
while current_token
|
20
|
+
error = execute_action current_token.grammar_symbol.name
|
21
|
+
return error if error
|
22
|
+
self.advance
|
23
|
+
end
|
24
|
+
ParseSuccessResult.new(node_stack[0])
|
25
|
+
end
|
26
|
+
def execute_action symbol_name
|
27
|
+
action = state_stack[-1].actions[symbol_name]
|
28
|
+
return ParseErrorResult.new(@current_token_index) unless action
|
29
|
+
self.instance_eval(&action.action_code).each do |symbol_name|
|
30
|
+
execute_action symbol_name
|
31
|
+
end
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'set'
|
3
|
+
module Dhaka
|
4
|
+
class ParserState
|
5
|
+
|
6
|
+
attr_accessor :items, :actions, :id
|
7
|
+
|
8
|
+
@@state_id = 0
|
9
|
+
|
10
|
+
def self.next_state_id
|
11
|
+
result = @@state_id
|
12
|
+
@@state_id += 1
|
13
|
+
result
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(parser, items, id=nil)
|
17
|
+
@parser = parser
|
18
|
+
@items = items
|
19
|
+
@actions = {}
|
20
|
+
@id = id ? id : ParserState.next_state_id
|
21
|
+
end
|
22
|
+
|
23
|
+
def transition_items
|
24
|
+
result = Hash.new {|h, k| h[k] = ItemSet.new()}
|
25
|
+
for item in @items.values
|
26
|
+
(result[item.next_symbol] << item) if item.next_symbol
|
27
|
+
end
|
28
|
+
result
|
29
|
+
end
|
30
|
+
|
31
|
+
def dot_name
|
32
|
+
self.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_dot
|
36
|
+
label = self.items.values.join('\n')
|
37
|
+
"#{dot_name} [label=\"#{label}\"]"
|
38
|
+
end
|
39
|
+
def compile_to_ruby_source
|
40
|
+
result = " at_state(#{@id}) {\n"
|
41
|
+
actions.each do |symbol_name, action|
|
42
|
+
result << " for_symbol('#{symbol_name}') { #{action.compile_to_ruby_source} }\n"
|
43
|
+
end
|
44
|
+
result << " }"
|
45
|
+
result
|
46
|
+
end
|
47
|
+
|
48
|
+
def for_symbol symbol_name, &blk
|
49
|
+
actions[symbol_name] = @parser.instance_eval(&blk)
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_s
|
53
|
+
"State#{id}"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
class ItemSet < Set
|
59
|
+
def hash
|
60
|
+
self.collect{|item| item.hash}.inject{|result, hashcode| result ^ hashcode}
|
61
|
+
end
|
62
|
+
def eql? other
|
63
|
+
self == other
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/parser/token.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Dhaka
|
2
|
+
class Token
|
3
|
+
attr_accessor :grammar_symbol, :value
|
4
|
+
def initialize(grammar_symbol, value)
|
5
|
+
@grammar_symbol = grammar_symbol
|
6
|
+
@value = value
|
7
|
+
end
|
8
|
+
def to_s
|
9
|
+
"#{@grammar_symbol.name}"
|
10
|
+
end
|
11
|
+
def == other
|
12
|
+
(grammar_symbol == other.grammar_symbol) && (value == other.value)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module Dhaka
|
2
|
+
class UnrecognizedInputCharacterException < StandardError
|
3
|
+
attr_reader :input, :char_index
|
4
|
+
def initialize(input, char_index)
|
5
|
+
@input = input
|
6
|
+
@char_index = char_index
|
7
|
+
end
|
8
|
+
def to_s
|
9
|
+
"Unrecognized character #{input[char_index].chr} encountered while tokenizing:\n #{input}"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class TokenizerState
|
14
|
+
attr_reader :actions
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@actions = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def for_characters(characters, &blk)
|
21
|
+
characters.each do |character|
|
22
|
+
actions[character] = blk
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def for_character(character, &blk)
|
27
|
+
actions[character[0]] = blk
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
actions.inspect
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
class Tokenizer
|
37
|
+
|
38
|
+
def self.inherited(tokenizer)
|
39
|
+
class << tokenizer
|
40
|
+
attr_accessor :states
|
41
|
+
end
|
42
|
+
tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.for_state(state_name, &blk)
|
46
|
+
states[state_name].instance_eval(&blk)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.tokenize(input)
|
50
|
+
TokenizerRun.new(self, input).run
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class TokenizerRun
|
55
|
+
|
56
|
+
attr_accessor :accumulator
|
57
|
+
attr_reader :tokens
|
58
|
+
def initialize(tokenizer, input)
|
59
|
+
@tokenizer = tokenizer
|
60
|
+
@input = input
|
61
|
+
@current_state = tokenizer.states[:idle_state]
|
62
|
+
@curr_char_index = 0
|
63
|
+
@tokens = []
|
64
|
+
end
|
65
|
+
|
66
|
+
def run
|
67
|
+
while curr_char
|
68
|
+
blk = @current_state.actions[curr_char]
|
69
|
+
raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
|
70
|
+
instance_eval(&blk)
|
71
|
+
end
|
72
|
+
tokens
|
73
|
+
end
|
74
|
+
|
75
|
+
def curr_char
|
76
|
+
@input[@curr_char_index] and @input[@curr_char_index].chr
|
77
|
+
end
|
78
|
+
|
79
|
+
def advance
|
80
|
+
@curr_char_index += 1
|
81
|
+
end
|
82
|
+
|
83
|
+
def switch_to state_name
|
84
|
+
@current_state = @tokenizer.states[state_name]
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
data/test/all_tests.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
require 'grammar_test'
|
5
|
+
require 'parser_test'
|
6
|
+
require 'arithmetic_evaluator_test'
|
7
|
+
require 'compiled_parser_test'
|
8
|
+
require 'evaluator_test'
|
9
|
+
require 'arithmetic_tokenizer_test'
|
10
|
+
require 'malformed_grammar_test'
|
11
|
+
require 'brackets_test'
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/dhaka'
|
2
|
+
require 'arithmetic_grammar'
|
3
|
+
|
4
|
+
class ArithmeticEvaluator < Dhaka::Evaluator
|
5
|
+
|
6
|
+
self.grammar = ArithmeticGrammar
|
7
|
+
|
8
|
+
define_evaluation_rules do
|
9
|
+
|
10
|
+
for_subtraction do
|
11
|
+
child_nodes[0] - child_nodes[2]
|
12
|
+
end
|
13
|
+
|
14
|
+
for_addition do
|
15
|
+
child_nodes[0] + child_nodes[2]
|
16
|
+
end
|
17
|
+
|
18
|
+
for_division do
|
19
|
+
child_nodes[0].to_f/child_nodes[2]
|
20
|
+
end
|
21
|
+
|
22
|
+
for_multiplication do
|
23
|
+
child_nodes[0] * child_nodes[2]
|
24
|
+
end
|
25
|
+
|
26
|
+
for_getting_literals do
|
27
|
+
child_nodes[0].token.value
|
28
|
+
end
|
29
|
+
|
30
|
+
for_start_production do
|
31
|
+
child_nodes[0]
|
32
|
+
end
|
33
|
+
|
34
|
+
for_unpacking_parenthetized_expression do
|
35
|
+
child_nodes[1]
|
36
|
+
end
|
37
|
+
|
38
|
+
for_empty_args do
|
39
|
+
[]
|
40
|
+
end
|
41
|
+
|
42
|
+
for_evaluating_function do
|
43
|
+
child_nodes[0].call child_nodes[2]
|
44
|
+
end
|
45
|
+
|
46
|
+
for_concatenating_args do
|
47
|
+
[child_nodes[0]]+child_nodes[2]
|
48
|
+
end
|
49
|
+
|
50
|
+
for_single_args do
|
51
|
+
[child_nodes[0]]
|
52
|
+
end
|
53
|
+
|
54
|
+
for_min_function do
|
55
|
+
@min_function
|
56
|
+
end
|
57
|
+
|
58
|
+
for_max_function do
|
59
|
+
@max_function
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def initialize(syntax_tree, min_function, max_function)
|
65
|
+
@min_function = min_function
|
66
|
+
@max_function = max_function
|
67
|
+
super(syntax_tree)
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|