dhaka 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +64 -0
- data/lib/dhaka.rb +12 -0
- data/lib/dot/dot.rb +29 -0
- data/lib/evaluator/evaluator.rb +35 -26
- data/lib/grammar/grammar.rb +42 -17
- data/lib/grammar/grammar_symbol.rb +4 -3
- data/lib/grammar/production.rb +9 -3
- data/lib/lexer/compiled_lexer.rb +46 -0
- data/lib/lexer/dfa.rb +71 -0
- data/lib/lexer/lexeme.rb +33 -0
- data/lib/lexer/lexer.rb +61 -0
- data/lib/lexer/lexer_run.rb +66 -0
- data/lib/lexer/regex_grammar.rb +368 -0
- data/lib/lexer/regex_parser.rb +1888 -0
- data/lib/lexer/regex_tokenizer.rb +14 -0
- data/lib/lexer/specification.rb +69 -0
- data/lib/lexer/state.rb +45 -0
- data/lib/lexer/state_machine.rb +37 -0
- data/lib/parser/action.rb +3 -3
- data/lib/parser/compiled_parser.rb +11 -3
- data/lib/parser/parse_result.rb +3 -5
- data/lib/parser/parse_tree.rb +6 -17
- data/lib/parser/parser.rb +15 -14
- data/lib/parser/parser_run.rb +4 -2
- data/lib/parser/parser_state.rb +16 -8
- data/lib/tokenizer/tokenizer.rb +5 -3
- data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
- data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
- data/test/chittagong/chittagong_driver.rb +12 -13
- data/test/chittagong/chittagong_driver_test.rb +18 -11
- data/test/chittagong/chittagong_evaluator.rb +7 -16
- data/test/chittagong/chittagong_evaluator_test.rb +7 -4
- data/test/chittagong/chittagong_grammar.rb +0 -6
- data/test/chittagong/chittagong_lexer.rb +109 -0
- data/test/chittagong/chittagong_lexer_specification.rb +39 -0
- data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
- data/test/chittagong/chittagong_parser.rb +879 -0
- data/test/chittagong/chittagong_parser_test.rb +8 -10
- data/test/chittagong/chittagong_test.rb +17 -13
- data/test/compiled_parser_test.rb +7 -2
- data/test/evaluator_test.rb +0 -1
- data/test/grammar_test.rb +19 -1
- data/test/lexer_test.rb +215 -0
- data/test/parse_result_test.rb +8 -8
- data/test/parser_state_test.rb +0 -12
- metadata +21 -5
- data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
- data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,69 @@
|
|
1
|
+
module Dhaka
|
2
|
+
# Abstract base class for lexer specifications.
|
3
|
+
#
|
4
|
+
# Use this to specify the transformations that will be performed when the lexer recognizes a given pattern. Actions are listed in
|
5
|
+
# descending order of priority. For example in the following lexer specification:
|
6
|
+
#
|
7
|
+
# class LexerSpec < Dhaka::LexerSpecification
|
8
|
+
# for_pattern 'zz' do
|
9
|
+
# "recognized two zs"
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# for_pattern '\w(\w|\d)*' do
|
13
|
+
# "recognized word token #{current_lexeme.value}"
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# for_pattern '(\d)+(\.\d+)?' do
|
17
|
+
# "recognized number #{current_lexeme.value}"
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# for_pattern ' +' do
|
21
|
+
# #ignores whitespace
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# for_pattern "\n+" do
|
25
|
+
# "recognized newline"
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# the pattern 'zz' takes precedence over the pattern immediately below it, so the lexer will announce that it has recognized two
|
30
|
+
# 'z's instead of a word token.
|
31
|
+
#
|
32
|
+
# The patterns are <i>not</i> Ruby regular expressions - a lot of operators featured in Ruby's regular expression engine are not yet supported.
|
33
|
+
# See http://dhaka.rubyforge.org/regex_grammar.html for the current syntax.
|
34
|
+
|
35
|
+
class LexerSpecification
|
36
|
+
class << self
|
37
|
+
# Associates +blk+ as the action to be performed when a lexer recognizes +pattern+. When Lexer#lex is invoked,
|
38
|
+
# it creates a LexerRun object that provides the context for +blk+ to be evaluated in. Methods available in this block
|
39
|
+
# are LexerRun#current_lexeme and LexerRun#create_token.
|
40
|
+
def for_pattern(pattern, &blk)
|
41
|
+
items[pattern] = LexerSpecificationItem.new(pattern, priority, blk)
|
42
|
+
self.priority += 1
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def inherited(specification)
|
47
|
+
class << specification
|
48
|
+
attr_accessor :items, :priority
|
49
|
+
end
|
50
|
+
specification.items = {}
|
51
|
+
specification.priority = 0
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class LexerSpecificationItem #:nodoc:
|
58
|
+
include Comparable
|
59
|
+
attr_reader :pattern, :action, :priority
|
60
|
+
def initialize(pattern, priority, action)
|
61
|
+
@pattern, @priority, @action = pattern, priority, action
|
62
|
+
end
|
63
|
+
|
64
|
+
def <=> other
|
65
|
+
priority <=> other.priority
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
data/lib/lexer/state.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Dhaka
|
2
|
+
module LexerSupport
|
3
|
+
class State
|
4
|
+
attr_reader :transitions, :pattern
|
5
|
+
def initialize state_machine, pattern
|
6
|
+
@state_machine = state_machine
|
7
|
+
@pattern = pattern
|
8
|
+
@transitions = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def accepting?
|
12
|
+
pattern
|
13
|
+
end
|
14
|
+
|
15
|
+
def for_characters *characters, &blk
|
16
|
+
dest_state = @state_machine.instance_eval(&blk)
|
17
|
+
characters.each do |char|
|
18
|
+
transitions[char] = dest_state
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def recognize pattern
|
23
|
+
@pattern = pattern
|
24
|
+
end
|
25
|
+
|
26
|
+
def compile_to_ruby_source
|
27
|
+
result = " at_state(#{object_id}) {\n"
|
28
|
+
result << " recognize(#{pattern.inspect})\n" if accepting?
|
29
|
+
transition_keys_by_destination_state = Hash.new {|hash, key| hash[key] = []}
|
30
|
+
transitions.each do |key, dest_state|
|
31
|
+
transition_keys_by_destination_state[dest_state.object_id] << key
|
32
|
+
end
|
33
|
+
|
34
|
+
transition_keys_by_destination_state.keys.each do |state_id|
|
35
|
+
transition_keys = transition_keys_by_destination_state[state_id].collect {|transition_key| "#{transition_key.inspect}"}.join(', ')
|
36
|
+
result << " for_characters(#{transition_keys}) { switch_to #{state_id} }\n"
|
37
|
+
end
|
38
|
+
|
39
|
+
result << " }"
|
40
|
+
result
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Dhaka
|
2
|
+
module LexerSupport
|
3
|
+
class StateMachine
|
4
|
+
attr_reader :start_state
|
5
|
+
|
6
|
+
def initialize start_key
|
7
|
+
@states = Hash.new do |hash, key|
|
8
|
+
new_state = new_state_for_key key
|
9
|
+
hash[key] = new_state
|
10
|
+
transition_characters(key).each do |char|
|
11
|
+
dest_key = dest_key_for(key, char)
|
12
|
+
dest_state = hash[dest_key]
|
13
|
+
new_state.transitions[char] = dest_state
|
14
|
+
end
|
15
|
+
new_state
|
16
|
+
end
|
17
|
+
@start_state = @states[start_key]
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_dot
|
21
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :circle, :size => 5) do |g|
|
22
|
+
start = 'Start'
|
23
|
+
g.node(start, :label => start)
|
24
|
+
g.edge(start, @start_state)
|
25
|
+
@states.values.each do |state|
|
26
|
+
state_attributes = {}
|
27
|
+
state_attributes.merge!(:shape => :doublecircle, :label => state.pattern) if state.accepting?
|
28
|
+
g.node(state, state_attributes)
|
29
|
+
state.transitions.each do |transition_key, dest_state|
|
30
|
+
g.edge(state, dest_state, :label => transition_key)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end.to_dot
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/parser/action.rb
CHANGED
@@ -31,12 +31,12 @@ module Dhaka
|
|
31
31
|
@action_code = Proc.new do
|
32
32
|
composite_node = ParseTreeCompositeNode.new(production)
|
33
33
|
|
34
|
-
production.expansion.each do |symbol|
|
34
|
+
production.expansion.each do |symbol|
|
35
35
|
state_stack.pop
|
36
36
|
composite_node.child_nodes.unshift(node_stack.pop)
|
37
37
|
end
|
38
38
|
|
39
|
-
node_stack << composite_node
|
39
|
+
node_stack << composite_node.instance_eval(&production.action)
|
40
40
|
|
41
41
|
unless composite_node.head_node?
|
42
42
|
@symbol_queue.concat [@current_token.symbol_name, production.symbol.name]
|
@@ -45,7 +45,7 @@ module Dhaka
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def compile_to_ruby_source
|
48
|
-
"reduce_with
|
48
|
+
"reduce_with #{production.name.inspect}"
|
49
49
|
end
|
50
50
|
|
51
51
|
def to_s
|
@@ -1,14 +1,22 @@
|
|
1
1
|
module Dhaka
|
2
|
-
#
|
2
|
+
# Abstract base class of all compiled Parsers. It is only used by generated code.
|
3
3
|
class CompiledParser
|
4
4
|
|
5
5
|
class << self
|
6
6
|
private
|
7
7
|
def inherited(compiled_parser)
|
8
8
|
class << compiled_parser
|
9
|
-
attr_accessor :states, :grammar, :start_state_id
|
9
|
+
attr_accessor :states, :grammar, :start_state_id, :shift_actions, :reduce_actions
|
10
10
|
end
|
11
|
-
compiled_parser.states
|
11
|
+
compiled_parser.states = Hash.new do |hash, state_id|
|
12
|
+
hash[state_id] = ParserState.new(compiled_parser, {}, state_id)
|
13
|
+
end
|
14
|
+
compiled_parser.shift_actions = Hash.new do |hash, state_id|
|
15
|
+
hash[state_id] = ShiftAction.new(compiled_parser.states[state_id])
|
16
|
+
end
|
17
|
+
compiled_parser.reduce_actions = Hash.new do |hash, production_name|
|
18
|
+
hash[production_name] = ReduceAction.new(compiled_parser.grammar.production_named(production_name))
|
19
|
+
end
|
12
20
|
end
|
13
21
|
|
14
22
|
def at_state x, &blk
|
data/lib/parser/parse_result.rb
CHANGED
@@ -16,11 +16,9 @@ module Dhaka
|
|
16
16
|
|
17
17
|
# Returns the dot representation of the parse tree
|
18
18
|
def to_dot
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
result << ['}']
|
23
|
-
result.join("\n")
|
19
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
|
20
|
+
parse_tree.to_dot(g)
|
21
|
+
end.to_dot
|
24
22
|
end
|
25
23
|
|
26
24
|
# Deprecated. Use the +parse_tree+ accessor.
|
data/lib/parser/parse_tree.rb
CHANGED
@@ -20,25 +20,18 @@ module Dhaka
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# Returns the dot representation of this node.
|
23
|
-
def to_dot
|
24
|
-
|
25
|
-
label = production
|
26
|
-
result << %(#{dot_name} [label="#{label}"])
|
23
|
+
def to_dot graph
|
24
|
+
graph.node(self, :label => production)
|
27
25
|
child_nodes.each do |child|
|
28
|
-
|
29
|
-
|
26
|
+
graph.edge(self, child)
|
27
|
+
child.to_dot(graph)
|
30
28
|
end
|
31
|
-
result.join("\n")
|
32
29
|
end
|
33
30
|
|
34
31
|
def head_node? #:nodoc:
|
35
32
|
production.symbol.name == START_SYMBOL_NAME
|
36
33
|
end
|
37
34
|
|
38
|
-
def dot_name #:nodoc:
|
39
|
-
"Node#{object_id}"
|
40
|
-
end
|
41
|
-
|
42
35
|
end
|
43
36
|
|
44
37
|
# These are leaf nodes of syntax trees. They contain tokens.
|
@@ -62,16 +55,12 @@ module Dhaka
|
|
62
55
|
end
|
63
56
|
|
64
57
|
# Returns the dot representation of this node.
|
65
|
-
def to_dot
|
66
|
-
|
58
|
+
def to_dot(graph)
|
59
|
+
graph.node(self, :label => token)
|
67
60
|
end
|
68
61
|
|
69
62
|
def head_node? #:nodoc:
|
70
63
|
false
|
71
64
|
end
|
72
|
-
|
73
|
-
def dot_name #:nodoc:
|
74
|
-
"Node#{object_id}"
|
75
|
-
end
|
76
65
|
end
|
77
66
|
end
|
data/lib/parser/parser.rb
CHANGED
@@ -14,10 +14,12 @@ module Dhaka
|
|
14
14
|
# and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
|
15
15
|
# at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
|
16
16
|
def initialize(grammar, logger = nil)
|
17
|
-
@
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
17
|
+
@shift_actions = Hash.new {|hash, state| hash[state] = ShiftAction.new(state)}
|
18
|
+
@reduce_actions = Hash.new {|hash, production| hash[production] = ReduceAction.new(production)}
|
19
|
+
@logger = logger || default_logger
|
20
|
+
@transitions = Hash.new {|hash, state| hash[state] = {}}
|
21
|
+
@grammar = grammar
|
22
|
+
@channels = []
|
21
23
|
@states = Hash.new do |hash, kernel|
|
22
24
|
channels, closure = grammar.closure(kernel)
|
23
25
|
@channels.concat channels.to_a
|
@@ -51,15 +53,14 @@ module Dhaka
|
|
51
53
|
# options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
|
52
54
|
# of lookahead symbols for every item in every state.
|
53
55
|
def to_dot(options = {})
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
|
57
|
+
states.each do |state|
|
58
|
+
g.node(state, :label => state.items.values.collect{|item| item.to_s(options)}.join("\n"))
|
59
|
+
@transitions[state].each do |symbol, dest_state|
|
60
|
+
g.edge(state, dest_state, :label => symbol.name)
|
61
|
+
end
|
59
62
|
end
|
60
|
-
end
|
61
|
-
result << ['}']
|
62
|
-
result.join("\n")
|
63
|
+
end.to_dot
|
63
64
|
end
|
64
65
|
|
65
66
|
def inspect
|
@@ -97,7 +98,7 @@ module Dhaka
|
|
97
98
|
def generate_shift_actions
|
98
99
|
@states.values.each do |state|
|
99
100
|
@transitions[state].keys.each do |symbol|
|
100
|
-
state.actions[symbol.name] =
|
101
|
+
state.actions[symbol.name] = @shift_actions[@transitions[state][symbol]]
|
101
102
|
end
|
102
103
|
end
|
103
104
|
end
|
@@ -112,7 +113,7 @@ module Dhaka
|
|
112
113
|
|
113
114
|
def create_reduction_actions_for_item_and_state item, state
|
114
115
|
item.lookaheadset.each do |lookahead|
|
115
|
-
new_action =
|
116
|
+
new_action = @reduce_actions[item.production]
|
116
117
|
if existing_action = state.actions[lookahead.name]
|
117
118
|
if ReduceAction === existing_action
|
118
119
|
message = ReduceReduceConflict.new(state, lookahead, new_action).resolve
|
data/lib/parser/parser_run.rb
CHANGED
@@ -10,14 +10,16 @@ module Dhaka
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def run
|
13
|
-
token_stream.each do |token|
|
13
|
+
tokenize_result = token_stream.each do |token|
|
14
14
|
@current_token = token
|
15
15
|
@symbol_queue << @current_token.symbol_name
|
16
16
|
error = execute_actions
|
17
17
|
return error if error
|
18
18
|
node_stack << ParseTreeLeafNode.new(@current_token)
|
19
|
+
state_stack.last
|
19
20
|
end
|
20
|
-
|
21
|
+
return tokenize_result if TokenizerErrorResult === tokenize_result
|
22
|
+
ParseSuccessResult.new(node_stack.first) if node_stack.first.head_node?
|
21
23
|
end
|
22
24
|
|
23
25
|
private
|
data/lib/parser/parser_state.rb
CHANGED
@@ -29,23 +29,31 @@ module Dhaka
|
|
29
29
|
"State#{id}"
|
30
30
|
end
|
31
31
|
|
32
|
-
def to_dot(options = {})
|
33
|
-
%(#{unique_name} [label="#{items.values.collect{|item| item.to_s(options)}.join('\n')}"])
|
34
|
-
end
|
35
|
-
|
36
32
|
def compile_to_ruby_source
|
37
33
|
result = " at_state(#{id}) {\n"
|
34
|
+
|
35
|
+
symbol_names_by_action = Hash.new {|hash, key| hash[key] = []}
|
38
36
|
actions.each do |symbol_name, action|
|
39
|
-
|
37
|
+
symbol_names_by_action[action] << symbol_name
|
38
|
+
end
|
39
|
+
|
40
|
+
symbol_names_by_action.keys.each do |action|
|
41
|
+
symbol_names = symbol_names_by_action[action].collect {|symbol_name| "#{symbol_name.inspect}"}.join(', ')
|
42
|
+
result << " for_symbols(#{symbol_names}) { #{action.compile_to_ruby_source} }\n"
|
40
43
|
end
|
44
|
+
|
41
45
|
result << " }"
|
42
46
|
result
|
43
47
|
end
|
44
|
-
|
45
|
-
def
|
46
|
-
|
48
|
+
|
49
|
+
def for_symbols *symbol_names, &blk
|
50
|
+
symbol_names.each do |symbol_name|
|
51
|
+
actions[symbol_name] = @parser.instance_eval(&blk)
|
52
|
+
end
|
47
53
|
end
|
48
54
|
|
55
|
+
alias :for_symbol :for_symbols
|
56
|
+
|
49
57
|
def to_s(options = {})
|
50
58
|
items.values.collect{|item| item.to_s(options)}.join("\n")
|
51
59
|
end
|
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -60,10 +60,9 @@ module Dhaka
|
|
60
60
|
|
61
61
|
end
|
62
62
|
|
63
|
-
# This class contains a DSL for
|
64
|
-
# Subclasses of this class may not be further subclassed.
|
63
|
+
# This abstract class contains a DSL for hand-coding tokenizers. Subclass it to implement tokenizers for specific grammars.
|
65
64
|
#
|
66
|
-
# Tokenizers are state machines
|
65
|
+
# Tokenizers are state machines. Each state of a tokenizer is identified
|
67
66
|
# by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
|
68
67
|
# that it starts in).
|
69
68
|
#
|
@@ -109,6 +108,9 @@ module Dhaka
|
|
109
108
|
# end
|
110
109
|
#
|
111
110
|
# end
|
111
|
+
#
|
112
|
+
# For languages where the lexical structure is very complicated, it may be too tedious to implement a Tokenizer by hand.
|
113
|
+
# In such cases, it's a lot easier to write a LexerSpecification using regular expressions and create a Lexer from that.
|
112
114
|
class Tokenizer
|
113
115
|
class << self
|
114
116
|
# Define the action for the state named +state_name+.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class ArithmeticPrecedenceLexerSpecification < Dhaka::LexerSpecification
|
2
|
+
|
3
|
+
for_pattern '\s' do
|
4
|
+
# ignore whitespace
|
5
|
+
end
|
6
|
+
|
7
|
+
%w| - h l , |.each do |char|
|
8
|
+
for_pattern char do
|
9
|
+
create_token(char)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
%w| ( ) + / * ^ |.each do |char|
|
14
|
+
for_pattern "\\#{char}" do
|
15
|
+
create_token(char)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
for_pattern '\d+' do
|
20
|
+
create_token('n')
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|