dhaka 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +64 -0
- data/lib/dhaka.rb +12 -0
- data/lib/dot/dot.rb +29 -0
- data/lib/evaluator/evaluator.rb +35 -26
- data/lib/grammar/grammar.rb +42 -17
- data/lib/grammar/grammar_symbol.rb +4 -3
- data/lib/grammar/production.rb +9 -3
- data/lib/lexer/compiled_lexer.rb +46 -0
- data/lib/lexer/dfa.rb +71 -0
- data/lib/lexer/lexeme.rb +33 -0
- data/lib/lexer/lexer.rb +61 -0
- data/lib/lexer/lexer_run.rb +66 -0
- data/lib/lexer/regex_grammar.rb +368 -0
- data/lib/lexer/regex_parser.rb +1888 -0
- data/lib/lexer/regex_tokenizer.rb +14 -0
- data/lib/lexer/specification.rb +69 -0
- data/lib/lexer/state.rb +45 -0
- data/lib/lexer/state_machine.rb +37 -0
- data/lib/parser/action.rb +3 -3
- data/lib/parser/compiled_parser.rb +11 -3
- data/lib/parser/parse_result.rb +3 -5
- data/lib/parser/parse_tree.rb +6 -17
- data/lib/parser/parser.rb +15 -14
- data/lib/parser/parser_run.rb +4 -2
- data/lib/parser/parser_state.rb +16 -8
- data/lib/tokenizer/tokenizer.rb +5 -3
- data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
- data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
- data/test/chittagong/chittagong_driver.rb +12 -13
- data/test/chittagong/chittagong_driver_test.rb +18 -11
- data/test/chittagong/chittagong_evaluator.rb +7 -16
- data/test/chittagong/chittagong_evaluator_test.rb +7 -4
- data/test/chittagong/chittagong_grammar.rb +0 -6
- data/test/chittagong/chittagong_lexer.rb +109 -0
- data/test/chittagong/chittagong_lexer_specification.rb +39 -0
- data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
- data/test/chittagong/chittagong_parser.rb +879 -0
- data/test/chittagong/chittagong_parser_test.rb +8 -10
- data/test/chittagong/chittagong_test.rb +17 -13
- data/test/compiled_parser_test.rb +7 -2
- data/test/evaluator_test.rb +0 -1
- data/test/grammar_test.rb +19 -1
- data/test/lexer_test.rb +215 -0
- data/test/parse_result_test.rb +8 -8
- data/test/parser_state_test.rb +0 -12
- metadata +21 -5
- data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
- data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,69 @@
|
|
1
|
+
module Dhaka
|
2
|
+
# Abstract base class for lexer specifications.
|
3
|
+
#
|
4
|
+
# Use this to specify the transformations that will be performed when the lexer recognizes a given pattern. Actions are listed in
|
5
|
+
# descending order of priority. For example in the following lexer specification:
|
6
|
+
#
|
7
|
+
# class LexerSpec < Dhaka::LexerSpecification
|
8
|
+
# for_pattern 'zz' do
|
9
|
+
# "recognized two zs"
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# for_pattern '\w(\w|\d)*' do
|
13
|
+
# "recognized word token #{current_lexeme.value}"
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# for_pattern '(\d)+(\.\d+)?' do
|
17
|
+
# "recognized number #{current_lexeme.value}"
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# for_pattern ' +' do
|
21
|
+
# #ignores whitespace
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# for_pattern "\n+" do
|
25
|
+
# "recognized newline"
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# the pattern 'zz' takes precedence over the pattern immediately below it, so the lexer will announce that it has recognized two
|
30
|
+
# 'z's instead of a word token.
|
31
|
+
#
|
32
|
+
# The patterns are <i>not</i> Ruby regular expressions - a lot of operators featured in Ruby's regular expression engine are not yet supported.
|
33
|
+
# See http://dhaka.rubyforge.org/regex_grammar.html for the current syntax.
|
34
|
+
|
35
|
+
class LexerSpecification
|
36
|
+
class << self
|
37
|
+
# Associates +blk+ as the action to be performed when a lexer recognizes +pattern+. When Lexer#lex is invoked,
|
38
|
+
# it creates a LexerRun object that provides the context for +blk+ to be evaluated in. Methods available in this block
|
39
|
+
# are LexerRun#current_lexeme and LexerRun#create_token.
|
40
|
+
def for_pattern(pattern, &blk)
|
41
|
+
items[pattern] = LexerSpecificationItem.new(pattern, priority, blk)
|
42
|
+
self.priority += 1
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def inherited(specification)
|
47
|
+
class << specification
|
48
|
+
attr_accessor :items, :priority
|
49
|
+
end
|
50
|
+
specification.items = {}
|
51
|
+
specification.priority = 0
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class LexerSpecificationItem #:nodoc:
|
58
|
+
include Comparable
|
59
|
+
attr_reader :pattern, :action, :priority
|
60
|
+
def initialize(pattern, priority, action)
|
61
|
+
@pattern, @priority, @action = pattern, priority, action
|
62
|
+
end
|
63
|
+
|
64
|
+
def <=> other
|
65
|
+
priority <=> other.priority
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
data/lib/lexer/state.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Dhaka
|
2
|
+
module LexerSupport
|
3
|
+
class State
|
4
|
+
attr_reader :transitions, :pattern
|
5
|
+
def initialize state_machine, pattern
|
6
|
+
@state_machine = state_machine
|
7
|
+
@pattern = pattern
|
8
|
+
@transitions = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def accepting?
|
12
|
+
pattern
|
13
|
+
end
|
14
|
+
|
15
|
+
def for_characters *characters, &blk
|
16
|
+
dest_state = @state_machine.instance_eval(&blk)
|
17
|
+
characters.each do |char|
|
18
|
+
transitions[char] = dest_state
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def recognize pattern
|
23
|
+
@pattern = pattern
|
24
|
+
end
|
25
|
+
|
26
|
+
def compile_to_ruby_source
|
27
|
+
result = " at_state(#{object_id}) {\n"
|
28
|
+
result << " recognize(#{pattern.inspect})\n" if accepting?
|
29
|
+
transition_keys_by_destination_state = Hash.new {|hash, key| hash[key] = []}
|
30
|
+
transitions.each do |key, dest_state|
|
31
|
+
transition_keys_by_destination_state[dest_state.object_id] << key
|
32
|
+
end
|
33
|
+
|
34
|
+
transition_keys_by_destination_state.keys.each do |state_id|
|
35
|
+
transition_keys = transition_keys_by_destination_state[state_id].collect {|transition_key| "#{transition_key.inspect}"}.join(', ')
|
36
|
+
result << " for_characters(#{transition_keys}) { switch_to #{state_id} }\n"
|
37
|
+
end
|
38
|
+
|
39
|
+
result << " }"
|
40
|
+
result
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Dhaka
|
2
|
+
module LexerSupport
|
3
|
+
class StateMachine
|
4
|
+
attr_reader :start_state
|
5
|
+
|
6
|
+
def initialize start_key
|
7
|
+
@states = Hash.new do |hash, key|
|
8
|
+
new_state = new_state_for_key key
|
9
|
+
hash[key] = new_state
|
10
|
+
transition_characters(key).each do |char|
|
11
|
+
dest_key = dest_key_for(key, char)
|
12
|
+
dest_state = hash[dest_key]
|
13
|
+
new_state.transitions[char] = dest_state
|
14
|
+
end
|
15
|
+
new_state
|
16
|
+
end
|
17
|
+
@start_state = @states[start_key]
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_dot
|
21
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :circle, :size => 5) do |g|
|
22
|
+
start = 'Start'
|
23
|
+
g.node(start, :label => start)
|
24
|
+
g.edge(start, @start_state)
|
25
|
+
@states.values.each do |state|
|
26
|
+
state_attributes = {}
|
27
|
+
state_attributes.merge!(:shape => :doublecircle, :label => state.pattern) if state.accepting?
|
28
|
+
g.node(state, state_attributes)
|
29
|
+
state.transitions.each do |transition_key, dest_state|
|
30
|
+
g.edge(state, dest_state, :label => transition_key)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end.to_dot
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/parser/action.rb
CHANGED
@@ -31,12 +31,12 @@ module Dhaka
|
|
31
31
|
@action_code = Proc.new do
|
32
32
|
composite_node = ParseTreeCompositeNode.new(production)
|
33
33
|
|
34
|
-
production.expansion.each do |symbol|
|
34
|
+
production.expansion.each do |symbol|
|
35
35
|
state_stack.pop
|
36
36
|
composite_node.child_nodes.unshift(node_stack.pop)
|
37
37
|
end
|
38
38
|
|
39
|
-
node_stack << composite_node
|
39
|
+
node_stack << composite_node.instance_eval(&production.action)
|
40
40
|
|
41
41
|
unless composite_node.head_node?
|
42
42
|
@symbol_queue.concat [@current_token.symbol_name, production.symbol.name]
|
@@ -45,7 +45,7 @@ module Dhaka
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def compile_to_ruby_source
|
48
|
-
"reduce_with
|
48
|
+
"reduce_with #{production.name.inspect}"
|
49
49
|
end
|
50
50
|
|
51
51
|
def to_s
|
@@ -1,14 +1,22 @@
|
|
1
1
|
module Dhaka
|
2
|
-
#
|
2
|
+
# Abstract base class of all compiled Parsers. It is only used by generated code.
|
3
3
|
class CompiledParser
|
4
4
|
|
5
5
|
class << self
|
6
6
|
private
|
7
7
|
def inherited(compiled_parser)
|
8
8
|
class << compiled_parser
|
9
|
-
attr_accessor :states, :grammar, :start_state_id
|
9
|
+
attr_accessor :states, :grammar, :start_state_id, :shift_actions, :reduce_actions
|
10
10
|
end
|
11
|
-
compiled_parser.states
|
11
|
+
compiled_parser.states = Hash.new do |hash, state_id|
|
12
|
+
hash[state_id] = ParserState.new(compiled_parser, {}, state_id)
|
13
|
+
end
|
14
|
+
compiled_parser.shift_actions = Hash.new do |hash, state_id|
|
15
|
+
hash[state_id] = ShiftAction.new(compiled_parser.states[state_id])
|
16
|
+
end
|
17
|
+
compiled_parser.reduce_actions = Hash.new do |hash, production_name|
|
18
|
+
hash[production_name] = ReduceAction.new(compiled_parser.grammar.production_named(production_name))
|
19
|
+
end
|
12
20
|
end
|
13
21
|
|
14
22
|
def at_state x, &blk
|
data/lib/parser/parse_result.rb
CHANGED
@@ -16,11 +16,9 @@ module Dhaka
|
|
16
16
|
|
17
17
|
# Returns the dot representation of the parse tree
|
18
18
|
def to_dot
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
result << ['}']
|
23
|
-
result.join("\n")
|
19
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
|
20
|
+
parse_tree.to_dot(g)
|
21
|
+
end.to_dot
|
24
22
|
end
|
25
23
|
|
26
24
|
# Deprecated. Use the +parse_tree+ accessor.
|
data/lib/parser/parse_tree.rb
CHANGED
@@ -20,25 +20,18 @@ module Dhaka
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# Returns the dot representation of this node.
|
23
|
-
def to_dot
|
24
|
-
|
25
|
-
label = production
|
26
|
-
result << %(#{dot_name} [label="#{label}"])
|
23
|
+
def to_dot graph
|
24
|
+
graph.node(self, :label => production)
|
27
25
|
child_nodes.each do |child|
|
28
|
-
|
29
|
-
|
26
|
+
graph.edge(self, child)
|
27
|
+
child.to_dot(graph)
|
30
28
|
end
|
31
|
-
result.join("\n")
|
32
29
|
end
|
33
30
|
|
34
31
|
def head_node? #:nodoc:
|
35
32
|
production.symbol.name == START_SYMBOL_NAME
|
36
33
|
end
|
37
34
|
|
38
|
-
def dot_name #:nodoc:
|
39
|
-
"Node#{object_id}"
|
40
|
-
end
|
41
|
-
|
42
35
|
end
|
43
36
|
|
44
37
|
# These are leaf nodes of syntax trees. They contain tokens.
|
@@ -62,16 +55,12 @@ module Dhaka
|
|
62
55
|
end
|
63
56
|
|
64
57
|
# Returns the dot representation of this node.
|
65
|
-
def to_dot
|
66
|
-
|
58
|
+
def to_dot(graph)
|
59
|
+
graph.node(self, :label => token)
|
67
60
|
end
|
68
61
|
|
69
62
|
def head_node? #:nodoc:
|
70
63
|
false
|
71
64
|
end
|
72
|
-
|
73
|
-
def dot_name #:nodoc:
|
74
|
-
"Node#{object_id}"
|
75
|
-
end
|
76
65
|
end
|
77
66
|
end
|
data/lib/parser/parser.rb
CHANGED
@@ -14,10 +14,12 @@ module Dhaka
|
|
14
14
|
# and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
|
15
15
|
# at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
|
16
16
|
def initialize(grammar, logger = nil)
|
17
|
-
@
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
17
|
+
@shift_actions = Hash.new {|hash, state| hash[state] = ShiftAction.new(state)}
|
18
|
+
@reduce_actions = Hash.new {|hash, production| hash[production] = ReduceAction.new(production)}
|
19
|
+
@logger = logger || default_logger
|
20
|
+
@transitions = Hash.new {|hash, state| hash[state] = {}}
|
21
|
+
@grammar = grammar
|
22
|
+
@channels = []
|
21
23
|
@states = Hash.new do |hash, kernel|
|
22
24
|
channels, closure = grammar.closure(kernel)
|
23
25
|
@channels.concat channels.to_a
|
@@ -51,15 +53,14 @@ module Dhaka
|
|
51
53
|
# options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
|
52
54
|
# of lookahead symbols for every item in every state.
|
53
55
|
def to_dot(options = {})
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
|
57
|
+
states.each do |state|
|
58
|
+
g.node(state, :label => state.items.values.collect{|item| item.to_s(options)}.join("\n"))
|
59
|
+
@transitions[state].each do |symbol, dest_state|
|
60
|
+
g.edge(state, dest_state, :label => symbol.name)
|
61
|
+
end
|
59
62
|
end
|
60
|
-
end
|
61
|
-
result << ['}']
|
62
|
-
result.join("\n")
|
63
|
+
end.to_dot
|
63
64
|
end
|
64
65
|
|
65
66
|
def inspect
|
@@ -97,7 +98,7 @@ module Dhaka
|
|
97
98
|
def generate_shift_actions
|
98
99
|
@states.values.each do |state|
|
99
100
|
@transitions[state].keys.each do |symbol|
|
100
|
-
state.actions[symbol.name] =
|
101
|
+
state.actions[symbol.name] = @shift_actions[@transitions[state][symbol]]
|
101
102
|
end
|
102
103
|
end
|
103
104
|
end
|
@@ -112,7 +113,7 @@ module Dhaka
|
|
112
113
|
|
113
114
|
def create_reduction_actions_for_item_and_state item, state
|
114
115
|
item.lookaheadset.each do |lookahead|
|
115
|
-
new_action =
|
116
|
+
new_action = @reduce_actions[item.production]
|
116
117
|
if existing_action = state.actions[lookahead.name]
|
117
118
|
if ReduceAction === existing_action
|
118
119
|
message = ReduceReduceConflict.new(state, lookahead, new_action).resolve
|
data/lib/parser/parser_run.rb
CHANGED
@@ -10,14 +10,16 @@ module Dhaka
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def run
|
13
|
-
token_stream.each do |token|
|
13
|
+
tokenize_result = token_stream.each do |token|
|
14
14
|
@current_token = token
|
15
15
|
@symbol_queue << @current_token.symbol_name
|
16
16
|
error = execute_actions
|
17
17
|
return error if error
|
18
18
|
node_stack << ParseTreeLeafNode.new(@current_token)
|
19
|
+
state_stack.last
|
19
20
|
end
|
20
|
-
|
21
|
+
return tokenize_result if TokenizerErrorResult === tokenize_result
|
22
|
+
ParseSuccessResult.new(node_stack.first) if node_stack.first.head_node?
|
21
23
|
end
|
22
24
|
|
23
25
|
private
|
data/lib/parser/parser_state.rb
CHANGED
@@ -29,23 +29,31 @@ module Dhaka
|
|
29
29
|
"State#{id}"
|
30
30
|
end
|
31
31
|
|
32
|
-
def to_dot(options = {})
|
33
|
-
%(#{unique_name} [label="#{items.values.collect{|item| item.to_s(options)}.join('\n')}"])
|
34
|
-
end
|
35
|
-
|
36
32
|
def compile_to_ruby_source
|
37
33
|
result = " at_state(#{id}) {\n"
|
34
|
+
|
35
|
+
symbol_names_by_action = Hash.new {|hash, key| hash[key] = []}
|
38
36
|
actions.each do |symbol_name, action|
|
39
|
-
|
37
|
+
symbol_names_by_action[action] << symbol_name
|
38
|
+
end
|
39
|
+
|
40
|
+
symbol_names_by_action.keys.each do |action|
|
41
|
+
symbol_names = symbol_names_by_action[action].collect {|symbol_name| "#{symbol_name.inspect}"}.join(', ')
|
42
|
+
result << " for_symbols(#{symbol_names}) { #{action.compile_to_ruby_source} }\n"
|
40
43
|
end
|
44
|
+
|
41
45
|
result << " }"
|
42
46
|
result
|
43
47
|
end
|
44
|
-
|
45
|
-
def
|
46
|
-
|
48
|
+
|
49
|
+
def for_symbols *symbol_names, &blk
|
50
|
+
symbol_names.each do |symbol_name|
|
51
|
+
actions[symbol_name] = @parser.instance_eval(&blk)
|
52
|
+
end
|
47
53
|
end
|
48
54
|
|
55
|
+
alias :for_symbol :for_symbols
|
56
|
+
|
49
57
|
def to_s(options = {})
|
50
58
|
items.values.collect{|item| item.to_s(options)}.join("\n")
|
51
59
|
end
|
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -60,10 +60,9 @@ module Dhaka
|
|
60
60
|
|
61
61
|
end
|
62
62
|
|
63
|
-
# This class contains a DSL for
|
64
|
-
# Subclasses of this class may not be further subclassed.
|
63
|
+
# This abstract class contains a DSL for hand-coding tokenizers. Subclass it to implement tokenizers for specific grammars.
|
65
64
|
#
|
66
|
-
# Tokenizers are state machines
|
65
|
+
# Tokenizers are state machines. Each state of a tokenizer is identified
|
67
66
|
# by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
|
68
67
|
# that it starts in).
|
69
68
|
#
|
@@ -109,6 +108,9 @@ module Dhaka
|
|
109
108
|
# end
|
110
109
|
#
|
111
110
|
# end
|
111
|
+
#
|
112
|
+
# For languages where the lexical structure is very complicated, it may be too tedious to implement a Tokenizer by hand.
|
113
|
+
# In such cases, it's a lot easier to write a LexerSpecification using regular expressions and create a Lexer from that.
|
112
114
|
class Tokenizer
|
113
115
|
class << self
|
114
116
|
# Define the action for the state named +state_name+.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class ArithmeticPrecedenceLexerSpecification < Dhaka::LexerSpecification
|
2
|
+
|
3
|
+
for_pattern '\s' do
|
4
|
+
# ignore whitespace
|
5
|
+
end
|
6
|
+
|
7
|
+
%w| - h l , |.each do |char|
|
8
|
+
for_pattern char do
|
9
|
+
create_token(char)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
%w| ( ) + / * ^ |.each do |char|
|
14
|
+
for_pattern "\\#{char}" do
|
15
|
+
create_token(char)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
for_pattern '\d+' do
|
20
|
+
create_token('n')
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|