dhaka 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/dhaka.rb +6 -2
- data/lib/evaluator/evaluator.rb +32 -19
- data/lib/grammar/closure_hash.rb +2 -1
- data/lib/grammar/grammar.rb +52 -25
- data/lib/grammar/grammar_symbol.rb +4 -0
- data/lib/grammar/precedence.rb +1 -1
- data/lib/parser/action.rb +4 -3
- data/lib/parser/channel.rb +4 -3
- data/lib/parser/compiled_parser.rb +2 -0
- data/lib/parser/item.rb +2 -1
- data/lib/parser/parse_result.rb +13 -8
- data/lib/parser/parse_tree.rb +22 -16
- data/lib/parser/parser.rb +29 -27
- data/lib/parser/parser_methods.rb +2 -0
- data/lib/parser/parser_run.rb +1 -1
- data/lib/parser/parser_state.rb +2 -2
- data/lib/parser/token.rb +2 -0
- data/lib/tokenizer/tokenizer.rb +52 -33
- data/test/arithmetic_precedence_tokenizer.rb +8 -4
- data/test/arithmetic_tokenizer.rb +8 -4
- data/test/bracket_tokenizer.rb +1 -1
- data/test/parser_test.rb +11 -1
- metadata +3 -3
data/lib/dhaka.rb
CHANGED
@@ -21,6 +21,12 @@
|
|
21
21
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
22
|
#++
|
23
23
|
|
24
|
+
# An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
|
25
|
+
#
|
26
|
+
# Further examples can be found in the test suites included with the gem.
|
27
|
+
module Dhaka
|
28
|
+
end
|
29
|
+
|
24
30
|
require File.dirname(__FILE__)+'/grammar/grammar_symbol'
|
25
31
|
require File.dirname(__FILE__)+'/grammar/production'
|
26
32
|
require File.dirname(__FILE__)+'/grammar/closure_hash'
|
@@ -41,5 +47,3 @@ require File.dirname(__FILE__)+'/parser/compiled_parser'
|
|
41
47
|
|
42
48
|
require File.dirname(__FILE__)+'/tokenizer/tokenizer'
|
43
49
|
require File.dirname(__FILE__)+'/evaluator/evaluator'
|
44
|
-
|
45
|
-
|
data/lib/evaluator/evaluator.rb
CHANGED
@@ -1,21 +1,43 @@
|
|
1
1
|
module Dhaka
|
2
|
+
|
3
|
+
# This is the abstract base evaluator class. It is not directly instantiated.
|
4
|
+
# When defining an evaluator for a specific grammar, we subclass it. e.g. for FooGrammar
|
5
|
+
# we create a FooEvaluator that subclasses Evaluator. Note that FooEvaluator may not
|
6
|
+
# be further subclassed.
|
7
|
+
#
|
8
|
+
# An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
|
9
|
+
# a block that performs the evaluation. For detailed examples, see the evaluators in the
|
10
|
+
# test suite.
|
11
|
+
|
2
12
|
class Evaluator
|
3
13
|
|
14
|
+
# Instantiates a new evaluator with the syntax tree of a parsed expression. Only subclasses
|
15
|
+
# of Evaluator are directly instantiated.
|
4
16
|
def initialize(syntax_tree)
|
5
17
|
@syntax_tree = syntax_tree
|
6
18
|
@node_stack = []
|
7
19
|
end
|
8
20
|
|
21
|
+
# Returns the evaluation result.
|
9
22
|
def result
|
10
23
|
evaluate(@syntax_tree)
|
11
24
|
end
|
12
25
|
|
26
|
+
private
|
27
|
+
|
13
28
|
def child_nodes
|
14
29
|
@node_stack[-1]
|
15
30
|
end
|
16
|
-
|
17
|
-
private
|
18
31
|
|
32
|
+
def evaluate node
|
33
|
+
return node if (ParseTreeLeafNode === node)
|
34
|
+
@node_stack << node.child_nodes.collect {|child_node| evaluate(child_node)}
|
35
|
+
proc = self.class.actions[node.production.name]
|
36
|
+
result = self.instance_eval(&proc)
|
37
|
+
@node_stack.pop
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
19
41
|
def self.inherited(evaluator)
|
20
42
|
class << evaluator
|
21
43
|
attr_accessor :grammar, :actions
|
@@ -29,34 +51,25 @@ module Dhaka
|
|
29
51
|
check_definitions
|
30
52
|
end
|
31
53
|
|
54
|
+
def self.method_missing(method_name, &blk)
|
55
|
+
if method_name.to_s =~ /^for_*/
|
56
|
+
rule_name = method_name.to_s[4..-1]
|
57
|
+
self.for_rule_named(rule_name, &blk)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
32
61
|
def self.check_definitions
|
33
62
|
non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
|
34
63
|
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
|
35
64
|
end
|
36
65
|
|
37
|
-
def evaluate node
|
38
|
-
return node if (ParseTreeLeafNode === node)
|
39
|
-
@node_stack << node.child_nodes.collect {|child_node| evaluate(child_node)}
|
40
|
-
proc = self.class.actions[node.production.name]
|
41
|
-
result = self.instance_eval(&proc)
|
42
|
-
@node_stack.pop
|
43
|
-
result
|
44
|
-
end
|
45
|
-
|
46
66
|
def self.for_rule_named(name, &blk)
|
47
67
|
self.actions[name] = blk
|
48
68
|
end
|
49
|
-
|
50
|
-
def self.method_missing(method_name, &blk)
|
51
|
-
if method_name.to_s =~ /^for_*/
|
52
|
-
rule_name = method_name.to_s[4..-1]
|
53
|
-
self.for_rule_named(rule_name, &blk)
|
54
|
-
end
|
55
|
-
end
|
56
69
|
|
57
70
|
end
|
58
71
|
|
59
|
-
class EvaluatorDefinitionError < StandardError
|
72
|
+
class EvaluatorDefinitionError < StandardError #:nodoc:
|
60
73
|
def initialize(non_trivial_productions_with_rules_undefined)
|
61
74
|
@non_trivial_productions_with_rules_undefined = non_trivial_productions_with_rules_undefined
|
62
75
|
end
|
data/lib/grammar/closure_hash.rb
CHANGED
data/lib/grammar/grammar.rb
CHANGED
@@ -2,15 +2,24 @@
|
|
2
2
|
require 'set'
|
3
3
|
module Dhaka
|
4
4
|
|
5
|
+
# Reserved name for the start symbol for all grammars.
|
5
6
|
START_SYMBOL_NAME = "_Start_"
|
6
|
-
END_SYMBOL_NAME = "_End_"
|
7
|
+
END_SYMBOL_NAME = "_End_" #:nodoc:
|
7
8
|
|
9
|
+
# Productions for specific grammar symbols are defined in the context of this class.
|
8
10
|
class ProductionBuilder
|
11
|
+
|
12
|
+
# +symbol+ is the grammar symbol that productions are being defined for.
|
9
13
|
def initialize(grammar, symbol)
|
10
14
|
@grammar = grammar
|
11
15
|
@symbol = symbol
|
12
16
|
end
|
13
17
|
|
18
|
+
# Creates a new production for +symbol+ with an expansion of +expansion+. The options hash can include
|
19
|
+
# a directive <tt>:prec</tt>, the value of which is a grammar symbol name. The precedence of the production is then
|
20
|
+
# set to the precedence of the grammar symbol corresponding to that name.
|
21
|
+
#
|
22
|
+
# See the arithmetic precedence grammar in the test suites for an example.
|
14
23
|
def method_missing(production_name, expansion, options = {})
|
15
24
|
expansion_symbols = expansion.collect {|name| @grammar.symbols[name]}
|
16
25
|
if precedence_symbol_name = options[:prec]
|
@@ -25,12 +34,23 @@ module Dhaka
|
|
25
34
|
end
|
26
35
|
end
|
27
36
|
|
37
|
+
# The precedence builder defines three methods, +left+, +right+ and +nonassoc+. These accept arrays of grammar
|
38
|
+
# symbols all of which have the same precedence level and associativity. This works almost exactly like Yacc.
|
39
|
+
#
|
40
|
+
# See the arithmetic precedence grammar in the test suites for an example.
|
28
41
|
class PrecedenceBuilder
|
29
|
-
def initialize(grammar)
|
42
|
+
def initialize(grammar) #:nodoc:
|
30
43
|
@grammar = grammar
|
31
44
|
@precedence_level = 0
|
32
45
|
end
|
33
|
-
|
46
|
+
[:left, :right, :nonassoc].each do |associativity|
|
47
|
+
define_method(associativity) do |symbols|
|
48
|
+
assign_precedences associativity, symbols
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def assign_precedences(associativity, symbol_names)
|
34
54
|
symbol_names.each do |symbol_name|
|
35
55
|
symbol = @grammar.symbols[symbol_name]
|
36
56
|
symbol.precedence = Precedence.new(@precedence_level, associativity)
|
@@ -39,8 +59,34 @@ module Dhaka
|
|
39
59
|
end
|
40
60
|
end
|
41
61
|
|
62
|
+
# This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
|
42
63
|
class Grammar
|
43
64
|
|
65
|
+
# Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
|
66
|
+
# evaluated in the context of a ProductionBuilder.
|
67
|
+
def self.for_symbol symbol, &blk
|
68
|
+
symbol = symbols[symbol]
|
69
|
+
symbol.non_terminal = true
|
70
|
+
ProductionBuilder.new(self, symbol).instance_eval(&blk)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Used for defining the precedences and associativities of symbols. The block +blk+ is
|
74
|
+
# evaluated in the context of a PrecedenceBuilder.
|
75
|
+
def self.precedences &blk
|
76
|
+
PrecedenceBuilder.new(self).instance_eval(&blk)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns the grammar symbol identified by +name+
|
80
|
+
def self.symbol_for_name(name)
|
81
|
+
if symbols.has_key? name
|
82
|
+
symbols[name]
|
83
|
+
else
|
84
|
+
raise "No symbol with name #{name} found"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
44
90
|
def self.inherited(grammar)
|
45
91
|
class << grammar
|
46
92
|
attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache
|
@@ -53,13 +99,10 @@ module Dhaka
|
|
53
99
|
grammar.__first_cache = {}
|
54
100
|
end
|
55
101
|
|
56
|
-
def self.
|
57
|
-
|
58
|
-
symbol.non_terminal = true
|
59
|
-
ProductionBuilder.new(self, symbol).instance_eval(&blk)
|
102
|
+
def self.productions_for_symbol(symbol)
|
103
|
+
productions_by_symbol[symbol]
|
60
104
|
end
|
61
105
|
|
62
|
-
|
63
106
|
def self.productions
|
64
107
|
productions_by_name.values
|
65
108
|
end
|
@@ -68,17 +111,6 @@ module Dhaka
|
|
68
111
|
productions_by_name[name]
|
69
112
|
end
|
70
113
|
|
71
|
-
def self.productions_for_symbol(symbol)
|
72
|
-
productions_by_symbol[symbol]
|
73
|
-
end
|
74
|
-
|
75
|
-
def self.symbol_for_name(name)
|
76
|
-
if symbols.has_key? name
|
77
|
-
symbols[name]
|
78
|
-
else
|
79
|
-
raise "No symbol with name #{name} found"
|
80
|
-
end
|
81
|
-
end
|
82
114
|
|
83
115
|
def self.terminal_symbols
|
84
116
|
symbols.values.select {|symbol| symbol.terminal}
|
@@ -102,10 +134,6 @@ module Dhaka
|
|
102
134
|
return channels, result
|
103
135
|
end
|
104
136
|
|
105
|
-
def self.precedences &blk
|
106
|
-
PrecedenceBuilder.new(self).instance_eval(&blk)
|
107
|
-
end
|
108
|
-
|
109
137
|
def self.first(given_symbol)
|
110
138
|
cached_result = self.__first_cache[given_symbol]
|
111
139
|
return cached_result if cached_result
|
@@ -146,7 +174,6 @@ module Dhaka
|
|
146
174
|
end
|
147
175
|
return closure_hash
|
148
176
|
end
|
149
|
-
|
150
|
-
|
151
177
|
end
|
178
|
+
|
152
179
|
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
+
# Each grammar symbol is uniquely identified by a string name. The name of a symbol can
|
4
|
+
# be anything and need not correspond to its character representation. For example, an ampersand in the
|
5
|
+
# character stream could be tokenized as a symbol with a name 'whatever'. In general, it's best to choose
|
6
|
+
# symbol names that are descriptive.
|
3
7
|
class GrammarSymbol
|
4
8
|
attr_reader :name
|
5
9
|
attr_accessor :non_terminal, :nullable, :precedence, :associativity
|
data/lib/grammar/precedence.rb
CHANGED
data/lib/parser/action.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module Dhaka
|
2
|
-
|
2
|
+
# Encapsulates code for Parser actions.
|
3
|
+
class Action #:nodoc:
|
3
4
|
attr_reader :action_code
|
4
5
|
end
|
5
6
|
|
6
|
-
class ShiftAction < Action
|
7
|
+
class ShiftAction < Action #:nodoc:
|
7
8
|
attr_reader :destination_state
|
8
9
|
def initialize destination_state
|
9
10
|
@destination_state = destination_state
|
@@ -20,7 +21,7 @@ module Dhaka
|
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
23
|
-
class ReduceAction < Action
|
24
|
+
class ReduceAction < Action #:nodoc:
|
24
25
|
attr_reader :production
|
25
26
|
def initialize(production)
|
26
27
|
@production = production
|
data/lib/parser/channel.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
-
|
3
|
+
# Represents channels for pumping of lookaheads between items
|
4
|
+
class Channel #:nodoc:
|
4
5
|
attr_reader :start_item, :end_item
|
5
6
|
def initialize(grammar, start_item, end_item)
|
6
7
|
@grammar = grammar
|
@@ -23,7 +24,7 @@ module Dhaka
|
|
23
24
|
end
|
24
25
|
end
|
25
26
|
|
26
|
-
class SpontaneousChannel < Channel
|
27
|
+
class SpontaneousChannel < Channel #:nodoc:
|
27
28
|
def to_s
|
28
29
|
"Spontaneous " + super.to_s
|
29
30
|
end
|
@@ -40,7 +41,7 @@ module Dhaka
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
class PassiveChannel < Channel
|
44
|
+
class PassiveChannel < Channel #:nodoc:
|
44
45
|
def to_s
|
45
46
|
"Passive " + super.to_s
|
46
47
|
end
|
data/lib/parser/item.rb
CHANGED
data/lib/parser/parse_result.rb
CHANGED
@@ -1,21 +1,26 @@
|
|
1
1
|
module Dhaka
|
2
|
+
# Returned on successful parsing of the input token stream.
|
2
3
|
class ParseSuccessResult
|
3
|
-
|
4
|
-
|
4
|
+
# Contains the parse result.
|
5
|
+
attr_accessor :syntax_tree
|
6
|
+
def initialize(syntax_tree) #:nodoc:
|
5
7
|
@syntax_tree = syntax_tree
|
6
8
|
end
|
7
|
-
|
8
|
-
def has_error?
|
9
|
+
# This is false.
|
10
|
+
def has_error?
|
9
11
|
false
|
10
12
|
end
|
11
13
|
end
|
14
|
+
|
15
|
+
# Returned on unsuccessful parsing of the input token stream.
|
12
16
|
class ParseErrorResult
|
13
|
-
|
14
|
-
|
17
|
+
# The index of the token that caused the parse error.
|
18
|
+
attr_reader :bad_token_index
|
19
|
+
def initialize(bad_token_index) #:nodoc:
|
15
20
|
@bad_token_index = bad_token_index
|
16
21
|
end
|
17
|
-
|
18
|
-
def has_error?
|
22
|
+
# This is true.
|
23
|
+
def has_error?
|
19
24
|
true
|
20
25
|
end
|
21
26
|
end
|
data/lib/parser/parse_tree.rb
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
module Dhaka
|
2
|
-
|
2
|
+
# These are composite nodes of the syntax tree returned by the successful parsing of a token stream.
|
3
|
+
class ParseTreeCompositeNode
|
3
4
|
attr_reader :production, :child_nodes
|
4
|
-
def initialize(production)
|
5
|
+
def initialize(production) #:nodoc:
|
5
6
|
@production = production
|
6
7
|
@child_nodes = []
|
7
8
|
end
|
8
|
-
def linearize
|
9
|
+
def linearize #:nodoc:
|
9
10
|
child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
|
10
11
|
end
|
11
|
-
def to_s
|
12
|
+
def to_s #:nodoc:
|
12
13
|
"CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
|
13
14
|
end
|
14
|
-
|
15
|
-
"Node#{object_id}"
|
16
|
-
end
|
17
|
-
|
15
|
+
# Returns the dot representation of the syntax tree.
|
18
16
|
def to_dot
|
19
17
|
result = []
|
20
18
|
result << ["digraph x {", "node [fontsize=\"10\" shape=box size=\"5\"]"] if head_node?
|
@@ -28,31 +26,39 @@ module Dhaka
|
|
28
26
|
result.join("\n")
|
29
27
|
end
|
30
28
|
|
31
|
-
def head_node?
|
29
|
+
def head_node? #:nodoc:
|
32
30
|
production.symbol.name == START_SYMBOL_NAME
|
33
31
|
end
|
32
|
+
|
33
|
+
def dot_name #:nodoc:
|
34
|
+
"Node#{object_id}"
|
35
|
+
end
|
36
|
+
|
34
37
|
end
|
35
38
|
|
39
|
+
# These are leaf nodes of syntax trees. They contain tokens.
|
36
40
|
class ParseTreeLeafNode
|
37
41
|
attr_reader :token
|
38
|
-
def initialize(token)
|
42
|
+
def initialize(token) #:nodoc:
|
39
43
|
@token = token
|
40
44
|
end
|
41
|
-
def linearize
|
45
|
+
def linearize #:nodoc:
|
42
46
|
[]
|
43
47
|
end
|
44
|
-
def to_s
|
48
|
+
def to_s #:nodoc:
|
45
49
|
"LeafNode: #{token}"
|
46
50
|
end
|
47
|
-
|
48
|
-
"Node#{object_id}"
|
49
|
-
end
|
51
|
+
# Returns the dot representation of this node.
|
50
52
|
def to_dot
|
51
53
|
label = "#{token}#{' : '+token.value.to_s if token.value}"
|
52
54
|
"#{dot_name} [label=\"#{label}\"]"
|
53
55
|
end
|
54
|
-
def head_node?
|
56
|
+
def head_node? #:nodoc:
|
55
57
|
false
|
56
58
|
end
|
59
|
+
|
60
|
+
def dot_name #:nodoc:
|
61
|
+
"Node#{object_id}"
|
62
|
+
end
|
57
63
|
end
|
58
64
|
end
|
data/lib/parser/parser.rb
CHANGED
@@ -7,6 +7,9 @@ module Dhaka
|
|
7
7
|
include ParserMethods
|
8
8
|
attr_reader :grammar, :start_state
|
9
9
|
|
10
|
+
# Creates a new parser from the given grammar. Messages are logged by default to STDOUT
|
11
|
+
# and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
|
12
|
+
# at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
|
10
13
|
def initialize(grammar, logger = nil)
|
11
14
|
if logger
|
12
15
|
@logger = logger
|
@@ -34,20 +37,7 @@ module Dhaka
|
|
34
37
|
initialize_states
|
35
38
|
end
|
36
39
|
|
37
|
-
|
38
|
-
start_productions = @grammar.productions_for_symbol(@grammar.start_symbol)
|
39
|
-
raise NoStartProductionsError.new(@grammar) if start_productions.empty?
|
40
|
-
start_items = ItemSet.new(start_productions.collect {|production| Item.new(production, 0)})
|
41
|
-
start_items.each {|start_item| start_item.lookaheadset << @grammar.end_symbol}
|
42
|
-
@start_state = @states[start_items]
|
43
|
-
@logger.debug("Pumping #{@channels.size} channels...")
|
44
|
-
pump_channels
|
45
|
-
@logger.debug("Generating shift actions...")
|
46
|
-
generate_shift_actions
|
47
|
-
@logger.debug("Generating reduce actions...")
|
48
|
-
generate_reduce_actions
|
49
|
-
end
|
50
|
-
|
40
|
+
# Returns the Ruby source of the generated parser compiled as +parser_class_name+. This can be written out to a file.
|
51
41
|
def compile_to_ruby_source_as parser_class_name
|
52
42
|
result = "class #{parser_class_name} < Dhaka::CompiledParser\n\n"
|
53
43
|
result << " self.grammar = #{@grammar.name}\n\n"
|
@@ -59,6 +49,9 @@ module Dhaka
|
|
59
49
|
result
|
60
50
|
end
|
61
51
|
|
52
|
+
# Returns the dot representation of the parser. If <tt>:hide_lookaheads</tt> is set to true in the
|
53
|
+
# options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
|
54
|
+
# of lookahead symbols for every item in every state.
|
62
55
|
def to_dot(options = {})
|
63
56
|
result = ["digraph x {", "node [fontsize=\"10\" shape=box size=\"5\"]"]
|
64
57
|
result += states.collect { |state| state.to_dot(options) }
|
@@ -70,10 +63,28 @@ module Dhaka
|
|
70
63
|
result << ['}']
|
71
64
|
result.join("\n")
|
72
65
|
end
|
66
|
+
|
67
|
+
private :start_state
|
68
|
+
private
|
69
|
+
|
73
70
|
def states
|
74
71
|
@states.values
|
75
72
|
end
|
76
|
-
|
73
|
+
|
74
|
+
def initialize_states
|
75
|
+
start_productions = @grammar.productions_for_symbol(@grammar.start_symbol)
|
76
|
+
raise NoStartProductionsError.new(@grammar) if start_productions.empty?
|
77
|
+
start_items = ItemSet.new(start_productions.collect {|production| Item.new(production, 0)})
|
78
|
+
start_items.each {|start_item| start_item.lookaheadset << @grammar.end_symbol}
|
79
|
+
@start_state = @states[start_items]
|
80
|
+
@logger.debug("Pumping #{@channels.size} channels...")
|
81
|
+
pump_channels
|
82
|
+
@logger.debug("Generating shift actions...")
|
83
|
+
generate_shift_actions
|
84
|
+
@logger.debug("Generating reduce actions...")
|
85
|
+
generate_reduce_actions
|
86
|
+
end
|
87
|
+
|
77
88
|
def generate_shift_actions
|
78
89
|
@states.values.each do |state|
|
79
90
|
@transitions[state].keys.each { |symbol|
|
@@ -157,21 +168,12 @@ module Dhaka
|
|
157
168
|
|
158
169
|
end
|
159
170
|
|
160
|
-
|
161
|
-
class ParserReduceReduceConflictError < StandardError
|
162
|
-
def initialize(message)
|
163
|
-
@message = message
|
164
|
-
end
|
165
|
-
def to_s
|
166
|
-
@message
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
171
|
+
# Raised when trying to create a Parser for a grammar that has no productions for the start symbol
|
170
172
|
class NoStartProductionsError < StandardError
|
171
|
-
def initialize(grammar)
|
173
|
+
def initialize(grammar) #:nodoc:
|
172
174
|
@grammar = grammar
|
173
175
|
end
|
174
|
-
def to_s
|
176
|
+
def to_s #:nodoc:
|
175
177
|
"No start productions defined for #{@grammar.name}"
|
176
178
|
end
|
177
179
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
+
# This module is included both in Parser and CompiledParser.
|
3
4
|
module ParserMethods
|
5
|
+
# +token_stream+ is an Enumerable of Token-s. Returns either a ParseSuccessResult or a ParseErrorResult.
|
4
6
|
def parse token_stream
|
5
7
|
parser_run = ParserRun.new(grammar, start_state, token_stream)
|
6
8
|
parser_run.run
|
data/lib/parser/parser_run.rb
CHANGED
data/lib/parser/parser_state.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'set'
|
3
3
|
module Dhaka
|
4
|
-
class ParserState
|
4
|
+
class ParserState #:nodoc:
|
5
5
|
|
6
6
|
attr_accessor :items, :actions, :id
|
7
7
|
|
@@ -55,7 +55,7 @@ module Dhaka
|
|
55
55
|
|
56
56
|
end
|
57
57
|
|
58
|
-
class ItemSet < Set
|
58
|
+
class ItemSet < Set #:nodoc:
|
59
59
|
def hash
|
60
60
|
self.collect{|item| item.hash}.inject{|result, hashcode| result ^ hashcode}
|
61
61
|
end
|
data/lib/parser/token.rb
CHANGED
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
module Dhaka
|
2
|
+
|
3
|
+
# Reserved constant used to identify the idle state of the tokenizer.
|
4
|
+
TOKENIZER_IDLE_STATE = :idle_state
|
5
|
+
|
6
|
+
# Raised when the tokenizer encounters a character that has no corresponding action in
|
7
|
+
# its current state.
|
2
8
|
class UnrecognizedInputCharacterException < StandardError
|
3
9
|
attr_reader :input, :char_index
|
4
10
|
def initialize(input, char_index)
|
@@ -10,6 +16,8 @@ module Dhaka
|
|
10
16
|
end
|
11
17
|
end
|
12
18
|
|
19
|
+
# A tokenizer state encapsulates actions that should be performed upon
|
20
|
+
# encountering each permissible character for that state.
|
13
21
|
class TokenizerState
|
14
22
|
attr_reader :actions
|
15
23
|
|
@@ -17,72 +25,83 @@ module Dhaka
|
|
17
25
|
@actions = {}
|
18
26
|
end
|
19
27
|
|
28
|
+
# Define the action (+blk+) to be performed when encountering any of +characters+ in the token stream.
|
20
29
|
def for_characters(characters, &blk)
|
21
30
|
characters.each do |character|
|
22
31
|
actions[character] = blk
|
23
32
|
end
|
24
33
|
end
|
25
34
|
|
26
|
-
|
27
|
-
actions[character[0]] = blk
|
28
|
-
end
|
35
|
+
alias for_character for_characters
|
29
36
|
|
30
|
-
def to_s
|
37
|
+
def to_s #:nodoc:
|
31
38
|
actions.inspect
|
32
39
|
end
|
33
40
|
|
34
41
|
end
|
35
42
|
|
43
|
+
# This class contains a DSL for specifying tokenizers. Subclass it to implement tokenizers for specific grammars.
|
44
|
+
# Subclasses of this class may not be further subclassed.
|
45
|
+
#
|
46
|
+
# Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
|
47
|
+
# by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
|
48
|
+
# that it starts in).
|
36
49
|
class Tokenizer
|
37
|
-
|
38
|
-
|
39
|
-
class << tokenizer
|
40
|
-
attr_accessor :states
|
41
|
-
end
|
42
|
-
tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
|
43
|
-
end
|
44
|
-
|
50
|
+
|
51
|
+
# Define the action for the state named +state_name+.
|
45
52
|
def self.for_state(state_name, &blk)
|
46
53
|
states[state_name].instance_eval(&blk)
|
47
54
|
end
|
48
|
-
|
55
|
+
|
56
|
+
# Tokenizes a string +input+ and returns an array of Token-s.
|
49
57
|
def self.tokenize(input)
|
50
|
-
|
58
|
+
self.new(input).run
|
51
59
|
end
|
52
|
-
|
53
|
-
|
54
|
-
class TokenizerRun
|
55
|
-
|
60
|
+
|
61
|
+
# A slot that can be used to accumulate characters when processing multi-character tokens.
|
56
62
|
attr_accessor :accumulator
|
63
|
+
# The tokens shifted so far.
|
57
64
|
attr_reader :tokens
|
58
|
-
|
59
|
-
|
65
|
+
|
66
|
+
def initialize(input) #:nodoc:
|
60
67
|
@input = input
|
61
|
-
@current_state =
|
68
|
+
@current_state = self.class.states[TOKENIZER_IDLE_STATE]
|
62
69
|
@curr_char_index = 0
|
63
70
|
@tokens = []
|
64
71
|
end
|
65
72
|
|
66
|
-
|
67
|
-
while curr_char
|
68
|
-
blk = @current_state.actions[curr_char]
|
69
|
-
raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
|
70
|
-
instance_eval(&blk)
|
71
|
-
end
|
72
|
-
tokens
|
73
|
-
end
|
74
|
-
|
73
|
+
# The character currently being processed.
|
75
74
|
def curr_char
|
76
75
|
@input[@curr_char_index] and @input[@curr_char_index].chr
|
77
76
|
end
|
78
|
-
|
77
|
+
|
78
|
+
# Advance to the next character.
|
79
79
|
def advance
|
80
80
|
@curr_char_index += 1
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
|
+
# Change the active state of the tokenizer to the state identified by the symbol +state_name+.
|
83
84
|
def switch_to state_name
|
84
|
-
@current_state =
|
85
|
+
@current_state = self.class.states[state_name]
|
86
|
+
end
|
87
|
+
|
88
|
+
def run #:nodoc:
|
89
|
+
while curr_char
|
90
|
+
blk = @current_state.actions[curr_char]
|
91
|
+
raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
|
92
|
+
instance_eval(&blk)
|
93
|
+
end
|
94
|
+
tokens
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def self.inherited(tokenizer)
|
99
|
+
class << tokenizer
|
100
|
+
attr_accessor :states
|
101
|
+
end
|
102
|
+
tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
|
85
103
|
end
|
86
104
|
|
87
105
|
end
|
106
|
+
|
88
107
|
end
|
@@ -13,7 +13,7 @@ class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
|
|
13
13
|
|
14
14
|
all_characters = digits + parenths + operators + functions + arg_separator + whitespace
|
15
15
|
|
16
|
-
for_state
|
16
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
17
17
|
for_characters(all_characters - (digits + whitespace)) do
|
18
18
|
tokens << Dhaka::Token.new(ArithmeticPrecedenceGrammar.symbol_for_name(curr_char), nil)
|
19
19
|
advance
|
@@ -29,15 +29,19 @@ class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
|
|
29
29
|
|
30
30
|
for_state :get_integer_literal do
|
31
31
|
for_characters all_characters - digits do
|
32
|
-
tokens <<
|
33
|
-
switch_to
|
32
|
+
tokens << integer_literal_token(accumulator.to_i)
|
33
|
+
switch_to Dhaka::TOKENIZER_IDLE_STATE
|
34
34
|
end
|
35
35
|
for_characters digits do
|
36
36
|
self.accumulator += curr_char
|
37
37
|
advance
|
38
|
-
tokens <<
|
38
|
+
tokens << integer_literal_token(accumulator.to_i) unless curr_char
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
+
def integer_literal_token(value)
|
43
|
+
Dhaka::Token.new(ArithmeticPrecedenceGrammar.symbol_for_name('n'), value)
|
44
|
+
end
|
45
|
+
|
42
46
|
end
|
43
47
|
|
@@ -13,7 +13,7 @@ class ArithmeticTokenizer < Dhaka::Tokenizer
|
|
13
13
|
|
14
14
|
all_characters = digits + parenths + operators + functions + arg_separator + whitespace
|
15
15
|
|
16
|
-
for_state
|
16
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
17
17
|
for_characters(all_characters - (digits + whitespace)) do
|
18
18
|
tokens << Dhaka::Token.new(ArithmeticGrammar.symbol_for_name(curr_char), nil)
|
19
19
|
advance
|
@@ -29,15 +29,19 @@ class ArithmeticTokenizer < Dhaka::Tokenizer
|
|
29
29
|
|
30
30
|
for_state :get_integer_literal do
|
31
31
|
for_characters all_characters - digits do
|
32
|
-
tokens <<
|
33
|
-
switch_to
|
32
|
+
tokens << integer_literal_token(accumulator.to_i)
|
33
|
+
switch_to Dhaka::TOKENIZER_IDLE_STATE
|
34
34
|
end
|
35
35
|
for_characters digits do
|
36
36
|
self.accumulator += curr_char
|
37
37
|
advance
|
38
|
-
tokens <<
|
38
|
+
tokens << integer_literal_token(accumulator.to_i) unless curr_char
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
+
def integer_literal_token(value)
|
43
|
+
Dhaka::Token.new(ArithmeticGrammar.symbol_for_name('n'), value)
|
44
|
+
end
|
45
|
+
|
42
46
|
end
|
43
47
|
|
data/test/bracket_tokenizer.rb
CHANGED
@@ -6,7 +6,7 @@ class BracketTokenizer < Dhaka::Tokenizer
|
|
6
6
|
|
7
7
|
all_characters = ['(', '[', '{', 'B', '}', ']', ')']
|
8
8
|
|
9
|
-
for_state
|
9
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
10
10
|
for_characters(all_characters) do
|
11
11
|
tokens << Dhaka::Token.new(BracketGrammar.symbol_for_name(curr_char), nil)
|
12
12
|
advance
|
data/test/parser_test.rb
CHANGED
@@ -42,12 +42,17 @@ class ParserTest < Test::Unit::TestCase
|
|
42
42
|
'E ::= E -> - T [)-]'])
|
43
43
|
expected_states[10] = Set.new(['T ::= ( E ) -> [#)-]'])
|
44
44
|
expected_states[11] = Set.new(['_Start_ ::= S # -> [_End_]'])
|
45
|
-
actual_states = Set.new(@parser.states.collect {|state| Set.new(state.items.values.collect {|item| item.to_s})})
|
45
|
+
actual_states = Set.new(@parser.send('states').collect {|state| Set.new(state.items.values.collect {|item| item.to_s})})
|
46
46
|
#write_parser(@parser)
|
47
47
|
expected_states.values.each do |state|
|
48
48
|
assert set_finder(state, actual_states), "expected #{state.to_a}"
|
49
49
|
end
|
50
50
|
end
|
51
|
+
|
52
|
+
def test_parser_can_be_exported_to_dot_format
|
53
|
+
dot_representation = @parser.to_dot
|
54
|
+
end
|
55
|
+
|
51
56
|
def build_tokens(token_symbol_names, grammar)
|
52
57
|
token_symbol_names.collect {|symbol_name| Dhaka::Token.new(grammar.symbol_for_name(symbol_name), nil)}
|
53
58
|
end
|
@@ -71,6 +76,11 @@ class ParserTest < Test::Unit::TestCase
|
|
71
76
|
"start"], syntax_tree.linearize
|
72
77
|
end
|
73
78
|
|
79
|
+
def test_syntax_trees_can_be_exported_to_dot_format
|
80
|
+
syntax_tree = @parser.parse(build_tokens(['(','n','-','(','n','-','n',')',')','-','n','#'], @grammar)).syntax_tree
|
81
|
+
syntax_tree.to_dot
|
82
|
+
end
|
83
|
+
|
74
84
|
def get_linearized_parse_result(input, parser)
|
75
85
|
parser.parse(build_tokens(input, parser.grammar)).syntax_tree.linearize
|
76
86
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: dhaka
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.0.
|
7
|
-
date:
|
6
|
+
version: 0.0.6
|
7
|
+
date: 2007-01-07 00:00:00 -05:00
|
8
8
|
summary: An LALR1 parser generator written in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -15,7 +15,7 @@ description:
|
|
15
15
|
autorequire: dhaka
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
18
|
-
has_rdoc:
|
18
|
+
has_rdoc: true
|
19
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">"
|