dhaka 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/dhaka.rb +6 -2
- data/lib/evaluator/evaluator.rb +32 -19
- data/lib/grammar/closure_hash.rb +2 -1
- data/lib/grammar/grammar.rb +52 -25
- data/lib/grammar/grammar_symbol.rb +4 -0
- data/lib/grammar/precedence.rb +1 -1
- data/lib/parser/action.rb +4 -3
- data/lib/parser/channel.rb +4 -3
- data/lib/parser/compiled_parser.rb +2 -0
- data/lib/parser/item.rb +2 -1
- data/lib/parser/parse_result.rb +13 -8
- data/lib/parser/parse_tree.rb +22 -16
- data/lib/parser/parser.rb +29 -27
- data/lib/parser/parser_methods.rb +2 -0
- data/lib/parser/parser_run.rb +1 -1
- data/lib/parser/parser_state.rb +2 -2
- data/lib/parser/token.rb +2 -0
- data/lib/tokenizer/tokenizer.rb +52 -33
- data/test/arithmetic_precedence_tokenizer.rb +8 -4
- data/test/arithmetic_tokenizer.rb +8 -4
- data/test/bracket_tokenizer.rb +1 -1
- data/test/parser_test.rb +11 -1
- metadata +3 -3
data/lib/dhaka.rb
CHANGED
@@ -21,6 +21,12 @@
|
|
21
21
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
22
|
#++
|
23
23
|
|
24
|
+
# An introduction to Dhaka and annotated examples can be found at the project homepage http://dhaka.rubyforge.org
|
25
|
+
#
|
26
|
+
# Further examples can be found in the test suites included with the gem.
|
27
|
+
module Dhaka
|
28
|
+
end
|
29
|
+
|
24
30
|
require File.dirname(__FILE__)+'/grammar/grammar_symbol'
|
25
31
|
require File.dirname(__FILE__)+'/grammar/production'
|
26
32
|
require File.dirname(__FILE__)+'/grammar/closure_hash'
|
@@ -41,5 +47,3 @@ require File.dirname(__FILE__)+'/parser/compiled_parser'
|
|
41
47
|
|
42
48
|
require File.dirname(__FILE__)+'/tokenizer/tokenizer'
|
43
49
|
require File.dirname(__FILE__)+'/evaluator/evaluator'
|
44
|
-
|
45
|
-
|
data/lib/evaluator/evaluator.rb
CHANGED
@@ -1,21 +1,43 @@
|
|
1
1
|
module Dhaka
|
2
|
+
|
3
|
+
# This is the abstract base evaluator class. It is not directly instantiated.
|
4
|
+
# When defining an evaluator for a specific grammar, we subclass it. e.g. for FooGrammar
|
5
|
+
# we create a FooEvaluator that subclasses Evaluator. Note that FooEvaluator may not
|
6
|
+
# be further subclassed.
|
7
|
+
#
|
8
|
+
# An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
|
9
|
+
# a block that performs the evaluation. For detailed examples, see the evaluators in the
|
10
|
+
# test suite.
|
11
|
+
|
2
12
|
class Evaluator
|
3
13
|
|
14
|
+
# Instantiates a new evaluator with the syntax tree of a parsed expression. Only subclasses
|
15
|
+
# of Evaluator are directly instantiated.
|
4
16
|
def initialize(syntax_tree)
|
5
17
|
@syntax_tree = syntax_tree
|
6
18
|
@node_stack = []
|
7
19
|
end
|
8
20
|
|
21
|
+
# Returns the evaluation result.
|
9
22
|
def result
|
10
23
|
evaluate(@syntax_tree)
|
11
24
|
end
|
12
25
|
|
26
|
+
private
|
27
|
+
|
13
28
|
def child_nodes
|
14
29
|
@node_stack[-1]
|
15
30
|
end
|
16
|
-
|
17
|
-
private
|
18
31
|
|
32
|
+
def evaluate node
|
33
|
+
return node if (ParseTreeLeafNode === node)
|
34
|
+
@node_stack << node.child_nodes.collect {|child_node| evaluate(child_node)}
|
35
|
+
proc = self.class.actions[node.production.name]
|
36
|
+
result = self.instance_eval(&proc)
|
37
|
+
@node_stack.pop
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
19
41
|
def self.inherited(evaluator)
|
20
42
|
class << evaluator
|
21
43
|
attr_accessor :grammar, :actions
|
@@ -29,34 +51,25 @@ module Dhaka
|
|
29
51
|
check_definitions
|
30
52
|
end
|
31
53
|
|
54
|
+
def self.method_missing(method_name, &blk)
|
55
|
+
if method_name.to_s =~ /^for_*/
|
56
|
+
rule_name = method_name.to_s[4..-1]
|
57
|
+
self.for_rule_named(rule_name, &blk)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
32
61
|
def self.check_definitions
|
33
62
|
non_trivial_productions_with_rules_undefined = self.grammar.productions.select {|production| production.expansion.size != 1}.collect {|production| production.name} - self.actions.keys
|
34
63
|
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
|
35
64
|
end
|
36
65
|
|
37
|
-
def evaluate node
|
38
|
-
return node if (ParseTreeLeafNode === node)
|
39
|
-
@node_stack << node.child_nodes.collect {|child_node| evaluate(child_node)}
|
40
|
-
proc = self.class.actions[node.production.name]
|
41
|
-
result = self.instance_eval(&proc)
|
42
|
-
@node_stack.pop
|
43
|
-
result
|
44
|
-
end
|
45
|
-
|
46
66
|
def self.for_rule_named(name, &blk)
|
47
67
|
self.actions[name] = blk
|
48
68
|
end
|
49
|
-
|
50
|
-
def self.method_missing(method_name, &blk)
|
51
|
-
if method_name.to_s =~ /^for_*/
|
52
|
-
rule_name = method_name.to_s[4..-1]
|
53
|
-
self.for_rule_named(rule_name, &blk)
|
54
|
-
end
|
55
|
-
end
|
56
69
|
|
57
70
|
end
|
58
71
|
|
59
|
-
class EvaluatorDefinitionError < StandardError
|
72
|
+
class EvaluatorDefinitionError < StandardError #:nodoc:
|
60
73
|
def initialize(non_trivial_productions_with_rules_undefined)
|
61
74
|
@non_trivial_productions_with_rules_undefined = non_trivial_productions_with_rules_undefined
|
62
75
|
end
|
data/lib/grammar/closure_hash.rb
CHANGED
data/lib/grammar/grammar.rb
CHANGED
@@ -2,15 +2,24 @@
|
|
2
2
|
require 'set'
|
3
3
|
module Dhaka
|
4
4
|
|
5
|
+
# Reserved name for the start symbol for all grammars.
|
5
6
|
START_SYMBOL_NAME = "_Start_"
|
6
|
-
END_SYMBOL_NAME = "_End_"
|
7
|
+
END_SYMBOL_NAME = "_End_" #:nodoc:
|
7
8
|
|
9
|
+
# Productions for specific grammar symbols are defined in the context of this class.
|
8
10
|
class ProductionBuilder
|
11
|
+
|
12
|
+
# +symbol+ is the grammar symbol that productions are being defined for.
|
9
13
|
def initialize(grammar, symbol)
|
10
14
|
@grammar = grammar
|
11
15
|
@symbol = symbol
|
12
16
|
end
|
13
17
|
|
18
|
+
# Creates a new production for +symbol+ with an expansion of +expansion+. The options hash can include
|
19
|
+
# a directive <tt>:prec</tt>, the value of which is a grammar symbol name. The precedence of the production is then
|
20
|
+
# set to the precedence of the grammar symbol corresponding to that name.
|
21
|
+
#
|
22
|
+
# See the arithmetic precedence grammar in the test suites for an example.
|
14
23
|
def method_missing(production_name, expansion, options = {})
|
15
24
|
expansion_symbols = expansion.collect {|name| @grammar.symbols[name]}
|
16
25
|
if precedence_symbol_name = options[:prec]
|
@@ -25,12 +34,23 @@ module Dhaka
|
|
25
34
|
end
|
26
35
|
end
|
27
36
|
|
37
|
+
# The precedence builder defines three methods, +left+, +right+ and +nonassoc+. These accept arrays of grammar
|
38
|
+
# symbols all of which have the same precedence level and associativity. This works almost exactly like Yacc.
|
39
|
+
#
|
40
|
+
# See the arithmetic precedence grammar in the test suites for an example.
|
28
41
|
class PrecedenceBuilder
|
29
|
-
def initialize(grammar)
|
42
|
+
def initialize(grammar) #:nodoc:
|
30
43
|
@grammar = grammar
|
31
44
|
@precedence_level = 0
|
32
45
|
end
|
33
|
-
|
46
|
+
[:left, :right, :nonassoc].each do |associativity|
|
47
|
+
define_method(associativity) do |symbols|
|
48
|
+
assign_precedences associativity, symbols
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def assign_precedences(associativity, symbol_names)
|
34
54
|
symbol_names.each do |symbol_name|
|
35
55
|
symbol = @grammar.symbols[symbol_name]
|
36
56
|
symbol.precedence = Precedence.new(@precedence_level, associativity)
|
@@ -39,8 +59,34 @@ module Dhaka
|
|
39
59
|
end
|
40
60
|
end
|
41
61
|
|
62
|
+
# This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
|
42
63
|
class Grammar
|
43
64
|
|
65
|
+
# Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
|
66
|
+
# evaluated in the context of a ProductionBuilder.
|
67
|
+
def self.for_symbol symbol, &blk
|
68
|
+
symbol = symbols[symbol]
|
69
|
+
symbol.non_terminal = true
|
70
|
+
ProductionBuilder.new(self, symbol).instance_eval(&blk)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Used for defining the precedences and associativities of symbols. The block +blk+ is
|
74
|
+
# evaluated in the context of a PrecedenceBuilder.
|
75
|
+
def self.precedences &blk
|
76
|
+
PrecedenceBuilder.new(self).instance_eval(&blk)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns the grammar symbol identified by +name+
|
80
|
+
def self.symbol_for_name(name)
|
81
|
+
if symbols.has_key? name
|
82
|
+
symbols[name]
|
83
|
+
else
|
84
|
+
raise "No symbol with name #{name} found"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
44
90
|
def self.inherited(grammar)
|
45
91
|
class << grammar
|
46
92
|
attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache
|
@@ -53,13 +99,10 @@ module Dhaka
|
|
53
99
|
grammar.__first_cache = {}
|
54
100
|
end
|
55
101
|
|
56
|
-
def self.
|
57
|
-
|
58
|
-
symbol.non_terminal = true
|
59
|
-
ProductionBuilder.new(self, symbol).instance_eval(&blk)
|
102
|
+
def self.productions_for_symbol(symbol)
|
103
|
+
productions_by_symbol[symbol]
|
60
104
|
end
|
61
105
|
|
62
|
-
|
63
106
|
def self.productions
|
64
107
|
productions_by_name.values
|
65
108
|
end
|
@@ -68,17 +111,6 @@ module Dhaka
|
|
68
111
|
productions_by_name[name]
|
69
112
|
end
|
70
113
|
|
71
|
-
def self.productions_for_symbol(symbol)
|
72
|
-
productions_by_symbol[symbol]
|
73
|
-
end
|
74
|
-
|
75
|
-
def self.symbol_for_name(name)
|
76
|
-
if symbols.has_key? name
|
77
|
-
symbols[name]
|
78
|
-
else
|
79
|
-
raise "No symbol with name #{name} found"
|
80
|
-
end
|
81
|
-
end
|
82
114
|
|
83
115
|
def self.terminal_symbols
|
84
116
|
symbols.values.select {|symbol| symbol.terminal}
|
@@ -102,10 +134,6 @@ module Dhaka
|
|
102
134
|
return channels, result
|
103
135
|
end
|
104
136
|
|
105
|
-
def self.precedences &blk
|
106
|
-
PrecedenceBuilder.new(self).instance_eval(&blk)
|
107
|
-
end
|
108
|
-
|
109
137
|
def self.first(given_symbol)
|
110
138
|
cached_result = self.__first_cache[given_symbol]
|
111
139
|
return cached_result if cached_result
|
@@ -146,7 +174,6 @@ module Dhaka
|
|
146
174
|
end
|
147
175
|
return closure_hash
|
148
176
|
end
|
149
|
-
|
150
|
-
|
151
177
|
end
|
178
|
+
|
152
179
|
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
+
# Each grammar symbol is uniquely identified by a string name. The name of a symbol can
|
4
|
+
# be anything and need not correspond to its character representation. For example, an ampersand in the
|
5
|
+
# character stream could be tokenized as a symbol with a name 'whatever'. In general, it's best to choose
|
6
|
+
# symbol names that are descriptive.
|
3
7
|
class GrammarSymbol
|
4
8
|
attr_reader :name
|
5
9
|
attr_accessor :non_terminal, :nullable, :precedence, :associativity
|
data/lib/grammar/precedence.rb
CHANGED
data/lib/parser/action.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module Dhaka
|
2
|
-
|
2
|
+
# Encapsulates code for Parser actions.
|
3
|
+
class Action #:nodoc:
|
3
4
|
attr_reader :action_code
|
4
5
|
end
|
5
6
|
|
6
|
-
class ShiftAction < Action
|
7
|
+
class ShiftAction < Action #:nodoc:
|
7
8
|
attr_reader :destination_state
|
8
9
|
def initialize destination_state
|
9
10
|
@destination_state = destination_state
|
@@ -20,7 +21,7 @@ module Dhaka
|
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
23
|
-
class ReduceAction < Action
|
24
|
+
class ReduceAction < Action #:nodoc:
|
24
25
|
attr_reader :production
|
25
26
|
def initialize(production)
|
26
27
|
@production = production
|
data/lib/parser/channel.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
-
|
3
|
+
# Represents channels for pumping of lookaheads between items
|
4
|
+
class Channel #:nodoc:
|
4
5
|
attr_reader :start_item, :end_item
|
5
6
|
def initialize(grammar, start_item, end_item)
|
6
7
|
@grammar = grammar
|
@@ -23,7 +24,7 @@ module Dhaka
|
|
23
24
|
end
|
24
25
|
end
|
25
26
|
|
26
|
-
class SpontaneousChannel < Channel
|
27
|
+
class SpontaneousChannel < Channel #:nodoc:
|
27
28
|
def to_s
|
28
29
|
"Spontaneous " + super.to_s
|
29
30
|
end
|
@@ -40,7 +41,7 @@ module Dhaka
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
class PassiveChannel < Channel
|
44
|
+
class PassiveChannel < Channel #:nodoc:
|
44
45
|
def to_s
|
45
46
|
"Passive " + super.to_s
|
46
47
|
end
|
data/lib/parser/item.rb
CHANGED
data/lib/parser/parse_result.rb
CHANGED
@@ -1,21 +1,26 @@
|
|
1
1
|
module Dhaka
|
2
|
+
# Returned on successful parsing of the input token stream.
|
2
3
|
class ParseSuccessResult
|
3
|
-
|
4
|
-
|
4
|
+
# Contains the parse result.
|
5
|
+
attr_accessor :syntax_tree
|
6
|
+
def initialize(syntax_tree) #:nodoc:
|
5
7
|
@syntax_tree = syntax_tree
|
6
8
|
end
|
7
|
-
|
8
|
-
def has_error?
|
9
|
+
# This is false.
|
10
|
+
def has_error?
|
9
11
|
false
|
10
12
|
end
|
11
13
|
end
|
14
|
+
|
15
|
+
# Returned on unsuccessful parsing of the input token stream.
|
12
16
|
class ParseErrorResult
|
13
|
-
|
14
|
-
|
17
|
+
# The index of the token that caused the parse error.
|
18
|
+
attr_reader :bad_token_index
|
19
|
+
def initialize(bad_token_index) #:nodoc:
|
15
20
|
@bad_token_index = bad_token_index
|
16
21
|
end
|
17
|
-
|
18
|
-
def has_error?
|
22
|
+
# This is true.
|
23
|
+
def has_error?
|
19
24
|
true
|
20
25
|
end
|
21
26
|
end
|
data/lib/parser/parse_tree.rb
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
module Dhaka
|
2
|
-
|
2
|
+
# These are composite nodes of the syntax tree returned by the successful parsing of a token stream.
|
3
|
+
class ParseTreeCompositeNode
|
3
4
|
attr_reader :production, :child_nodes
|
4
|
-
def initialize(production)
|
5
|
+
def initialize(production) #:nodoc:
|
5
6
|
@production = production
|
6
7
|
@child_nodes = []
|
7
8
|
end
|
8
|
-
def linearize
|
9
|
+
def linearize #:nodoc:
|
9
10
|
child_nodes.collect {|child_node| child_node.linearize}.flatten + [production.name]
|
10
11
|
end
|
11
|
-
def to_s
|
12
|
+
def to_s #:nodoc:
|
12
13
|
"CompositeNode: #{production.symbol} --> [#{child_nodes.join(", ")}]"
|
13
14
|
end
|
14
|
-
|
15
|
-
"Node#{object_id}"
|
16
|
-
end
|
17
|
-
|
15
|
+
# Returns the dot representation of the syntax tree.
|
18
16
|
def to_dot
|
19
17
|
result = []
|
20
18
|
result << ["digraph x {", "node [fontsize=\"10\" shape=box size=\"5\"]"] if head_node?
|
@@ -28,31 +26,39 @@ module Dhaka
|
|
28
26
|
result.join("\n")
|
29
27
|
end
|
30
28
|
|
31
|
-
def head_node?
|
29
|
+
def head_node? #:nodoc:
|
32
30
|
production.symbol.name == START_SYMBOL_NAME
|
33
31
|
end
|
32
|
+
|
33
|
+
def dot_name #:nodoc:
|
34
|
+
"Node#{object_id}"
|
35
|
+
end
|
36
|
+
|
34
37
|
end
|
35
38
|
|
39
|
+
# These are leaf nodes of syntax trees. They contain tokens.
|
36
40
|
class ParseTreeLeafNode
|
37
41
|
attr_reader :token
|
38
|
-
def initialize(token)
|
42
|
+
def initialize(token) #:nodoc:
|
39
43
|
@token = token
|
40
44
|
end
|
41
|
-
def linearize
|
45
|
+
def linearize #:nodoc:
|
42
46
|
[]
|
43
47
|
end
|
44
|
-
def to_s
|
48
|
+
def to_s #:nodoc:
|
45
49
|
"LeafNode: #{token}"
|
46
50
|
end
|
47
|
-
|
48
|
-
"Node#{object_id}"
|
49
|
-
end
|
51
|
+
# Returns the dot representation of this node.
|
50
52
|
def to_dot
|
51
53
|
label = "#{token}#{' : '+token.value.to_s if token.value}"
|
52
54
|
"#{dot_name} [label=\"#{label}\"]"
|
53
55
|
end
|
54
|
-
def head_node?
|
56
|
+
def head_node? #:nodoc:
|
55
57
|
false
|
56
58
|
end
|
59
|
+
|
60
|
+
def dot_name #:nodoc:
|
61
|
+
"Node#{object_id}"
|
62
|
+
end
|
57
63
|
end
|
58
64
|
end
|
data/lib/parser/parser.rb
CHANGED
@@ -7,6 +7,9 @@ module Dhaka
|
|
7
7
|
include ParserMethods
|
8
8
|
attr_reader :grammar, :start_state
|
9
9
|
|
10
|
+
# Creates a new parser from the given grammar. Messages are logged by default to STDOUT
|
11
|
+
# and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
|
12
|
+
# at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
|
10
13
|
def initialize(grammar, logger = nil)
|
11
14
|
if logger
|
12
15
|
@logger = logger
|
@@ -34,20 +37,7 @@ module Dhaka
|
|
34
37
|
initialize_states
|
35
38
|
end
|
36
39
|
|
37
|
-
|
38
|
-
start_productions = @grammar.productions_for_symbol(@grammar.start_symbol)
|
39
|
-
raise NoStartProductionsError.new(@grammar) if start_productions.empty?
|
40
|
-
start_items = ItemSet.new(start_productions.collect {|production| Item.new(production, 0)})
|
41
|
-
start_items.each {|start_item| start_item.lookaheadset << @grammar.end_symbol}
|
42
|
-
@start_state = @states[start_items]
|
43
|
-
@logger.debug("Pumping #{@channels.size} channels...")
|
44
|
-
pump_channels
|
45
|
-
@logger.debug("Generating shift actions...")
|
46
|
-
generate_shift_actions
|
47
|
-
@logger.debug("Generating reduce actions...")
|
48
|
-
generate_reduce_actions
|
49
|
-
end
|
50
|
-
|
40
|
+
# Returns the Ruby source of the generated parser compiled as +parser_class_name+. This can be written out to a file.
|
51
41
|
def compile_to_ruby_source_as parser_class_name
|
52
42
|
result = "class #{parser_class_name} < Dhaka::CompiledParser\n\n"
|
53
43
|
result << " self.grammar = #{@grammar.name}\n\n"
|
@@ -59,6 +49,9 @@ module Dhaka
|
|
59
49
|
result
|
60
50
|
end
|
61
51
|
|
52
|
+
# Returns the dot representation of the parser. If <tt>:hide_lookaheads</tt> is set to true in the
|
53
|
+
# options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
|
54
|
+
# of lookahead symbols for every item in every state.
|
62
55
|
def to_dot(options = {})
|
63
56
|
result = ["digraph x {", "node [fontsize=\"10\" shape=box size=\"5\"]"]
|
64
57
|
result += states.collect { |state| state.to_dot(options) }
|
@@ -70,10 +63,28 @@ module Dhaka
|
|
70
63
|
result << ['}']
|
71
64
|
result.join("\n")
|
72
65
|
end
|
66
|
+
|
67
|
+
private :start_state
|
68
|
+
private
|
69
|
+
|
73
70
|
def states
|
74
71
|
@states.values
|
75
72
|
end
|
76
|
-
|
73
|
+
|
74
|
+
def initialize_states
|
75
|
+
start_productions = @grammar.productions_for_symbol(@grammar.start_symbol)
|
76
|
+
raise NoStartProductionsError.new(@grammar) if start_productions.empty?
|
77
|
+
start_items = ItemSet.new(start_productions.collect {|production| Item.new(production, 0)})
|
78
|
+
start_items.each {|start_item| start_item.lookaheadset << @grammar.end_symbol}
|
79
|
+
@start_state = @states[start_items]
|
80
|
+
@logger.debug("Pumping #{@channels.size} channels...")
|
81
|
+
pump_channels
|
82
|
+
@logger.debug("Generating shift actions...")
|
83
|
+
generate_shift_actions
|
84
|
+
@logger.debug("Generating reduce actions...")
|
85
|
+
generate_reduce_actions
|
86
|
+
end
|
87
|
+
|
77
88
|
def generate_shift_actions
|
78
89
|
@states.values.each do |state|
|
79
90
|
@transitions[state].keys.each { |symbol|
|
@@ -157,21 +168,12 @@ module Dhaka
|
|
157
168
|
|
158
169
|
end
|
159
170
|
|
160
|
-
|
161
|
-
class ParserReduceReduceConflictError < StandardError
|
162
|
-
def initialize(message)
|
163
|
-
@message = message
|
164
|
-
end
|
165
|
-
def to_s
|
166
|
-
@message
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
171
|
+
# Raised when trying to create a Parser for a grammar that has no productions for the start symbol
|
170
172
|
class NoStartProductionsError < StandardError
|
171
|
-
def initialize(grammar)
|
173
|
+
def initialize(grammar) #:nodoc:
|
172
174
|
@grammar = grammar
|
173
175
|
end
|
174
|
-
def to_s
|
176
|
+
def to_s #:nodoc:
|
175
177
|
"No start productions defined for #{@grammar.name}"
|
176
178
|
end
|
177
179
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
module Dhaka
|
3
|
+
# This module is included both in Parser and CompiledParser.
|
3
4
|
module ParserMethods
|
5
|
+
# +token_stream+ is an Enumerable of Token-s. Returns either a ParseSuccessResult or a ParseErrorResult.
|
4
6
|
def parse token_stream
|
5
7
|
parser_run = ParserRun.new(grammar, start_state, token_stream)
|
6
8
|
parser_run.run
|
data/lib/parser/parser_run.rb
CHANGED
data/lib/parser/parser_state.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'set'
|
3
3
|
module Dhaka
|
4
|
-
class ParserState
|
4
|
+
class ParserState #:nodoc:
|
5
5
|
|
6
6
|
attr_accessor :items, :actions, :id
|
7
7
|
|
@@ -55,7 +55,7 @@ module Dhaka
|
|
55
55
|
|
56
56
|
end
|
57
57
|
|
58
|
-
class ItemSet < Set
|
58
|
+
class ItemSet < Set #:nodoc:
|
59
59
|
def hash
|
60
60
|
self.collect{|item| item.hash}.inject{|result, hashcode| result ^ hashcode}
|
61
61
|
end
|
data/lib/parser/token.rb
CHANGED
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
module Dhaka
|
2
|
+
|
3
|
+
# Reserved constant used to identify the idle state of the tokenizer.
|
4
|
+
TOKENIZER_IDLE_STATE = :idle_state
|
5
|
+
|
6
|
+
# Raised when the tokenizer encounters a character that has no corresponding action in
|
7
|
+
# its current state.
|
2
8
|
class UnrecognizedInputCharacterException < StandardError
|
3
9
|
attr_reader :input, :char_index
|
4
10
|
def initialize(input, char_index)
|
@@ -10,6 +16,8 @@ module Dhaka
|
|
10
16
|
end
|
11
17
|
end
|
12
18
|
|
19
|
+
# A tokenizer state encapsulates actions that should be performed upon
|
20
|
+
# encountering each permissible character for that state.
|
13
21
|
class TokenizerState
|
14
22
|
attr_reader :actions
|
15
23
|
|
@@ -17,72 +25,83 @@ module Dhaka
|
|
17
25
|
@actions = {}
|
18
26
|
end
|
19
27
|
|
28
|
+
# Define the action (+blk+) to be performed when encountering any of +characters+ in the token stream.
|
20
29
|
def for_characters(characters, &blk)
|
21
30
|
characters.each do |character|
|
22
31
|
actions[character] = blk
|
23
32
|
end
|
24
33
|
end
|
25
34
|
|
26
|
-
|
27
|
-
actions[character[0]] = blk
|
28
|
-
end
|
35
|
+
alias for_character for_characters
|
29
36
|
|
30
|
-
def to_s
|
37
|
+
def to_s #:nodoc:
|
31
38
|
actions.inspect
|
32
39
|
end
|
33
40
|
|
34
41
|
end
|
35
42
|
|
43
|
+
# This class contains a DSL for specifying tokenizers. Subclass it to implement tokenizers for specific grammars.
|
44
|
+
# Subclasses of this class may not be further subclassed.
|
45
|
+
#
|
46
|
+
# Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
|
47
|
+
# by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
|
48
|
+
# that it starts in).
|
36
49
|
class Tokenizer
|
37
|
-
|
38
|
-
|
39
|
-
class << tokenizer
|
40
|
-
attr_accessor :states
|
41
|
-
end
|
42
|
-
tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
|
43
|
-
end
|
44
|
-
|
50
|
+
|
51
|
+
# Define the action for the state named +state_name+.
|
45
52
|
def self.for_state(state_name, &blk)
|
46
53
|
states[state_name].instance_eval(&blk)
|
47
54
|
end
|
48
|
-
|
55
|
+
|
56
|
+
# Tokenizes a string +input+ and returns an array of Token-s.
|
49
57
|
def self.tokenize(input)
|
50
|
-
|
58
|
+
self.new(input).run
|
51
59
|
end
|
52
|
-
|
53
|
-
|
54
|
-
class TokenizerRun
|
55
|
-
|
60
|
+
|
61
|
+
# A slot that can be used to accumulate characters when processing multi-character tokens.
|
56
62
|
attr_accessor :accumulator
|
63
|
+
# The tokens shifted so far.
|
57
64
|
attr_reader :tokens
|
58
|
-
|
59
|
-
|
65
|
+
|
66
|
+
def initialize(input) #:nodoc:
|
60
67
|
@input = input
|
61
|
-
@current_state =
|
68
|
+
@current_state = self.class.states[TOKENIZER_IDLE_STATE]
|
62
69
|
@curr_char_index = 0
|
63
70
|
@tokens = []
|
64
71
|
end
|
65
72
|
|
66
|
-
|
67
|
-
while curr_char
|
68
|
-
blk = @current_state.actions[curr_char]
|
69
|
-
raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
|
70
|
-
instance_eval(&blk)
|
71
|
-
end
|
72
|
-
tokens
|
73
|
-
end
|
74
|
-
|
73
|
+
# The character currently being processed.
|
75
74
|
def curr_char
|
76
75
|
@input[@curr_char_index] and @input[@curr_char_index].chr
|
77
76
|
end
|
78
|
-
|
77
|
+
|
78
|
+
# Advance to the next character.
|
79
79
|
def advance
|
80
80
|
@curr_char_index += 1
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
|
+
# Change the active state of the tokenizer to the state identified by the symbol +state_name+.
|
83
84
|
def switch_to state_name
|
84
|
-
@current_state =
|
85
|
+
@current_state = self.class.states[state_name]
|
86
|
+
end
|
87
|
+
|
88
|
+
def run #:nodoc:
|
89
|
+
while curr_char
|
90
|
+
blk = @current_state.actions[curr_char]
|
91
|
+
raise UnrecognizedInputCharacterException.new(@input, @curr_char_index) unless blk
|
92
|
+
instance_eval(&blk)
|
93
|
+
end
|
94
|
+
tokens
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def self.inherited(tokenizer)
|
99
|
+
class << tokenizer
|
100
|
+
attr_accessor :states
|
101
|
+
end
|
102
|
+
tokenizer.states = Hash.new {|hash, key| hash[key] = TokenizerState.new}
|
85
103
|
end
|
86
104
|
|
87
105
|
end
|
106
|
+
|
88
107
|
end
|
@@ -13,7 +13,7 @@ class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
|
|
13
13
|
|
14
14
|
all_characters = digits + parenths + operators + functions + arg_separator + whitespace
|
15
15
|
|
16
|
-
for_state
|
16
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
17
17
|
for_characters(all_characters - (digits + whitespace)) do
|
18
18
|
tokens << Dhaka::Token.new(ArithmeticPrecedenceGrammar.symbol_for_name(curr_char), nil)
|
19
19
|
advance
|
@@ -29,15 +29,19 @@ class ArithmeticPrecedenceTokenizer < Dhaka::Tokenizer
|
|
29
29
|
|
30
30
|
for_state :get_integer_literal do
|
31
31
|
for_characters all_characters - digits do
|
32
|
-
tokens <<
|
33
|
-
switch_to
|
32
|
+
tokens << integer_literal_token(accumulator.to_i)
|
33
|
+
switch_to Dhaka::TOKENIZER_IDLE_STATE
|
34
34
|
end
|
35
35
|
for_characters digits do
|
36
36
|
self.accumulator += curr_char
|
37
37
|
advance
|
38
|
-
tokens <<
|
38
|
+
tokens << integer_literal_token(accumulator.to_i) unless curr_char
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
+
def integer_literal_token(value)
|
43
|
+
Dhaka::Token.new(ArithmeticPrecedenceGrammar.symbol_for_name('n'), value)
|
44
|
+
end
|
45
|
+
|
42
46
|
end
|
43
47
|
|
@@ -13,7 +13,7 @@ class ArithmeticTokenizer < Dhaka::Tokenizer
|
|
13
13
|
|
14
14
|
all_characters = digits + parenths + operators + functions + arg_separator + whitespace
|
15
15
|
|
16
|
-
for_state
|
16
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
17
17
|
for_characters(all_characters - (digits + whitespace)) do
|
18
18
|
tokens << Dhaka::Token.new(ArithmeticGrammar.symbol_for_name(curr_char), nil)
|
19
19
|
advance
|
@@ -29,15 +29,19 @@ class ArithmeticTokenizer < Dhaka::Tokenizer
|
|
29
29
|
|
30
30
|
for_state :get_integer_literal do
|
31
31
|
for_characters all_characters - digits do
|
32
|
-
tokens <<
|
33
|
-
switch_to
|
32
|
+
tokens << integer_literal_token(accumulator.to_i)
|
33
|
+
switch_to Dhaka::TOKENIZER_IDLE_STATE
|
34
34
|
end
|
35
35
|
for_characters digits do
|
36
36
|
self.accumulator += curr_char
|
37
37
|
advance
|
38
|
-
tokens <<
|
38
|
+
tokens << integer_literal_token(accumulator.to_i) unless curr_char
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
+
def integer_literal_token(value)
|
43
|
+
Dhaka::Token.new(ArithmeticGrammar.symbol_for_name('n'), value)
|
44
|
+
end
|
45
|
+
|
42
46
|
end
|
43
47
|
|
data/test/bracket_tokenizer.rb
CHANGED
@@ -6,7 +6,7 @@ class BracketTokenizer < Dhaka::Tokenizer
|
|
6
6
|
|
7
7
|
all_characters = ['(', '[', '{', 'B', '}', ']', ')']
|
8
8
|
|
9
|
-
for_state
|
9
|
+
for_state Dhaka::TOKENIZER_IDLE_STATE do
|
10
10
|
for_characters(all_characters) do
|
11
11
|
tokens << Dhaka::Token.new(BracketGrammar.symbol_for_name(curr_char), nil)
|
12
12
|
advance
|
data/test/parser_test.rb
CHANGED
@@ -42,12 +42,17 @@ class ParserTest < Test::Unit::TestCase
|
|
42
42
|
'E ::= E -> - T [)-]'])
|
43
43
|
expected_states[10] = Set.new(['T ::= ( E ) -> [#)-]'])
|
44
44
|
expected_states[11] = Set.new(['_Start_ ::= S # -> [_End_]'])
|
45
|
-
actual_states = Set.new(@parser.states.collect {|state| Set.new(state.items.values.collect {|item| item.to_s})})
|
45
|
+
actual_states = Set.new(@parser.send('states').collect {|state| Set.new(state.items.values.collect {|item| item.to_s})})
|
46
46
|
#write_parser(@parser)
|
47
47
|
expected_states.values.each do |state|
|
48
48
|
assert set_finder(state, actual_states), "expected #{state.to_a}"
|
49
49
|
end
|
50
50
|
end
|
51
|
+
|
52
|
+
def test_parser_can_be_exported_to_dot_format
|
53
|
+
dot_representation = @parser.to_dot
|
54
|
+
end
|
55
|
+
|
51
56
|
def build_tokens(token_symbol_names, grammar)
|
52
57
|
token_symbol_names.collect {|symbol_name| Dhaka::Token.new(grammar.symbol_for_name(symbol_name), nil)}
|
53
58
|
end
|
@@ -71,6 +76,11 @@ class ParserTest < Test::Unit::TestCase
|
|
71
76
|
"start"], syntax_tree.linearize
|
72
77
|
end
|
73
78
|
|
79
|
+
def test_syntax_trees_can_be_exported_to_dot_format
|
80
|
+
syntax_tree = @parser.parse(build_tokens(['(','n','-','(','n','-','n',')',')','-','n','#'], @grammar)).syntax_tree
|
81
|
+
syntax_tree.to_dot
|
82
|
+
end
|
83
|
+
|
74
84
|
def get_linearized_parse_result(input, parser)
|
75
85
|
parser.parse(build_tokens(input, parser.grammar)).syntax_tree.linearize
|
76
86
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: dhaka
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.0.
|
7
|
-
date:
|
6
|
+
version: 0.0.6
|
7
|
+
date: 2007-01-07 00:00:00 -05:00
|
8
8
|
summary: An LALR1 parser generator written in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -15,7 +15,7 @@ description:
|
|
15
15
|
autorequire: dhaka
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
18
|
-
has_rdoc:
|
18
|
+
has_rdoc: true
|
19
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">"
|