dhaka 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,14 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class RegexTokenizer < Tokenizer
4
+
5
+ for_state TOKENIZER_IDLE_STATE do
6
+ for_characters(ALL_CHARACTERS) do
7
+ create_token(curr_char, nil)
8
+ advance
9
+ end
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,69 @@
1
+ module Dhaka
2
+ # Abstract base class for lexer specifications.
3
+ #
4
+ # Use this to specify the transformations that will be performed when the lexer recognizes a given pattern. Actions are listed in
5
+ # descending order of priority. For example in the following lexer specification:
6
+ #
7
+ # class LexerSpec < Dhaka::LexerSpecification
8
+ # for_pattern 'zz' do
9
+ # "recognized two zs"
10
+ # end
11
+ #
12
+ # for_pattern '\w(\w|\d)*' do
13
+ # "recognized word token #{current_lexeme.value}"
14
+ # end
15
+ #
16
+ # for_pattern '(\d)+(\.\d+)?' do
17
+ # "recognized number #{current_lexeme.value}"
18
+ # end
19
+ #
20
+ # for_pattern ' +' do
21
+ # #ignores whitespace
22
+ # end
23
+ #
24
+ # for_pattern "\n+" do
25
+ # "recognized newline"
26
+ # end
27
+ # end
28
+ #
29
+ # the pattern 'zz' takes precedence over the pattern immediately below it, so the lexer will announce that it has recognized two
30
+ # 'z's instead of a word token.
31
+ #
32
+ # The patterns are <i>not</i> Ruby regular expressions - a lot of operators featured in Ruby's regular expression engine are not yet supported.
33
+ # See http://dhaka.rubyforge.org/regex_grammar.html for the current syntax.
34
+
35
+ class LexerSpecification
36
+ class << self
37
+ # Associates +blk+ as the action to be performed when a lexer recognizes +pattern+. When Lexer#lex is invoked,
38
+ # it creates a LexerRun object that provides the context for +blk+ to be evaluated in. Methods available in this block
39
+ # are LexerRun#current_lexeme and LexerRun#create_token.
40
+ def for_pattern(pattern, &blk)
41
+ items[pattern] = LexerSpecificationItem.new(pattern, priority, blk)
42
+ self.priority += 1
43
+ end
44
+
45
+ private
46
+ def inherited(specification)
47
+ class << specification
48
+ attr_accessor :items, :priority
49
+ end
50
+ specification.items = {}
51
+ specification.priority = 0
52
+ end
53
+
54
+ end
55
+ end
56
+
57
+ class LexerSpecificationItem #:nodoc:
58
+ include Comparable
59
+ attr_reader :pattern, :action, :priority
60
+ def initialize(pattern, priority, action)
61
+ @pattern, @priority, @action = pattern, priority, action
62
+ end
63
+
64
+ def <=> other
65
+ priority <=> other.priority
66
+ end
67
+ end
68
+ end
69
+
@@ -0,0 +1,45 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class State
4
+ attr_reader :transitions, :pattern
5
+ def initialize state_machine, pattern
6
+ @state_machine = state_machine
7
+ @pattern = pattern
8
+ @transitions = {}
9
+ end
10
+
11
+ def accepting?
12
+ pattern
13
+ end
14
+
15
+ def for_characters *characters, &blk
16
+ dest_state = @state_machine.instance_eval(&blk)
17
+ characters.each do |char|
18
+ transitions[char] = dest_state
19
+ end
20
+ end
21
+
22
+ def recognize pattern
23
+ @pattern = pattern
24
+ end
25
+
26
+ def compile_to_ruby_source
27
+ result = " at_state(#{object_id}) {\n"
28
+ result << " recognize(#{pattern.inspect})\n" if accepting?
29
+ transition_keys_by_destination_state = Hash.new {|hash, key| hash[key] = []}
30
+ transitions.each do |key, dest_state|
31
+ transition_keys_by_destination_state[dest_state.object_id] << key
32
+ end
33
+
34
+ transition_keys_by_destination_state.keys.each do |state_id|
35
+ transition_keys = transition_keys_by_destination_state[state_id].collect {|transition_key| "#{transition_key.inspect}"}.join(', ')
36
+ result << " for_characters(#{transition_keys}) { switch_to #{state_id} }\n"
37
+ end
38
+
39
+ result << " }"
40
+ result
41
+ end
42
+ end
43
+ end
44
+ end
45
+
@@ -0,0 +1,37 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class StateMachine
4
+ attr_reader :start_state
5
+
6
+ def initialize start_key
7
+ @states = Hash.new do |hash, key|
8
+ new_state = new_state_for_key key
9
+ hash[key] = new_state
10
+ transition_characters(key).each do |char|
11
+ dest_key = dest_key_for(key, char)
12
+ dest_state = hash[dest_key]
13
+ new_state.transitions[char] = dest_state
14
+ end
15
+ new_state
16
+ end
17
+ @start_state = @states[start_key]
18
+ end
19
+
20
+ def to_dot
21
+ Dot::Digraph.new(:fontsize => 10, :shape => :circle, :size => 5) do |g|
22
+ start = 'Start'
23
+ g.node(start, :label => start)
24
+ g.edge(start, @start_state)
25
+ @states.values.each do |state|
26
+ state_attributes = {}
27
+ state_attributes.merge!(:shape => :doublecircle, :label => state.pattern) if state.accepting?
28
+ g.node(state, state_attributes)
29
+ state.transitions.each do |transition_key, dest_state|
30
+ g.edge(state, dest_state, :label => transition_key)
31
+ end
32
+ end
33
+ end.to_dot
34
+ end
35
+ end
36
+ end
37
+ end
@@ -31,12 +31,12 @@ module Dhaka
31
31
  @action_code = Proc.new do
32
32
  composite_node = ParseTreeCompositeNode.new(production)
33
33
 
34
- production.expansion.each do |symbol|
34
+ production.expansion.each do |symbol|
35
35
  state_stack.pop
36
36
  composite_node.child_nodes.unshift(node_stack.pop)
37
37
  end
38
38
 
39
- node_stack << composite_node
39
+ node_stack << composite_node.instance_eval(&production.action)
40
40
 
41
41
  unless composite_node.head_node?
42
42
  @symbol_queue.concat [@current_token.symbol_name, production.symbol.name]
@@ -45,7 +45,7 @@ module Dhaka
45
45
  end
46
46
 
47
47
  def compile_to_ruby_source
48
- "reduce_with '#{production.name}'"
48
+ "reduce_with #{production.name.inspect}"
49
49
  end
50
50
 
51
51
  def to_s
@@ -1,14 +1,22 @@
1
1
  module Dhaka
2
- # This is the superclass of all compiled Parsers. It is only used by generated code.
2
+ # Abstract base class of all compiled Parsers. It is only used by generated code.
3
3
  class CompiledParser
4
4
 
5
5
  class << self
6
6
  private
7
7
  def inherited(compiled_parser)
8
8
  class << compiled_parser
9
- attr_accessor :states, :grammar, :start_state_id
9
+ attr_accessor :states, :grammar, :start_state_id, :shift_actions, :reduce_actions
10
10
  end
11
- compiled_parser.states = Hash.new {|hash, state_id| hash[state_id] = ParserState.new(compiled_parser, {}, state_id)}
11
+ compiled_parser.states = Hash.new do |hash, state_id|
12
+ hash[state_id] = ParserState.new(compiled_parser, {}, state_id)
13
+ end
14
+ compiled_parser.shift_actions = Hash.new do |hash, state_id|
15
+ hash[state_id] = ShiftAction.new(compiled_parser.states[state_id])
16
+ end
17
+ compiled_parser.reduce_actions = Hash.new do |hash, production_name|
18
+ hash[production_name] = ReduceAction.new(compiled_parser.grammar.production_named(production_name))
19
+ end
12
20
  end
13
21
 
14
22
  def at_state x, &blk
@@ -16,11 +16,9 @@ module Dhaka
16
16
 
17
17
  # Returns the dot representation of the parse tree
18
18
  def to_dot
19
- result = []
20
- result << ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
21
- result << parse_tree.to_dot
22
- result << ['}']
23
- result.join("\n")
19
+ Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
20
+ parse_tree.to_dot(g)
21
+ end.to_dot
24
22
  end
25
23
 
26
24
  # Deprecated. Use the +parse_tree+ accessor.
@@ -20,25 +20,18 @@ module Dhaka
20
20
  end
21
21
 
22
22
  # Returns the dot representation of this node.
23
- def to_dot
24
- result = []
25
- label = production
26
- result << %(#{dot_name} [label="#{label}"])
23
+ def to_dot graph
24
+ graph.node(self, :label => production)
27
25
  child_nodes.each do |child|
28
- result << "#{dot_name} -> #{child.dot_name}"
29
- result << "#{child.to_dot}"
26
+ graph.edge(self, child)
27
+ child.to_dot(graph)
30
28
  end
31
- result.join("\n")
32
29
  end
33
30
 
34
31
  def head_node? #:nodoc:
35
32
  production.symbol.name == START_SYMBOL_NAME
36
33
  end
37
34
 
38
- def dot_name #:nodoc:
39
- "Node#{object_id}"
40
- end
41
-
42
35
  end
43
36
 
44
37
  # These are leaf nodes of syntax trees. They contain tokens.
@@ -62,16 +55,12 @@ module Dhaka
62
55
  end
63
56
 
64
57
  # Returns the dot representation of this node.
65
- def to_dot
66
- %(#{dot_name} [label="#{token.to_s}"])
58
+ def to_dot(graph)
59
+ graph.node(self, :label => token)
67
60
  end
68
61
 
69
62
  def head_node? #:nodoc:
70
63
  false
71
64
  end
72
-
73
- def dot_name #:nodoc:
74
- "Node#{object_id}"
75
- end
76
65
  end
77
66
  end
@@ -14,10 +14,12 @@ module Dhaka
14
14
  # and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
15
15
  # at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
16
16
  def initialize(grammar, logger = nil)
17
- @logger = logger || default_logger
18
- @transitions = Hash.new {|hash, state| hash[state] = {}}
19
- @grammar = grammar
20
- @channels = []
17
+ @shift_actions = Hash.new {|hash, state| hash[state] = ShiftAction.new(state)}
18
+ @reduce_actions = Hash.new {|hash, production| hash[production] = ReduceAction.new(production)}
19
+ @logger = logger || default_logger
20
+ @transitions = Hash.new {|hash, state| hash[state] = {}}
21
+ @grammar = grammar
22
+ @channels = []
21
23
  @states = Hash.new do |hash, kernel|
22
24
  channels, closure = grammar.closure(kernel)
23
25
  @channels.concat channels.to_a
@@ -51,15 +53,14 @@ module Dhaka
51
53
  # options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
52
54
  # of lookahead symbols for every item in every state.
53
55
  def to_dot(options = {})
54
- result = ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
55
- result.concat states.collect { |state| state.to_dot(options) }
56
- states.each do |state|
57
- @transitions[state].each do |symbol, dest_state|
58
- result << %(#{state.unique_name} -> #{dest_state.unique_name} [label="#{symbol.name}"])
56
+ Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
57
+ states.each do |state|
58
+ g.node(state, :label => state.items.values.collect{|item| item.to_s(options)}.join("\n"))
59
+ @transitions[state].each do |symbol, dest_state|
60
+ g.edge(state, dest_state, :label => symbol.name)
61
+ end
59
62
  end
60
- end
61
- result << ['}']
62
- result.join("\n")
63
+ end.to_dot
63
64
  end
64
65
 
65
66
  def inspect
@@ -97,7 +98,7 @@ module Dhaka
97
98
  def generate_shift_actions
98
99
  @states.values.each do |state|
99
100
  @transitions[state].keys.each do |symbol|
100
- state.actions[symbol.name] = ShiftAction.new(@transitions[state][symbol])
101
+ state.actions[symbol.name] = @shift_actions[@transitions[state][symbol]]
101
102
  end
102
103
  end
103
104
  end
@@ -112,7 +113,7 @@ module Dhaka
112
113
 
113
114
  def create_reduction_actions_for_item_and_state item, state
114
115
  item.lookaheadset.each do |lookahead|
115
- new_action = ReduceAction.new(item.production)
116
+ new_action = @reduce_actions[item.production]
116
117
  if existing_action = state.actions[lookahead.name]
117
118
  if ReduceAction === existing_action
118
119
  message = ReduceReduceConflict.new(state, lookahead, new_action).resolve
@@ -10,14 +10,16 @@ module Dhaka
10
10
  end
11
11
 
12
12
  def run
13
- token_stream.each do |token|
13
+ tokenize_result = token_stream.each do |token|
14
14
  @current_token = token
15
15
  @symbol_queue << @current_token.symbol_name
16
16
  error = execute_actions
17
17
  return error if error
18
18
  node_stack << ParseTreeLeafNode.new(@current_token)
19
+ state_stack.last
19
20
  end
20
- ParseSuccessResult.new(node_stack.first)
21
+ return tokenize_result if TokenizerErrorResult === tokenize_result
22
+ ParseSuccessResult.new(node_stack.first) if node_stack.first.head_node?
21
23
  end
22
24
 
23
25
  private
@@ -29,23 +29,31 @@ module Dhaka
29
29
  "State#{id}"
30
30
  end
31
31
 
32
- def to_dot(options = {})
33
- %(#{unique_name} [label="#{items.values.collect{|item| item.to_s(options)}.join('\n')}"])
34
- end
35
-
36
32
  def compile_to_ruby_source
37
33
  result = " at_state(#{id}) {\n"
34
+
35
+ symbol_names_by_action = Hash.new {|hash, key| hash[key] = []}
38
36
  actions.each do |symbol_name, action|
39
- result << " for_symbol('#{symbol_name}') { #{action.compile_to_ruby_source} }\n"
37
+ symbol_names_by_action[action] << symbol_name
38
+ end
39
+
40
+ symbol_names_by_action.keys.each do |action|
41
+ symbol_names = symbol_names_by_action[action].collect {|symbol_name| "#{symbol_name.inspect}"}.join(', ')
42
+ result << " for_symbols(#{symbol_names}) { #{action.compile_to_ruby_source} }\n"
40
43
  end
44
+
41
45
  result << " }"
42
46
  result
43
47
  end
44
-
45
- def for_symbol symbol_name, &blk
46
- actions[symbol_name] = @parser.instance_eval(&blk)
48
+
49
+ def for_symbols *symbol_names, &blk
50
+ symbol_names.each do |symbol_name|
51
+ actions[symbol_name] = @parser.instance_eval(&blk)
52
+ end
47
53
  end
48
54
 
55
+ alias :for_symbol :for_symbols
56
+
49
57
  def to_s(options = {})
50
58
  items.values.collect{|item| item.to_s(options)}.join("\n")
51
59
  end
@@ -60,10 +60,9 @@ module Dhaka
60
60
 
61
61
  end
62
62
 
63
- # This class contains a DSL for specifying tokenizers. Subclass it to implement tokenizers for specific grammars.
64
- # Subclasses of this class may not be further subclassed.
63
+ # This abstract class contains a DSL for hand-coding tokenizers. Subclass it to implement tokenizers for specific grammars.
65
64
  #
66
- # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
65
+ # Tokenizers are state machines. Each state of a tokenizer is identified
67
66
  # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
68
67
  # that it starts in).
69
68
  #
@@ -109,6 +108,9 @@ module Dhaka
109
108
  # end
110
109
  #
111
110
  # end
111
+ #
112
+ # For languages where the lexical structure is very complicated, it may be too tedious to implement a Tokenizer by hand.
113
+ # In such cases, it's a lot easier to write a LexerSpecification using regular expressions and create a Lexer from that.
112
114
  class Tokenizer
113
115
  class << self
114
116
  # Define the action for the state named +state_name+.
@@ -0,0 +1,23 @@
1
+ class ArithmeticPrecedenceLexerSpecification < Dhaka::LexerSpecification
2
+
3
+ for_pattern '\s' do
4
+ # ignore whitespace
5
+ end
6
+
7
+ %w| - h l , |.each do |char|
8
+ for_pattern char do
9
+ create_token(char)
10
+ end
11
+ end
12
+
13
+ %w| ( ) + / * ^ |.each do |char|
14
+ for_pattern "\\#{char}" do
15
+ create_token(char)
16
+ end
17
+ end
18
+
19
+ for_pattern '\d+' do
20
+ create_token('n')
21
+ end
22
+
23
+ end