dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,14 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class RegexTokenizer < Tokenizer
4
+
5
+ for_state TOKENIZER_IDLE_STATE do
6
+ for_characters(ALL_CHARACTERS) do
7
+ create_token(curr_char, nil)
8
+ advance
9
+ end
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,69 @@
1
+ module Dhaka
2
+ # Abstract base class for lexer specifications.
3
+ #
4
+ # Use this to specify the transformations that will be performed when the lexer recognizes a given pattern. Actions are listed in
5
+ # descending order of priority. For example in the following lexer specification:
6
+ #
7
+ # class LexerSpec < Dhaka::LexerSpecification
8
+ # for_pattern 'zz' do
9
+ # "recognized two zs"
10
+ # end
11
+ #
12
+ # for_pattern '\w(\w|\d)*' do
13
+ # "recognized word token #{current_lexeme.value}"
14
+ # end
15
+ #
16
+ # for_pattern '(\d)+(\.\d+)?' do
17
+ # "recognized number #{current_lexeme.value}"
18
+ # end
19
+ #
20
+ # for_pattern ' +' do
21
+ # #ignores whitespace
22
+ # end
23
+ #
24
+ # for_pattern "\n+" do
25
+ # "recognized newline"
26
+ # end
27
+ # end
28
+ #
29
+ # the pattern 'zz' takes precedence over the pattern immediately below it, so the lexer will announce that it has recognized two
30
+ # 'z's instead of a word token.
31
+ #
32
+ # The patterns are <i>not</i> Ruby regular expressions - a lot of operators featured in Ruby's regular expression engine are not yet supported.
33
+ # See http://dhaka.rubyforge.org/regex_grammar.html for the current syntax.
34
+
35
+ class LexerSpecification
36
+ class << self
37
+ # Associates +blk+ as the action to be performed when a lexer recognizes +pattern+. When Lexer#lex is invoked,
38
+ # it creates a LexerRun object that provides the context for +blk+ to be evaluated in. Methods available in this block
39
+ # are LexerRun#current_lexeme and LexerRun#create_token.
40
+ def for_pattern(pattern, &blk)
41
+ items[pattern] = LexerSpecificationItem.new(pattern, priority, blk)
42
+ self.priority += 1
43
+ end
44
+
45
+ private
46
+ def inherited(specification)
47
+ class << specification
48
+ attr_accessor :items, :priority
49
+ end
50
+ specification.items = {}
51
+ specification.priority = 0
52
+ end
53
+
54
+ end
55
+ end
56
+
57
+ class LexerSpecificationItem #:nodoc:
58
+ include Comparable
59
+ attr_reader :pattern, :action, :priority
60
+ def initialize(pattern, priority, action)
61
+ @pattern, @priority, @action = pattern, priority, action
62
+ end
63
+
64
+ def <=> other
65
+ priority <=> other.priority
66
+ end
67
+ end
68
+ end
69
+
@@ -0,0 +1,45 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class State
4
+ attr_reader :transitions, :pattern
5
+ def initialize state_machine, pattern
6
+ @state_machine = state_machine
7
+ @pattern = pattern
8
+ @transitions = {}
9
+ end
10
+
11
+ def accepting?
12
+ pattern
13
+ end
14
+
15
+ def for_characters *characters, &blk
16
+ dest_state = @state_machine.instance_eval(&blk)
17
+ characters.each do |char|
18
+ transitions[char] = dest_state
19
+ end
20
+ end
21
+
22
+ def recognize pattern
23
+ @pattern = pattern
24
+ end
25
+
26
+ def compile_to_ruby_source
27
+ result = " at_state(#{object_id}) {\n"
28
+ result << " recognize(#{pattern.inspect})\n" if accepting?
29
+ transition_keys_by_destination_state = Hash.new {|hash, key| hash[key] = []}
30
+ transitions.each do |key, dest_state|
31
+ transition_keys_by_destination_state[dest_state.object_id] << key
32
+ end
33
+
34
+ transition_keys_by_destination_state.keys.each do |state_id|
35
+ transition_keys = transition_keys_by_destination_state[state_id].collect {|transition_key| "#{transition_key.inspect}"}.join(', ')
36
+ result << " for_characters(#{transition_keys}) { switch_to #{state_id} }\n"
37
+ end
38
+
39
+ result << " }"
40
+ result
41
+ end
42
+ end
43
+ end
44
+ end
45
+
@@ -0,0 +1,37 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+ class StateMachine
4
+ attr_reader :start_state
5
+
6
+ def initialize start_key
7
+ @states = Hash.new do |hash, key|
8
+ new_state = new_state_for_key key
9
+ hash[key] = new_state
10
+ transition_characters(key).each do |char|
11
+ dest_key = dest_key_for(key, char)
12
+ dest_state = hash[dest_key]
13
+ new_state.transitions[char] = dest_state
14
+ end
15
+ new_state
16
+ end
17
+ @start_state = @states[start_key]
18
+ end
19
+
20
+ def to_dot
21
+ Dot::Digraph.new(:fontsize => 10, :shape => :circle, :size => 5) do |g|
22
+ start = 'Start'
23
+ g.node(start, :label => start)
24
+ g.edge(start, @start_state)
25
+ @states.values.each do |state|
26
+ state_attributes = {}
27
+ state_attributes.merge!(:shape => :doublecircle, :label => state.pattern) if state.accepting?
28
+ g.node(state, state_attributes)
29
+ state.transitions.each do |transition_key, dest_state|
30
+ g.edge(state, dest_state, :label => transition_key)
31
+ end
32
+ end
33
+ end.to_dot
34
+ end
35
+ end
36
+ end
37
+ end
@@ -31,12 +31,12 @@ module Dhaka
31
31
  @action_code = Proc.new do
32
32
  composite_node = ParseTreeCompositeNode.new(production)
33
33
 
34
- production.expansion.each do |symbol|
34
+ production.expansion.each do |symbol|
35
35
  state_stack.pop
36
36
  composite_node.child_nodes.unshift(node_stack.pop)
37
37
  end
38
38
 
39
- node_stack << composite_node
39
+ node_stack << composite_node.instance_eval(&production.action)
40
40
 
41
41
  unless composite_node.head_node?
42
42
  @symbol_queue.concat [@current_token.symbol_name, production.symbol.name]
@@ -45,7 +45,7 @@ module Dhaka
45
45
  end
46
46
 
47
47
  def compile_to_ruby_source
48
- "reduce_with '#{production.name}'"
48
+ "reduce_with #{production.name.inspect}"
49
49
  end
50
50
 
51
51
  def to_s
@@ -1,14 +1,22 @@
1
1
  module Dhaka
2
- # This is the superclass of all compiled Parsers. It is only used by generated code.
2
+ # Abstract base class of all compiled Parsers. It is only used by generated code.
3
3
  class CompiledParser
4
4
 
5
5
  class << self
6
6
  private
7
7
  def inherited(compiled_parser)
8
8
  class << compiled_parser
9
- attr_accessor :states, :grammar, :start_state_id
9
+ attr_accessor :states, :grammar, :start_state_id, :shift_actions, :reduce_actions
10
10
  end
11
- compiled_parser.states = Hash.new {|hash, state_id| hash[state_id] = ParserState.new(compiled_parser, {}, state_id)}
11
+ compiled_parser.states = Hash.new do |hash, state_id|
12
+ hash[state_id] = ParserState.new(compiled_parser, {}, state_id)
13
+ end
14
+ compiled_parser.shift_actions = Hash.new do |hash, state_id|
15
+ hash[state_id] = ShiftAction.new(compiled_parser.states[state_id])
16
+ end
17
+ compiled_parser.reduce_actions = Hash.new do |hash, production_name|
18
+ hash[production_name] = ReduceAction.new(compiled_parser.grammar.production_named(production_name))
19
+ end
12
20
  end
13
21
 
14
22
  def at_state x, &blk
@@ -16,11 +16,9 @@ module Dhaka
16
16
 
17
17
  # Returns the dot representation of the parse tree
18
18
  def to_dot
19
- result = []
20
- result << ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
21
- result << parse_tree.to_dot
22
- result << ['}']
23
- result.join("\n")
19
+ Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
20
+ parse_tree.to_dot(g)
21
+ end.to_dot
24
22
  end
25
23
 
26
24
  # Deprecated. Use the +parse_tree+ accessor.
@@ -20,25 +20,18 @@ module Dhaka
20
20
  end
21
21
 
22
22
  # Returns the dot representation of this node.
23
- def to_dot
24
- result = []
25
- label = production
26
- result << %(#{dot_name} [label="#{label}"])
23
+ def to_dot graph
24
+ graph.node(self, :label => production)
27
25
  child_nodes.each do |child|
28
- result << "#{dot_name} -> #{child.dot_name}"
29
- result << "#{child.to_dot}"
26
+ graph.edge(self, child)
27
+ child.to_dot(graph)
30
28
  end
31
- result.join("\n")
32
29
  end
33
30
 
34
31
  def head_node? #:nodoc:
35
32
  production.symbol.name == START_SYMBOL_NAME
36
33
  end
37
34
 
38
- def dot_name #:nodoc:
39
- "Node#{object_id}"
40
- end
41
-
42
35
  end
43
36
 
44
37
  # These are leaf nodes of syntax trees. They contain tokens.
@@ -62,16 +55,12 @@ module Dhaka
62
55
  end
63
56
 
64
57
  # Returns the dot representation of this node.
65
- def to_dot
66
- %(#{dot_name} [label="#{token.to_s}"])
58
+ def to_dot(graph)
59
+ graph.node(self, :label => token)
67
60
  end
68
61
 
69
62
  def head_node? #:nodoc:
70
63
  false
71
64
  end
72
-
73
- def dot_name #:nodoc:
74
- "Node#{object_id}"
75
- end
76
65
  end
77
66
  end
@@ -14,10 +14,12 @@ module Dhaka
14
14
  # and the log level is WARN. Shift-reduce conflicts are reported at WARN and reduce-reduce conflicts
15
15
  # at ERROR. You may pass in your own logger. Logging at DEBUG shows a lot of progress output.
16
16
  def initialize(grammar, logger = nil)
17
- @logger = logger || default_logger
18
- @transitions = Hash.new {|hash, state| hash[state] = {}}
19
- @grammar = grammar
20
- @channels = []
17
+ @shift_actions = Hash.new {|hash, state| hash[state] = ShiftAction.new(state)}
18
+ @reduce_actions = Hash.new {|hash, production| hash[production] = ReduceAction.new(production)}
19
+ @logger = logger || default_logger
20
+ @transitions = Hash.new {|hash, state| hash[state] = {}}
21
+ @grammar = grammar
22
+ @channels = []
21
23
  @states = Hash.new do |hash, kernel|
22
24
  channels, closure = grammar.closure(kernel)
23
25
  @channels.concat channels.to_a
@@ -51,15 +53,14 @@ module Dhaka
51
53
  # options hash, lookaheads are not written out to the parser states, which is helpful when there are dozens
52
54
  # of lookahead symbols for every item in every state.
53
55
  def to_dot(options = {})
54
- result = ["digraph x {", %(node [fontsize="10" shape=box size="5"])]
55
- result.concat states.collect { |state| state.to_dot(options) }
56
- states.each do |state|
57
- @transitions[state].each do |symbol, dest_state|
58
- result << %(#{state.unique_name} -> #{dest_state.unique_name} [label="#{symbol.name}"])
56
+ Dot::Digraph.new(:fontsize => 10, :shape => :box, :size => 5) do |g|
57
+ states.each do |state|
58
+ g.node(state, :label => state.items.values.collect{|item| item.to_s(options)}.join("\n"))
59
+ @transitions[state].each do |symbol, dest_state|
60
+ g.edge(state, dest_state, :label => symbol.name)
61
+ end
59
62
  end
60
- end
61
- result << ['}']
62
- result.join("\n")
63
+ end.to_dot
63
64
  end
64
65
 
65
66
  def inspect
@@ -97,7 +98,7 @@ module Dhaka
97
98
  def generate_shift_actions
98
99
  @states.values.each do |state|
99
100
  @transitions[state].keys.each do |symbol|
100
- state.actions[symbol.name] = ShiftAction.new(@transitions[state][symbol])
101
+ state.actions[symbol.name] = @shift_actions[@transitions[state][symbol]]
101
102
  end
102
103
  end
103
104
  end
@@ -112,7 +113,7 @@ module Dhaka
112
113
 
113
114
  def create_reduction_actions_for_item_and_state item, state
114
115
  item.lookaheadset.each do |lookahead|
115
- new_action = ReduceAction.new(item.production)
116
+ new_action = @reduce_actions[item.production]
116
117
  if existing_action = state.actions[lookahead.name]
117
118
  if ReduceAction === existing_action
118
119
  message = ReduceReduceConflict.new(state, lookahead, new_action).resolve
@@ -10,14 +10,16 @@ module Dhaka
10
10
  end
11
11
 
12
12
  def run
13
- token_stream.each do |token|
13
+ tokenize_result = token_stream.each do |token|
14
14
  @current_token = token
15
15
  @symbol_queue << @current_token.symbol_name
16
16
  error = execute_actions
17
17
  return error if error
18
18
  node_stack << ParseTreeLeafNode.new(@current_token)
19
+ state_stack.last
19
20
  end
20
- ParseSuccessResult.new(node_stack.first)
21
+ return tokenize_result if TokenizerErrorResult === tokenize_result
22
+ ParseSuccessResult.new(node_stack.first) if node_stack.first.head_node?
21
23
  end
22
24
 
23
25
  private
@@ -29,23 +29,31 @@ module Dhaka
29
29
  "State#{id}"
30
30
  end
31
31
 
32
- def to_dot(options = {})
33
- %(#{unique_name} [label="#{items.values.collect{|item| item.to_s(options)}.join('\n')}"])
34
- end
35
-
36
32
  def compile_to_ruby_source
37
33
  result = " at_state(#{id}) {\n"
34
+
35
+ symbol_names_by_action = Hash.new {|hash, key| hash[key] = []}
38
36
  actions.each do |symbol_name, action|
39
- result << " for_symbol('#{symbol_name}') { #{action.compile_to_ruby_source} }\n"
37
+ symbol_names_by_action[action] << symbol_name
38
+ end
39
+
40
+ symbol_names_by_action.keys.each do |action|
41
+ symbol_names = symbol_names_by_action[action].collect {|symbol_name| "#{symbol_name.inspect}"}.join(', ')
42
+ result << " for_symbols(#{symbol_names}) { #{action.compile_to_ruby_source} }\n"
40
43
  end
44
+
41
45
  result << " }"
42
46
  result
43
47
  end
44
-
45
- def for_symbol symbol_name, &blk
46
- actions[symbol_name] = @parser.instance_eval(&blk)
48
+
49
+ def for_symbols *symbol_names, &blk
50
+ symbol_names.each do |symbol_name|
51
+ actions[symbol_name] = @parser.instance_eval(&blk)
52
+ end
47
53
  end
48
54
 
55
+ alias :for_symbol :for_symbols
56
+
49
57
  def to_s(options = {})
50
58
  items.values.collect{|item| item.to_s(options)}.join("\n")
51
59
  end
@@ -60,10 +60,9 @@ module Dhaka
60
60
 
61
61
  end
62
62
 
63
- # This class contains a DSL for specifying tokenizers. Subclass it to implement tokenizers for specific grammars.
64
- # Subclasses of this class may not be further subclassed.
63
+ # This abstract class contains a DSL for hand-coding tokenizers. Subclass it to implement tokenizers for specific grammars.
65
64
  #
66
- # Tokenizers are state machines that are specified pretty much by hand. Each state of a tokenizer is identified
65
+ # Tokenizers are state machines. Each state of a tokenizer is identified
67
66
  # by a Ruby symbol. The constant Dhaka::TOKENIZER_IDLE_STATE is reserved for the idle state of the tokenizer (the one
68
67
  # that it starts in).
69
68
  #
@@ -109,6 +108,9 @@ module Dhaka
109
108
  # end
110
109
  #
111
110
  # end
111
+ #
112
+ # For languages where the lexical structure is very complicated, it may be too tedious to implement a Tokenizer by hand.
113
+ # In such cases, it's a lot easier to write a LexerSpecification using regular expressions and create a Lexer from that.
112
114
  class Tokenizer
113
115
  class << self
114
116
  # Define the action for the state named +state_name+.
@@ -0,0 +1,23 @@
1
+ class ArithmeticPrecedenceLexerSpecification < Dhaka::LexerSpecification
2
+
3
+ for_pattern '\s' do
4
+ # ignore whitespace
5
+ end
6
+
7
+ %w| - h l , |.each do |char|
8
+ for_pattern char do
9
+ create_token(char)
10
+ end
11
+ end
12
+
13
+ %w| ( ) + / * ^ |.each do |char|
14
+ for_pattern "\\#{char}" do
15
+ create_token(char)
16
+ end
17
+ end
18
+
19
+ for_pattern '\d+' do
20
+ create_token('n')
21
+ end
22
+
23
+ end