dhaka 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,64 @@
1
+ require 'rake/rdoctask'
2
+ require 'rake/gempackagetask'
3
+ require 'rake/testtask'
4
+ require 'rubygems'
5
+
6
+ Rake::RDocTask.new do |rdoc|
7
+ rdoc.rdoc_files.include('README', 'lib/**/*.rb')
8
+ rdoc.rdoc_files.exclude("lib/lexer/regex_parser.rb")
9
+ rdoc.main = "README"
10
+ rdoc.rdoc_dir = '../doc'
11
+ end
12
+
13
+ spec = Gem::Specification.new do |s|
14
+ s.name = "dhaka"
15
+ s.author = "Mushfeq Khan"
16
+ s.email = "mushfeq dot khan at gmail dot com"
17
+ s.version = ENV['VERSION'] || "0.0.0"
18
+ s.platform = Gem::Platform::RUBY
19
+ s.summary = "An LALR1 parser generator written in Ruby"
20
+ s.files = Dir.glob("{lib,test}/**/*").select {|file| file.include?('.rb') || file.include?('.txt')} + ['Rakefile']
21
+ s.require_path = 'lib'
22
+ s.autorequire = 'dhaka'
23
+ s.has_rdoc = true
24
+ end
25
+
26
+ Rake::GemPackageTask.new(spec) do |pkg|
27
+ pkg.package_dir = "../gems"
28
+ end
29
+
30
+ Rake::TestTask.new do |t|
31
+ t.libs << "test"
32
+ t.test_files = FileList['test/**/*test.rb']
33
+ t.verbose = true
34
+ end
35
+
36
+ task :generate_regex_parser do
37
+ require 'lib/dhaka'
38
+ File.open('lib/lexer/regex_parser.rb', 'w') do |file|
39
+ file << Dhaka::Parser.new(Dhaka::LexerSupport::RegexGrammar).compile_to_ruby_source_as('Dhaka::LexerSupport::RegexParser')
40
+ end
41
+ end
42
+
43
+ task :gem => [:test, :generate_regex_parser]
44
+
45
+ task :default => :test
46
+
47
+ task :test => [:generate_chittagong_parser, :generate_chittagong_lexer]
48
+
49
+ task :generate_chittagong_parser do
50
+ require 'lib/dhaka'
51
+ require 'test/chittagong/chittagong_grammar'
52
+ require 'test/fake_logger'
53
+ File.open('test/chittagong/chittagong_parser.rb', 'w') do |file|
54
+ file << Dhaka::Parser.new(ChittagongGrammar, FakeLogger.new).compile_to_ruby_source_as(:ChittagongParser)
55
+ end
56
+ end
57
+
58
+ task :generate_chittagong_lexer do
59
+ require 'lib/dhaka'
60
+ require 'test/chittagong/chittagong_lexer_specification'
61
+ File.open('test/chittagong/chittagong_lexer.rb', 'w') do |file|
62
+ file << Dhaka::Lexer.new(ChittagongLexerSpecification).compile_to_ruby_source_as(:ChittagongLexer)
63
+ end
64
+ end
@@ -26,6 +26,7 @@ require 'logger'
26
26
  require 'delegate'
27
27
 
28
28
  %w[
29
+ dot/dot
29
30
  grammar/grammar_symbol
30
31
  grammar/production
31
32
  grammar/closure_hash
@@ -45,4 +46,15 @@ parser/parser
45
46
  parser/compiled_parser
46
47
  tokenizer/tokenizer
47
48
  evaluator/evaluator
49
+ lexer/regex_grammar
50
+ lexer/regex_tokenizer
51
+ lexer/regex_parser
52
+ lexer/state_machine
53
+ lexer/dfa
54
+ lexer/state
55
+ lexer/specification
56
+ lexer/lexeme
57
+ lexer/lexer_run
58
+ lexer/lexer
59
+ lexer/compiled_lexer
48
60
  ].each {|path| require File.join(File.dirname(__FILE__), path)}
@@ -0,0 +1,29 @@
1
+ module Dhaka
2
+ module Dot #:nodoc:
3
+ class Digraph #:nodoc:
4
+ def initialize(node_attributes = {})
5
+ @result = ["digraph x {"]
6
+ @result << %(node #{dotify_hash(node_attributes)})
7
+ yield(self)
8
+ @result << '}'
9
+ end
10
+
11
+ def node(obj, attributes = {})
12
+ @result << "#{obj.object_id} #{dotify_hash(attributes)}"
13
+ end
14
+
15
+ def edge(src, dest, attributes = {})
16
+ @result << "#{src.object_id} -> #{dest.object_id} #{dotify_hash(attributes)}"
17
+ end
18
+
19
+ def dotify_hash hash
20
+ sorted_key_value_pairs = hash.collect {|key, value| [key.to_s, value.to_s]}.sort
21
+ hash.empty? ? "" : '[' + sorted_key_value_pairs.collect {|key, value| "#{key}=#{value.to_s.inspect}"}.join(' ') + ']'
22
+ end
23
+
24
+ def to_dot
25
+ @result.join("\n")
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,14 +1,14 @@
1
1
  module Dhaka
2
- # This is the abstract base evaluator class. It is not directly instantiated.
3
- # When defining an evaluator for a specific grammar, we subclass it. e.g. for FooGrammar
4
- # we create a FooEvaluator that subclasses Evaluator. Note that FooEvaluator may not
5
- # be further subclassed.
2
+ # Abstract base class for evaluators.
3
+ #
4
+ # Defining an evaluator is an easy way to perform syntax-directed evaluation without having to generate an abstract
5
+ # syntax tree representation of the input.
6
6
  #
7
7
  # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
8
8
  # a block that performs the evaluation. For detailed examples, see the evaluators in the
9
9
  # test suite.
10
10
  #
11
- # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
11
+ # The following is an evaluator for arithmetic expressions. When a parse tree node is encountered that
12
12
  # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
13
13
  # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
14
14
  # result is obtained by adding the evaluation results of the child nodes.
@@ -54,28 +54,18 @@ module Dhaka
54
54
  # end
55
55
  #
56
56
  # end
57
+
57
58
  class Evaluator
58
59
  class << self
59
- def inherited(evaluator)
60
- class << evaluator
61
- attr_accessor :grammar, :actions
62
- end
63
- end
64
-
65
- def method_missing(method_name, *args, &blk)
66
- name = method_name.to_s
67
- if name =~ /^for_(.+)$/
68
- rule_name = $1
69
- actions << rule_name
70
- send(:define_method, rule_name, &blk)
71
- else
72
- super
73
- end
74
- end
75
-
76
- # Evaluation rules are defined within a block passed to this method.
60
+ # Defining evaluation rules within a block passed to this method tells the evaluator to carry out a
61
+ # rudimentary check of your definitions and define default evaluation rules for pass-through
62
+ # productions (i.e. productions with expansions consisting of exactly one grammar symbol). The
63
+ # default evaluation rule for such productions is to simply return the result of calling +evaluate+
64
+ # on the unique child node. If you neglect to define a rule for a non-pass-through production (one
65
+ # where the expansion consists of multiple symbols), the evaluator will raise an exception
66
+ # at loading time, listing all the productions that absolutely need to be defined before you can
67
+ # continue.
77
68
  def define_evaluation_rules
78
- self.actions = []
79
69
  yield
80
70
  check_definitions
81
71
  end
@@ -93,9 +83,28 @@ module Dhaka
93
83
  non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
94
84
  raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
95
85
  end
86
+
87
+ def inherited(evaluator)
88
+ class << evaluator
89
+ attr_accessor :grammar, :actions
90
+ end
91
+ evaluator.actions = []
92
+ end
93
+
94
+ def method_missing(method_name, *args, &blk)
95
+ name = method_name.to_s
96
+ if name =~ /^for_(.+)$/
97
+ rule_name = $1
98
+ raise "Attempted to define evaluation rule for non-existent production '#{rule_name}'" unless grammar.production_named(rule_name)
99
+ actions << rule_name
100
+ send(:define_method, rule_name, &blk)
101
+ else
102
+ super
103
+ end
104
+ end
96
105
  end
97
106
 
98
- # Evaluate a syntax tree node.
107
+ # Evaluate a parse tree node.
99
108
  def evaluate node
100
109
  @node_stack ||= []
101
110
  @node_stack << node.child_nodes
@@ -104,7 +113,7 @@ module Dhaka
104
113
  result
105
114
  end
106
115
 
107
- # Returns the array of child nodes of the node being currently evaluated.
116
+ # Returns the array of child nodes of the node being evaluated currently.
108
117
  def child_nodes
109
118
  @node_stack.last
110
119
  end
@@ -6,7 +6,6 @@ module Dhaka
6
6
 
7
7
  # Productions for specific grammar symbols are defined in the context of this class.
8
8
  class ProductionBuilder
9
-
10
9
  # +symbol+ is the grammar symbol that productions are being defined for.
11
10
  def initialize(grammar, symbol)
12
11
  @grammar = grammar
@@ -18,14 +17,15 @@ module Dhaka
18
17
  # set to the precedence of the grammar symbol corresponding to that name.
19
18
  #
20
19
  # See the arithmetic precedence grammar in the test suites for an example.
21
- def method_missing(production_name, expansion, options = {})
20
+ def method_missing(production_name, expansion, options = {}, &blk)
22
21
  expansion_symbols = expansion.collect {|name| @grammar.symbols[name]}
23
- production_args = [@symbol, expansion_symbols, production_name.to_s]
22
+ production_args = [@symbol, expansion_symbols, production_name.to_s, blk, @grammar.production_index]
24
23
  if precedence_symbol_name = options[:prec]
25
24
  production_args << @grammar.symbol_for_name(precedence_symbol_name).precedence
26
25
  end
27
26
 
28
27
  production = Production.new(*production_args)
28
+ @grammar.production_index += 1
29
29
 
30
30
  @symbol.nullable = true if expansion_symbols.empty?
31
31
  @grammar.productions_by_symbol[production.symbol] << production
@@ -35,9 +35,9 @@ module Dhaka
35
35
  end
36
36
 
37
37
  # The precedence builder defines three methods, +left+, +right+ and +nonassoc+. These accept arrays of grammar
38
- # symbols all of which have the same precedence level and associativity. This works almost exactly like Yacc.
38
+ # symbols all of which have the same precedence level and associativity.
39
39
  #
40
- # See the arithmetic precedence grammar in the test suites for an example.
40
+ # See the arithmetic precedence grammar in the test suites for an example of how this works.
41
41
  class PrecedenceBuilder
42
42
  def initialize(grammar) #:nodoc:
43
43
  @grammar = grammar
@@ -60,11 +60,11 @@ module Dhaka
60
60
  end
61
61
  end
62
62
 
63
- # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
63
+ # Abstract base class for grammar specifications.
64
64
  #
65
- # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
66
- # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
67
- # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
65
+ # The following is a grammar specification for simple arithmetic. Precedences are specified as in Yacc -
66
+ # in ascending order of binding strength, with equal-strength symbols on the same level.
67
+ # Production rules are specified for each symbol by specifying the name of the production (used when
68
68
  # encoding the Evaluator) and the expansion for that particular production. For example, the production named
69
69
  # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
70
70
  #
@@ -91,9 +91,14 @@ module Dhaka
91
91
  # end
92
92
  # end
93
93
  #
94
+ # In the above grammar, the symbols <tt>+</tt> and <tt>-</tt> are declared as being +left+-associative, meaning that
95
+ # 1 + 2 + 3 is parsed as (1 + 2) + 3 as opposed to 1 + (2 + 3) (+right+-associativity). The symbol <tt>^</tt> is declared
96
+ # +nonassoc+ which means that expressions such as 2 ^ 3 ^ 4 are not allowed (non-associative). <tt>+</tt> and <tt>-</tt> are listed
97
+ # before <tt>^</tt> which means that they bind lower, and an expression such as 2 + 3 ^ 5 will be always be parsed as
98
+ # 2 + (3 ^ 5) and not (2 + 3) ^ 5.
94
99
  class Grammar
95
100
  class << self
96
- # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
101
+ # Used for defining the Production-s for the symbol with name +symbol+. The block +blk+ is
97
102
  # evaluated in the context of a ProductionBuilder.
98
103
  def for_symbol symbol, &blk
99
104
  symbol = symbols[symbol]
@@ -116,15 +121,16 @@ module Dhaka
116
121
  end
117
122
  end
118
123
 
119
- def productions
124
+ # Returns a list of all the Production-s in this grammar.
125
+ def productions
120
126
  productions_by_name.values
121
127
  end
122
128
 
123
- def productions_for_symbol(symbol)
129
+ def productions_for_symbol(symbol) #:nodoc:
124
130
  productions_by_symbol[symbol]
125
131
  end
126
132
 
127
- def closure(kernel)
133
+ def closure(kernel) #:nodoc:
128
134
  channels = Set.new
129
135
 
130
136
  result = compute_closure(kernel) do |hash, item|
@@ -138,11 +144,11 @@ module Dhaka
138
144
  [channels, result]
139
145
  end
140
146
 
141
- def passive_channel(start_item, end_item)
147
+ def passive_channel(start_item, end_item) #:nodoc:
142
148
  PassiveChannel.new(self, start_item, end_item)
143
149
  end
144
150
 
145
- def first(given_symbol)
151
+ def first(given_symbol) #:nodoc:
146
152
  cached_result = __first_cache[given_symbol]
147
153
  return cached_result if cached_result
148
154
  result = compute_closure([given_symbol]) do |hash, symbol|
@@ -159,23 +165,41 @@ module Dhaka
159
165
  result
160
166
  end
161
167
 
168
+ # Returns the Production identified by +name+.
162
169
  def production_named(name)
163
170
  productions_by_name[name]
164
171
  end
165
172
 
173
+ # Returns the set of terminal symbols in the grammar.
166
174
  def terminal_symbols
167
175
  symbols.values.select {|symbol| symbol.terminal}
168
176
  end
169
-
177
+
178
+ # Returns the set of non-terminal symbols in the grammar.
170
179
  def non_terminal_symbols
171
180
  symbols.values.select {|symbol| symbol.non_terminal}
172
181
  end
173
182
 
183
+ # Export the grammar to a BNF-like format
184
+ def to_bnf
185
+ result = []
186
+ last_symbol = nil
187
+ productions.sort.each do |production|
188
+ if production.symbol != last_symbol
189
+ result << ""
190
+ result << "#{production.symbol.name.inspect} :"
191
+ last_symbol = production.symbol
192
+ end
193
+ result << " | #{production.expansion.collect{|symbol| symbol.name.inspect}.join(' ')}"
194
+ end
195
+ result.join("\n")
196
+ end
197
+
174
198
  private
175
199
 
176
200
  def inherited(grammar)
177
201
  class << grammar
178
- attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache
202
+ attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache, :production_index
179
203
  end
180
204
  grammar.symbols = Hash.new {|hash, name| hash[name] = GrammarSymbol.new(name)}
181
205
  grammar.productions_by_symbol = Hash.new {|hash, name| hash[name] = Set.new([])}
@@ -183,6 +207,7 @@ module Dhaka
183
207
  grammar.end_symbol = grammar.symbols[END_SYMBOL_NAME]
184
208
  grammar.start_symbol = grammar.symbols[START_SYMBOL_NAME]
185
209
  grammar.__first_cache = {}
210
+ grammar.production_index = 0
186
211
  end
187
212
 
188
213
  def spontaneous_channel(start_item, end_item)
@@ -1,8 +1,9 @@
1
1
  module Dhaka
2
2
  # Each grammar symbol is uniquely identified by a string name. The name of a symbol can
3
- # be anything and need not correspond to its character representation. For example, an ampersand in the
4
- # character stream could be tokenized as a symbol with a name 'whatever'. In general, it's best to choose
5
- # symbol names that are descriptive.
3
+ # be anything (except the two reserved names <tt>'\_Start_'</tt> and <tt>'\_End_'</tt>) and need not
4
+ # correspond to its character representation. For example, an ampersand in the input string could
5
+ # be tokenized as a symbol with a name 'AND_OP'. You never have to directly instantiate a
6
+ # GrammarSymbol. It is done implicitly for you when you define a Grammar.
6
7
  class GrammarSymbol
7
8
  attr_reader :name
8
9
  attr_accessor :non_terminal, :nullable, :precedence, :associativity
@@ -1,13 +1,16 @@
1
1
  module Dhaka
2
2
  class Production
3
+ include Comparable
4
+
5
+ attr_reader :symbol, :expansion, :name, :action, :priority
3
6
 
4
- attr_reader :symbol, :expansion, :name
5
-
6
- def initialize(symbol, expansion, name, precedence = nil)
7
+ def initialize(symbol, expansion, name, action, priority, precedence = nil)
7
8
  @symbol = symbol
8
9
  @expansion = expansion
9
10
  @name = name
10
11
  @precedence = precedence
12
+ @action = action || proc { self }
13
+ @priority = priority
11
14
  end
12
15
 
13
16
  def precedence
@@ -26,5 +29,8 @@ module Dhaka
26
29
  "#{name} #{symbol} ::= #{expansion.join(' ')}"
27
30
  end
28
31
 
32
+ def <=> other
33
+ priority <=> other.priority
34
+ end
29
35
  end
30
36
  end
@@ -0,0 +1,46 @@
1
+ module Dhaka
2
+ # Abstract base class of all compiled Lexers. It is only used by generated code.
3
+ class CompiledLexer
4
+
5
+ class << self
6
+ # Returns a LexerRun that tokenizes +input+.
7
+ def lex input
8
+ LexerRun.new(self, input)
9
+ end
10
+
11
+ def start_state #:nodoc:
12
+ states[start_state_id]
13
+ end
14
+
15
+ def action_for_pattern pattern #:nodoc:
16
+ specification.items[pattern].action
17
+ end
18
+
19
+ private
20
+ def inherited(lexer)
21
+ class << lexer
22
+ attr_accessor :states, :specification, :start_state_id
23
+ end
24
+ lexer.states = Hash.new do |hash, state_id|
25
+ hash[state_id] = LexerSupport::State.new(lexer, nil)
26
+ end
27
+ end
28
+
29
+ def at_state x, &blk
30
+ states[x].instance_eval(&blk)
31
+ end
32
+
33
+ def start_with start_state_id
34
+ self.start_state_id = start_state_id
35
+ end
36
+
37
+ def switch_to dest_state_id
38
+ states[dest_state_id]
39
+ end
40
+
41
+ def inspect
42
+ "<Dhaka::CompiledLexer specification : #{specification}>"
43
+ end
44
+ end
45
+ end
46
+ end