dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,64 @@
1
+ require 'rake/rdoctask'
2
+ require 'rake/gempackagetask'
3
+ require 'rake/testtask'
4
+ require 'rubygems'
5
+
6
+ Rake::RDocTask.new do |rdoc|
7
+ rdoc.rdoc_files.include('README', 'lib/**/*.rb')
8
+ rdoc.rdoc_files.exclude("lib/lexer/regex_parser.rb")
9
+ rdoc.main = "README"
10
+ rdoc.rdoc_dir = '../doc'
11
+ end
12
+
13
+ spec = Gem::Specification.new do |s|
14
+ s.name = "dhaka"
15
+ s.author = "Mushfeq Khan"
16
+ s.email = "mushfeq dot khan at gmail dot com"
17
+ s.version = ENV['VERSION'] || "0.0.0"
18
+ s.platform = Gem::Platform::RUBY
19
+ s.summary = "An LALR1 parser generator written in Ruby"
20
+ s.files = Dir.glob("{lib,test}/**/*").select {|file| file.include?('.rb') || file.include?('.txt')} + ['Rakefile']
21
+ s.require_path = 'lib'
22
+ s.autorequire = 'dhaka'
23
+ s.has_rdoc = true
24
+ end
25
+
26
+ Rake::GemPackageTask.new(spec) do |pkg|
27
+ pkg.package_dir = "../gems"
28
+ end
29
+
30
+ Rake::TestTask.new do |t|
31
+ t.libs << "test"
32
+ t.test_files = FileList['test/**/*test.rb']
33
+ t.verbose = true
34
+ end
35
+
36
+ task :generate_regex_parser do
37
+ require 'lib/dhaka'
38
+ File.open('lib/lexer/regex_parser.rb', 'w') do |file|
39
+ file << Dhaka::Parser.new(Dhaka::LexerSupport::RegexGrammar).compile_to_ruby_source_as('Dhaka::LexerSupport::RegexParser')
40
+ end
41
+ end
42
+
43
+ task :gem => [:test, :generate_regex_parser]
44
+
45
+ task :default => :test
46
+
47
+ task :test => [:generate_chittagong_parser, :generate_chittagong_lexer]
48
+
49
+ task :generate_chittagong_parser do
50
+ require 'lib/dhaka'
51
+ require 'test/chittagong/chittagong_grammar'
52
+ require 'test/fake_logger'
53
+ File.open('test/chittagong/chittagong_parser.rb', 'w') do |file|
54
+ file << Dhaka::Parser.new(ChittagongGrammar, FakeLogger.new).compile_to_ruby_source_as(:ChittagongParser)
55
+ end
56
+ end
57
+
58
+ task :generate_chittagong_lexer do
59
+ require 'lib/dhaka'
60
+ require 'test/chittagong/chittagong_lexer_specification'
61
+ File.open('test/chittagong/chittagong_lexer.rb', 'w') do |file|
62
+ file << Dhaka::Lexer.new(ChittagongLexerSpecification).compile_to_ruby_source_as(:ChittagongLexer)
63
+ end
64
+ end
@@ -26,6 +26,7 @@ require 'logger'
26
26
  require 'delegate'
27
27
 
28
28
  %w[
29
+ dot/dot
29
30
  grammar/grammar_symbol
30
31
  grammar/production
31
32
  grammar/closure_hash
@@ -45,4 +46,15 @@ parser/parser
45
46
  parser/compiled_parser
46
47
  tokenizer/tokenizer
47
48
  evaluator/evaluator
49
+ lexer/regex_grammar
50
+ lexer/regex_tokenizer
51
+ lexer/regex_parser
52
+ lexer/state_machine
53
+ lexer/dfa
54
+ lexer/state
55
+ lexer/specification
56
+ lexer/lexeme
57
+ lexer/lexer_run
58
+ lexer/lexer
59
+ lexer/compiled_lexer
48
60
  ].each {|path| require File.join(File.dirname(__FILE__), path)}
@@ -0,0 +1,29 @@
1
+ module Dhaka
2
+ module Dot #:nodoc:
3
+ class Digraph #:nodoc:
4
+ def initialize(node_attributes = {})
5
+ @result = ["digraph x {"]
6
+ @result << %(node #{dotify_hash(node_attributes)})
7
+ yield(self)
8
+ @result << '}'
9
+ end
10
+
11
+ def node(obj, attributes = {})
12
+ @result << "#{obj.object_id} #{dotify_hash(attributes)}"
13
+ end
14
+
15
+ def edge(src, dest, attributes = {})
16
+ @result << "#{src.object_id} -> #{dest.object_id} #{dotify_hash(attributes)}"
17
+ end
18
+
19
+ def dotify_hash hash
20
+ sorted_key_value_pairs = hash.collect {|key, value| [key.to_s, value.to_s]}.sort
21
+ hash.empty? ? "" : '[' + sorted_key_value_pairs.collect {|key, value| "#{key}=#{value.to_s.inspect}"}.join(' ') + ']'
22
+ end
23
+
24
+ def to_dot
25
+ @result.join("\n")
26
+ end
27
+ end
28
+ end
29
+ end
@@ -1,14 +1,14 @@
1
1
  module Dhaka
2
- # This is the abstract base evaluator class. It is not directly instantiated.
3
- # When defining an evaluator for a specific grammar, we subclass it. e.g. for FooGrammar
4
- # we create a FooEvaluator that subclasses Evaluator. Note that FooEvaluator may not
5
- # be further subclassed.
2
+ # Abstract base class for evaluators.
3
+ #
4
+ # Defining an evaluator is an easy way to perform syntax-directed evaluation without having to generate an abstract
5
+ # syntax tree representation of the input.
6
6
  #
7
7
  # An evaluation rule for a given production named +bar+ is defined by calling +for_bar+ with
8
8
  # a block that performs the evaluation. For detailed examples, see the evaluators in the
9
9
  # test suite.
10
10
  #
11
- # The following is an evaluator for arithmetic expressions. When a syntax tree node is encountered that
11
+ # The following is an evaluator for arithmetic expressions. When a parse tree node is encountered that
12
12
  # corresponds to the production named +addition+, the block passed to +for_addition+ is invoked. The +evaluate+
13
13
  # method is then recursively called on the child nodes, in this case the operands to the addition operation. The
14
14
  # result is obtained by adding the evaluation results of the child nodes.
@@ -54,28 +54,18 @@ module Dhaka
54
54
  # end
55
55
  #
56
56
  # end
57
+
57
58
  class Evaluator
58
59
  class << self
59
- def inherited(evaluator)
60
- class << evaluator
61
- attr_accessor :grammar, :actions
62
- end
63
- end
64
-
65
- def method_missing(method_name, *args, &blk)
66
- name = method_name.to_s
67
- if name =~ /^for_(.+)$/
68
- rule_name = $1
69
- actions << rule_name
70
- send(:define_method, rule_name, &blk)
71
- else
72
- super
73
- end
74
- end
75
-
76
- # Evaluation rules are defined within a block passed to this method.
60
+ # Defining evaluation rules within a block passed to this method tells the evaluator to carry out a
61
+ # rudimentary check of your definitions and define default evaluation rules for pass-through
62
+ # productions (i.e. productions with expansions consisting of exactly one grammar symbol). The
63
+ # default evaluation rule for such productions is to simply return the result of calling +evaluate+
64
+ # on the unique child node. If you neglect to define a rule for a non-pass-through production (one
65
+ # where the expansion consists of multiple symbols), the evaluator will raise an exception
66
+ # at loading time, listing all the productions that absolutely need to be defined before you can
67
+ # continue.
77
68
  def define_evaluation_rules
78
- self.actions = []
79
69
  yield
80
70
  check_definitions
81
71
  end
@@ -93,9 +83,28 @@ module Dhaka
93
83
  non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
94
84
  raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
95
85
  end
86
+
87
+ def inherited(evaluator)
88
+ class << evaluator
89
+ attr_accessor :grammar, :actions
90
+ end
91
+ evaluator.actions = []
92
+ end
93
+
94
+ def method_missing(method_name, *args, &blk)
95
+ name = method_name.to_s
96
+ if name =~ /^for_(.+)$/
97
+ rule_name = $1
98
+ raise "Attempted to define evaluation rule for non-existent production '#{rule_name}'" unless grammar.production_named(rule_name)
99
+ actions << rule_name
100
+ send(:define_method, rule_name, &blk)
101
+ else
102
+ super
103
+ end
104
+ end
96
105
  end
97
106
 
98
- # Evaluate a syntax tree node.
107
+ # Evaluate a parse tree node.
99
108
  def evaluate node
100
109
  @node_stack ||= []
101
110
  @node_stack << node.child_nodes
@@ -104,7 +113,7 @@ module Dhaka
104
113
  result
105
114
  end
106
115
 
107
- # Returns the array of child nodes of the node being currently evaluated.
116
+ # Returns the array of child nodes of the node being evaluated currently.
108
117
  def child_nodes
109
118
  @node_stack.last
110
119
  end
@@ -6,7 +6,6 @@ module Dhaka
6
6
 
7
7
  # Productions for specific grammar symbols are defined in the context of this class.
8
8
  class ProductionBuilder
9
-
10
9
  # +symbol+ is the grammar symbol that productions are being defined for.
11
10
  def initialize(grammar, symbol)
12
11
  @grammar = grammar
@@ -18,14 +17,15 @@ module Dhaka
18
17
  # set to the precedence of the grammar symbol corresponding to that name.
19
18
  #
20
19
  # See the arithmetic precedence grammar in the test suites for an example.
21
- def method_missing(production_name, expansion, options = {})
20
+ def method_missing(production_name, expansion, options = {}, &blk)
22
21
  expansion_symbols = expansion.collect {|name| @grammar.symbols[name]}
23
- production_args = [@symbol, expansion_symbols, production_name.to_s]
22
+ production_args = [@symbol, expansion_symbols, production_name.to_s, blk, @grammar.production_index]
24
23
  if precedence_symbol_name = options[:prec]
25
24
  production_args << @grammar.symbol_for_name(precedence_symbol_name).precedence
26
25
  end
27
26
 
28
27
  production = Production.new(*production_args)
28
+ @grammar.production_index += 1
29
29
 
30
30
  @symbol.nullable = true if expansion_symbols.empty?
31
31
  @grammar.productions_by_symbol[production.symbol] << production
@@ -35,9 +35,9 @@ module Dhaka
35
35
  end
36
36
 
37
37
  # The precedence builder defines three methods, +left+, +right+ and +nonassoc+. These accept arrays of grammar
38
- # symbols all of which have the same precedence level and associativity. This works almost exactly like Yacc.
38
+ # symbols all of which have the same precedence level and associativity.
39
39
  #
40
- # See the arithmetic precedence grammar in the test suites for an example.
40
+ # See the arithmetic precedence grammar in the test suites for an example of how this works.
41
41
  class PrecedenceBuilder
42
42
  def initialize(grammar) #:nodoc:
43
43
  @grammar = grammar
@@ -60,11 +60,11 @@ module Dhaka
60
60
  end
61
61
  end
62
62
 
63
- # This class is subclassed when specifying a grammar. Note that subclasses of this class may not be further subclassed.
63
+ # Abstract base class for grammar specifications.
64
64
  #
65
- # The following is a grammar specification for simple arithmetic. Familiarity with Yacc helps, but the short version is
66
- # that precedences for symbols are specified in ascending order of binding strength, with equal-strength symbols
67
- # on the same level. Production rules are specified for each symbol by specifying the name of the production (used when
65
+ # The following is a grammar specification for simple arithmetic. Precedences are specified as in Yacc -
66
+ # in ascending order of binding strength, with equal-strength symbols on the same level.
67
+ # Production rules are specified for each symbol by specifying the name of the production (used when
68
68
  # encoding the Evaluator) and the expansion for that particular production. For example, the production named
69
69
  # +addition+ expands the symbol <tt>'E'</tt> to the list of symbols <tt>['E', '+', 'E']</tt>.
70
70
  #
@@ -91,9 +91,14 @@ module Dhaka
91
91
  # end
92
92
  # end
93
93
  #
94
+ # In the above grammar, the symbols <tt>+</tt> and <tt>-</tt> are declared as being +left+-associative, meaning that
95
+ # 1 + 2 + 3 is parsed as (1 + 2) + 3 as opposed to 1 + (2 + 3) (+right+-associativity). The symbol <tt>^</tt> is declared
96
+ # +nonassoc+ which means that expressions such as 2 ^ 3 ^ 4 are not allowed (non-associative). <tt>+</tt> and <tt>-</tt> are listed
97
+ # before <tt>^</tt> which means that they bind lower, and an expression such as 2 + 3 ^ 5 will be always be parsed as
98
+ # 2 + (3 ^ 5) and not (2 + 3) ^ 5.
94
99
  class Grammar
95
100
  class << self
96
- # Used for defining the productions for the symbol with name +symbol+. The block +blk+ is
101
+ # Used for defining the Production-s for the symbol with name +symbol+. The block +blk+ is
97
102
  # evaluated in the context of a ProductionBuilder.
98
103
  def for_symbol symbol, &blk
99
104
  symbol = symbols[symbol]
@@ -116,15 +121,16 @@ module Dhaka
116
121
  end
117
122
  end
118
123
 
119
- def productions
124
+ # Returns a list of all the Production-s in this grammar.
125
+ def productions
120
126
  productions_by_name.values
121
127
  end
122
128
 
123
- def productions_for_symbol(symbol)
129
+ def productions_for_symbol(symbol) #:nodoc:
124
130
  productions_by_symbol[symbol]
125
131
  end
126
132
 
127
- def closure(kernel)
133
+ def closure(kernel) #:nodoc:
128
134
  channels = Set.new
129
135
 
130
136
  result = compute_closure(kernel) do |hash, item|
@@ -138,11 +144,11 @@ module Dhaka
138
144
  [channels, result]
139
145
  end
140
146
 
141
- def passive_channel(start_item, end_item)
147
+ def passive_channel(start_item, end_item) #:nodoc:
142
148
  PassiveChannel.new(self, start_item, end_item)
143
149
  end
144
150
 
145
- def first(given_symbol)
151
+ def first(given_symbol) #:nodoc:
146
152
  cached_result = __first_cache[given_symbol]
147
153
  return cached_result if cached_result
148
154
  result = compute_closure([given_symbol]) do |hash, symbol|
@@ -159,23 +165,41 @@ module Dhaka
159
165
  result
160
166
  end
161
167
 
168
+ # Returns the Production identified by +name+.
162
169
  def production_named(name)
163
170
  productions_by_name[name]
164
171
  end
165
172
 
173
+ # Returns the set of terminal symbols in the grammar.
166
174
  def terminal_symbols
167
175
  symbols.values.select {|symbol| symbol.terminal}
168
176
  end
169
-
177
+
178
+ # Returns the set of non-terminal symbols in the grammar.
170
179
  def non_terminal_symbols
171
180
  symbols.values.select {|symbol| symbol.non_terminal}
172
181
  end
173
182
 
183
+ # Export the grammar to a BNF-like format
184
+ def to_bnf
185
+ result = []
186
+ last_symbol = nil
187
+ productions.sort.each do |production|
188
+ if production.symbol != last_symbol
189
+ result << ""
190
+ result << "#{production.symbol.name.inspect} :"
191
+ last_symbol = production.symbol
192
+ end
193
+ result << " | #{production.expansion.collect{|symbol| symbol.name.inspect}.join(' ')}"
194
+ end
195
+ result.join("\n")
196
+ end
197
+
174
198
  private
175
199
 
176
200
  def inherited(grammar)
177
201
  class << grammar
178
- attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache
202
+ attr_accessor :symbols, :productions_by_symbol, :productions_by_name, :start_symbol, :end_symbol, :__first_cache, :production_index
179
203
  end
180
204
  grammar.symbols = Hash.new {|hash, name| hash[name] = GrammarSymbol.new(name)}
181
205
  grammar.productions_by_symbol = Hash.new {|hash, name| hash[name] = Set.new([])}
@@ -183,6 +207,7 @@ module Dhaka
183
207
  grammar.end_symbol = grammar.symbols[END_SYMBOL_NAME]
184
208
  grammar.start_symbol = grammar.symbols[START_SYMBOL_NAME]
185
209
  grammar.__first_cache = {}
210
+ grammar.production_index = 0
186
211
  end
187
212
 
188
213
  def spontaneous_channel(start_item, end_item)
@@ -1,8 +1,9 @@
1
1
  module Dhaka
2
2
  # Each grammar symbol is uniquely identified by a string name. The name of a symbol can
3
- # be anything and need not correspond to its character representation. For example, an ampersand in the
4
- # character stream could be tokenized as a symbol with a name 'whatever'. In general, it's best to choose
5
- # symbol names that are descriptive.
3
+ # be anything (except the two reserved names <tt>'\_Start_'</tt> and <tt>'\_End_'</tt>) and need not
4
+ # correspond to its character representation. For example, an ampersand in the input string could
5
+ # be tokenized as a symbol with a name 'AND_OP'. You never have to directly instantiate a
6
+ # GrammarSymbol. It is done implicitly for you when you define a Grammar.
6
7
  class GrammarSymbol
7
8
  attr_reader :name
8
9
  attr_accessor :non_terminal, :nullable, :precedence, :associativity
@@ -1,13 +1,16 @@
1
1
  module Dhaka
2
2
  class Production
3
+ include Comparable
4
+
5
+ attr_reader :symbol, :expansion, :name, :action, :priority
3
6
 
4
- attr_reader :symbol, :expansion, :name
5
-
6
- def initialize(symbol, expansion, name, precedence = nil)
7
+ def initialize(symbol, expansion, name, action, priority, precedence = nil)
7
8
  @symbol = symbol
8
9
  @expansion = expansion
9
10
  @name = name
10
11
  @precedence = precedence
12
+ @action = action || proc { self }
13
+ @priority = priority
11
14
  end
12
15
 
13
16
  def precedence
@@ -26,5 +29,8 @@ module Dhaka
26
29
  "#{name} #{symbol} ::= #{expansion.join(' ')}"
27
30
  end
28
31
 
32
+ def <=> other
33
+ priority <=> other.priority
34
+ end
29
35
  end
30
36
  end
@@ -0,0 +1,46 @@
1
+ module Dhaka
2
+ # Abstract base class of all compiled Lexers. It is only used by generated code.
3
+ class CompiledLexer
4
+
5
+ class << self
6
+ # Returns a LexerRun that tokenizes +input+.
7
+ def lex input
8
+ LexerRun.new(self, input)
9
+ end
10
+
11
+ def start_state #:nodoc:
12
+ states[start_state_id]
13
+ end
14
+
15
+ def action_for_pattern pattern #:nodoc:
16
+ specification.items[pattern].action
17
+ end
18
+
19
+ private
20
+ def inherited(lexer)
21
+ class << lexer
22
+ attr_accessor :states, :specification, :start_state_id
23
+ end
24
+ lexer.states = Hash.new do |hash, state_id|
25
+ hash[state_id] = LexerSupport::State.new(lexer, nil)
26
+ end
27
+ end
28
+
29
+ def at_state x, &blk
30
+ states[x].instance_eval(&blk)
31
+ end
32
+
33
+ def start_with start_state_id
34
+ self.start_state_id = start_state_id
35
+ end
36
+
37
+ def switch_to dest_state_id
38
+ states[dest_state_id]
39
+ end
40
+
41
+ def inspect
42
+ "<Dhaka::CompiledLexer specification : #{specification}>"
43
+ end
44
+ end
45
+ end
46
+ end