kanocc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,15 @@
1
+ Kanocc - Kanocc ain't no compiler-compiler
2
+
3
+ Kanocc is a ruby framework for parsing and translation
4
+
5
+ == Getting Kanocc
6
+
7
+ Kanocc is free software, available under the GPL v3 license.
8
+ You can get it from the rubyforge git repository:
9
+
10
+ git clone git://rubyforge.org/kanocc.git
11
+
12
+ == Documentation and contact information
13
+
14
+ Visik http://kanocc.rubyforge.org where you'll find links to documentation,
15
+ mailing lists and bug tracking.
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Kanocc - Kanocc ain't no compiler-compiler
4
+ #
5
+ # Copyright 2008 Christian Surlykke
6
+ #
7
+ # This file is part of Kanocc.
8
+ #
9
+ # Kanocc is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License, version 3
11
+ # as published by the Free Software Foundation.
12
+ #
13
+ # Kanocc is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License, version 3 for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License,
19
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
20
+ #
21
+ libdir = File.expand_path(File.join(File.dirname(__FILE__), "..", "lib"))
22
+ $:.unshift(libdir)
23
+ require "kanocc.rb"
24
+ #require 'rubygems'
25
+ #require 'kanocc'
26
+ require "logger"
27
+ #require "breakpoint"
28
+
29
+ # Example use of Kanocc for a small calculator program.
30
+ # It implements the grammar:
31
+ #
32
+ # Program ::= Line+
33
+ # Line ::= Expr "\n"
34
+ # Expr ::= Expr '+' Expr
35
+ # | Expr '-' Expr
36
+ # | Expr '*' Expr
37
+ # | Expr '/' Expr
38
+ # | '(' Expr ')'
39
+ # | Number
40
+ #
41
+ # With the lexical grammar:
42
+ #
43
+ # Number ::= \d+, '(', ')', '+', '-', '*', '/' '\n'
44
+
45
+
46
+ # ========== Define a lexical grammar =============
47
+ class Number < Kanocc::Token
48
+ attr_reader :val
49
+ pattern(/\d+/) {@val = @m[0].to_i}
50
+ pattern(/0x[0-9A-F]+/) {@val = @m[0].hex}
51
+ end
52
+
53
+ # ========== Define a grammar =====================
54
+ class Expr < Kanocc::Nonterminal
55
+ attr_reader :val
56
+
57
+ rule(Expr, "+", Expr) {@val = @rhs[0].val + @rhs[2].val}
58
+ rule(Expr, "-", Expr) {@val = @rhs[0].val - @rhs[2].val}
59
+ rule(Expr, "*", Expr) {@val = @rhs[0].val * @rhs[2].val}
60
+ rule(Expr, "/", Expr) {@val = @rhs[0].val / @rhs[2].val}
61
+ rule("(", Expr, ")") {@val = @rhs[1].val}
62
+ rule(Number) {@val = @rhs[0].val}
63
+
64
+ set_operator_precedence ['*', '/'], 2
65
+ end
66
+
67
+ class Line < Kanocc::Nonterminal
68
+ rule(Expr, "\n") { p @rhs[0].val}
69
+ rule(Kanocc::Error, "\n") do
70
+ error_string = $source[@rhs.start_pos, @rhs.end_pos - @rhs.start_pos]
71
+ puts "Sorry - didn't understand: #{error_string.inspect}"
72
+ end
73
+ end
74
+
75
+ class Program < Kanocc::Nonterminal
76
+ rule(Program, Line)
77
+ rule()
78
+ end
79
+
80
+ # Make a parser, give it 'Program' as the grammars startsymbol
81
+
82
+ parser = Kanocc::Kanocc.new(Program)
83
+ #parser.logger.level = Logger::INFO
84
+
85
+ # Feed it some input
86
+ $source = <<-EOF
87
+ 2 * 3
88
+ 3 - 3 +
89
+ 7 - 2 - 1
90
+ 3 * 2 + 4
91
+ 4 + 3 * 3
92
+ 0xFF + 7
93
+ EOF
94
+ puts "parsing: \n" + $source
95
+
96
+ # and go
97
+ parser.parse($source)
98
+
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # Copyright 2008 Christian Surlykke
4
+ #
5
+ # This file is part of Kanocc.
6
+ #
7
+ # Kanocc is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License, version 3
9
+ # as published by the Free Software Foundation.
10
+ #
11
+ # Kanocc is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License, version 3 for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License,
17
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ require 'logger'
20
+ $:.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
21
+ require "kanocc"
22
+
23
+ # ======== Bracket Packing Grammar ========
24
+ # Package ::= '(' 'B' ')'
25
+ # | '{' 'B' '}'
26
+ # | '[' 'B' ']'
27
+ # | '(' PackageList ')'
28
+ # | '{' PackageList '}'
29
+ # | '[' PackageList ']'
30
+ #
31
+ # PackageList ::= Package
32
+ # | PackageList Package
33
+
34
+ class PackageList < Kanocc::Nonterminal
35
+ end
36
+
37
+ class Package < Kanocc::Nonterminal
38
+ attr_reader :val
39
+ rule('(', 'B', ')') { @val = '(B)' }
40
+ rule('{', 'B', '}') { @val = '{B}' }
41
+ rule('[', 'B', ']') { @val = '[B]' }
42
+ rule('(', PackageList , ')') { @val = "(#{@rhs[1].val})"}
43
+ rule('{', PackageList , '}') { @val = "{#{@rhs[1].val}}"}
44
+ rule('[', PackageList , ']') { @val = "[#{@rhs[1].val}]"}
45
+ # Some error-correcting rules
46
+ rule(PackageList, ')') {@val = "(#{@rhs[0].val})"}; prec -2
47
+ rule('(', PackageList) {@val = "(#{@rhs[1].val})"}; prec -2
48
+ rule(PackageList, '}') {@val = "{#{@rhs[0].val}}"}; prec -2
49
+ rule('{', PackageList) {@val = "{#{@rhs[1].val}}"}; prec -2
50
+ rule(PackageList, ']') {@val = "[#{@rhs[0].val}]"}; prec -2
51
+ rule('[', PackageList) {@val = "[#{@rhs[1].val}]"}; prec -2
52
+ end
53
+
54
+ class PackageList
55
+ attr_reader :val
56
+ rule(om(Package)){ @val = @rhs[0].elements.map{|p| p.val}.join("") }
57
+ end
58
+
59
+ # Set up a parser
60
+ packageChecker = Kanocc::Kanocc.new(Package)
61
+
62
+ # And go
63
+ puts "[(B)] becomes " + packageChecker.parse('[(B)]').val
64
+ puts "[[B] becomes " + packageChecker.parse('[[B]').val
65
+ puts "[(B)]](B){{(B)] becomes " + packageChecker.parse("[(B)]](B){{(B)]").val
data/lib/kanocc.rb ADDED
@@ -0,0 +1,303 @@
1
+ #
2
+ # Kanocc - Kanocc ain't no compiler-compiler
3
+ #
4
+ # Copyright 2008 Christian Surlykke
5
+ #
6
+ # This file is part of Kanocc.
7
+ #
8
+ # Kanocc is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License, version 3
10
+ # as published by the Free Software Foundation.
11
+ #
12
+ # Kanocc is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License, version 3 for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License,
18
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ require 'kanocc/token'
23
+ require 'kanocc/nonterminal'
24
+ require 'kanocc/scanner'
25
+ require 'kanocc/earley'
26
+ require 'logger'
27
+
28
+ # = Kanocc - Kanocc ain't no compiler-compiler
29
+ #
30
+ # Kanocc is a ruby-framework for parsing and translating.
31
+ # Emphasis is on easy, 'scripty' use, and seamless integration with ruby. Performance has been
32
+ # a secondary concern.
33
+ # In it's default configuration, Kanocc uses it's own lexical scanner and a parser
34
+ # based on Earley's algorithm to allow handling of any context-free grammer. It is possible,
35
+ # however, to plug in other lexical scanners or parsers. See ##FIXMEREF.
36
+ #
37
+ # A simple example.
38
+ #
39
+ # Reading and evaluating reverse polish notated expressions. Consider this grammar:
40
+ #
41
+ # E ::= E E '+'
42
+ # | E E '-'
43
+ # | E E '*'
44
+ # | E E '/'
45
+ # | NUM
46
+ #
47
+ # NUM a sequence of digits
48
+ #
49
+ # In Kanocc yout could do it like this:
50
+ #
51
+ # require "kanocc"
52
+ #
53
+ # # ========== Define a lexical grammar =============
54
+ # class NUM < Kanocc::Token
55
+ # attr_reader :val
56
+ # set_pattern(/\d+/) { @val = @m[0].to_i}
57
+ # end
58
+ #
59
+ # # ========== Define a grammar =====================
60
+ # class E < Kanocc::Nonterminal
61
+ # attr_reader :val
62
+ # rule(E, E, "+") { @val = @rhs[0].val + @rhs[1].val}
63
+ # rule(E, E, "-") { @val = @rhs[0].val - @rhs[1].val}
64
+ # rule(E, E, "*") { @val = @rhs[0].val * @rhs[1].val}
65
+ # rule(E, E, "/") { @val = @rhs[0].val / @rhs[1].val}
66
+ # rule(NUM) { @val = @rhs[0].val }
67
+ # end
68
+ #
69
+ # # ========== Set up a parser ======================
70
+ # myParser = Kanocc::Kanocc.new(E)
71
+ #
72
+ # # ========== And try it out =======================
73
+ # puts "3 4 + 2 - = #{myParser.parse("3 4 + 2 -").val}"
74
+ #
75
+ # and you'd get:
76
+ #
77
+ # 3 4 + 2 - = 5
78
+ #
79
+ # For more examples, please refer to the documentation: ##FIXMEREF
80
+ #
81
+ module Kanocc
82
+ class Kanocc
83
+ attr_accessor :scanner, :parser, :logger
84
+
85
+ # Creates a new instance of Kannocc, with the given start symbol.
86
+ # From the start_symbol, Kanocc will deduce the grammar and the
87
+ # grammarsymbols
88
+ #
89
+ def initialize(start_symbol)
90
+ @start_symbol = start_symbol
91
+ @logger = Logger.new(STDOUT)
92
+ @logger.datetime_format = ""
93
+ @logger.level = Logger::WARN
94
+ @scanner = Scanner.new(:logger => @logger)
95
+ @parser = EarleyParser.new(self, :logger => @logger)
96
+ end
97
+
98
+ def logger=(logger)
99
+ @logger = logger || logger.new(STDOUT)
100
+ @parser.logger = @logger if parser.respond_to?(:logger)
101
+ @scanner.logger = @logger if scanner.respond_to?(:logger)
102
+ end
103
+
104
+ def parser=(parser)
105
+ @parser = parser
106
+ @parser.logger = @logger if parser.respond_to?(:logger=)
107
+ end
108
+
109
+ def scanner=(scanner)
110
+ @scanner = scanner
111
+ @scanner.logger = @logger if scanner.respond_to?(:logger=)
112
+ end
113
+
114
+ # Consume input. Kanocc will parse input according to the rules given, and
115
+ # - if parsing succeeds - return an instance of the grammars start symbol.
116
+ # Input may be a String or an IO object.
117
+ def parse(input)
118
+ raise "Start symbol not defined" unless @start_symbol
119
+ tell_parser_start_symbol(@start_symbol)
120
+ @parser.prepare
121
+ @stack = []
122
+ @inputPos = 0
123
+ @scanner.each_token(input) do |token_match|
124
+ @logger.info "got #{token_match.inspect} from scanner"
125
+ @inputPos += 1
126
+ @parser.consume(token_match)
127
+ end
128
+ @parser.eof
129
+ @stack[0][0]
130
+ end
131
+
132
+ def parse_file(file)
133
+ if file.is_a? String # Then we assume it's a path
134
+ file = File.open(File.expand_path(file))
135
+ opened_file = true
136
+ end
137
+ input = file.read
138
+ file.close if opened_file
139
+ parse(input)
140
+ end
141
+
142
+ # Define whitespace. By default, Kanocc will recogninze anything that matches
143
+ # /\s/ as whitespace.
144
+ # whitespace takes a variable number of arguments, each of which must be a
145
+ # regular expression.
146
+ def set_whitespace(*ws)
147
+ @scanner.set_whitespace(*ws)
148
+ end
149
+
150
+ # Define which tokens Kanocc should recognize. If this method is not called
151
+ # Kanocc will scan for those tokens that are mentioned in the grammar.
152
+ # tokens= takes a variable number of arguments. Each argument must either be
153
+ # a string or a class which is a subclass of Kanocc::Token
154
+ def set_tokens(*tokens)
155
+ @scanner.set_recognized(*tokens)
156
+ end
157
+
158
+ # The parser must call this method when it have decided upon a reduction.
159
+ # As arguments it should give the rule, by which to reduce.
160
+ def report_reduction(rule)
161
+ @logger.info "Reducing by " + rule.inspect
162
+ raise "Fatal: stack too short!" if @stack.length < rule.rhs.length
163
+ nonterminal = rule.lhs.new
164
+ stack_part = @stack.slice!(-rule.rhs.length, rule.rhs.length)
165
+ if rule.rhs.length > 0
166
+ start_pos, end_pos = stack_part[0][1], stack_part[-1][2]
167
+ elsif @stack.length > 0
168
+ start_pos, end_pos = @stack[-1][2], @stack[-1][2]
169
+ else
170
+ start_pos, end_pos = 0,0
171
+ end
172
+ if rule.method
173
+ rhs = Rhs.new(stack_part.map{|a| a[0]}, start_pos, end_pos)
174
+ old_rhs = nonterminal.instance_variable_get('@rhs')
175
+ nonterminal.instance_variable_set('@rhs', rhs)
176
+ nonterminal.send(rule.method)
177
+ nonterminal.instance_variable_set('@rhs', old_rhs)
178
+ end
179
+ nonterminal_with_pos = [nonterminal, start_pos, end_pos]
180
+ @stack.push(nonterminal_with_pos)
181
+ show_stack
182
+ end
183
+
184
+ def calculate_start_and_end_pos(rule)
185
+ end
186
+
187
+ def evaluate_semantics_and_pop(rule, nonterminal)
188
+ end
189
+
190
+ # The parser must call this method when it consumes a token
191
+ # As argument it should give the consumed token and the positions
192
+ # in the input string corresponding to the token. Positions should be given
193
+ # as the position of the first character of the token and the position of the
194
+ # first character after the token.
195
+ def report_token(tokenmatch, element)
196
+ @logger.info("Pushing token: " + element.inspect)
197
+ match = tokenmatch[:matches].find do |m|
198
+ m[:token] == element || m[:literal] == element
199
+ end
200
+
201
+ if match[:token]
202
+ token = match[:token].new
203
+ token.m = match[:regexp].match(tokenmatch[:string])
204
+ token.send(match[:method_name]) if match[:method_name]
205
+ else # It's a string literal
206
+ token = match[:literal]
207
+ end
208
+
209
+ start_pos = tokenmatch[:start_pos]
210
+ end_pos = start_pos + tokenmatch[:length]
211
+ token_with_pos = [token, start_pos, end_pos]
212
+
213
+ @stack.push(token_with_pos)
214
+ show_stack
215
+ end
216
+
217
+
218
+ def tell_parser_start_symbol(start_symbol)
219
+ @parser.startsymbol = start_symbol
220
+ bag_of_terminals = {}
221
+ find_tokens(start_symbol, bag_of_terminals)
222
+ @logger.debug "tokens = " + bag_of_terminals.keys.inspect
223
+ strings = bag_of_terminals.keys.find_all{|ter| ter.is_a? String}
224
+ @logger.info("Literals: " + strings.inspect)
225
+ tokens = bag_of_terminals.keys.find_all{|ter| ter.is_a? Class and ter.ancestors.member?(Token)}
226
+ @logger.info("Tokens: " + tokens.inspect)
227
+ @scanner.set_recognized(*(strings + tokens))
228
+
229
+ # Show rules
230
+ @logger.info("Rules:")
231
+ nonterminals = [start_symbol]
232
+ nonterminals.each do |nonterminal|
233
+ nonterminal.rules.each do |rule|
234
+ @logger.info(" " + rule.inspect)
235
+ rule.rhs.each do |gs|
236
+ if gs.is_a? Class and gs.ancestors.member?(Nonterminal) and not nonterminals.member?(gs)
237
+ nonterminals.push(gs)
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
243
+
244
+ def find_tokens(nonterminal, collectedTokens, visited_nonterminals = {})
245
+ unless visited_nonterminals[nonterminal]
246
+ visited_nonterminals[nonterminal] = true
247
+ nonterminal.rules.each do |r|
248
+ r.rhs.each do |gs|
249
+ if gs.is_a?(Class) and gs.ancestors.member?(Nonterminal)
250
+ find_tokens(gs, collectedTokens, visited_nonterminals)
251
+ else
252
+ collectedTokens[gs] = true
253
+ end
254
+ end
255
+ end
256
+ end
257
+ end
258
+
259
+ # For debugging
260
+ def show_stack
261
+ @logger.info("Stack: #{@stack.inspect}") if @logger
262
+ end
263
+
264
+ def show_grammar_symbols(tokens)
265
+ "[" + tokens.map{|token| show_grammar_symbol(token)}.join(", ") + "]"
266
+ end
267
+
268
+ def show_grammar_symbol(gs)
269
+ if gs.is_a?(Token)
270
+ "#{gs.class}(#{gs.m[0].inspect}, #{gs.start_pos}, #{gs.end_pos})"
271
+ elsif gs.is_a?(Nonterminal)
272
+ "#{gs.class}(#{gs.start_pos}, #{gs.end_pos})"
273
+ else
274
+ gs.inspect
275
+ end
276
+ end
277
+
278
+ end
279
+
280
+ class Rhs < Array
281
+ attr_accessor :start_pos, :end_pos
282
+ def initialize(arr, start_pos, end_pos)
283
+ @start_pos, @end_pos = start_pos, end_pos
284
+ super(arr)
285
+ end
286
+
287
+ def inspect
288
+ return "#{super.inspect}, #{start_pos.inspect}, #{end_pos.inspect}"
289
+ end
290
+ end
291
+
292
+ class ParseException < Exception
293
+ attr_accessor :inputPos, :inputSymbol, :expected
294
+ def initialize(inputPos, inputSymbol, expected)
295
+ @inputPos, @inputSymbol, @expected = inputPos, inputSymbol, expected
296
+ end
297
+ end
298
+
299
+ class KanoccException < Exception
300
+ end
301
+ end
302
+
303
+