Kanocc 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ #require "rubygems"
3
+ $:.unshift("lib")
4
+ require "kanocc.rb"
5
+ require "logger"
6
+ #require "breakpoint"
7
+
8
+ # Example use of Kanocc for a small calculator program.
9
+ # It implements the grammar:
10
+ #
11
+ # Program ::=
12
+ # | Program Expr '\n'R
13
+ # Expr ::= Expr '+' Expr
14
+ # | Expr '-' Expr
15
+ # | Expr '*' Expr
16
+ # | Expr '/' Expr
17
+ # | '(' Expr ')'
18
+ # | Number
19
+ #
20
+ # With the lexical grammar:
21
+ #
22
+ # Number ::= \d+, '(', ')', '+', '-', '*', '/' '\n'
23
+
24
+
25
+ # ========== Define a lexical grammar =============
26
+ class Number < Kanocc::Token
27
+ attr_reader :val
28
+ setPattern(/\d+/) {@val = eval @m[0]}
29
+ end
30
+
31
+ # ========== Define a grammar =====================
32
+ class Expr < Kanocc::Nonterminal
33
+ attr_reader :val
34
+
35
+ rule(Expr, "+", Expr) {@val = @rhs[0].val + @rhs[2].val}
36
+ rule(Expr, "-", Expr) {@val = @rhs[0].val - @rhs[2].val}
37
+ rule(Expr, "*", Expr) {@val = @rhs[0].val * @rhs[2].val}
38
+ rule(Expr, "/", Expr) {@val = @rhs[0].val / @rhs[2].val}
39
+ rule("(", Expr, ")") {@val = @rhs[1].val}
40
+ rule(Number) {@val = @rhs[0].val}
41
+
42
+ setOperatorPrecedence ['*', '/'], 2
43
+ end
44
+
45
+ class Line < Kanocc::Nonterminal
46
+ rule(Expr, "\n") { p @rhs[0].val}
47
+ rule(Kanocc::Error, "\n") do
48
+ puts "Sorry - didn't understand: #{$source[startPos, endPos-startPos].inspect}"
49
+ end
50
+ end
51
+
52
+ class Program < Kanocc::Nonterminal
53
+ rule(Program, Line)
54
+ rule()
55
+ end
56
+
57
+ # Make a parser, give it 'Program' as the grammars startsymbol and run
58
+
59
+ parser = Kanocc::Kanocc.new(Program)
60
+ #parser.logger.level = Logger::DEBUG
61
+ $source = <<-EOF
62
+ 2 * 3
63
+ 3 - 3 +
64
+ 7 - 2 - 1
65
+ 3 * 2 + 4
66
+ 4 + 3 * 3
67
+ EOF
68
+
69
+ parser.parse($source)
@@ -0,0 +1,255 @@
1
+ #
2
+ # Kanocc - Kanocc ain't no compiler-compiler
3
+ #
4
+ require 'kanocc/token'
5
+ require 'kanocc/nonterminal'
6
+ require 'kanocc/scanner'
7
+ require 'kanocc/earley'
8
+ require 'logger'
9
+
10
+ # = Kanocc - Kanocc ain't no compiler-compiler
11
+ #
12
+ # Kanocc is a ruby-framework for parsing and translating.
13
+ # Emphasis is on easy, 'scripty' use, and seamless integration with ruby. Performance has been
14
+ # a secondary concern.
15
+ # In it's default configuration, Kanocc uses it's own lexical scanner and a parser
16
+ # based on Earley's algorithm to allow handling of any context-free grammer. It is possible,
17
+ # however, to plug in other lexical scanners or parsers. See ##FIXMEREF.
18
+ #
19
+ # A simple example.
20
+ #
21
+ # Reading and evaluating reverse polish notated expressions. Consider this grammar:
22
+ #
23
+ # E ::= E E '+'
24
+ # | E E '-'
25
+ # | E E '*'
26
+ # | E E '/'
27
+ # | NUM
28
+ #
29
+ # NUM a sequence of digits
30
+ #
31
+ # In Kanocc yout could do it like this:
32
+ #
33
+ # require "kanocc"
34
+ #
35
+ # # ========== Define a lexical grammar =============
36
+ # class NUM < Kanocc::Token
37
+ # attr_reader :val
38
+ # setPattern(/\d+/) { @val = @m[0].to_i}
39
+ # end
40
+ #
41
+ # # ========== Define a grammar =====================
42
+ # class E < Kanocc::Nonterminal
43
+ # attr_reader :val
44
+ # rule(E, E, "+") { @val = @rhs[0].val + @rhs[1].val}
45
+ # rule(E, E, "-") { @val = @rhs[0].val - @rhs[1].val}
46
+ # rule(E, E, "*") { @val = @rhs[0].val * @rhs[1].val}
47
+ # rule(E, E, "/") { @val = @rhs[0].val / @rhs[1].val}
48
+ # rule(NUM) { @val = @rhs[0].val }
49
+ # end
50
+ #
51
+ # # ========== Set up a parser ======================
52
+ # myParser = Kanocc::Kanocc.new(E)
53
+ #
54
+ # # ========== And try it out =======================
55
+ # puts "3 4 + 2 - = #{myParser.parse("3 4 + 2 -").val}"
56
+ #
57
+ # and you'd get:
58
+ #
59
+ # 3 4 + 2 - = 5
60
+ #
61
+ # For more examples, please refer to the documentation: ##FIXMEREF
62
+ #
63
+ module Kanocc
64
+ class Kanocc
65
+ attr_accessor :scanner, :parser, :logger
66
+
67
+ # Creates a new instance of Kannocc, with the given start symbol.
68
+ # From the startsymbol, Kanocc will deduce the grammar and the
69
+ # grammarsymbols
70
+ #
71
+ def initialize(startSymbol)
72
+ @startSymbol = startSymbol
73
+ @logger = Logger.new(STDOUT)
74
+ @logger.datetime_format = ""
75
+ @logger.level = Logger::WARN
76
+ @scanner = Scanner.new(:logger => @logger)
77
+ @parser = EarleyParser.new(self, :logger => @logger)
78
+ end
79
+
80
+ def logger=(logger)
81
+ @logger = logger || logger.new(STDOUT)
82
+ @parser.logger = @logger if parser.respond_to?(:logger)
83
+ @scanner.logger = @logger if scanner.respond_to?(:logger)
84
+ end
85
+
86
+ def parser=(parser)
87
+ @parser = parser
88
+ @parser.logger = @logger if parser.respond_to?(:logger=)
89
+ end
90
+
91
+ def scanner=(scanner)
92
+ @scanner = scanner
93
+ @scanner.logger = @logger if scanner.respond_to?(:logger=)
94
+ end
95
+
96
+ # Consume input. Kanocc will parse input according to the rules given, and
97
+ # - if parsing succeeds - return an instance of the grammars start symbol.
98
+ # Input may be a String or an IO object.
99
+ def parse(input)
100
+ raise "Start symbol not defined" unless @startSymbol
101
+ tellParserStartSymbol(@startSymbol)
102
+ @parser.prepare
103
+ @stack = []
104
+ @inputPos = 0
105
+ @scanner.eachToken(input) do |tokens, startPos, endPos|
106
+ @logger.info "got #{show(tokens)} from scanner at #{startPos}, #{endPos}"
107
+ @logger.debug "Consume " + tokens.inspect if @logger
108
+ @inputPos += 1
109
+ @parser.consume(tokens, startPos, endPos)
110
+ end
111
+ @parser.eof
112
+ @stack[0]
113
+ end
114
+
115
+ def parseFile(file)
116
+ if file.is_a? String # Then we assume it's a path
117
+ file = File.open(File.expand_path(file))
118
+ openedFile = true
119
+ end
120
+ input = file.read
121
+ file.close if openedFile
122
+ parse(input)
123
+ end
124
+
125
+ # Define whitespace. By default, Kanocc will recogninze anything that matches
126
+ # /\s/ as whitespace.
127
+ # whitespace takes a variable number of arguments, each of which must be a
128
+ # regular expression.
129
+ def setWhitespace(*ws)
130
+ @scanner.setWhitespace(*ws)
131
+ end
132
+
133
+ # Define which tokens Kanocc should recognize. If this method is not called
134
+ # Kanocc will scan for those tokens that are mentioned in the grammar.
135
+ # tokens= takes a variable number of arguments. Each argument must either be
136
+ # a string or a class which is a subclass of Kanocc::Token
137
+ def setTokens(*tokens)
138
+ @scanner.setRecognized(*tokens)
139
+ end
140
+
141
+ # The parser must call this method when it have decided upon a reduction.
142
+ # As arguments it should give the rule, by which to reduce.
143
+ def reportReduction(rule, startPos, endPos)
144
+ @logger.info "Reducing by " + rule.inspect
145
+ nonterminal = rule.lhs.new
146
+ nonterminal.startPos = startPos
147
+ nonterminal.endPos = endPos
148
+ rightHandSide = @stack.slice!(-rule.rhs.length, rule.rhs.length)
149
+ rightHandSide = rightHandSide.map {|e| e.is_a?(List) ? e.elements : e} unless nonterminal.is_a? List
150
+ if rule.method
151
+ oldRhs = nonterminal.instance_variable_get('@rhs')
152
+ nonterminal.instance_variable_set('@rhs', rightHandSide)
153
+ nonterminal.send(rule.method)
154
+ nonterminal.instance_variable_set('@rhs', oldRhs)
155
+ end
156
+ @stack.push(nonterminal)
157
+ showStack
158
+ end
159
+
160
+
161
+ # The parser must call this method when it consumes a token
162
+ # As argument it should give the consumed token and the positions
163
+ # in the input string corresponding to the token. Positions should be given
164
+ # as the position of the first character of the token and the position of the
165
+ # first character after the token.
166
+ def reportToken(token)
167
+ @logger.info("Pushing token: " + token.inspect)
168
+ @stack.push(token)
169
+ if token.respond_to?("__recognize__")
170
+ token.__recognize__
171
+ end
172
+ showStack
173
+ end
174
+
175
+
176
+ def tellParserStartSymbol(startSymbol)
177
+ @parser.startSymbol = startSymbol
178
+ bagOfTerminals = {}
179
+ findTokens(startSymbol, bagOfTerminals)
180
+ @logger.debug "tokens = " + bagOfTerminals.keys.inspect
181
+ strings = bagOfTerminals.keys.find_all{|ter| ter.is_a? String}
182
+ @logger.info("Literals: " + strings.inspect)
183
+ tokens = bagOfTerminals.keys.find_all{|ter| ter.is_a? Class and ter.ancestors.member?(Token)}
184
+ @logger.info("Tokens: " + tokens.inspect)
185
+ @scanner.setRecognized(*(strings + tokens))
186
+
187
+ # Show rules
188
+ @logger.info("Rules:")
189
+ nonterminals = [startSymbol]
190
+ nonterminals.each do |nonterminal|
191
+ nonterminal.rules.each do |rule|
192
+ @logger.info(" " + rule.inspect)
193
+ rule.rhs.each do |gs|
194
+ if gs.is_a? Class and gs.ancestors.member?(Nonterminal) and not nonterminals.member?(gs)
195
+ nonterminals.push(gs)
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ def findTokens(nonterminal, collectedTokens, visitedNonterminals = {})
203
+ unless visitedNonterminals[nonterminal]
204
+ visitedNonterminals[nonterminal] = true
205
+ nonterminal.rules.each do |r|
206
+ r.rhs.each do |gs|
207
+ if gs.is_a?(Class) and gs.ancestors.member?(Nonterminal)
208
+ findTokens(gs, collectedTokens, visitedNonterminals)
209
+ else
210
+ collectedTokens[gs] = true
211
+ end
212
+ end
213
+ end
214
+ end
215
+ end
216
+
217
+ def operatorPrecedence(rule)
218
+ if operator = rule.operator
219
+ rule.lhs.operatorPrecedence(operator) || 0
220
+ else
221
+ 0
222
+ end
223
+ end
224
+
225
+ # For debugging
226
+ def showStack
227
+ @logger.info("Stack: [" + @stack.map {|gs| show(gs)}.join(", ") + "]" ) if @logger
228
+ end
229
+
230
+ def show(gs)
231
+ if gs.is_a?(Nonterminal) or gs.is_a?(Token)
232
+ gs.class.to_s;
233
+ elsif gs.is_a?(String)
234
+ gs.inspect;
235
+ end
236
+ end
237
+
238
+
239
+ end
240
+
241
+ class ParseException < Exception
242
+ attr_accessor :inputPos, :inputSymbol, :expected
243
+ def initialize(inputPos, inputSymbol, expected)
244
+ @inputPos, @inputSymbol, @expected = inputPos, inputSymbol, expected
245
+ end
246
+ end
247
+
248
+ class KanoccException < Exception
249
+ end
250
+
251
+
252
+
253
+ end
254
+
255
+
@@ -0,0 +1,309 @@
1
+ require 'kanocc/grammar_rule'
2
+ require 'kanocc/token'
3
+ require 'logger'
4
+ module Kanocc
5
+ #
6
+ # Parser for Kanocc based on Earleys algorithm. For a description see:
7
+ # Alfred V. Aho, Jeffrey D. Ullman, The Theory of Parsing, Translation and Compiling,
8
+ # or try a web search engine of your choice with 'Earley parsing'
9
+ #
10
+ # Earley's parser will parse according to any zcontext-free grammar using O(n*n*n) time
11
+ # and O(n*n) space, n being the length of input. If the grammar is unambigous time/space
12
+ # complexity is O(n*n)/O(n*n).
13
+ # As of yet (version 0.1) the implementation is surely not optimal,
14
+ # so time/space complexity is probably worse.
15
+ #
16
+ # Christian Surlykke 2007.
17
+ #
18
+ class EarleyParser
19
+ attr_accessor :kanocc, :logger
20
+
21
+ ErrorRule = GrammarRule.new(Error, [], nil)
22
+
23
+ def initialize(kanocc, options = {})
24
+ @kanocc = kanocc
25
+ @logger = options[:logger] || Logger.new
26
+ end
27
+ #
28
+ # Sets up the parser, creating itemlist 0.
29
+ #
30
+ def startSymbol=(startSymbol)
31
+ @startSymbol = startSymbol
32
+ @itemLists = [ItemList.new(nil, 0, 0)]
33
+ @inputPos = 0
34
+ @recoveryPoints = []
35
+ @itemLists[0].addAll(@startSymbol.rules.map{|rule| Item.new(rule, 0)})
36
+ predictAndComplete(0)
37
+ end
38
+
39
+ def prepare
40
+ @itemLists = @itemLists[0..0]
41
+ @inputPos = 0
42
+ if @recoveryPoints.size > 0 and @recoveryPoints[0] == 0
43
+ @recoveryPoints = [0]
44
+ else
45
+ @recoveryPoints = []
46
+ end
47
+ @logger.info("Itemlist 0:\n" + @itemLists[0].inspect) unless not @logger
48
+ end
49
+
50
+ def scan(terminals)
51
+ terminals.each do |terminal|
52
+ @itemLists[@inputPos].addAll(@itemLists[@inputPos - 1].findMatching(terminal).map{|item| item.move})
53
+ end
54
+ end
55
+
56
+ def predictAndComplete(pos)
57
+ itemList = @itemLists[pos]
58
+ prevSize = 0
59
+ while prevSize < itemList.size do
60
+ prevSize = itemList.size
61
+ itemList.each do |item|
62
+ if item.rule.rhs.length <= item.dot
63
+ # complete
64
+ itemList.addAll(@itemLists[item.j].findMatching(item.rule.lhs).map{|item| item.move})
65
+ elsif (nont = item.rule.rhs[item.dot]).respond_to?(:rules)
66
+ # predict
67
+ itemList.addAll(nont.rules.map {|rule| Item.new(rule, @inputPos)})
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def addRecoveryPoints(pos)
74
+ if @recoveryPoints[-1] != pos
75
+ @itemLists[pos].each do |item|
76
+ if Error == item.rule.rhs[item.dot]
77
+ @recoveryPoints.push(pos)
78
+ break
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ #
85
+ # Consume and parse next input symbol
86
+ #
87
+ def consume(inputSymbols, startPos, endPos)
88
+ @inputPos += 1
89
+ @itemLists.push(ItemList.new(inputSymbols, @inputPos, endPos))
90
+
91
+ # scan, predict and complete until no more can be added
92
+ scan(inputSymbols)
93
+
94
+ if @itemLists[@inputPos].size == 0
95
+ @logger.debug("Found no items matching #{inputSymbols} in itemlist #{@inputPos - 1}")
96
+ @logger.debug("@recoveryPoints = " + @recoveryPoints.inspect)
97
+ for i in 1..@recoveryPoints.length do
98
+ if @recoveryPoints[-i] < @inputPos
99
+ @itemLists[@inputPos - 1].add(Item.new(ErrorRule, @recoveryPoints[-i]))
100
+ predictAndComplete(@inputPos - 1)
101
+ scan(inputSymbols)
102
+ break if @itemLists[@inputPos].size > 0
103
+ end
104
+ end
105
+ end
106
+ predictAndComplete(@inputPos)
107
+ addRecoveryPoints(@inputPos)
108
+ @logger.info("Itemlist #{@inputPos}:\n" + @itemLists[@inputPos].inspect) if @logger
109
+ end
110
+
111
+
112
+ #
113
+ # Signal to the parser that end of input is reached
114
+ #
115
+ def eof
116
+ @logger.debug "--- Parsing done, translating ---"
117
+ topItem = findFullItems(@startSymbol, @inputPos).find_all {|item| item.j == 0}.min
118
+ if topItem
119
+ translate(topItem, @inputPos)
120
+ else
121
+ raise(KanoccException, "It didn't parse")
122
+ end
123
+ end
124
+
125
+ def translate(element, pos)
126
+ @logger.debug("translate: " + element.inspect + ", pos = " + pos.inspect)
127
+ if element.class == Item
128
+ translateHelper(element, pos)
129
+ @kanocc.reportReduction(element.rule,
130
+ @itemLists[element.j].textPos,
131
+ @itemLists[pos].textPos)
132
+ elsif element.class == Class # Its a token class
133
+ @kanocc.reportToken(@itemLists[pos].inputSymbol.find {|sym| sym.is_a? element})
134
+ else # Its a string instance
135
+ @logger.debug @itemLists[pos].inspect
136
+ @kanocc.reportToken(element)
137
+ end
138
+ end
139
+
140
+ def translateHelper(item, pos)
141
+ @logger.debug("translateHelper: " + item.inspect)
142
+ return if item.dot == 0
143
+ if item.rule.rhs[item.dot - 1].respond_to?("rules")
144
+ # Assume item is of form [A --> aB�c, k] in itemlist i
145
+ # Must then find item of form [B --> x�, j] in itemlist i so
146
+ # that there exists item of form [A --> a�Bc, k] on itemlist j
147
+ #
148
+ # First: Items of form [B --> x�, j] on list i
149
+ candidates = findFullItems(item.rule.rhs[item.dot - 1], pos)
150
+ # Then: Those for which item of form [A --> a�Bc, k] exists
151
+ # on list j
152
+ candidates = candidates.find_all {|subItem|
153
+ @itemLists[subItem.j].findItem(item.rule, item.dot - 1, item.j)
154
+ }
155
+ #####
156
+ # Precedence handling is somewhat problematic in Earley parsing.
157
+ # We now have to choose amongst possibly several candidates
158
+ #
159
+ # Last: Pick the one with the rule with the _lowest_ precedence
160
+ # (We are finding reductions top-down, but will evaluate bottom-up, hence
161
+ # this will make the rule with the _highest_ precedence evaluate first.
162
+
163
+
164
+ subItem = candidates.min
165
+ prevItem = @itemLists[subItem.j].findItem(item.rule, item.dot - 1, item.j)
166
+ prevList = subItem.j
167
+ else
168
+ prevItem = @itemLists[pos - 1].findItem(item.rule, item.dot - 1, item.j)
169
+ prevList = pos - 1
170
+ subItem = item.rule.rhs[item.dot - 1]
171
+ end
172
+ translateHelper(prevItem, prevList)
173
+ translate(subItem, pos)
174
+ end
175
+
176
+ def findFullItems(nonterminal, inputPos)
177
+ @itemLists[inputPos].find_all do |item|
178
+ item.rule.lhs == nonterminal and item.dot >= item.rule.rhs.length
179
+ end
180
+ end
181
+
182
+ def operatorPrecedence(rule)
183
+ - (@kanocc.operatorPrecedence(rule))
184
+ end
185
+ end
186
+
187
+ class ItemList
188
+ attr_reader :inputSymbol, :textPos
189
+ attr_accessor :items
190
+
191
+ def initialize(inputSymbol, inputPos, textPos)
192
+ @inputPos = inputPos
193
+ @inputSymbol = inputSymbol
194
+ @textPos = textPos
195
+ @items = Hash.new
196
+ end
197
+
198
+ def copy
199
+ res = clone
200
+ res.items = @items.clone
201
+ return res
202
+ end
203
+
204
+ def size
205
+ return @items.size
206
+ end
207
+
208
+ def find_all(&b)
209
+ return @items.keys.find_all(&b)
210
+ end
211
+
212
+ def findItem(rule, dot, j)
213
+ return @items.keys.find{ |item|
214
+ item.rule == rule and
215
+ item.dot == dot and
216
+ item.j == j
217
+ }
218
+ end
219
+
220
+ def eachMatching(inputSymbol)
221
+ findMatching(inputSymbol).each do |item|
222
+ yield(item)
223
+ end
224
+ end
225
+
226
+ def findMatching(inputSymbol)
227
+ @items.keys.find_all do |item|
228
+ inputSymbol === item.symbolAfterDot or inputSymbol == item.symbolAfterDot
229
+ end
230
+ end
231
+
232
+ def contains(item)
233
+ return @items[item]
234
+ end
235
+
236
+ def add(item)
237
+ @items.store(item, true)
238
+ end
239
+
240
+ def addAll(items)
241
+ items.each {|item| @items.store(item, true)}
242
+ end
243
+
244
+ def each
245
+ @items.keys.each do |item|
246
+ yield item
247
+ end
248
+ end
249
+
250
+ def inspect
251
+ return "[" + @inputSymbol.inspect + "\n " +
252
+ @textPos.to_s + "\n " +
253
+ @items.keys.map{|item| item.inspect}.join("\n ") + "]\n"
254
+ end
255
+ end
256
+
257
+
258
+ class Item
259
+ attr_reader :rule, :j, :dot
260
+ @@items = Hash.new
261
+
262
+ def Item.new(rule, j, dot = 0)
263
+ unless (item = @@items[[rule,j,dot]])
264
+ item = super(rule, j, dot)
265
+ @@items.store([rule, j, dot], item)
266
+ end
267
+ return item
268
+ end
269
+
270
+ def symbolAfterDot
271
+ return @dot < @rule.rhs.size ? @rule.rhs[@dot] : nil
272
+ end
273
+
274
+ def initialize(rule, j, dot = 0)
275
+ @rule = rule
276
+ @j = j
277
+ @dot = dot
278
+ end
279
+
280
+ def move
281
+ return Item.new(@rule, @j, @dot + 1)
282
+ end
283
+
284
+ def inspect
285
+ return "[" +
286
+ @rule.lhs.inspect + " --> " +
287
+ (@rule.rhs.slice(0, dot) +
288
+ [Dot.new] +
289
+ @rule.rhs.slice(dot, @rule.rhs.length - dot)).map{|symbol| symbol.inspect}.join(" ") +
290
+ " ; " + @j.to_s + "]"
291
+ end
292
+
293
+ def <=>(other)
294
+ tmp = (@rule.prec <=> other.rule.prec)
295
+ if tmp == 0
296
+ return other.j <=> @j
297
+ else
298
+ return tmp
299
+ end
300
+ end
301
+ end
302
+
303
+ # Just for Item inspect
304
+ class Dot
305
+ def inspect
306
+ return "�"
307
+ end
308
+ end
309
+ end
@@ -0,0 +1,36 @@
1
+ module Kanocc
2
+ class GrammarRule
3
+ attr_reader :lhs, :rhs, :method, :argPositions
4
+ attr_accessor :prec
5
+
6
+ def initialize(lhs, rhs, method)
7
+ @lhs = lhs
8
+ @rhs = rhs
9
+ @method = method
10
+ @logger.debug("#{lhs} --> #{rhs.map {|gs| gs.is_a?(Symbol) ? gs.to_s : gs}.join}, #prec = #{@prec}, method = #{method}") unless not @logger
11
+ end
12
+
13
+ def operator
14
+ rhs.find {|s| s.is_a?(String) or s.is_a?(Token)}
15
+ end
16
+
17
+ def prec=(newPrec)
18
+ @prec = newPrec
19
+ end
20
+
21
+ # The precedence of a rule is defined as:
22
+ # The given precedence
23
+ # or (if that's not defined) the precedence of the leftmost operator (token)
24
+ # or (if that's not defined) 0.
25
+ def prec
26
+ @prec or
27
+ ((o = operator) and (@lhs.operatorPrecedence(o))) or
28
+ 0
29
+ end
30
+
31
+ def inspect
32
+ return lhs.inspect + " ::= " + rhs.map{|gs| gs.inspect}.join(" ")
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,158 @@
1
+ require 'kanocc/grammar_rule'
2
+ module Kanocc
3
+ class Nonterminal
4
+ attr_accessor :startPos, :endPos
5
+ @@rules = Hash.new
6
+ @@lastRule = Hash.new
7
+ @@derivesRight = Hash.new
8
+ @@operatorPrecedence = Hash.new
9
+ @@methodNames = Hash.new
10
+
11
+ Left = 1
12
+ Right = 2
13
+
14
+ def Nonterminal.derivesRight
15
+ @@derivesRight[self] = true
16
+ end
17
+
18
+ def Nonterminal.derivesRight?
19
+ return @@derivesRight[self]
20
+ end
21
+
22
+ def Nonterminal.setOperatorPrecedence(operator, precedence)
23
+ raise "Precedence must be an integer" unless precedence.class == Fixnum
24
+ @@operatorPrecedence[self] ||= Hash.new
25
+ if is_an_operator?(operator)
26
+ @@operatorPrecedence[self][operator] = precedence
27
+ elsif is_an_array_of_operators(operator)
28
+ operator.each {|o| @@operatorPrecedence[self][o] = precedence}
29
+ else
30
+ raise "Operator must be a string, a token or an array of those"
31
+ end
32
+ end
33
+
34
+ def Nonterminal.operatorPrecedence(operator)
35
+ (@@operatorPrecedence[self] and @@operatorPrecedence[self][operator]) or 0
36
+ end
37
+
38
+ def Nonterminal.is_an_array_of_operators(arr)
39
+ arr.is_a?(Array) and
40
+ arr.collect{|o| is_an_operator?(o)}.inject {|b1, b2| b1 and b2 }
41
+ end
42
+
43
+ def Nonterminal.is_an_operator?(operator)
44
+ operator.is_a?(String) or operator.is_a?(Token)
45
+ end
46
+
47
+ def Nonterminal.rules
48
+ rules = @@rules[self]
49
+ return rules ? rules : []
50
+ end
51
+
52
+ def Nonterminal.addRule(rule)
53
+ @@rules[self] ||= []
54
+ @@rules[self].push(rule)
55
+ @@lastRule[self] = rule
56
+ end
57
+
58
+ def Nonterminal.is_a_grammarsymbol?(x)
59
+ x.is_a?(String) or (x.respond_to?("is_a_kanocc_grammarsymbol?") and x.is_a_kanocc_grammarsymbol?)
60
+ end
61
+
62
+ def Nonterminal.is_a_kanocc_grammarsymbol?
63
+ return true
64
+ end
65
+
66
+ def Nonterminal.rule(*rhs, &block)
67
+ for pos in 0..rhs.length - 1 do
68
+ unless is_a_grammarsymbol?(rhs[pos])
69
+ raise "Problem with rule: #{rhs.inspect}, element:#{pos.to_s} - #{rhs[pos].inspect}\nElements of a rule must be Strings, Tokens or Nonterminals"
70
+ end
71
+ end
72
+
73
+ if block_given?
74
+ methodName = generateMethodName(*rhs)
75
+ define_method(methodName.to_sym, &block)
76
+ addRule(GrammarRule.new(self, rhs, methodName.to_sym))
77
+ else
78
+ addRule(GrammarRule.new(self, rhs, nil))
79
+ end
80
+ end
81
+
82
+ def Nonterminal.zm(symbols, sep = nil)
83
+ listClass = newListClass
84
+ listClass.rule() {@elements = []}
85
+ listClass.rule(om(symbols, sep)) {@elements = @rhs[0].elements}
86
+ return listClass
87
+ end
88
+
89
+ def Nonterminal.om(symbols, sep = nil)
90
+ symbols = [symbols] unless symbols.is_a? Array
91
+ listClass = newListClass
92
+ listClass.rule(*symbols) {@elements = @rhs}
93
+ if sep
94
+ listClass.rule(listClass, sep, *symbols) {@elements = @rhs[0].elements + @rhs[2..@rhs.length]}
95
+ else
96
+ listClass.rule(listClass, *symbols) {@elements = @rhs[0].elements + @rhs[1..@rhs.length]}
97
+ end
98
+ return listClass
99
+ end
100
+
101
+ @@listClassNumber = 0
102
+
103
+ def Nonterminal.newListClass
104
+ listClass = Class.new(List)
105
+ @@listClassNumber += 1
106
+ def listClass.inspect
107
+ return "anonList_#{@@listClassNumber}"
108
+ end
109
+ return listClass
110
+ end
111
+
112
+ def Nonterminal.generateMethodName(*args)
113
+ methodName = self.name + " --> " + args.map {|a| a.inspect}.join(' ')
114
+ @@methodNames[self] ||= []
115
+ i = 1
116
+ while @@methodNames[self].member?(methodName) do
117
+ methodName += ' ';
118
+ end
119
+ @@methodNames[self].push(methodName)
120
+ return methodName
121
+ end
122
+
123
+ def Nonterminal.prec(p)
124
+ raise "Call to prec not preceded by rule" unless @@lastRule[self]
125
+ @@lastRule[self].prec = p
126
+ end
127
+
128
+ def Nonterminal.showMethodNames
129
+ @@methodNames[self].each{|mn| puts mn.inspect} if @@methodNames[self]
130
+ end
131
+ end
132
+
133
+
134
+ class List < Nonterminal
135
+ attr_reader :elements
136
+
137
+ protected
138
+ # Assumes @rhs[0] is a Kanocc::List and that rhs.length > 1
139
+ def collect(stripSeparator = false)
140
+ puts "collect with stripSeparator = #{stripSeparator}"
141
+ @elements = @rhs[0].elements
142
+ if stripSeparator
143
+ @elements = @elements + @rhs[2..@rhs.length]
144
+ else
145
+ @elements = @elements + @rhs[1..@rhs.length]
146
+ end
147
+ puts "@elements: " + @elements.inspect
148
+ end
149
+ end
150
+
151
+ class Error < Nonterminal
152
+ attr_reader :text
153
+ def initialize
154
+ super
155
+ @text = "FIXME"
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,152 @@
1
+ require 'stringio'
2
+ require 'strscan'
3
+ require 'logger'
4
+
5
+ module Kanocc
6
+ class Scanner
7
+ attr_accessor :logger
8
+ def initialize(init = {})
9
+ if init[:logger]
10
+ @logger = init[:logger]
11
+ else
12
+ @logger = Logger.new(STDOUT)
13
+ @logger.level = Logger::WARN
14
+ end
15
+ @wsRegs = [/\s/]
16
+ @recognizables = []
17
+ @regexps = []
18
+ end
19
+
20
+ def setWhitespace(*wsRegs)
21
+ @wsRegs = []
22
+ wsRegs.each do |wsReg|
23
+ unless wsReg.is_a?(Regexp)
24
+ raise "setWhitespace must be given a list of Regexp's"
25
+ end
26
+ @wsRegs << r
27
+ end
28
+ end
29
+
30
+ def setRecognized(*rec)
31
+ @recognizables = []
32
+ @regexps = []
33
+ rec.each do |r|
34
+ @recognizables << r
35
+ if r.class == Class
36
+ @regexps << r.pattern
37
+ else
38
+ @regexps << Regexp.compile(Regexp.escape(r))
39
+ end
40
+ end
41
+ end
42
+
43
+ def eachToken(input)
44
+ if input.is_a?(IO)
45
+ @input = input.readlines.join("")
46
+ elsif input.is_a?(String)
47
+ @input = input
48
+ else
49
+ raise "Input must be a string or an IO object"
50
+ end
51
+ @stringScanner = StringScanner.new(@input)
52
+ pos = @stringScanner.pos
53
+ while tokens = nextToken do
54
+ @logger.debug("Yielding with #{tokens}, #{pos}, #{@stringScanner.pos}")
55
+ yield(tokens, pos, @stringScanner.pos)
56
+ pos = @stringScanner.pos
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ def nextToken
63
+
64
+ while true do
65
+ if @stringScanner.pos >= @input.length
66
+ return nil
67
+ end
68
+ tokens = matchToken
69
+
70
+ if tokens.size > 0
71
+ @logger.debug("nextToken returning #{tokens}")
72
+ return tokens
73
+ elsif trimWhitespace
74
+ # Now we've stripped some whitespace, so we go
75
+ # back and try to match a token again
76
+ next
77
+ else
78
+ # We've not been able to recognize a token or whitespace,
79
+ # so we emit the first character of the remaining input as a string literal.
80
+ # With this behavior, lexical scanning cannot fail.
81
+ res = [@stringScanner.scan(/./m)]
82
+ @logger.debug("nextToken returning #{res.inspect}")
83
+ return res
84
+ end
85
+ end
86
+ end
87
+
88
+ def matchToken
89
+ regPoss = findMatchingReg(@regexps)
90
+ @logger.debug("matchToken, regPoss = #{regPoss.inspect}");
91
+ tokens = []
92
+ str = nil
93
+ regPoss.each do |i|
94
+ logger.debug("@recognizables[#{i}] = #{@recognizables[i].inspect}")
95
+ str = @stringScanner.scan(@regexps[i]) unless str
96
+ if @recognizables[i].class == Class
97
+ @logger.debug("Its a class")
98
+ token = @recognizables[i].new(str)
99
+ token.m = token.match(str) # To create a proper match object
100
+ @logger.debug("token: " + token.inspect)
101
+ tokens << token
102
+ @logger.debug("tokens: " + tokens.inspect)
103
+ else
104
+ tokens << str
105
+ end
106
+ end
107
+ @logger.debug("matchToken returning: " + tokens.inspect)
108
+ return tokens
109
+ end
110
+
111
+ def trimWhitespace
112
+ wsPoss = findMatchingReg(@wsRegs)
113
+ if wsPoss.size > 0
114
+ @stringScanner.skip(@wsRegs[wsPoss[0]])
115
+ return true
116
+ else
117
+ return false
118
+ end
119
+ end
120
+
121
+ def findMatchingReg(arrayOfRegs)
122
+ @logger.debug("findMatchingReg: arrayOfRegs = #{arrayOfRegs}")
123
+ maxLength = 0
124
+ regPoss = []
125
+ for i in 0..arrayOfRegs.size-1 do
126
+ len = @stringScanner.match?(arrayOfRegs[i]) || 0
127
+ if len > maxLength
128
+ regPoss = [i]
129
+ maxLength = len
130
+ elsif len == maxLength and len > 0
131
+ regPoss << i
132
+ end
133
+ end
134
+ return regPoss
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ ############################################
141
+ # Testing
142
+ #require 'Token'
143
+ #
144
+ #class Number < Token
145
+ # setPattern(/\d+/)
146
+ #end
147
+ #
148
+ #scanner = KanoccScanner.new
149
+ #scanner.setRecognized(Number, "Exit")
150
+ #scanner.setWhitespace(/[ \t]/)
151
+ #
152
+ #scanner.eachTokenDo{|token| print token.inspect, "\n"}
@@ -0,0 +1,40 @@
1
+ module Kanocc
2
+ class Token < Regexp
3
+ attr_reader :str
4
+ attr_accessor :m
5
+
6
+ @@patterns = Hash.new
7
+
8
+ def initialize(str)
9
+ @str = str
10
+ super(@@patterns[self.class])
11
+ end
12
+
13
+ def ===(klass)
14
+ self.class == klass
15
+ end
16
+
17
+ def Token.setPattern(reg, &block)
18
+ @@patterns[self] = reg
19
+ if block_given?
20
+ define_method(:__recognize__, &block)
21
+ end
22
+ end
23
+
24
+ def Token.pattern
25
+ return @@patterns[self]
26
+ end
27
+
28
+ def is_a_kanocc_token?
29
+ return true
30
+ end
31
+
32
+ def Token.is_a_kanocc_grammarsymbol?
33
+ return true
34
+ end
35
+
36
+ def inspect
37
+ self.class.name + "[" + @str + "]"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,3 @@
1
+ Better handling of blocks
2
+ LR Parsers
3
+ Scanner.eachToken method
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Kanocc
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Christian Surlykke
8
+ autorequire: kanocc
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-04-12 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: ""
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/todo
26
+ - lib/kanocc.rb
27
+ - lib/kanocc
28
+ - lib/kanocc/token.rb
29
+ - lib/kanocc/grammar_rule.rb
30
+ - lib/kanocc/nonterminal.rb
31
+ - lib/kanocc/scanner.rb
32
+ - lib/kanocc/earley.rb
33
+ - examples/calculator.rb
34
+ has_rdoc: false
35
+ homepage: ""
36
+ post_install_message:
37
+ rdoc_options: []
38
+
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: "0"
52
+ version:
53
+ requirements: []
54
+
55
+ rubyforge_project:
56
+ rubygems_version: 1.0.1
57
+ signing_key:
58
+ specification_version: 2
59
+ summary: Kanocc - Kanocc ain't no compiler-compiler. A framework for syntax directed translation
60
+ test_files: []
61
+