Kanocc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ #require "rubygems"
3
+ $:.unshift("lib")
4
+ require "kanocc.rb"
5
+ require "logger"
6
+ #require "breakpoint"
7
+
8
+ # Example use of Kanocc for a small calculator program.
9
+ # It implements the grammar:
10
+ #
11
+ # Program ::=
12
+ # | Program Expr '\n'R
13
+ # Expr ::= Expr '+' Expr
14
+ # | Expr '-' Expr
15
+ # | Expr '*' Expr
16
+ # | Expr '/' Expr
17
+ # | '(' Expr ')'
18
+ # | Number
19
+ #
20
+ # With the lexical grammar:
21
+ #
22
+ # Number ::= \d+, '(', ')', '+', '-', '*', '/' '\n'
23
+
24
+
25
+ # ========== Define a lexical grammar =============
26
+ class Number < Kanocc::Token
27
+ attr_reader :val
28
+ setPattern(/\d+/) {@val = eval @m[0]}
29
+ end
30
+
31
+ # ========== Define a grammar =====================
32
+ class Expr < Kanocc::Nonterminal
33
+ attr_reader :val
34
+
35
+ rule(Expr, "+", Expr) {@val = @rhs[0].val + @rhs[2].val}
36
+ rule(Expr, "-", Expr) {@val = @rhs[0].val - @rhs[2].val}
37
+ rule(Expr, "*", Expr) {@val = @rhs[0].val * @rhs[2].val}
38
+ rule(Expr, "/", Expr) {@val = @rhs[0].val / @rhs[2].val}
39
+ rule("(", Expr, ")") {@val = @rhs[1].val}
40
+ rule(Number) {@val = @rhs[0].val}
41
+
42
+ setOperatorPrecedence ['*', '/'], 2
43
+ end
44
+
45
+ class Line < Kanocc::Nonterminal
46
+ rule(Expr, "\n") { p @rhs[0].val}
47
+ rule(Kanocc::Error, "\n") do
48
+ puts "Sorry - didn't understand: #{$source[startPos, endPos-startPos].inspect}"
49
+ end
50
+ end
51
+
52
+ class Program < Kanocc::Nonterminal
53
+ rule(Program, Line)
54
+ rule()
55
+ end
56
+
57
+ # Make a parser, give it 'Program' as the grammars startsymbol and run
58
+
59
+ parser = Kanocc::Kanocc.new(Program)
60
+ #parser.logger.level = Logger::DEBUG
61
+ $source = <<-EOF
62
+ 2 * 3
63
+ 3 - 3 +
64
+ 7 - 2 - 1
65
+ 3 * 2 + 4
66
+ 4 + 3 * 3
67
+ EOF
68
+
69
+ parser.parse($source)
@@ -0,0 +1,255 @@
1
+ #
2
+ # Kanocc - Kanocc ain't no compiler-compiler
3
+ #
4
+ require 'kanocc/token'
5
+ require 'kanocc/nonterminal'
6
+ require 'kanocc/scanner'
7
+ require 'kanocc/earley'
8
+ require 'logger'
9
+
10
+ # = Kanocc - Kanocc ain't no compiler-compiler
11
+ #
12
+ # Kanocc is a ruby-framework for parsing and translating.
13
+ # Emphasis is on easy, 'scripty' use, and seamless integration with ruby. Performance has been
14
+ # a secondary concern.
15
+ # In it's default configuration, Kanocc uses it's own lexical scanner and a parser
16
+ # based on Earley's algorithm to allow handling of any context-free grammer. It is possible,
17
+ # however, to plug in other lexical scanners or parsers. See ##FIXMEREF.
18
+ #
19
+ # A simple example.
20
+ #
21
+ # Reading and evaluating reverse polish notated expressions. Consider this grammar:
22
+ #
23
+ # E ::= E E '+'
24
+ # | E E '-'
25
+ # | E E '*'
26
+ # | E E '/'
27
+ # | NUM
28
+ #
29
+ # NUM a sequence of digits
30
+ #
31
+ # In Kanocc yout could do it like this:
32
+ #
33
+ # require "kanocc"
34
+ #
35
+ # # ========== Define a lexical grammar =============
36
+ # class NUM < Kanocc::Token
37
+ # attr_reader :val
38
+ # setPattern(/\d+/) { @val = @m[0].to_i}
39
+ # end
40
+ #
41
+ # # ========== Define a grammar =====================
42
+ # class E < Kanocc::Nonterminal
43
+ # attr_reader :val
44
+ # rule(E, E, "+") { @val = @rhs[0].val + @rhs[1].val}
45
+ # rule(E, E, "-") { @val = @rhs[0].val - @rhs[1].val}
46
+ # rule(E, E, "*") { @val = @rhs[0].val * @rhs[1].val}
47
+ # rule(E, E, "/") { @val = @rhs[0].val / @rhs[1].val}
48
+ # rule(NUM) { @val = @rhs[0].val }
49
+ # end
50
+ #
51
+ # # ========== Set up a parser ======================
52
+ # myParser = Kanocc::Kanocc.new(E)
53
+ #
54
+ # # ========== And try it out =======================
55
+ # puts "3 4 + 2 - = #{myParser.parse("3 4 + 2 -").val}"
56
+ #
57
+ # and you'd get:
58
+ #
59
+ # 3 4 + 2 - = 5
60
+ #
61
+ # For more examples, please refer to the documentation: ##FIXMEREF
62
+ #
63
+ module Kanocc
64
+ class Kanocc
65
+ attr_accessor :scanner, :parser, :logger
66
+
67
+ # Creates a new instance of Kannocc, with the given start symbol.
68
+ # From the startsymbol, Kanocc will deduce the grammar and the
69
+ # grammarsymbols
70
+ #
71
+ def initialize(startSymbol)
72
+ @startSymbol = startSymbol
73
+ @logger = Logger.new(STDOUT)
74
+ @logger.datetime_format = ""
75
+ @logger.level = Logger::WARN
76
+ @scanner = Scanner.new(:logger => @logger)
77
+ @parser = EarleyParser.new(self, :logger => @logger)
78
+ end
79
+
80
+ def logger=(logger)
81
+ @logger = logger || logger.new(STDOUT)
82
+ @parser.logger = @logger if parser.respond_to?(:logger)
83
+ @scanner.logger = @logger if scanner.respond_to?(:logger)
84
+ end
85
+
86
+ def parser=(parser)
87
+ @parser = parser
88
+ @parser.logger = @logger if parser.respond_to?(:logger=)
89
+ end
90
+
91
+ def scanner=(scanner)
92
+ @scanner = scanner
93
+ @scanner.logger = @logger if scanner.respond_to?(:logger=)
94
+ end
95
+
96
+ # Consume input. Kanocc will parse input according to the rules given, and
97
+ # - if parsing succeeds - return an instance of the grammars start symbol.
98
+ # Input may be a String or an IO object.
99
+ def parse(input)
100
+ raise "Start symbol not defined" unless @startSymbol
101
+ tellParserStartSymbol(@startSymbol)
102
+ @parser.prepare
103
+ @stack = []
104
+ @inputPos = 0
105
+ @scanner.eachToken(input) do |tokens, startPos, endPos|
106
+ @logger.info "got #{show(tokens)} from scanner at #{startPos}, #{endPos}"
107
+ @logger.debug "Consume " + tokens.inspect if @logger
108
+ @inputPos += 1
109
+ @parser.consume(tokens, startPos, endPos)
110
+ end
111
+ @parser.eof
112
+ @stack[0]
113
+ end
114
+
115
+ def parseFile(file)
116
+ if file.is_a? String # Then we assume it's a path
117
+ file = File.open(File.expand_path(file))
118
+ openedFile = true
119
+ end
120
+ input = file.read
121
+ file.close if openedFile
122
+ parse(input)
123
+ end
124
+
125
+ # Define whitespace. By default, Kanocc will recogninze anything that matches
126
+ # /\s/ as whitespace.
127
+ # whitespace takes a variable number of arguments, each of which must be a
128
+ # regular expression.
129
+ def setWhitespace(*ws)
130
+ @scanner.setWhitespace(*ws)
131
+ end
132
+
133
+ # Define which tokens Kanocc should recognize. If this method is not called
134
+ # Kanocc will scan for those tokens that are mentioned in the grammar.
135
+ # tokens= takes a variable number of arguments. Each argument must either be
136
+ # a string or a class which is a subclass of Kanocc::Token
137
+ def setTokens(*tokens)
138
+ @scanner.setRecognized(*tokens)
139
+ end
140
+
141
+ # The parser must call this method when it have decided upon a reduction.
142
+ # As arguments it should give the rule, by which to reduce.
143
+ def reportReduction(rule, startPos, endPos)
144
+ @logger.info "Reducing by " + rule.inspect
145
+ nonterminal = rule.lhs.new
146
+ nonterminal.startPos = startPos
147
+ nonterminal.endPos = endPos
148
+ rightHandSide = @stack.slice!(-rule.rhs.length, rule.rhs.length)
149
+ rightHandSide = rightHandSide.map {|e| e.is_a?(List) ? e.elements : e} unless nonterminal.is_a? List
150
+ if rule.method
151
+ oldRhs = nonterminal.instance_variable_get('@rhs')
152
+ nonterminal.instance_variable_set('@rhs', rightHandSide)
153
+ nonterminal.send(rule.method)
154
+ nonterminal.instance_variable_set('@rhs', oldRhs)
155
+ end
156
+ @stack.push(nonterminal)
157
+ showStack
158
+ end
159
+
160
+
161
+ # The parser must call this method when it consumes a token
162
+ # As argument it should give the consumed token and the positions
163
+ # in the input string corresponding to the token. Positions should be given
164
+ # as the position of the first character of the token and the position of the
165
+ # first character after the token.
166
+ def reportToken(token)
167
+ @logger.info("Pushing token: " + token.inspect)
168
+ @stack.push(token)
169
+ if token.respond_to?("__recognize__")
170
+ token.__recognize__
171
+ end
172
+ showStack
173
+ end
174
+
175
+
176
+ def tellParserStartSymbol(startSymbol)
177
+ @parser.startSymbol = startSymbol
178
+ bagOfTerminals = {}
179
+ findTokens(startSymbol, bagOfTerminals)
180
+ @logger.debug "tokens = " + bagOfTerminals.keys.inspect
181
+ strings = bagOfTerminals.keys.find_all{|ter| ter.is_a? String}
182
+ @logger.info("Literals: " + strings.inspect)
183
+ tokens = bagOfTerminals.keys.find_all{|ter| ter.is_a? Class and ter.ancestors.member?(Token)}
184
+ @logger.info("Tokens: " + tokens.inspect)
185
+ @scanner.setRecognized(*(strings + tokens))
186
+
187
+ # Show rules
188
+ @logger.info("Rules:")
189
+ nonterminals = [startSymbol]
190
+ nonterminals.each do |nonterminal|
191
+ nonterminal.rules.each do |rule|
192
+ @logger.info(" " + rule.inspect)
193
+ rule.rhs.each do |gs|
194
+ if gs.is_a? Class and gs.ancestors.member?(Nonterminal) and not nonterminals.member?(gs)
195
+ nonterminals.push(gs)
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ def findTokens(nonterminal, collectedTokens, visitedNonterminals = {})
203
+ unless visitedNonterminals[nonterminal]
204
+ visitedNonterminals[nonterminal] = true
205
+ nonterminal.rules.each do |r|
206
+ r.rhs.each do |gs|
207
+ if gs.is_a?(Class) and gs.ancestors.member?(Nonterminal)
208
+ findTokens(gs, collectedTokens, visitedNonterminals)
209
+ else
210
+ collectedTokens[gs] = true
211
+ end
212
+ end
213
+ end
214
+ end
215
+ end
216
+
217
+ def operatorPrecedence(rule)
218
+ if operator = rule.operator
219
+ rule.lhs.operatorPrecedence(operator) || 0
220
+ else
221
+ 0
222
+ end
223
+ end
224
+
225
+ # For debugging
226
+ def showStack
227
+ @logger.info("Stack: [" + @stack.map {|gs| show(gs)}.join(", ") + "]" ) if @logger
228
+ end
229
+
230
+ def show(gs)
231
+ if gs.is_a?(Nonterminal) or gs.is_a?(Token)
232
+ gs.class.to_s;
233
+ elsif gs.is_a?(String)
234
+ gs.inspect;
235
+ end
236
+ end
237
+
238
+
239
+ end
240
+
241
+ class ParseException < Exception
242
+ attr_accessor :inputPos, :inputSymbol, :expected
243
+ def initialize(inputPos, inputSymbol, expected)
244
+ @inputPos, @inputSymbol, @expected = inputPos, inputSymbol, expected
245
+ end
246
+ end
247
+
248
+ class KanoccException < Exception
249
+ end
250
+
251
+
252
+
253
+ end
254
+
255
+
@@ -0,0 +1,309 @@
1
+ require 'kanocc/grammar_rule'
2
+ require 'kanocc/token'
3
+ require 'logger'
4
+ module Kanocc
5
+ #
6
+ # Parser for Kanocc based on Earleys algorithm. For a description see:
7
+ # Alfred V. Aho, Jeffrey D. Ullman, The Theory of Parsing, Translation and Compiling,
8
+ # or try a web search engine of your choice with 'Earley parsing'
9
+ #
10
+ # Earley's parser will parse according to any zcontext-free grammar using O(n*n*n) time
11
+ # and O(n*n) space, n being the length of input. If the grammar is unambigous time/space
12
+ # complexity is O(n*n)/O(n*n).
13
+ # As of yet (version 0.1) the implementation is surely not optimal,
14
+ # so time/space complexity is probably worse.
15
+ #
16
+ # Christian Surlykke 2007.
17
+ #
18
+ class EarleyParser
19
+ attr_accessor :kanocc, :logger
20
+
21
+ ErrorRule = GrammarRule.new(Error, [], nil)
22
+
23
+ def initialize(kanocc, options = {})
24
+ @kanocc = kanocc
25
+ @logger = options[:logger] || Logger.new
26
+ end
27
+ #
28
+ # Sets up the parser, creating itemlist 0.
29
+ #
30
+ def startSymbol=(startSymbol)
31
+ @startSymbol = startSymbol
32
+ @itemLists = [ItemList.new(nil, 0, 0)]
33
+ @inputPos = 0
34
+ @recoveryPoints = []
35
+ @itemLists[0].addAll(@startSymbol.rules.map{|rule| Item.new(rule, 0)})
36
+ predictAndComplete(0)
37
+ end
38
+
39
+ def prepare
40
+ @itemLists = @itemLists[0..0]
41
+ @inputPos = 0
42
+ if @recoveryPoints.size > 0 and @recoveryPoints[0] == 0
43
+ @recoveryPoints = [0]
44
+ else
45
+ @recoveryPoints = []
46
+ end
47
+ @logger.info("Itemlist 0:\n" + @itemLists[0].inspect) unless not @logger
48
+ end
49
+
50
+ def scan(terminals)
51
+ terminals.each do |terminal|
52
+ @itemLists[@inputPos].addAll(@itemLists[@inputPos - 1].findMatching(terminal).map{|item| item.move})
53
+ end
54
+ end
55
+
56
+ def predictAndComplete(pos)
57
+ itemList = @itemLists[pos]
58
+ prevSize = 0
59
+ while prevSize < itemList.size do
60
+ prevSize = itemList.size
61
+ itemList.each do |item|
62
+ if item.rule.rhs.length <= item.dot
63
+ # complete
64
+ itemList.addAll(@itemLists[item.j].findMatching(item.rule.lhs).map{|item| item.move})
65
+ elsif (nont = item.rule.rhs[item.dot]).respond_to?(:rules)
66
+ # predict
67
+ itemList.addAll(nont.rules.map {|rule| Item.new(rule, @inputPos)})
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def addRecoveryPoints(pos)
74
+ if @recoveryPoints[-1] != pos
75
+ @itemLists[pos].each do |item|
76
+ if Error == item.rule.rhs[item.dot]
77
+ @recoveryPoints.push(pos)
78
+ break
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ #
85
+ # Consume and parse next input symbol
86
+ #
87
+ def consume(inputSymbols, startPos, endPos)
88
+ @inputPos += 1
89
+ @itemLists.push(ItemList.new(inputSymbols, @inputPos, endPos))
90
+
91
+ # scan, predict and complete until no more can be added
92
+ scan(inputSymbols)
93
+
94
+ if @itemLists[@inputPos].size == 0
95
+ @logger.debug("Found no items matching #{inputSymbols} in itemlist #{@inputPos - 1}")
96
+ @logger.debug("@recoveryPoints = " + @recoveryPoints.inspect)
97
+ for i in 1..@recoveryPoints.length do
98
+ if @recoveryPoints[-i] < @inputPos
99
+ @itemLists[@inputPos - 1].add(Item.new(ErrorRule, @recoveryPoints[-i]))
100
+ predictAndComplete(@inputPos - 1)
101
+ scan(inputSymbols)
102
+ break if @itemLists[@inputPos].size > 0
103
+ end
104
+ end
105
+ end
106
+ predictAndComplete(@inputPos)
107
+ addRecoveryPoints(@inputPos)
108
+ @logger.info("Itemlist #{@inputPos}:\n" + @itemLists[@inputPos].inspect) if @logger
109
+ end
110
+
111
+
112
+ #
113
+ # Signal to the parser that end of input is reached
114
+ #
115
+ def eof
116
+ @logger.debug "--- Parsing done, translating ---"
117
+ topItem = findFullItems(@startSymbol, @inputPos).find_all {|item| item.j == 0}.min
118
+ if topItem
119
+ translate(topItem, @inputPos)
120
+ else
121
+ raise(KanoccException, "It didn't parse")
122
+ end
123
+ end
124
+
125
+ def translate(element, pos)
126
+ @logger.debug("translate: " + element.inspect + ", pos = " + pos.inspect)
127
+ if element.class == Item
128
+ translateHelper(element, pos)
129
+ @kanocc.reportReduction(element.rule,
130
+ @itemLists[element.j].textPos,
131
+ @itemLists[pos].textPos)
132
+ elsif element.class == Class # Its a token class
133
+ @kanocc.reportToken(@itemLists[pos].inputSymbol.find {|sym| sym.is_a? element})
134
+ else # Its a string instance
135
+ @logger.debug @itemLists[pos].inspect
136
+ @kanocc.reportToken(element)
137
+ end
138
+ end
139
+
140
+ def translateHelper(item, pos)
141
+ @logger.debug("translateHelper: " + item.inspect)
142
+ return if item.dot == 0
143
+ if item.rule.rhs[item.dot - 1].respond_to?("rules")
144
+ # Assume item is of form [A --> aB�c, k] in itemlist i
145
+ # Must then find item of form [B --> x�, j] in itemlist i so
146
+ # that there exists item of form [A --> a�Bc, k] on itemlist j
147
+ #
148
+ # First: Items of form [B --> x�, j] on list i
149
+ candidates = findFullItems(item.rule.rhs[item.dot - 1], pos)
150
+ # Then: Those for which item of form [A --> a�Bc, k] exists
151
+ # on list j
152
+ candidates = candidates.find_all {|subItem|
153
+ @itemLists[subItem.j].findItem(item.rule, item.dot - 1, item.j)
154
+ }
155
+ #####
156
+ # Precedence handling is somewhat problematic in Earley parsing.
157
+ # We now have to choose amongst possibly several candidates
158
+ #
159
+ # Last: Pick the one with the rule with the _lowest_ precedence
160
+ # (We are finding reductions top-down, but will evaluate bottom-up, hence
161
+ # this will make the rule with the _highest_ precedence evaluate first.
162
+
163
+
164
+ subItem = candidates.min
165
+ prevItem = @itemLists[subItem.j].findItem(item.rule, item.dot - 1, item.j)
166
+ prevList = subItem.j
167
+ else
168
+ prevItem = @itemLists[pos - 1].findItem(item.rule, item.dot - 1, item.j)
169
+ prevList = pos - 1
170
+ subItem = item.rule.rhs[item.dot - 1]
171
+ end
172
+ translateHelper(prevItem, prevList)
173
+ translate(subItem, pos)
174
+ end
175
+
176
+ def findFullItems(nonterminal, inputPos)
177
+ @itemLists[inputPos].find_all do |item|
178
+ item.rule.lhs == nonterminal and item.dot >= item.rule.rhs.length
179
+ end
180
+ end
181
+
182
+ def operatorPrecedence(rule)
183
+ - (@kanocc.operatorPrecedence(rule))
184
+ end
185
+ end
186
+
187
+ class ItemList
188
+ attr_reader :inputSymbol, :textPos
189
+ attr_accessor :items
190
+
191
+ def initialize(inputSymbol, inputPos, textPos)
192
+ @inputPos = inputPos
193
+ @inputSymbol = inputSymbol
194
+ @textPos = textPos
195
+ @items = Hash.new
196
+ end
197
+
198
+ def copy
199
+ res = clone
200
+ res.items = @items.clone
201
+ return res
202
+ end
203
+
204
+ def size
205
+ return @items.size
206
+ end
207
+
208
+ def find_all(&b)
209
+ return @items.keys.find_all(&b)
210
+ end
211
+
212
+ def findItem(rule, dot, j)
213
+ return @items.keys.find{ |item|
214
+ item.rule == rule and
215
+ item.dot == dot and
216
+ item.j == j
217
+ }
218
+ end
219
+
220
+ def eachMatching(inputSymbol)
221
+ findMatching(inputSymbol).each do |item|
222
+ yield(item)
223
+ end
224
+ end
225
+
226
+ def findMatching(inputSymbol)
227
+ @items.keys.find_all do |item|
228
+ inputSymbol === item.symbolAfterDot or inputSymbol == item.symbolAfterDot
229
+ end
230
+ end
231
+
232
+ def contains(item)
233
+ return @items[item]
234
+ end
235
+
236
+ def add(item)
237
+ @items.store(item, true)
238
+ end
239
+
240
+ def addAll(items)
241
+ items.each {|item| @items.store(item, true)}
242
+ end
243
+
244
+ def each
245
+ @items.keys.each do |item|
246
+ yield item
247
+ end
248
+ end
249
+
250
+ def inspect
251
+ return "[" + @inputSymbol.inspect + "\n " +
252
+ @textPos.to_s + "\n " +
253
+ @items.keys.map{|item| item.inspect}.join("\n ") + "]\n"
254
+ end
255
+ end
256
+
257
+
258
+ class Item
259
+ attr_reader :rule, :j, :dot
260
+ @@items = Hash.new
261
+
262
+ def Item.new(rule, j, dot = 0)
263
+ unless (item = @@items[[rule,j,dot]])
264
+ item = super(rule, j, dot)
265
+ @@items.store([rule, j, dot], item)
266
+ end
267
+ return item
268
+ end
269
+
270
+ def symbolAfterDot
271
+ return @dot < @rule.rhs.size ? @rule.rhs[@dot] : nil
272
+ end
273
+
274
+ def initialize(rule, j, dot = 0)
275
+ @rule = rule
276
+ @j = j
277
+ @dot = dot
278
+ end
279
+
280
+ def move
281
+ return Item.new(@rule, @j, @dot + 1)
282
+ end
283
+
284
+ def inspect
285
+ return "[" +
286
+ @rule.lhs.inspect + " --> " +
287
+ (@rule.rhs.slice(0, dot) +
288
+ [Dot.new] +
289
+ @rule.rhs.slice(dot, @rule.rhs.length - dot)).map{|symbol| symbol.inspect}.join(" ") +
290
+ " ; " + @j.to_s + "]"
291
+ end
292
+
293
+ def <=>(other)
294
+ tmp = (@rule.prec <=> other.rule.prec)
295
+ if tmp == 0
296
+ return other.j <=> @j
297
+ else
298
+ return tmp
299
+ end
300
+ end
301
+ end
302
+
303
+ # Just for Item inspect
304
+ class Dot
305
+ def inspect
306
+ return "�"
307
+ end
308
+ end
309
+ end
@@ -0,0 +1,36 @@
1
+ module Kanocc
2
+ class GrammarRule
3
+ attr_reader :lhs, :rhs, :method, :argPositions
4
+ attr_accessor :prec
5
+
6
+ def initialize(lhs, rhs, method)
7
+ @lhs = lhs
8
+ @rhs = rhs
9
+ @method = method
10
+ @logger.debug("#{lhs} --> #{rhs.map {|gs| gs.is_a?(Symbol) ? gs.to_s : gs}.join}, #prec = #{@prec}, method = #{method}") unless not @logger
11
+ end
12
+
13
+ def operator
14
+ rhs.find {|s| s.is_a?(String) or s.is_a?(Token)}
15
+ end
16
+
17
+ def prec=(newPrec)
18
+ @prec = newPrec
19
+ end
20
+
21
+ # The precedence of a rule is defined as:
22
+ # The given precedence
23
+ # or (if that's not defined) the precedence of the leftmost operator (token)
24
+ # or (if that's not defined) 0.
25
+ def prec
26
+ @prec or
27
+ ((o = operator) and (@lhs.operatorPrecedence(o))) or
28
+ 0
29
+ end
30
+
31
+ def inspect
32
+ return lhs.inspect + " ::= " + rhs.map{|gs| gs.inspect}.join(" ")
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,158 @@
1
+ require 'kanocc/grammar_rule'
2
+ module Kanocc
3
+ class Nonterminal
4
+ attr_accessor :startPos, :endPos
5
+ @@rules = Hash.new
6
+ @@lastRule = Hash.new
7
+ @@derivesRight = Hash.new
8
+ @@operatorPrecedence = Hash.new
9
+ @@methodNames = Hash.new
10
+
11
+ Left = 1
12
+ Right = 2
13
+
14
+ def Nonterminal.derivesRight
15
+ @@derivesRight[self] = true
16
+ end
17
+
18
+ def Nonterminal.derivesRight?
19
+ return @@derivesRight[self]
20
+ end
21
+
22
+ def Nonterminal.setOperatorPrecedence(operator, precedence)
23
+ raise "Precedence must be an integer" unless precedence.class == Fixnum
24
+ @@operatorPrecedence[self] ||= Hash.new
25
+ if is_an_operator?(operator)
26
+ @@operatorPrecedence[self][operator] = precedence
27
+ elsif is_an_array_of_operators(operator)
28
+ operator.each {|o| @@operatorPrecedence[self][o] = precedence}
29
+ else
30
+ raise "Operator must be a string, a token or an array of those"
31
+ end
32
+ end
33
+
34
+ def Nonterminal.operatorPrecedence(operator)
35
+ (@@operatorPrecedence[self] and @@operatorPrecedence[self][operator]) or 0
36
+ end
37
+
38
+ def Nonterminal.is_an_array_of_operators(arr)
39
+ arr.is_a?(Array) and
40
+ arr.collect{|o| is_an_operator?(o)}.inject {|b1, b2| b1 and b2 }
41
+ end
42
+
43
+ def Nonterminal.is_an_operator?(operator)
44
+ operator.is_a?(String) or operator.is_a?(Token)
45
+ end
46
+
47
+ def Nonterminal.rules
48
+ rules = @@rules[self]
49
+ return rules ? rules : []
50
+ end
51
+
52
+ def Nonterminal.addRule(rule)
53
+ @@rules[self] ||= []
54
+ @@rules[self].push(rule)
55
+ @@lastRule[self] = rule
56
+ end
57
+
58
+ def Nonterminal.is_a_grammarsymbol?(x)
59
+ x.is_a?(String) or (x.respond_to?("is_a_kanocc_grammarsymbol?") and x.is_a_kanocc_grammarsymbol?)
60
+ end
61
+
62
+ def Nonterminal.is_a_kanocc_grammarsymbol?
63
+ return true
64
+ end
65
+
66
+ def Nonterminal.rule(*rhs, &block)
67
+ for pos in 0..rhs.length - 1 do
68
+ unless is_a_grammarsymbol?(rhs[pos])
69
+ raise "Problem with rule: #{rhs.inspect}, element:#{pos.to_s} - #{rhs[pos].inspect}\nElements of a rule must be Strings, Tokens or Nonterminals"
70
+ end
71
+ end
72
+
73
+ if block_given?
74
+ methodName = generateMethodName(*rhs)
75
+ define_method(methodName.to_sym, &block)
76
+ addRule(GrammarRule.new(self, rhs, methodName.to_sym))
77
+ else
78
+ addRule(GrammarRule.new(self, rhs, nil))
79
+ end
80
+ end
81
+
82
+ def Nonterminal.zm(symbols, sep = nil)
83
+ listClass = newListClass
84
+ listClass.rule() {@elements = []}
85
+ listClass.rule(om(symbols, sep)) {@elements = @rhs[0].elements}
86
+ return listClass
87
+ end
88
+
89
+ def Nonterminal.om(symbols, sep = nil)
90
+ symbols = [symbols] unless symbols.is_a? Array
91
+ listClass = newListClass
92
+ listClass.rule(*symbols) {@elements = @rhs}
93
+ if sep
94
+ listClass.rule(listClass, sep, *symbols) {@elements = @rhs[0].elements + @rhs[2..@rhs.length]}
95
+ else
96
+ listClass.rule(listClass, *symbols) {@elements = @rhs[0].elements + @rhs[1..@rhs.length]}
97
+ end
98
+ return listClass
99
+ end
100
+
101
+ @@listClassNumber = 0
102
+
103
+ def Nonterminal.newListClass
104
+ listClass = Class.new(List)
105
+ @@listClassNumber += 1
106
+ def listClass.inspect
107
+ return "anonList_#{@@listClassNumber}"
108
+ end
109
+ return listClass
110
+ end
111
+
112
+ def Nonterminal.generateMethodName(*args)
113
+ methodName = self.name + " --> " + args.map {|a| a.inspect}.join(' ')
114
+ @@methodNames[self] ||= []
115
+ i = 1
116
+ while @@methodNames[self].member?(methodName) do
117
+ methodName += ' ';
118
+ end
119
+ @@methodNames[self].push(methodName)
120
+ return methodName
121
+ end
122
+
123
+ def Nonterminal.prec(p)
124
+ raise "Call to prec not preceded by rule" unless @@lastRule[self]
125
+ @@lastRule[self].prec = p
126
+ end
127
+
128
+ def Nonterminal.showMethodNames
129
+ @@methodNames[self].each{|mn| puts mn.inspect} if @@methodNames[self]
130
+ end
131
+ end
132
+
133
+
134
+ class List < Nonterminal
135
+ attr_reader :elements
136
+
137
+ protected
138
+ # Assumes @rhs[0] is a Kanocc::List and that rhs.length > 1
139
+ def collect(stripSeparator = false)
140
+ puts "collect with stripSeparator = #{stripSeparator}"
141
+ @elements = @rhs[0].elements
142
+ if stripSeparator
143
+ @elements = @elements + @rhs[2..@rhs.length]
144
+ else
145
+ @elements = @elements + @rhs[1..@rhs.length]
146
+ end
147
+ puts "@elements: " + @elements.inspect
148
+ end
149
+ end
150
+
151
+ class Error < Nonterminal
152
+ attr_reader :text
153
+ def initialize
154
+ super
155
+ @text = "FIXME"
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,152 @@
1
+ require 'stringio'
2
+ require 'strscan'
3
+ require 'logger'
4
+
5
+ module Kanocc
6
+ class Scanner
7
+ attr_accessor :logger
8
+ def initialize(init = {})
9
+ if init[:logger]
10
+ @logger = init[:logger]
11
+ else
12
+ @logger = Logger.new(STDOUT)
13
+ @logger.level = Logger::WARN
14
+ end
15
+ @wsRegs = [/\s/]
16
+ @recognizables = []
17
+ @regexps = []
18
+ end
19
+
20
+ def setWhitespace(*wsRegs)
21
+ @wsRegs = []
22
+ wsRegs.each do |wsReg|
23
+ unless wsReg.is_a?(Regexp)
24
+ raise "setWhitespace must be given a list of Regexp's"
25
+ end
26
+ @wsRegs << r
27
+ end
28
+ end
29
+
30
+ def setRecognized(*rec)
31
+ @recognizables = []
32
+ @regexps = []
33
+ rec.each do |r|
34
+ @recognizables << r
35
+ if r.class == Class
36
+ @regexps << r.pattern
37
+ else
38
+ @regexps << Regexp.compile(Regexp.escape(r))
39
+ end
40
+ end
41
+ end
42
+
43
+ def eachToken(input)
44
+ if input.is_a?(IO)
45
+ @input = input.readlines.join("")
46
+ elsif input.is_a?(String)
47
+ @input = input
48
+ else
49
+ raise "Input must be a string or an IO object"
50
+ end
51
+ @stringScanner = StringScanner.new(@input)
52
+ pos = @stringScanner.pos
53
+ while tokens = nextToken do
54
+ @logger.debug("Yielding with #{tokens}, #{pos}, #{@stringScanner.pos}")
55
+ yield(tokens, pos, @stringScanner.pos)
56
+ pos = @stringScanner.pos
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ def nextToken
63
+
64
+ while true do
65
+ if @stringScanner.pos >= @input.length
66
+ return nil
67
+ end
68
+ tokens = matchToken
69
+
70
+ if tokens.size > 0
71
+ @logger.debug("nextToken returning #{tokens}")
72
+ return tokens
73
+ elsif trimWhitespace
74
+ # Now we've stripped some whitespace, so we go
75
+ # back and try to match a token again
76
+ next
77
+ else
78
+ # We've not been able to recognize a token or whitespace,
79
+ # so we emit the first character of the remaining input as a string literal.
80
+ # With this behavior, lexical scanning cannot fail.
81
+ res = [@stringScanner.scan(/./m)]
82
+ @logger.debug("nextToken returning #{res.inspect}")
83
+ return res
84
+ end
85
+ end
86
+ end
87
+
88
+ def matchToken
89
+ regPoss = findMatchingReg(@regexps)
90
+ @logger.debug("matchToken, regPoss = #{regPoss.inspect}");
91
+ tokens = []
92
+ str = nil
93
+ regPoss.each do |i|
94
+ logger.debug("@recognizables[#{i}] = #{@recognizables[i].inspect}")
95
+ str = @stringScanner.scan(@regexps[i]) unless str
96
+ if @recognizables[i].class == Class
97
+ @logger.debug("Its a class")
98
+ token = @recognizables[i].new(str)
99
+ token.m = token.match(str) # To create a proper match object
100
+ @logger.debug("token: " + token.inspect)
101
+ tokens << token
102
+ @logger.debug("tokens: " + tokens.inspect)
103
+ else
104
+ tokens << str
105
+ end
106
+ end
107
+ @logger.debug("matchToken returning: " + tokens.inspect)
108
+ return tokens
109
+ end
110
+
111
+ def trimWhitespace
112
+ wsPoss = findMatchingReg(@wsRegs)
113
+ if wsPoss.size > 0
114
+ @stringScanner.skip(@wsRegs[wsPoss[0]])
115
+ return true
116
+ else
117
+ return false
118
+ end
119
+ end
120
+
121
+ def findMatchingReg(arrayOfRegs)
122
+ @logger.debug("findMatchingReg: arrayOfRegs = #{arrayOfRegs}")
123
+ maxLength = 0
124
+ regPoss = []
125
+ for i in 0..arrayOfRegs.size-1 do
126
+ len = @stringScanner.match?(arrayOfRegs[i]) || 0
127
+ if len > maxLength
128
+ regPoss = [i]
129
+ maxLength = len
130
+ elsif len == maxLength and len > 0
131
+ regPoss << i
132
+ end
133
+ end
134
+ return regPoss
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ ############################################
141
+ # Testing
142
+ #require 'Token'
143
+ #
144
+ #class Number < Token
145
+ # setPattern(/\d+/)
146
+ #end
147
+ #
148
+ #scanner = KanoccScanner.new
149
+ #scanner.setRecognized(Number, "Exit")
150
+ #scanner.setWhitespace(/[ \t]/)
151
+ #
152
+ #scanner.eachTokenDo{|token| print token.inspect, "\n"}
@@ -0,0 +1,40 @@
1
+ module Kanocc
2
+ class Token < Regexp
3
+ attr_reader :str
4
+ attr_accessor :m
5
+
6
+ @@patterns = Hash.new
7
+
8
+ def initialize(str)
9
+ @str = str
10
+ super(@@patterns[self.class])
11
+ end
12
+
13
+ def ===(klass)
14
+ self.class == klass
15
+ end
16
+
17
+ def Token.setPattern(reg, &block)
18
+ @@patterns[self] = reg
19
+ if block_given?
20
+ define_method(:__recognize__, &block)
21
+ end
22
+ end
23
+
24
+ def Token.pattern
25
+ return @@patterns[self]
26
+ end
27
+
28
+ def is_a_kanocc_token?
29
+ return true
30
+ end
31
+
32
+ def Token.is_a_kanocc_grammarsymbol?
33
+ return true
34
+ end
35
+
36
+ def inspect
37
+ self.class.name + "[" + @str + "]"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,3 @@
1
+ Better handling of blocks
2
+ LR Parsers
3
+ Scanner.eachToken method
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Kanocc
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Christian Surlykke
8
+ autorequire: kanocc
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-04-12 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: ""
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/todo
26
+ - lib/kanocc.rb
27
+ - lib/kanocc
28
+ - lib/kanocc/token.rb
29
+ - lib/kanocc/grammar_rule.rb
30
+ - lib/kanocc/nonterminal.rb
31
+ - lib/kanocc/scanner.rb
32
+ - lib/kanocc/earley.rb
33
+ - examples/calculator.rb
34
+ has_rdoc: false
35
+ homepage: ""
36
+ post_install_message:
37
+ rdoc_options: []
38
+
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: "0"
52
+ version:
53
+ requirements: []
54
+
55
+ rubyforge_project:
56
+ rubygems_version: 1.0.1
57
+ signing_key:
58
+ specification_version: 2
59
+ summary: Kanocc - Kanocc ain't no compiler-compiler. A framework for syntax directed translation
60
+ test_files: []
61
+