kanocc 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,322 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ require 'kanocc/grammar_rule'
19
+ require 'kanocc/token'
20
+ require 'logger'
21
+ module Kanocc
22
+ #
23
+ # Parser for Kanocc based on Earleys algorithm. For a description see:
24
+ # Alfred V. Aho, Jeffrey D. Ullman, The Theory of Parsing, Translation and Compiling,
25
+ # or try a web search engine of your choice with 'Earley parsing'
26
+ #
27
+ # Earley's parser will parse according to any zcontext-free grammar using O(n*n*n) time
28
+ # and O(n*n) space, n being the length of input. If the grammar is unambigous time/space
29
+ # complexity is O(n*n)/O(n*n).
30
+ # As of yet (version 0.1) the implementation is surely not optimal,
31
+ # so time/space complexity is probably worse.
32
+ #
33
+ # Christian Surlykke 2007.
34
+ #
35
+ class EarleyParser
36
+ attr_accessor :kanocc, :logger
37
+
38
+ ErrorRule = GrammarRule.new(Error, [], nil)
39
+
40
+ def initialize(kanocc, options = {})
41
+ @kanocc = kanocc
42
+ @logger = options[:logger] || Logger.new
43
+ end
44
+
45
+ #
46
+ # Sets up the parser, creating itemlist 0.
47
+ #
48
+ def startsymbol=(startSymbol)
49
+ @start_symbol = startSymbol
50
+ @itemLists = [ItemList.new(nil, 0)]
51
+ @inputPos = 0
52
+ @recoveryPoints = []
53
+ @itemLists[0].add_all(@start_symbol.rules.map{|rule| Item.new(rule, 0)})
54
+ predict_and_complete(0)
55
+ end
56
+
57
+ def prepare
58
+ @itemLists = @itemLists[0..0]
59
+ @inputPos = 0
60
+ if @recoveryPoints.size > 0 and @recoveryPoints[0] == 0
61
+ @recoveryPoints = [0]
62
+ else
63
+ @recoveryPoints = []
64
+ end
65
+ @logger.info("Itemlist 0:\n" + @itemLists[0].inspect) unless not @logger
66
+ end
67
+
68
+
69
+ def scan(token_match)
70
+ token_match[:matches].each do |match|
71
+ if match[:token]
72
+ symbol = match[:token]
73
+ else
74
+ symbol = match[:literal]
75
+ end
76
+ items = @itemLists[@inputPos - 1].find_matching(symbol)
77
+ @itemLists[@inputPos].add_all(items.map{|item| item.move})
78
+ end
79
+ end
80
+
81
+ def predict_and_complete(pos)
82
+ item_list = @itemLists[pos]
83
+ prev_size = 0
84
+ while prev_size < item_list.size do
85
+ prev_size = item_list.size
86
+ item_list.each do |item|
87
+ if item.rule.rhs.length <= item.dot
88
+ # complete
89
+ item_list.add_all(@itemLists[item.j].find_matching(item.rule.lhs).map{|item| item.move})
90
+ elsif (nont = item.rule.rhs[item.dot]).respond_to?(:rules)
91
+ # predict
92
+ item_list.add_all(nont.rules.map {|rule| Item.new(rule, @inputPos)})
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ def add_recovery_points(pos)
99
+ if @recoveryPoints[-1] != pos
100
+ @itemLists[pos].each do |item|
101
+ if Error == item.rule.rhs[item.dot]
102
+ @recoveryPoints.push(pos)
103
+ break
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ #
110
+ # Consume and parse next input symbol
111
+ #
112
+ def consume(token_match)
113
+ @inputPos += 1
114
+ @itemLists.push(ItemList.new(token_match, @inputPos))
115
+
116
+ # scan, predict and complete until no more can be added
117
+ scan(token_match)
118
+
119
+ if @itemLists[@inputPos].size == 0
120
+ @logger.debug("Found no items matching #{token_match} in itemlist #{@inputPos - 1}")
121
+ @logger.debug("@recoveryPoints = " + @recoveryPoints.inspect)
122
+ for i in 1..@recoveryPoints.length do
123
+ if @recoveryPoints[-i] < @inputPos
124
+ @itemLists[@inputPos - 1].add(Item.new(ErrorRule, @recoveryPoints[-i]))
125
+ predict_and_complete(@inputPos - 1)
126
+ scan(token_match)
127
+ break if @itemLists[@inputPos].size > 0
128
+ end
129
+ end
130
+ end
131
+ predict_and_complete(@inputPos)
132
+ add_recovery_points(@inputPos)
133
+ @logger.info("Itemlist #{@inputPos}:\n" + @itemLists[@inputPos].inspect) if @logger
134
+ end
135
+
136
+
137
+
138
+
139
+ #
140
+ # Signal to the parser that end of input is reached
141
+ #
142
+ def eof
143
+ top_item = find_full_items(@start_symbol, @inputPos).find_all {|item| item.j == 0}.max
144
+ if top_item
145
+ translate(top_item, @inputPos)
146
+ else
147
+ raise(KanoccException, "It didn't parse")
148
+ end
149
+ end
150
+
151
+ def translate(element, pos)
152
+ @logger.debug("translate: " + element.inspect + " on " + pos.inspect)
153
+ if element.class == Item
154
+ translate_helper(element, pos)
155
+ @kanocc.report_reduction(element.rule)
156
+ else # Its a token or a string
157
+ @kanocc.report_token(@itemLists[pos].inputSymbol, element)
158
+ end
159
+ end
160
+
161
+ def translate_helper(item, pos)
162
+ @logger.debug("translateHelper: " + item.inspect + " on " + pos.inspect)
163
+ return if item.dot == 0
164
+ if item.rule.rhs[item.dot - 1].respond_to?("rules")
165
+ # Assume item is of form [A --> aB*c, k] in itemlist i
166
+ # Must then find item of form [B --> x*, j] in itemlist i so
167
+ # that there exists item of form [A --> a*Bc, k] on itemlist j
168
+
169
+ # First: Items of form [B --> x*, j] on list i
170
+ candidates = find_full_items(item.rule.rhs[item.dot - 1], pos)
171
+
172
+ # Then: Those for which item of form [A --> a*Bc, k] exists
173
+ # on list j
174
+ candidates = candidates.find_all {|subItem|
175
+ @itemLists[subItem.j].find_item(item.rule, item.dot - 1, item.j)
176
+ }
177
+
178
+ # Precedence: We pick the posibility with the higest precedence
179
+ sub_item = candidates.max
180
+ prev_item = @itemLists[sub_item.j].find_item(item.rule, item.dot - 1, item.j)
181
+ prev_list = sub_item.j
182
+ else
183
+ prev_item = @itemLists[pos - 1].find_item(item.rule, item.dot - 1, item.j)
184
+ prev_list = pos - 1
185
+ sub_item = item.rule.rhs[item.dot - 1]
186
+ end
187
+ translate_helper(prev_item, prev_list)
188
+ translate(sub_item, pos)
189
+ end
190
+
191
+
192
+
193
+ def find_full_items(nonterminal, inputPos)
194
+ @itemLists[inputPos].find_all do |item|
195
+ item.rule.lhs == nonterminal and item.dot >= item.rule.rhs.length
196
+ end
197
+ end
198
+ end
199
+
200
+ class ItemList
201
+ attr_reader :inputSymbol
202
+ attr_accessor :items
203
+
204
+ def initialize(inputSymbol, inputPos)
205
+ @inputPos = inputPos
206
+ @inputSymbol = inputSymbol
207
+ @items = Hash.new
208
+ end
209
+
210
+ def copy
211
+ res = clone
212
+ res.items = @items.clone
213
+ return res
214
+ end
215
+
216
+ def size
217
+ return @items.size
218
+ end
219
+
220
+ def find_all(&b)
221
+ return @items.keys.find_all(&b)
222
+ end
223
+
224
+ def find_item(rule, dot, j)
225
+ return @items.keys.find{ |item|
226
+ item.rule == rule and
227
+ item.dot == dot and
228
+ item.j == j
229
+ }
230
+ end
231
+
232
+ def each_matching(inputSymbol)
233
+ find_matching(inputSymbol).each do |item|
234
+ yield(item)
235
+ end
236
+ end
237
+
238
+ def find_matching(inputSymbol)
239
+ @items.keys.find_all do |item|
240
+ inputSymbol === item.symbol_after_dot or inputSymbol == item.symbol_after_dot
241
+ end
242
+ end
243
+
244
+ def contains(item)
245
+ return @items[item]
246
+ end
247
+
248
+ def add(item)
249
+ @items.store(item, true)
250
+ end
251
+
252
+ def add_all(items)
253
+ items.each {|item| @items.store(item, true)}
254
+ end
255
+
256
+ def each
257
+ @items.keys.each do |item|
258
+ yield item
259
+ end
260
+ end
261
+
262
+ def inspect
263
+ return "[" + @inputSymbol.inspect + "\n " +
264
+ @items.keys.map{|item| item.inspect}.join("\n ") + "]\n"
265
+ end
266
+ end
267
+
268
+
269
+ class Item
270
+ attr_reader :rule, :j, :dot
271
+ @@items = Hash.new
272
+
273
+ def Item.new(rule, j, dot = 0)
274
+ unless (item = @@items[[rule,j,dot]])
275
+ item = super(rule, j, dot)
276
+ @@items.store([rule, j, dot], item)
277
+ end
278
+ return item
279
+ end
280
+
281
+ def symbol_after_dot
282
+ return @dot < @rule.rhs.size ? @rule.rhs[@dot] : nil
283
+ end
284
+
285
+ def initialize(rule, j, dot = 0)
286
+ @rule = rule
287
+ @j = j
288
+ @dot = dot
289
+ end
290
+
291
+ def move
292
+ return Item.new(@rule, @j, @dot + 1)
293
+ end
294
+
295
+ def inspect
296
+ return "[" +
297
+ @rule.lhs.inspect + " --> " +
298
+ (@rule.rhs.slice(0, dot) +
299
+ [Dot.new] +
300
+ @rule.rhs.slice(dot, @rule.rhs.length - dot)).map{|symbol| symbol.inspect}.join(" ") +
301
+ " ; " + @j.to_s + "]"
302
+ end
303
+
304
+ def <=>(other)
305
+ res = @rule.prec <=> other.rule.prec;
306
+ if res == 0 and @rule.operator_prec and other.rule.operator_prec
307
+ res = other.rule.operator_prec <=> @rule.operator_prec
308
+ end
309
+ if res == 0
310
+ res = @j <=> other.j
311
+ end
312
+ return res
313
+ end
314
+ end
315
+
316
+ # Just for Item inspect
317
+ class Dot
318
+ def inspect
319
+ return "*"
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ module Kanocc
19
+ class GrammarRule
20
+ attr_reader :lhs, :rhs, :method, :operator_prec
21
+ attr_accessor :prec
22
+
23
+ def initialize(lhs, rhs, method)
24
+ @lhs = lhs
25
+ @rhs = rhs
26
+ @method = method
27
+ if (operator =rhs.find {|s| s.is_a?(String) or s.is_a?(Token)})
28
+ @operator_prec = Nonterminal.operator_precedence(operator)
29
+ end
30
+ @prec = 0
31
+ @logger.debug("#{lhs} --> #{rhs.map {|gs| gs.is_a?(Symbol) ? gs.to_s : gs}.join}, #prec = #{@prec}, method = #{method}") unless not @logger
32
+ end
33
+
34
+ def operator_prec
35
+ unless @operator_prec_calculated
36
+ operator = rhs.find {|s| s.is_a?(String) or s.is_a?(Token)}
37
+ if operator
38
+ @operator_prec = lhs.operator_precedence(operator)
39
+ end
40
+ @operator_prec_calculated = true
41
+ end
42
+ @operator_prec
43
+ end
44
+
45
+ def inspect
46
+ return lhs.inspect + " ::= " + rhs.map{|gs| gs.inspect}.join(" ")
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,176 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ require 'kanocc/grammar_rule'
19
+ module Kanocc
20
+ class Nonterminal
21
+ @@rules = Hash.new
22
+ @@last_rule = Hash.new
23
+ @@derives_right = Hash.new
24
+ @@operator_precedence = Hash.new
25
+ @@method_names = Hash.new
26
+
27
+ Left = 1
28
+ Right = 2
29
+
30
+ def Nonterminal.derives_right
31
+ @@derives_right[self] = true
32
+ end
33
+
34
+ def Nonterminal.derives_right?
35
+ return @@derives_right[self]
36
+ end
37
+
38
+ def Nonterminal.set_operator_precedence(operator, precedence)
39
+ raise "Precedence must be an integer" unless precedence.class == Fixnum
40
+ @@operator_precedence[self] ||= Hash.new
41
+ if is_an_operator?(operator)
42
+ @@operator_precedence[self][operator] = precedence
43
+ elsif is_an_array_of_operators(operator)
44
+ operator.each {|o| @@operator_precedence[self][o] = precedence}
45
+ else
46
+ raise "Operator must be a string, a token or an array of those"
47
+ end
48
+ end
49
+
50
+ def Nonterminal.operator_precedence(operator)
51
+ (@@operator_precedence[self] and @@operator_precedence[self][operator]) or 0
52
+ end
53
+
54
+ def Nonterminal.is_an_array_of_operators(arr)
55
+ arr.is_a?(Array) and
56
+ arr.collect{|o| is_an_operator?(o)}.inject {|b1, b2| b1 and b2 }
57
+ end
58
+
59
+ def Nonterminal.is_an_operator?(operator)
60
+ operator.is_a?(String) or operator.is_a?(Token)
61
+ end
62
+
63
+ def Nonterminal.rules
64
+ rules = @@rules[self]
65
+ return rules ? rules : []
66
+ end
67
+
68
+ def Nonterminal.add_rule(rule)
69
+ @@rules[self] ||= []
70
+ @@rules[self].push(rule)
71
+ @@last_rule[self] = rule
72
+ end
73
+
74
+ def Nonterminal.is_a_grammarsymbol?(x)
75
+ x.is_a?(String) or (x.respond_to?("is_a_kanocc_grammarsymbol?") and x.is_a_kanocc_grammarsymbol?)
76
+ end
77
+
78
+ def Nonterminal.is_a_kanocc_grammarsymbol?
79
+ return true
80
+ end
81
+
82
+ def Nonterminal.rule(*rhs, &block)
83
+ for pos in 0..rhs.length - 1 do
84
+ unless is_a_grammarsymbol?(rhs[pos])
85
+ raise "Problem with rule: #{rhs.inspect}, element:#{pos.to_s} - #{rhs[pos].inspect}\nElements of a rule must be Strings, Tokens or Nonterminals"
86
+ end
87
+ end
88
+
89
+ if block_given?
90
+ method_name = generate_method_name(*rhs)
91
+ define_method(method_name.to_sym, &block)
92
+ add_rule(GrammarRule.new(self, rhs, method_name.to_sym))
93
+ else
94
+ add_rule(GrammarRule.new(self, rhs, nil))
95
+ end
96
+ end
97
+
98
+ def Nonterminal.zm(symbols, sep = nil)
99
+ list_class = new_list_class
100
+ list_class.rule() {@elements = []}
101
+ list_class.rule(om(symbols, sep)) {@elements = @rhs[0].elements}
102
+ return list_class
103
+ end
104
+
105
+ def Nonterminal.om(symbols, sep = nil)
106
+ symbols = [symbols] unless symbols.is_a? Array
107
+ list_class = new_list_class
108
+ list_class.rule(*symbols) {@elements = @rhs}
109
+ if sep
110
+ list_class.rule(list_class, sep, *symbols) {@elements = @rhs[0].elements + @rhs[2..@rhs.length]}
111
+ else
112
+ list_class.rule(list_class, *symbols) {@elements = @rhs[0].elements + @rhs[1..@rhs.length]}
113
+ end
114
+ return list_class
115
+ end
116
+
117
+ @@listClassNumber = 0
118
+
119
+ def Nonterminal.new_list_class
120
+ list_class = Class.new(List)
121
+ @@listClassNumber += 1
122
+ def list_class.inspect
123
+ return "anonList_#{@@listClassNumber}"
124
+ end
125
+ return list_class
126
+ end
127
+
128
+ def Nonterminal.generate_method_name(*args)
129
+ method_name = self.name + " --> " + args.map {|a| a.inspect}.join(' ')
130
+ @@method_names[self] ||= []
131
+ i = 1
132
+ while @@method_names[self].member?(method_name) do
133
+ method_name += ' ';
134
+ end
135
+ @@method_names[self].push(method_name)
136
+ return method_name
137
+ end
138
+
139
+ def Nonterminal.prec(p)
140
+ raise "Call to prec not preceded by rule" unless @@last_rule[self]
141
+ @@last_rule[self].prec = p
142
+ end
143
+
144
+ def Nonterminal.show_method_names
145
+ @@method_names[self].each{|mn| puts mn.inspect} if @@method_names[self]
146
+ end
147
+
148
+ def inspect
149
+ self.class.name
150
+ end
151
+ end
152
+
153
+
154
+ class List < Nonterminal
155
+ attr_reader :elements
156
+
157
+ protected
158
+ # Assumes @rhs[0] is a Kanocc::List and that rhs.length > 1
159
+ def collect(strip_separator = false)
160
+ @elements = @rhs[0].elements
161
+ if strip_separator
162
+ @elements = @elements + @rhs[2..@rhs.length]
163
+ else
164
+ @elements = @elements + @rhs[1..@rhs.length]
165
+ end
166
+ end
167
+ end
168
+
169
+ class Error < Nonterminal
170
+ attr_reader :text
171
+ def initialize
172
+ super
173
+ @text = "FIXME"
174
+ end
175
+ end
176
+ end