kanocc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ require 'kanocc/grammar_rule'
19
+ require 'kanocc/token'
20
+ require 'logger'
21
+ module Kanocc
22
+ #
23
+ # Parser for Kanocc based on Earleys algorithm. For a description see:
24
+ # Alfred V. Aho, Jeffrey D. Ullman, The Theory of Parsing, Translation and Compiling,
25
+ # or try a web search engine of your choice with 'Earley parsing'
26
+ #
27
+ # Earley's parser will parse according to any zcontext-free grammar using O(n*n*n) time
28
+ # and O(n*n) space, n being the length of input. If the grammar is unambigous time/space
29
+ # complexity is O(n*n)/O(n*n).
30
+ # As of yet (version 0.1) the implementation is surely not optimal,
31
+ # so time/space complexity is probably worse.
32
+ #
33
+ # Christian Surlykke 2007.
34
+ #
35
+ class EarleyParser
36
+ attr_accessor :kanocc, :logger
37
+
38
+ ErrorRule = GrammarRule.new(Error, [], nil)
39
+
40
+ def initialize(kanocc, options = {})
41
+ @kanocc = kanocc
42
+ @logger = options[:logger] || Logger.new
43
+ end
44
+
45
+ #
46
+ # Sets up the parser, creating itemlist 0.
47
+ #
48
+ def startsymbol=(startSymbol)
49
+ @start_symbol = startSymbol
50
+ @itemLists = [ItemList.new(nil, 0)]
51
+ @inputPos = 0
52
+ @recoveryPoints = []
53
+ @itemLists[0].add_all(@start_symbol.rules.map{|rule| Item.new(rule, 0)})
54
+ predict_and_complete(0)
55
+ end
56
+
57
+ def prepare
58
+ @itemLists = @itemLists[0..0]
59
+ @inputPos = 0
60
+ if @recoveryPoints.size > 0 and @recoveryPoints[0] == 0
61
+ @recoveryPoints = [0]
62
+ else
63
+ @recoveryPoints = []
64
+ end
65
+ @logger.info("Itemlist 0:\n" + @itemLists[0].inspect) unless not @logger
66
+ end
67
+
68
+
69
+ def scan(token_match)
70
+ token_match[:matches].each do |match|
71
+ if match[:token]
72
+ symbol = match[:token]
73
+ else
74
+ symbol = match[:literal]
75
+ end
76
+ items = @itemLists[@inputPos - 1].find_matching(symbol)
77
+ @itemLists[@inputPos].add_all(items.map{|item| item.move})
78
+ end
79
+ end
80
+
81
+ def predict_and_complete(pos)
82
+ item_list = @itemLists[pos]
83
+ prev_size = 0
84
+ while prev_size < item_list.size do
85
+ prev_size = item_list.size
86
+ item_list.each do |item|
87
+ if item.rule.rhs.length <= item.dot
88
+ # complete
89
+ item_list.add_all(@itemLists[item.j].find_matching(item.rule.lhs).map{|item| item.move})
90
+ elsif (nont = item.rule.rhs[item.dot]).respond_to?(:rules)
91
+ # predict
92
+ item_list.add_all(nont.rules.map {|rule| Item.new(rule, @inputPos)})
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ def add_recovery_points(pos)
99
+ if @recoveryPoints[-1] != pos
100
+ @itemLists[pos].each do |item|
101
+ if Error == item.rule.rhs[item.dot]
102
+ @recoveryPoints.push(pos)
103
+ break
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ #
110
+ # Consume and parse next input symbol
111
+ #
112
+ def consume(token_match)
113
+ @inputPos += 1
114
+ @itemLists.push(ItemList.new(token_match, @inputPos))
115
+
116
+ # scan, predict and complete until no more can be added
117
+ scan(token_match)
118
+
119
+ if @itemLists[@inputPos].size == 0
120
+ @logger.debug("Found no items matching #{token_match} in itemlist #{@inputPos - 1}")
121
+ @logger.debug("@recoveryPoints = " + @recoveryPoints.inspect)
122
+ for i in 1..@recoveryPoints.length do
123
+ if @recoveryPoints[-i] < @inputPos
124
+ @itemLists[@inputPos - 1].add(Item.new(ErrorRule, @recoveryPoints[-i]))
125
+ predict_and_complete(@inputPos - 1)
126
+ scan(token_match)
127
+ break if @itemLists[@inputPos].size > 0
128
+ end
129
+ end
130
+ end
131
+ predict_and_complete(@inputPos)
132
+ add_recovery_points(@inputPos)
133
+ @logger.info("Itemlist #{@inputPos}:\n" + @itemLists[@inputPos].inspect) if @logger
134
+ end
135
+
136
+
137
+
138
+
139
+ #
140
+ # Signal to the parser that end of input is reached
141
+ #
142
+ def eof
143
+ top_item = find_full_items(@start_symbol, @inputPos).find_all {|item| item.j == 0}.max
144
+ if top_item
145
+ translate(top_item, @inputPos)
146
+ else
147
+ raise(KanoccException, "It didn't parse")
148
+ end
149
+ end
150
+
151
+ def translate(element, pos)
152
+ @logger.debug("translate: " + element.inspect + " on " + pos.inspect)
153
+ if element.class == Item
154
+ translate_helper(element, pos)
155
+ @kanocc.report_reduction(element.rule)
156
+ else # Its a token or a string
157
+ @kanocc.report_token(@itemLists[pos].inputSymbol, element)
158
+ end
159
+ end
160
+
161
+ def translate_helper(item, pos)
162
+ @logger.debug("translateHelper: " + item.inspect + " on " + pos.inspect)
163
+ return if item.dot == 0
164
+ if item.rule.rhs[item.dot - 1].respond_to?("rules")
165
+ # Assume item is of form [A --> aB*c, k] in itemlist i
166
+ # Must then find item of form [B --> x*, j] in itemlist i so
167
+ # that there exists item of form [A --> a*Bc, k] on itemlist j
168
+
169
+ # First: Items of form [B --> x*, j] on list i
170
+ candidates = find_full_items(item.rule.rhs[item.dot - 1], pos)
171
+
172
+ # Then: Those for which item of form [A --> a*Bc, k] exists
173
+ # on list j
174
+ candidates = candidates.find_all {|subItem|
175
+ @itemLists[subItem.j].find_item(item.rule, item.dot - 1, item.j)
176
+ }
177
+
178
+ # Precedence: We pick the posibility with the higest precedence
179
+ sub_item = candidates.max
180
+ prev_item = @itemLists[sub_item.j].find_item(item.rule, item.dot - 1, item.j)
181
+ prev_list = sub_item.j
182
+ else
183
+ prev_item = @itemLists[pos - 1].find_item(item.rule, item.dot - 1, item.j)
184
+ prev_list = pos - 1
185
+ sub_item = item.rule.rhs[item.dot - 1]
186
+ end
187
+ translate_helper(prev_item, prev_list)
188
+ translate(sub_item, pos)
189
+ end
190
+
191
+
192
+
193
+ def find_full_items(nonterminal, inputPos)
194
+ @itemLists[inputPos].find_all do |item|
195
+ item.rule.lhs == nonterminal and item.dot >= item.rule.rhs.length
196
+ end
197
+ end
198
+ end
199
+
200
+ class ItemList
201
+ attr_reader :inputSymbol
202
+ attr_accessor :items
203
+
204
+ def initialize(inputSymbol, inputPos)
205
+ @inputPos = inputPos
206
+ @inputSymbol = inputSymbol
207
+ @items = Hash.new
208
+ end
209
+
210
+ def copy
211
+ res = clone
212
+ res.items = @items.clone
213
+ return res
214
+ end
215
+
216
+ def size
217
+ return @items.size
218
+ end
219
+
220
+ def find_all(&b)
221
+ return @items.keys.find_all(&b)
222
+ end
223
+
224
+ def find_item(rule, dot, j)
225
+ return @items.keys.find{ |item|
226
+ item.rule == rule and
227
+ item.dot == dot and
228
+ item.j == j
229
+ }
230
+ end
231
+
232
+ def each_matching(inputSymbol)
233
+ find_matching(inputSymbol).each do |item|
234
+ yield(item)
235
+ end
236
+ end
237
+
238
+ def find_matching(inputSymbol)
239
+ @items.keys.find_all do |item|
240
+ inputSymbol === item.symbol_after_dot or inputSymbol == item.symbol_after_dot
241
+ end
242
+ end
243
+
244
+ def contains(item)
245
+ return @items[item]
246
+ end
247
+
248
+ def add(item)
249
+ @items.store(item, true)
250
+ end
251
+
252
+ def add_all(items)
253
+ items.each {|item| @items.store(item, true)}
254
+ end
255
+
256
+ def each
257
+ @items.keys.each do |item|
258
+ yield item
259
+ end
260
+ end
261
+
262
+ def inspect
263
+ return "[" + @inputSymbol.inspect + "\n " +
264
+ @items.keys.map{|item| item.inspect}.join("\n ") + "]\n"
265
+ end
266
+ end
267
+
268
+
269
+ class Item
270
+ attr_reader :rule, :j, :dot
271
+ @@items = Hash.new
272
+
273
+ def Item.new(rule, j, dot = 0)
274
+ unless (item = @@items[[rule,j,dot]])
275
+ item = super(rule, j, dot)
276
+ @@items.store([rule, j, dot], item)
277
+ end
278
+ return item
279
+ end
280
+
281
+ def symbol_after_dot
282
+ return @dot < @rule.rhs.size ? @rule.rhs[@dot] : nil
283
+ end
284
+
285
+ def initialize(rule, j, dot = 0)
286
+ @rule = rule
287
+ @j = j
288
+ @dot = dot
289
+ end
290
+
291
+ def move
292
+ return Item.new(@rule, @j, @dot + 1)
293
+ end
294
+
295
+ def inspect
296
+ return "[" +
297
+ @rule.lhs.inspect + " --> " +
298
+ (@rule.rhs.slice(0, dot) +
299
+ [Dot.new] +
300
+ @rule.rhs.slice(dot, @rule.rhs.length - dot)).map{|symbol| symbol.inspect}.join(" ") +
301
+ " ; " + @j.to_s + "]"
302
+ end
303
+
304
+ def <=>(other)
305
+ res = @rule.prec <=> other.rule.prec;
306
+ if res == 0 and @rule.operator_prec and other.rule.operator_prec
307
+ res = other.rule.operator_prec <=> @rule.operator_prec
308
+ end
309
+ if res == 0
310
+ res = @j <=> other.j
311
+ end
312
+ return res
313
+ end
314
+ end
315
+
316
+ # Just for Item inspect
317
+ class Dot
318
+ def inspect
319
+ return "*"
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ module Kanocc
19
+ class GrammarRule
20
+ attr_reader :lhs, :rhs, :method, :operator_prec
21
+ attr_accessor :prec
22
+
23
+ def initialize(lhs, rhs, method)
24
+ @lhs = lhs
25
+ @rhs = rhs
26
+ @method = method
27
+ if (operator =rhs.find {|s| s.is_a?(String) or s.is_a?(Token)})
28
+ @operator_prec = Nonterminal.operator_precedence(operator)
29
+ end
30
+ @prec = 0
31
+ @logger.debug("#{lhs} --> #{rhs.map {|gs| gs.is_a?(Symbol) ? gs.to_s : gs}.join}, #prec = #{@prec}, method = #{method}") unless not @logger
32
+ end
33
+
34
+ def operator_prec
35
+ unless @operator_prec_calculated
36
+ operator = rhs.find {|s| s.is_a?(String) or s.is_a?(Token)}
37
+ if operator
38
+ @operator_prec = lhs.operator_precedence(operator)
39
+ end
40
+ @operator_prec_calculated = true
41
+ end
42
+ @operator_prec
43
+ end
44
+
45
+ def inspect
46
+ return lhs.inspect + " ::= " + rhs.map{|gs| gs.inspect}.join(" ")
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,176 @@
1
+ #
2
+ # Copyright 2008 Christian Surlykke
3
+ #
4
+ # This file is part of Kanocc.
5
+ #
6
+ # Kanocc is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License, version 3
8
+ # as published by the Free Software Foundation.
9
+ #
10
+ # Kanocc is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License, version 3 for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License,
16
+ # version 3 along with Kanocc. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ require 'kanocc/grammar_rule'
19
+ module Kanocc
20
+ class Nonterminal
21
+ @@rules = Hash.new
22
+ @@last_rule = Hash.new
23
+ @@derives_right = Hash.new
24
+ @@operator_precedence = Hash.new
25
+ @@method_names = Hash.new
26
+
27
+ Left = 1
28
+ Right = 2
29
+
30
+ def Nonterminal.derives_right
31
+ @@derives_right[self] = true
32
+ end
33
+
34
+ def Nonterminal.derives_right?
35
+ return @@derives_right[self]
36
+ end
37
+
38
+ def Nonterminal.set_operator_precedence(operator, precedence)
39
+ raise "Precedence must be an integer" unless precedence.class == Fixnum
40
+ @@operator_precedence[self] ||= Hash.new
41
+ if is_an_operator?(operator)
42
+ @@operator_precedence[self][operator] = precedence
43
+ elsif is_an_array_of_operators(operator)
44
+ operator.each {|o| @@operator_precedence[self][o] = precedence}
45
+ else
46
+ raise "Operator must be a string, a token or an array of those"
47
+ end
48
+ end
49
+
50
+ def Nonterminal.operator_precedence(operator)
51
+ (@@operator_precedence[self] and @@operator_precedence[self][operator]) or 0
52
+ end
53
+
54
+ def Nonterminal.is_an_array_of_operators(arr)
55
+ arr.is_a?(Array) and
56
+ arr.collect{|o| is_an_operator?(o)}.inject {|b1, b2| b1 and b2 }
57
+ end
58
+
59
+ def Nonterminal.is_an_operator?(operator)
60
+ operator.is_a?(String) or operator.is_a?(Token)
61
+ end
62
+
63
+ def Nonterminal.rules
64
+ rules = @@rules[self]
65
+ return rules ? rules : []
66
+ end
67
+
68
+ def Nonterminal.add_rule(rule)
69
+ @@rules[self] ||= []
70
+ @@rules[self].push(rule)
71
+ @@last_rule[self] = rule
72
+ end
73
+
74
+ def Nonterminal.is_a_grammarsymbol?(x)
75
+ x.is_a?(String) or (x.respond_to?("is_a_kanocc_grammarsymbol?") and x.is_a_kanocc_grammarsymbol?)
76
+ end
77
+
78
+ def Nonterminal.is_a_kanocc_grammarsymbol?
79
+ return true
80
+ end
81
+
82
+ def Nonterminal.rule(*rhs, &block)
83
+ for pos in 0..rhs.length - 1 do
84
+ unless is_a_grammarsymbol?(rhs[pos])
85
+ raise "Problem with rule: #{rhs.inspect}, element:#{pos.to_s} - #{rhs[pos].inspect}\nElements of a rule must be Strings, Tokens or Nonterminals"
86
+ end
87
+ end
88
+
89
+ if block_given?
90
+ method_name = generate_method_name(*rhs)
91
+ define_method(method_name.to_sym, &block)
92
+ add_rule(GrammarRule.new(self, rhs, method_name.to_sym))
93
+ else
94
+ add_rule(GrammarRule.new(self, rhs, nil))
95
+ end
96
+ end
97
+
98
+ def Nonterminal.zm(symbols, sep = nil)
99
+ list_class = new_list_class
100
+ list_class.rule() {@elements = []}
101
+ list_class.rule(om(symbols, sep)) {@elements = @rhs[0].elements}
102
+ return list_class
103
+ end
104
+
105
+ def Nonterminal.om(symbols, sep = nil)
106
+ symbols = [symbols] unless symbols.is_a? Array
107
+ list_class = new_list_class
108
+ list_class.rule(*symbols) {@elements = @rhs}
109
+ if sep
110
+ list_class.rule(list_class, sep, *symbols) {@elements = @rhs[0].elements + @rhs[2..@rhs.length]}
111
+ else
112
+ list_class.rule(list_class, *symbols) {@elements = @rhs[0].elements + @rhs[1..@rhs.length]}
113
+ end
114
+ return list_class
115
+ end
116
+
117
+ @@listClassNumber = 0
118
+
119
+ def Nonterminal.new_list_class
120
+ list_class = Class.new(List)
121
+ @@listClassNumber += 1
122
+ def list_class.inspect
123
+ return "anonList_#{@@listClassNumber}"
124
+ end
125
+ return list_class
126
+ end
127
+
128
+ def Nonterminal.generate_method_name(*args)
129
+ method_name = self.name + " --> " + args.map {|a| a.inspect}.join(' ')
130
+ @@method_names[self] ||= []
131
+ i = 1
132
+ while @@method_names[self].member?(method_name) do
133
+ method_name += ' ';
134
+ end
135
+ @@method_names[self].push(method_name)
136
+ return method_name
137
+ end
138
+
139
+ def Nonterminal.prec(p)
140
+ raise "Call to prec not preceded by rule" unless @@last_rule[self]
141
+ @@last_rule[self].prec = p
142
+ end
143
+
144
+ def Nonterminal.show_method_names
145
+ @@method_names[self].each{|mn| puts mn.inspect} if @@method_names[self]
146
+ end
147
+
148
+ def inspect
149
+ self.class.name
150
+ end
151
+ end
152
+
153
+
154
+ class List < Nonterminal
155
+ attr_reader :elements
156
+
157
+ protected
158
+ # Assumes @rhs[0] is a Kanocc::List and that rhs.length > 1
159
+ def collect(strip_separator = false)
160
+ @elements = @rhs[0].elements
161
+ if strip_separator
162
+ @elements = @elements + @rhs[2..@rhs.length]
163
+ else
164
+ @elements = @elements + @rhs[1..@rhs.length]
165
+ end
166
+ end
167
+ end
168
+
169
+ class Error < Nonterminal
170
+ attr_reader :text
171
+ def initialize
172
+ super
173
+ @text = "FIXME"
174
+ end
175
+ end
176
+ end