rltk 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rltk/cfg.rb ADDED
@@ -0,0 +1,491 @@
1
+ # Author: Chris Wailes <chris.wailes@gmail.com>
2
+ # Project: Ruby Language Toolkit
3
+ # Date: 2011/03/24
4
+ # Description: This file contains the a class representing a context-free
5
+ # grammar.
6
+
7
+ # encoding: utf-8
8
+
9
+ ############
10
+ # Requires #
11
+ ############
12
+
13
+ # Ruby Language Toolkit
14
+ require 'rltk/lexers/ebnf'
15
+
16
+ #######################
17
+ # Classes and Modules #
18
+ #######################
19
+
20
+ module RLTK # :nodoc:
21
+
22
+ # An exception class that represents a problem with a context-free
23
+ # grammar's definition.
24
+ class GrammarError < Exception; end
25
+
26
+ # The CFG class is used to represent context-free grammars. It is used by
27
+ # the RLTK::Parser class to represent the parser's grammar, but can also be
28
+ # used to manipulate arbitrary CFGs.
29
+ class CFG
30
+
31
+ # The start symbol for the grammar.
32
+ attr_reader :start_symbol
33
+
34
+ # The current left-hand side symbol. This is used by the
35
+ # CFG.production method to wrap CFG.clause calls.
36
+ attr_accessor :curr_lhs
37
+
38
+ #################
39
+ # Class Methods #
40
+ #################
41
+
42
+ # Tests to see if a symbol is a terminal symbol, as used by the CFG
43
+ # class.
44
+ def self.is_terminal?(sym)
45
+ sym and (s = sym.to_s) == s.upcase
46
+ end
47
+
48
+ # Tests to see if a symbol is a non-terminal symbol, as used by the
49
+ # CFG class.
50
+ def self.is_nonterminal?(sym)
51
+ sym and (s = sym.to_s) == s.downcase
52
+ end
53
+
54
+ ####################
55
+ # Instance Methods #
56
+ ####################
57
+
58
+ # Instantiates a new CFG object that uses _callback_ to inform the
59
+ # programmer of the generation of new productions due to EBNF
60
+ # operators.
61
+ def initialize(&callback)
62
+ @curr_lhs = nil
63
+ @callback = callback || Proc.new {}
64
+ @lexer = Lexers::EBNF.new
65
+ @production_counter = -1
66
+ @start_symbol = nil
67
+ @wrapper_symbol = nil
68
+
69
+ @productions_id = Hash.new
70
+ @productions_sym = Hash.new { |h, k| h[k] = [] }
71
+ @production_buffer = Array.new
72
+
73
+ @terms = Hash.new(false).update({:EOS => true})
74
+ @nonterms = Hash.new(false)
75
+
76
+ @firsts = Hash.new
77
+ @follows = Hash.new { |h,k| h[k] = Array.new }
78
+ end
79
+
80
+ # Adds _production_ to the appropriate internal data structures.
81
+ def add_production(production)
82
+ @productions_sym[production.lhs] << (@productions_id[production.id] = production)
83
+ end
84
+
85
+ # Sets the EBNF callback to _callback_.
86
+ def callback(&callback)
87
+ @callback = callback || Proc.new {}
88
+ end
89
+
90
+ # This function MUST be called inside a CFG.production block. It will
91
+ # make a new production with the left-hand side specified by the
92
+ # CFG.production call's argument. This is the function that is
93
+ # responsible for removing EBNF symbols from the grammar.
94
+ def clause(expression)
95
+ if not @curr_lhs
96
+ raise GrammarError, 'CFG.clause called outside of CFG.production block.'
97
+ end
98
+
99
+ lhs = @curr_lhs.to_sym
100
+ rhs = Array.new
101
+ tokens = @lexer.lex(expression)
102
+
103
+ # Set this as the start symbol if there isn't one already
104
+ # defined.
105
+ @start_symbol ||= lhs
106
+
107
+ # Remove EBNF tokens and replace them with new productions.
108
+ tokens.each_index do |i|
109
+ ttype0 = tokens[i].type
110
+ tvalue0 = tokens[i].value
111
+
112
+ if ttype0 == :TERM or ttype0 == :NONTERM
113
+
114
+ # Add this symbol to the correct collection.
115
+ (ttype0 == :TERM ? @terms : @nonterms)[tvalue0] = true
116
+
117
+ if i + 1 < tokens.length
118
+ ttype1 = tokens[i + 1].type
119
+ tvalue1 = tokens[i + 1].value
120
+
121
+ rhs <<
122
+ case ttype1
123
+ when :'?'
124
+ self.get_question(tvalue0)
125
+
126
+ when :*
127
+ self.get_star(tvalue0)
128
+
129
+ when :+
130
+ self.get_plus(tvalue0)
131
+
132
+ else
133
+ tvalue0
134
+ end
135
+ else
136
+ rhs << tvalue0
137
+ end
138
+ end
139
+ end
140
+
141
+ # Make the production.
142
+ @production_buffer << (production = Production.new(self.next_id, lhs, rhs))
143
+
144
+ # Make sure the production symbol is collected.
145
+ @nonterms[lhs] = true
146
+
147
+ # Add the new production to our collections.
148
+ self.add_production(production)
149
+
150
+ return production
151
+ end
152
+
153
+ # Returns the _first_ set for _sentence_. _Sentence_ may be either a
154
+ # single symbol or an array of symbols.
155
+ def first_set(sentence)
156
+ if sentence.is_a?(Symbol)
157
+ self.first_set_prime(sentence)
158
+
159
+ elsif sentence.inject(true) { |m, sym| m and self.symbols.include?(sym) }
160
+ set0 = []
161
+ all_have_empty = true
162
+
163
+ sentence.each do |sym|
164
+ set0 |= (set1 = self.first_set(sym)) - [:'ɛ']
165
+
166
+ break if not (all_have_empty = set1.include?(:'ɛ'))
167
+ end
168
+
169
+ if all_have_empty then set0 + [:'ɛ'] else set0 end
170
+ else
171
+ nil
172
+ end
173
+ end
174
+
175
+ # This function is responsible for calculating the _first_ set of
176
+ # individual symbols. CFG.first_set is a wrapper around this function
177
+ # to provide support for calculating the _first_ set for sentences.
178
+ def first_set_prime(sym0, seen_lh_sides = [])
179
+ if self.symbols.include?(sym0)
180
+ # Memoize the result for later.
181
+ @firsts[sym0] ||=
182
+
183
+ if CFG::is_terminal?(sym0)
184
+ # If the symbol is a terminal, it is the only symbol in
185
+ # its follow set.
186
+ [sym0]
187
+ else
188
+ set0 = []
189
+
190
+ @productions_sym[sym0].each do |production|
191
+ if production.rhs == []
192
+ # If this is an empty production we should
193
+ # add the empty string to the First set.
194
+ set0 << :'ɛ'
195
+ else
196
+ all_have_empty = true
197
+
198
+ production.rhs.each do |sym1|
199
+
200
+ set1 = []
201
+
202
+ # Grab the First set for the current
203
+ # symbol in this production.
204
+ if not seen_lh_sides.include?(sym1)
205
+ set0 |= (set1 = self.first_set_prime(sym1, seen_lh_sides << sym1)) - [:'ɛ']
206
+ end
207
+
208
+ break if not (all_have_empty = set1.include?(:'ɛ'))
209
+ end
210
+
211
+ # Add the empty production if this production
212
+ # is all non-terminals that can be reduced to
213
+ # the empty string.
214
+ set0 << :'ɛ' if all_have_empty
215
+ end
216
+ end
217
+
218
+ set0.uniq
219
+ end
220
+ else
221
+ nil
222
+ end
223
+ end
224
+
225
+ # Returns the _follow_ set for a given symbol. The second argument is
226
+ # used to avoid infinite recursion when mutually recursive rules are
227
+ # encountered.
228
+ def follow_set(sym0, seen_lh_sides = [])
229
+
230
+ # Use the memoized set if possible.
231
+ return @follows[sym0] if @follows.has_key?(sym0)
232
+
233
+ if @nonterms[sym0]
234
+ set0 = []
235
+
236
+ # Add EOS to the start symbol's follow set.
237
+ set0 << :EOS if sym0 == @start_symbol
238
+
239
+ @productions_id.values.each do |production|
240
+ production.rhs.each_with_index do |sym1, i|
241
+ if i + 1 < production.rhs.length
242
+ if sym0 == sym1
243
+ set0 |= (set1 = self.first_set(production.rhs[(i + 1)..-1])) - [:'ɛ']
244
+
245
+ set0 |= self.follow_set(production.lhs) if set1.include?(:'ɛ')
246
+ end
247
+ elsif sym0 != production.lhs and sym0 == sym1 and not seen_lh_sides.include?(production.lhs)
248
+ set0 |= self.follow_set(production.lhs, seen_lh_sides << production.lhs)
249
+ end
250
+ end
251
+ end
252
+
253
+ if seen_lh_sides.empty? or not set0.empty?
254
+ # Memoize the result for later.
255
+ @follows[sym0] |= set0
256
+ else
257
+ set0
258
+ end
259
+ else
260
+ []
261
+ end
262
+ end
263
+
264
+ # Builds productions used to eliminate the + EBNF operator.
265
+ def get_plus(symbol)
266
+ new_symbol = (symbol.to_s.downcase + '_plus').to_sym
267
+
268
+ if not @productions_sym.has_key?(new_symbol)
269
+ # Add the items for the following productions:
270
+ #
271
+ # token_plus: token | token token_plus
272
+
273
+ # 1st production
274
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
275
+ @callback.call(production, :+, :first)
276
+
277
+ # 2nd production
278
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
279
+ @callback.call(production, :+, :second)
280
+
281
+ # Add the new symbol to the list of nonterminals.
282
+ @nonterms[new_symbol] = true
283
+ end
284
+
285
+ return new_symbol
286
+ end
287
+
288
+ # Builds productions used to eliminate the ? EBNF operator.
289
+ def get_question(symbol)
290
+ new_symbol = (symbol.to_s.downcase + '_question').to_sym
291
+
292
+ if not @productions_sym.has_key?(new_symbol)
293
+ # Add the items for the following productions:
294
+ #
295
+ # nonterm_question: | nonterm
296
+
297
+ # 1st (empty) production.
298
+ self.add_production(production = Production.new(self.next_id, new_symbol, []))
299
+ @callback.call(production, :'?', :first)
300
+
301
+ # 2nd production
302
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
303
+ @callback.call(production, :'?', :second)
304
+
305
+ # Add the new symbol to the list of nonterminals.
306
+ @nonterms[new_symbol] = true
307
+ end
308
+
309
+ return new_symbol
310
+ end
311
+
312
+ # Builds productions used to eliminate the * EBNF operator.
313
+ def get_star(symbol)
314
+ new_symbol = (symbol.to_s.downcase + '_star').to_sym
315
+
316
+ if not @productions_sym.has_key?(new_symbol)
317
+ # Add the items for the following productions:
318
+ #
319
+ # token_star: | token token_star
320
+
321
+ # 1st (empty) production
322
+ self.add_production(production = Production.new(self.next_id, new_symbol, []))
323
+ @callback.call(production, :*, :first)
324
+
325
+ # 2nd production
326
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
327
+ @callback.call(production, :*, :second)
328
+
329
+ # Add the new symbol to the list of nonterminals.
330
+ @nonterms[new_symbol] = true
331
+ end
332
+
333
+ return new_symbol
334
+ end
335
+
336
+ # Returns the ID for the next production to be defined.
337
+ def next_id
338
+ @production_counter += 1
339
+ end
340
+
341
+ # Returns all of the non-terminal symbols used in the gramar's
342
+ # definition.
343
+ def nonterms
344
+ @nonterms.keys
345
+ end
346
+
347
+ # Builds a new production with the left-hand side value of _symbol_.
348
+ # If _expression_ is specified it is take as the right-hand side of
349
+ # production. If _expression_ is nil then _block_ is evaluated, and
350
+ # expected to make one or more calls to CFG.clause.
351
+ def production(symbol, expression = nil, &block)
352
+ @production_buffer = Array.new
353
+ @curr_lhs = symbol
354
+
355
+ if expression
356
+ self.clause(expression)
357
+ else
358
+ self.instance_exec(&block)
359
+ end
360
+
361
+ @curr_lhs = nil
362
+ return @production_buffer.clone
363
+ end
364
+
365
+ # If _by_ is :sym, returns a hash of the grammar's productions, using
366
+ # the productions' left-hand side symbol as the key. If _by_ is :id
367
+ # an array of productions is returned in the order of their
368
+ # definition.
369
+ def productions(by = :sym)
370
+ if by == :sym
371
+ @productions_sym
372
+ elsif by == :id
373
+ @productions_id
374
+ else
375
+ nil
376
+ end
377
+ end
378
+
379
+ # Sets the start symbol for this grammar.
380
+ def start(symbol)
381
+ if not CFG::is_nonterminal?(symbol)
382
+ raise GrammarError, 'Start symbol must be a non-terminal.'
383
+ end
384
+
385
+ @start_symbol = symbol
386
+ end
387
+
388
+ # Returns a list of symbols encountered in the grammar's definition.
389
+ def symbols
390
+ self.terms + self.nonterms
391
+ end
392
+
393
+ # Returns a list of all terminal symbols encountered in the grammar's
394
+ # definition.
395
+ def terms
396
+ @terms.keys
397
+ end
398
+
399
+ # Oddly enough, the Production class represents a production in a
400
+ # context-free grammar.
401
+ class Production
402
+ attr_reader :id
403
+ attr_reader :lhs
404
+ attr_reader :rhs
405
+
406
+ # Instantiates a new Production object with the specified ID,
407
+ # and left- and right-hand sides.
408
+ def initialize(id, lhs, rhs)
409
+ @id = id
410
+ @lhs = lhs
411
+ @rhs = rhs
412
+ end
413
+
414
+ # Comparese on production to another. Returns true only if the
415
+ # left- and right- hand sides match.
416
+ def ==(other)
417
+ self.lhs == other.lhs and self.rhs == other.rhs
418
+ end
419
+
420
+ # Makes a new copy of the production.
421
+ def copy
422
+ Production.new(@id, @lhs, @rhs.clone)
423
+ end
424
+
425
+ # Locates the last terminal in the right-hand side of a
426
+ # production.
427
+ def last_terminal
428
+ @rhs.inject(nil) { |m, sym| if CFG::is_terminal?(sym) then sym else m end }
429
+ end
430
+
431
+ # Returns a new Item based on this production.
432
+ def to_item
433
+ Item.new(0, @id, @lhs, @rhs)
434
+ end
435
+
436
+ # Returns a string representation of this production.
437
+ def to_s(padding = 0)
438
+ "#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.join(' ')}"
439
+ end
440
+ end
441
+
442
+ # The Item class represents a CFG production with dot in it.
443
+ class Item < Production
444
+ attr_reader :dot
445
+
446
+ # Instantiates a new Item object with a dot located before the
447
+ # symbol at index _dot_ of the right-hand side. The remaining
448
+ # arguments (_args_) should be as specified by
449
+ # Production.initialize.
450
+ def initialize(dot, *args)
451
+ super(*args)
452
+
453
+ # The Dot indicates the NEXT symbol to be read.
454
+ @dot = dot
455
+ end
456
+
457
+ # Compares two items.
458
+ def ==(other)
459
+ self.dot == other.dot and self.lhs == other.lhs and self.rhs == other.rhs
460
+ end
461
+
462
+ # Moves the items dot forward by one if the end of the right-hand
463
+ # side hasn't already been reached.
464
+ def advance
465
+ if @dot < @rhs.length
466
+ @dot += 1
467
+ end
468
+ end
469
+
470
+ # Tests to see if the dot is at the end of the right-hand side.
471
+ def at_end?
472
+ @dot == @rhs.length
473
+ end
474
+
475
+ # Produces a new copy of this item.
476
+ def copy
477
+ Item.new(@dot, @id, @lhs, @rhs.clone)
478
+ end
479
+
480
+ # Returns the symbol located after the dot.
481
+ def next_symbol
482
+ @rhs[@dot]
483
+ end
484
+
485
+ # Returns a string representation of this item.
486
+ def to_s(padding = 0)
487
+ "#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.insert(@dot, '·').join(' ') }"
488
+ end
489
+ end
490
+ end
491
+ end