rltk 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rltk/cfg.rb ADDED
@@ -0,0 +1,491 @@
1
+ # Author: Chris Wailes <chris.wailes@gmail.com>
2
+ # Project: Ruby Language Toolkit
3
+ # Date: 2011/03/24
4
+ # Description: This file contains the a class representing a context-free
5
+ # grammar.
6
+
7
+ # encoding: utf-8
8
+
9
+ ############
10
+ # Requires #
11
+ ############
12
+
13
+ # Ruby Language Toolkit
14
+ require 'rltk/lexers/ebnf'
15
+
16
+ #######################
17
+ # Classes and Modules #
18
+ #######################
19
+
20
+ module RLTK # :nodoc:
21
+
22
+ # An exception class that represents a problem with a context-free
23
+ # grammar's definition.
24
+ class GrammarError < Exception; end
25
+
26
+ # The CFG class is used to represent context-free grammars. It is used by
27
+ # the RLTK::Parser class to represent the parser's grammar, but can also be
28
+ # used to manipulate arbitrary CFGs.
29
+ class CFG
30
+
31
+ # The start symbol for the grammar.
32
+ attr_reader :start_symbol
33
+
34
+ # The current left-hand side symbol. This is used by the
35
+ # CFG.production method to wrap CFG.clause calls.
36
+ attr_accessor :curr_lhs
37
+
38
+ #################
39
+ # Class Methods #
40
+ #################
41
+
42
+ # Tests to see if a symbol is a terminal symbol, as used by the CFG
43
+ # class.
44
+ def self.is_terminal?(sym)
45
+ sym and (s = sym.to_s) == s.upcase
46
+ end
47
+
48
+ # Tests to see if a symbol is a non-terminal symbol, as used by the
49
+ # CFG class.
50
+ def self.is_nonterminal?(sym)
51
+ sym and (s = sym.to_s) == s.downcase
52
+ end
53
+
54
+ ####################
55
+ # Instance Methods #
56
+ ####################
57
+
58
+ # Instantiates a new CFG object that uses _callback_ to inform the
59
+ # programmer of the generation of new productions due to EBNF
60
+ # operators.
61
+ def initialize(&callback)
62
+ @curr_lhs = nil
63
+ @callback = callback || Proc.new {}
64
+ @lexer = Lexers::EBNF.new
65
+ @production_counter = -1
66
+ @start_symbol = nil
67
+ @wrapper_symbol = nil
68
+
69
+ @productions_id = Hash.new
70
+ @productions_sym = Hash.new { |h, k| h[k] = [] }
71
+ @production_buffer = Array.new
72
+
73
+ @terms = Hash.new(false).update({:EOS => true})
74
+ @nonterms = Hash.new(false)
75
+
76
+ @firsts = Hash.new
77
+ @follows = Hash.new { |h,k| h[k] = Array.new }
78
+ end
79
+
80
+ # Adds _production_ to the appropriate internal data structures.
81
+ def add_production(production)
82
+ @productions_sym[production.lhs] << (@productions_id[production.id] = production)
83
+ end
84
+
85
+ # Sets the EBNF callback to _callback_.
86
+ def callback(&callback)
87
+ @callback = callback || Proc.new {}
88
+ end
89
+
90
+ # This function MUST be called inside a CFG.production block. It will
91
+ # make a new production with the left-hand side specified by the
92
+ # CFG.production call's argument. This is the function that is
93
+ # responsible for removing EBNF symbols from the grammar.
94
+ def clause(expression)
95
+ if not @curr_lhs
96
+ raise GrammarError, 'CFG.clause called outside of CFG.production block.'
97
+ end
98
+
99
+ lhs = @curr_lhs.to_sym
100
+ rhs = Array.new
101
+ tokens = @lexer.lex(expression)
102
+
103
+ # Set this as the start symbol if there isn't one already
104
+ # defined.
105
+ @start_symbol ||= lhs
106
+
107
+ # Remove EBNF tokens and replace them with new productions.
108
+ tokens.each_index do |i|
109
+ ttype0 = tokens[i].type
110
+ tvalue0 = tokens[i].value
111
+
112
+ if ttype0 == :TERM or ttype0 == :NONTERM
113
+
114
+ # Add this symbol to the correct collection.
115
+ (ttype0 == :TERM ? @terms : @nonterms)[tvalue0] = true
116
+
117
+ if i + 1 < tokens.length
118
+ ttype1 = tokens[i + 1].type
119
+ tvalue1 = tokens[i + 1].value
120
+
121
+ rhs <<
122
+ case ttype1
123
+ when :'?'
124
+ self.get_question(tvalue0)
125
+
126
+ when :*
127
+ self.get_star(tvalue0)
128
+
129
+ when :+
130
+ self.get_plus(tvalue0)
131
+
132
+ else
133
+ tvalue0
134
+ end
135
+ else
136
+ rhs << tvalue0
137
+ end
138
+ end
139
+ end
140
+
141
+ # Make the production.
142
+ @production_buffer << (production = Production.new(self.next_id, lhs, rhs))
143
+
144
+ # Make sure the production symbol is collected.
145
+ @nonterms[lhs] = true
146
+
147
+ # Add the new production to our collections.
148
+ self.add_production(production)
149
+
150
+ return production
151
+ end
152
+
153
+ # Returns the _first_ set for _sentence_. _Sentence_ may be either a
154
+ # single symbol or an array of symbols.
155
+ def first_set(sentence)
156
+ if sentence.is_a?(Symbol)
157
+ self.first_set_prime(sentence)
158
+
159
+ elsif sentence.inject(true) { |m, sym| m and self.symbols.include?(sym) }
160
+ set0 = []
161
+ all_have_empty = true
162
+
163
+ sentence.each do |sym|
164
+ set0 |= (set1 = self.first_set(sym)) - [:'ɛ']
165
+
166
+ break if not (all_have_empty = set1.include?(:'ɛ'))
167
+ end
168
+
169
+ if all_have_empty then set0 + [:'ɛ'] else set0 end
170
+ else
171
+ nil
172
+ end
173
+ end
174
+
175
+ # This function is responsible for calculating the _first_ set of
176
+ # individual symbols. CFG.first_set is a wrapper around this function
177
+ # to provide support for calculating the _first_ set for sentences.
178
+ def first_set_prime(sym0, seen_lh_sides = [])
179
+ if self.symbols.include?(sym0)
180
+ # Memoize the result for later.
181
+ @firsts[sym0] ||=
182
+
183
+ if CFG::is_terminal?(sym0)
184
+ # If the symbol is a terminal, it is the only symbol in
185
+ # its follow set.
186
+ [sym0]
187
+ else
188
+ set0 = []
189
+
190
+ @productions_sym[sym0].each do |production|
191
+ if production.rhs == []
192
+ # If this is an empty production we should
193
+ # add the empty string to the First set.
194
+ set0 << :'ɛ'
195
+ else
196
+ all_have_empty = true
197
+
198
+ production.rhs.each do |sym1|
199
+
200
+ set1 = []
201
+
202
+ # Grab the First set for the current
203
+ # symbol in this production.
204
+ if not seen_lh_sides.include?(sym1)
205
+ set0 |= (set1 = self.first_set_prime(sym1, seen_lh_sides << sym1)) - [:'ɛ']
206
+ end
207
+
208
+ break if not (all_have_empty = set1.include?(:'ɛ'))
209
+ end
210
+
211
+ # Add the empty production if this production
212
+ # is all non-terminals that can be reduced to
213
+ # the empty string.
214
+ set0 << :'ɛ' if all_have_empty
215
+ end
216
+ end
217
+
218
+ set0.uniq
219
+ end
220
+ else
221
+ nil
222
+ end
223
+ end
224
+
225
+ # Returns the _follow_ set for a given symbol. The second argument is
226
+ # used to avoid infinite recursion when mutually recursive rules are
227
+ # encountered.
228
+ def follow_set(sym0, seen_lh_sides = [])
229
+
230
+ # Use the memoized set if possible.
231
+ return @follows[sym0] if @follows.has_key?(sym0)
232
+
233
+ if @nonterms[sym0]
234
+ set0 = []
235
+
236
+ # Add EOS to the start symbol's follow set.
237
+ set0 << :EOS if sym0 == @start_symbol
238
+
239
+ @productions_id.values.each do |production|
240
+ production.rhs.each_with_index do |sym1, i|
241
+ if i + 1 < production.rhs.length
242
+ if sym0 == sym1
243
+ set0 |= (set1 = self.first_set(production.rhs[(i + 1)..-1])) - [:'ɛ']
244
+
245
+ set0 |= self.follow_set(production.lhs) if set1.include?(:'ɛ')
246
+ end
247
+ elsif sym0 != production.lhs and sym0 == sym1 and not seen_lh_sides.include?(production.lhs)
248
+ set0 |= self.follow_set(production.lhs, seen_lh_sides << production.lhs)
249
+ end
250
+ end
251
+ end
252
+
253
+ if seen_lh_sides.empty? or not set0.empty?
254
+ # Memoize the result for later.
255
+ @follows[sym0] |= set0
256
+ else
257
+ set0
258
+ end
259
+ else
260
+ []
261
+ end
262
+ end
263
+
264
+ # Builds productions used to eliminate the + EBNF operator.
265
+ def get_plus(symbol)
266
+ new_symbol = (symbol.to_s.downcase + '_plus').to_sym
267
+
268
+ if not @productions_sym.has_key?(new_symbol)
269
+ # Add the items for the following productions:
270
+ #
271
+ # token_plus: token | token token_plus
272
+
273
+ # 1st production
274
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
275
+ @callback.call(production, :+, :first)
276
+
277
+ # 2nd production
278
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
279
+ @callback.call(production, :+, :second)
280
+
281
+ # Add the new symbol to the list of nonterminals.
282
+ @nonterms[new_symbol] = true
283
+ end
284
+
285
+ return new_symbol
286
+ end
287
+
288
+ # Builds productions used to eliminate the ? EBNF operator.
289
+ def get_question(symbol)
290
+ new_symbol = (symbol.to_s.downcase + '_question').to_sym
291
+
292
+ if not @productions_sym.has_key?(new_symbol)
293
+ # Add the items for the following productions:
294
+ #
295
+ # nonterm_question: | nonterm
296
+
297
+ # 1st (empty) production.
298
+ self.add_production(production = Production.new(self.next_id, new_symbol, []))
299
+ @callback.call(production, :'?', :first)
300
+
301
+ # 2nd production
302
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
303
+ @callback.call(production, :'?', :second)
304
+
305
+ # Add the new symbol to the list of nonterminals.
306
+ @nonterms[new_symbol] = true
307
+ end
308
+
309
+ return new_symbol
310
+ end
311
+
312
+ # Builds productions used to eliminate the * EBNF operator.
313
+ def get_star(symbol)
314
+ new_symbol = (symbol.to_s.downcase + '_star').to_sym
315
+
316
+ if not @productions_sym.has_key?(new_symbol)
317
+ # Add the items for the following productions:
318
+ #
319
+ # token_star: | token token_star
320
+
321
+ # 1st (empty) production
322
+ self.add_production(production = Production.new(self.next_id, new_symbol, []))
323
+ @callback.call(production, :*, :first)
324
+
325
+ # 2nd production
326
+ self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
327
+ @callback.call(production, :*, :second)
328
+
329
+ # Add the new symbol to the list of nonterminals.
330
+ @nonterms[new_symbol] = true
331
+ end
332
+
333
+ return new_symbol
334
+ end
335
+
336
+ # Returns the ID for the next production to be defined.
337
+ def next_id
338
+ @production_counter += 1
339
+ end
340
+
341
+ # Returns all of the non-terminal symbols used in the gramar's
342
+ # definition.
343
+ def nonterms
344
+ @nonterms.keys
345
+ end
346
+
347
+ # Builds a new production with the left-hand side value of _symbol_.
348
+ # If _expression_ is specified it is take as the right-hand side of
349
+ # production. If _expression_ is nil then _block_ is evaluated, and
350
+ # expected to make one or more calls to CFG.clause.
351
+ def production(symbol, expression = nil, &block)
352
+ @production_buffer = Array.new
353
+ @curr_lhs = symbol
354
+
355
+ if expression
356
+ self.clause(expression)
357
+ else
358
+ self.instance_exec(&block)
359
+ end
360
+
361
+ @curr_lhs = nil
362
+ return @production_buffer.clone
363
+ end
364
+
365
+ # If _by_ is :sym, returns a hash of the grammar's productions, using
366
+ # the productions' left-hand side symbol as the key. If _by_ is :id
367
+ # an array of productions is returned in the order of their
368
+ # definition.
369
+ def productions(by = :sym)
370
+ if by == :sym
371
+ @productions_sym
372
+ elsif by == :id
373
+ @productions_id
374
+ else
375
+ nil
376
+ end
377
+ end
378
+
379
+ # Sets the start symbol for this grammar.
380
+ def start(symbol)
381
+ if not CFG::is_nonterminal?(symbol)
382
+ raise GrammarError, 'Start symbol must be a non-terminal.'
383
+ end
384
+
385
+ @start_symbol = symbol
386
+ end
387
+
388
+ # Returns a list of symbols encountered in the grammar's definition.
389
+ def symbols
390
+ self.terms + self.nonterms
391
+ end
392
+
393
+ # Returns a list of all terminal symbols encountered in the grammar's
394
+ # definition.
395
+ def terms
396
+ @terms.keys
397
+ end
398
+
399
+ # Oddly enough, the Production class represents a production in a
400
+ # context-free grammar.
401
+ class Production
402
+ attr_reader :id
403
+ attr_reader :lhs
404
+ attr_reader :rhs
405
+
406
+ # Instantiates a new Production object with the specified ID,
407
+ # and left- and right-hand sides.
408
+ def initialize(id, lhs, rhs)
409
+ @id = id
410
+ @lhs = lhs
411
+ @rhs = rhs
412
+ end
413
+
414
+ # Comparese on production to another. Returns true only if the
415
+ # left- and right- hand sides match.
416
+ def ==(other)
417
+ self.lhs == other.lhs and self.rhs == other.rhs
418
+ end
419
+
420
+ # Makes a new copy of the production.
421
+ def copy
422
+ Production.new(@id, @lhs, @rhs.clone)
423
+ end
424
+
425
+ # Locates the last terminal in the right-hand side of a
426
+ # production.
427
+ def last_terminal
428
+ @rhs.inject(nil) { |m, sym| if CFG::is_terminal?(sym) then sym else m end }
429
+ end
430
+
431
+ # Returns a new Item based on this production.
432
+ def to_item
433
+ Item.new(0, @id, @lhs, @rhs)
434
+ end
435
+
436
+ # Returns a string representation of this production.
437
+ def to_s(padding = 0)
438
+ "#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.join(' ')}"
439
+ end
440
+ end
441
+
442
+ # The Item class represents a CFG production with dot in it.
443
+ class Item < Production
444
+ attr_reader :dot
445
+
446
+ # Instantiates a new Item object with a dot located before the
447
+ # symbol at index _dot_ of the right-hand side. The remaining
448
+ # arguments (_args_) should be as specified by
449
+ # Production.initialize.
450
+ def initialize(dot, *args)
451
+ super(*args)
452
+
453
+ # The Dot indicates the NEXT symbol to be read.
454
+ @dot = dot
455
+ end
456
+
457
+ # Compares two items.
458
+ def ==(other)
459
+ self.dot == other.dot and self.lhs == other.lhs and self.rhs == other.rhs
460
+ end
461
+
462
+ # Moves the items dot forward by one if the end of the right-hand
463
+ # side hasn't already been reached.
464
+ def advance
465
+ if @dot < @rhs.length
466
+ @dot += 1
467
+ end
468
+ end
469
+
470
+ # Tests to see if the dot is at the end of the right-hand side.
471
+ def at_end?
472
+ @dot == @rhs.length
473
+ end
474
+
475
+ # Produces a new copy of this item.
476
+ def copy
477
+ Item.new(@dot, @id, @lhs, @rhs.clone)
478
+ end
479
+
480
+ # Returns the symbol located after the dot.
481
+ def next_symbol
482
+ @rhs[@dot]
483
+ end
484
+
485
+ # Returns a string representation of this item.
486
+ def to_s(padding = 0)
487
+ "#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.insert(@dot, '·').join(' ') }"
488
+ end
489
+ end
490
+ end
491
+ end